mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-07 14:32:23 +00:00
md updates for 3.9
mostly little bugfixes. Only "feature" is a new RAID10 layout which slightly improves the number of sets of devices that can concurrently fail, without data loss. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.19 (GNU/Linux) iQIVAwUAUTPm+znsnt1WYoG5AQLLsw/+PMqr8roC4twgxTWV1NRbU8NtOcRi9Rj9 uvBS63uYAaLdi/D3UBKFYczmNCu9knuXbcp9SgFDxH7LlthQsWN/GYnif06pPo3w 9Agu5M8c062TJEG1vrnX6FhPO6pNgrWFr3h+CKkTiD3179i9DoQpP8LXQToeyMtI YRMQf/zCkxYtDvWAP0iwsEWtw8cf+q9I/uGPhQ1L+DnZapXYdbtnqWBRz9q6mrDt orcGrP41aZHvnOHUaTbwmaorCKkf/Ys4SMaGenrSFpnpQMypt7VgNuwHC59LxvJT 5eiFG/26zIsv7Wk0jv/TvFP5qzUPo0/PFkd5ug0ArvbVRiXS2cMJDwQvMdO1toxD i5Bb+P9DptadvoWhOTgIpxnG77yRH45wJvyJOk+ZfS1/IO87nCRa3d0yiNOU5e2/ o0VdXPZRr72sdKKTK6kQuYfwCPb+Z2Pz6Q8BJdk6GxlmTXyP6sKhIgwUX86534fE LrOxfK8qV+GetVu3X02RoX2CyJJRQHXyXmbHuSzXuo/JiOYtDigAydwNZChvf+tf OoMY9K8vgNbhnGsUG6la7XPvZ+6dZMjdnxp2HB99Ml5A3PWZd75i5T6IHHxIQFbD C3z9PWTWP+hK4k15DEyjlELtsE9WduGTXG4kUcf328xJ/7lj4VIImVugdCz+1B6z +HlI6BiLwzY= =YdVD -----END PGP SIGNATURE----- Merge tag 'md-3.9' of git://neil.brown.name/md Pull md updates from NeilBrown: "Mostly little bugfixes. Only "feature" is a new RAID10 layout which slightly improves the number of sets of devices that can concurrently fail, without data loss." * tag 'md-3.9' of git://neil.brown.name/md: md: expedite metadata update when switching read-auto -> active md: remove CONFIG_MULTICORE_RAID456 md/raid1,raid10: fix deadlock with freeze_array() md/raid0: improve error message when converting RAID4-with-spares to RAID0 md: raid0: fix error return from create_stripe_zones. md: fix two bugs when attempting to resize RAID0 array. DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 2) MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1) MD RAID10: Minor non-functional code changes md: raid1,10: Handle REQ_WRITE_SAME flag in write bios md: protect against crash upon fsync on ro array
This commit is contained in:
commit
a5e0d73163
@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters:
|
||||
raid10 Various RAID10 inspired algorithms chosen by additional params
|
||||
- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
|
||||
- RAID1E: Integrated Adjacent Stripe Mirroring
|
||||
- RAID1E: Integrated Offset Stripe Mirroring
|
||||
- and other similar RAID10 variants
|
||||
|
||||
Reference: Chapter 4 of
|
||||
@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters:
|
||||
synchronisation state for each region.
|
||||
|
||||
[raid10_copies <# copies>]
|
||||
[raid10_format near]
|
||||
[raid10_format <near|far|offset>]
|
||||
These two options are used to alter the default layout of
|
||||
a RAID10 configuration. The number of copies is can be
|
||||
specified, but the default is 2. There are other variations
|
||||
to how the copies are laid down - the default and only current
|
||||
option is "near". Near copies are what most people think of
|
||||
with respect to mirroring. If these options are left
|
||||
unspecified, or 'raid10_copies 2' and/or 'raid10_format near'
|
||||
are given, then the layouts for 2, 3 and 4 devices are:
|
||||
specified, but the default is 2. There are also three
|
||||
variations to how the copies are laid down - the default
|
||||
is "near". Near copies are what most people think of with
|
||||
respect to mirroring. If these options are left unspecified,
|
||||
or 'raid10_copies 2' and/or 'raid10_format near' are given,
|
||||
then the layouts for 2, 3 and 4 devices are:
|
||||
2 drives 3 drives 4 drives
|
||||
-------- ---------- --------------
|
||||
A1 A1 A1 A1 A2 A1 A1 A2 A2
|
||||
@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters:
|
||||
3-device layout is what might be called a 'RAID1E - Integrated
|
||||
Adjacent Stripe Mirroring'.
|
||||
|
||||
If 'raid10_copies 2' and 'raid10_format far', then the layouts
|
||||
for 2, 3 and 4 devices are:
|
||||
2 drives 3 drives 4 drives
|
||||
-------- -------------- --------------------
|
||||
A1 A2 A1 A2 A3 A1 A2 A3 A4
|
||||
A3 A4 A4 A5 A6 A5 A6 A7 A8
|
||||
A5 A6 A7 A8 A9 A9 A10 A11 A12
|
||||
.. .. .. .. .. .. .. .. ..
|
||||
A2 A1 A3 A1 A2 A2 A1 A4 A3
|
||||
A4 A3 A6 A4 A5 A6 A5 A8 A7
|
||||
A6 A5 A9 A7 A8 A10 A9 A12 A11
|
||||
.. .. .. .. .. .. .. .. ..
|
||||
|
||||
If 'raid10_copies 2' and 'raid10_format offset', then the
|
||||
layouts for 2, 3 and 4 devices are:
|
||||
2 drives 3 drives 4 drives
|
||||
-------- ------------ -----------------
|
||||
A1 A2 A1 A2 A3 A1 A2 A3 A4
|
||||
A2 A1 A3 A1 A2 A2 A1 A4 A3
|
||||
A3 A4 A4 A5 A6 A5 A6 A7 A8
|
||||
A4 A3 A6 A4 A5 A6 A5 A8 A7
|
||||
A5 A6 A7 A8 A9 A9 A10 A11 A12
|
||||
A6 A5 A9 A7 A8 A10 A9 A12 A11
|
||||
.. .. .. .. .. .. .. .. ..
|
||||
Here we see layouts closely akin to 'RAID1E - Integrated
|
||||
Offset Stripe Mirroring'.
|
||||
|
||||
<#raid_devs>: The number of devices composing the array.
|
||||
Each device consists of two entries. The first is the device
|
||||
containing the metadata (if any); the second is the one containing the
|
||||
@ -142,3 +170,5 @@ Version History
|
||||
1.3.0 Added support for RAID 10
|
||||
1.3.1 Allow device replacement/rebuild for RAID 10
|
||||
1.3.2 Fix/improve redundancy checking for RAID10
|
||||
1.4.0 Non-functional change. Removes arg from mapping function.
|
||||
1.4.1 Add RAID10 "far" and "offset" algorithm support.
|
||||
|
@ -154,17 +154,6 @@ config MD_RAID456
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MULTICORE_RAID456
|
||||
bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
|
||||
depends on MD_RAID456
|
||||
depends on SMP
|
||||
depends on EXPERIMENTAL
|
||||
---help---
|
||||
Enable the raid456 module to dispatch per-stripe raid operations to a
|
||||
thread pool.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config MD_MULTIPATH
|
||||
tristate "Multipath I/O support"
|
||||
depends on BLK_DEV_MD
|
||||
|
@ -91,15 +91,44 @@ static struct raid_type {
|
||||
{"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
|
||||
};
|
||||
|
||||
static char *raid10_md_layout_to_format(int layout)
|
||||
{
|
||||
/*
|
||||
* Bit 16 and 17 stand for "offset" and "use_far_sets"
|
||||
* Refer to MD's raid10.c for details
|
||||
*/
|
||||
if ((layout & 0x10000) && (layout & 0x20000))
|
||||
return "offset";
|
||||
|
||||
if ((layout & 0xFF) > 1)
|
||||
return "near";
|
||||
|
||||
return "far";
|
||||
}
|
||||
|
||||
static unsigned raid10_md_layout_to_copies(int layout)
|
||||
{
|
||||
return layout & 0xFF;
|
||||
if ((layout & 0xFF) > 1)
|
||||
return layout & 0xFF;
|
||||
return (layout >> 8) & 0xFF;
|
||||
}
|
||||
|
||||
static int raid10_format_to_md_layout(char *format, unsigned copies)
|
||||
{
|
||||
/* 1 "far" copy, and 'copies' "near" copies */
|
||||
return (1 << 8) | (copies & 0xFF);
|
||||
unsigned n = 1, f = 1;
|
||||
|
||||
if (!strcmp("near", format))
|
||||
n = copies;
|
||||
else
|
||||
f = copies;
|
||||
|
||||
if (!strcmp("offset", format))
|
||||
return 0x30000 | (f << 8) | n;
|
||||
|
||||
if (!strcmp("far", format))
|
||||
return 0x20000 | (f << 8) | n;
|
||||
|
||||
return (f << 8) | n;
|
||||
}
|
||||
|
||||
static struct raid_type *get_raid_type(char *name)
|
||||
@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
|
||||
{
|
||||
unsigned i, rebuild_cnt = 0;
|
||||
unsigned rebuilds_per_group, copies, d;
|
||||
unsigned group_size, last_group_start;
|
||||
|
||||
for (i = 0; i < rs->md.raid_disks; i++)
|
||||
if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
|
||||
@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
|
||||
* as long as the failed devices occur in different mirror
|
||||
* groups (i.e. different stripes).
|
||||
*
|
||||
* Right now, we only allow for "near" copies. When other
|
||||
* formats are added, we will have to check those too.
|
||||
*
|
||||
* When checking "near" format, make sure no adjacent devices
|
||||
* have failed beyond what can be handled. In addition to the
|
||||
* simple case where the number of devices is a multiple of the
|
||||
@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs)
|
||||
* A A B B C
|
||||
* C D D E E
|
||||
*/
|
||||
for (i = 0; i < rs->md.raid_disks * copies; i++) {
|
||||
if (!(i % copies))
|
||||
if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
|
||||
for (i = 0; i < rs->md.raid_disks * copies; i++) {
|
||||
if (!(i % copies))
|
||||
rebuilds_per_group = 0;
|
||||
d = i % rs->md.raid_disks;
|
||||
if ((!rs->dev[d].rdev.sb_page ||
|
||||
!test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
|
||||
(++rebuilds_per_group >= copies))
|
||||
goto too_many;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* When checking "far" and "offset" formats, we need to ensure
|
||||
* that the device that holds its copy is not also dead or
|
||||
* being rebuilt. (Note that "far" and "offset" formats only
|
||||
* support two copies right now. These formats also only ever
|
||||
* use the 'use_far_sets' variant.)
|
||||
*
|
||||
* This check is somewhat complicated by the need to account
|
||||
* for arrays that are not a multiple of (far) copies. This
|
||||
* results in the need to treat the last (potentially larger)
|
||||
* set differently.
|
||||
*/
|
||||
group_size = (rs->md.raid_disks / copies);
|
||||
last_group_start = (rs->md.raid_disks / group_size) - 1;
|
||||
last_group_start *= group_size;
|
||||
for (i = 0; i < rs->md.raid_disks; i++) {
|
||||
if (!(i % copies) && !(i > last_group_start))
|
||||
rebuilds_per_group = 0;
|
||||
d = i % rs->md.raid_disks;
|
||||
if ((!rs->dev[d].rdev.sb_page ||
|
||||
!test_bit(In_sync, &rs->dev[d].rdev.flags)) &&
|
||||
if ((!rs->dev[i].rdev.sb_page ||
|
||||
!test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
|
||||
(++rebuilds_per_group >= copies))
|
||||
goto too_many;
|
||||
goto too_many;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@ -433,7 +487,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
|
||||
*
|
||||
* RAID10-only options:
|
||||
* [raid10_copies <# copies>] Number of copies. (Default: 2)
|
||||
* [raid10_format <near>] Layout algorithm. (Default: near)
|
||||
* [raid10_format <near|far|offset>] Layout algorithm. (Default: near)
|
||||
*/
|
||||
static int parse_raid_params(struct raid_set *rs, char **argv,
|
||||
unsigned num_raid_params)
|
||||
@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
|
||||
rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
|
||||
return -EINVAL;
|
||||
}
|
||||
if (strcmp("near", argv[i])) {
|
||||
if (strcmp("near", argv[i]) &&
|
||||
strcmp("far", argv[i]) &&
|
||||
strcmp("offset", argv[i])) {
|
||||
rs->ti->error = "Invalid 'raid10_format' value given";
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the format is not "near", we only support
|
||||
* two copies at the moment.
|
||||
*/
|
||||
if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
|
||||
rs->ti->error = "Too many copies for given RAID10 format.";
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* (Len * #mirrors) / #devices */
|
||||
sectors_per_dev = rs->ti->len * raid10_copies;
|
||||
sector_div(sectors_per_dev, rs->md.raid_disks);
|
||||
@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
|
||||
/*
|
||||
* Reshaping is not currently allowed
|
||||
*/
|
||||
if ((le32_to_cpu(sb->level) != mddev->level) ||
|
||||
(le32_to_cpu(sb->layout) != mddev->layout) ||
|
||||
(le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
|
||||
DMERR("Reshaping arrays not yet supported.");
|
||||
if (le32_to_cpu(sb->level) != mddev->level) {
|
||||
DMERR("Reshaping arrays not yet supported. (RAID level change)");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (le32_to_cpu(sb->layout) != mddev->layout) {
|
||||
DMERR("Reshaping arrays not yet supported. (RAID layout change)");
|
||||
DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
|
||||
DMERR(" Old layout: %s w/ %d copies",
|
||||
raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
|
||||
raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
|
||||
DMERR(" New layout: %s w/ %d copies",
|
||||
raid10_md_layout_to_format(mddev->layout),
|
||||
raid10_md_layout_to_copies(mddev->layout));
|
||||
return -EINVAL;
|
||||
}
|
||||
if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
|
||||
DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* We can only change the number of devices in RAID1 right now */
|
||||
if ((rs->raid_type->level != 1) &&
|
||||
(le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
|
||||
DMERR("Reshaping arrays not yet supported.");
|
||||
DMERR("Reshaping arrays not yet supported. (device count change)");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
|
||||
raid10_md_layout_to_copies(rs->md.layout));
|
||||
|
||||
if (rs->print_flags & DMPF_RAID10_FORMAT)
|
||||
DMEMIT(" raid10_format near");
|
||||
DMEMIT(" raid10_format %s",
|
||||
raid10_md_layout_to_format(rs->md.layout));
|
||||
|
||||
DMEMIT(" %d", rs->md.raid_disks);
|
||||
for (i = 0; i < rs->md.raid_disks; i++) {
|
||||
@ -1418,6 +1497,10 @@ static struct target_type raid_target = {
|
||||
|
||||
static int __init dm_raid_init(void)
|
||||
{
|
||||
DMINFO("Loading target version %u.%u.%u",
|
||||
raid_target.version[0],
|
||||
raid_target.version[1],
|
||||
raid_target.version[2]);
|
||||
return dm_register_target(&raid_target);
|
||||
}
|
||||
|
||||
|
@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
|
||||
bio_io_error(bio);
|
||||
return;
|
||||
}
|
||||
if (mddev->ro == 1 && unlikely(rw == WRITE)) {
|
||||
bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
|
||||
return;
|
||||
}
|
||||
smp_rmb(); /* Ensure implications of 'active' are visible */
|
||||
rcu_read_lock();
|
||||
if (mddev->suspended) {
|
||||
@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
|
||||
} else if (!sectors)
|
||||
sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
|
||||
rdev->data_offset;
|
||||
if (!my_mddev->pers->resize)
|
||||
/* Cannot change size for RAID0 or Linear etc */
|
||||
return -EINVAL;
|
||||
}
|
||||
if (sectors < my_mddev->dev_sectors)
|
||||
return -EINVAL; /* component must fit device */
|
||||
@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
mddev->ro = 0;
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
/* mddev_unlock will wake thread */
|
||||
/* If a device failed while we were read-only, we
|
||||
* need to make sure the metadata is updated now.
|
||||
*/
|
||||
if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
|
||||
mddev_unlock(mddev);
|
||||
wait_event(mddev->sb_wait,
|
||||
!test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
|
||||
!test_bit(MD_CHANGE_PENDING, &mddev->flags));
|
||||
mddev_lock(mddev);
|
||||
}
|
||||
} else {
|
||||
err = -EROFS;
|
||||
goto abort_unlock;
|
||||
|
@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
|
||||
rdev1->new_raid_disk = j;
|
||||
}
|
||||
|
||||
if (j < 0 || j >= mddev->raid_disks) {
|
||||
if (j < 0) {
|
||||
printk(KERN_ERR
|
||||
"md/raid0:%s: remove inactive devices before converting to RAID0\n",
|
||||
mdname(mddev));
|
||||
goto abort;
|
||||
}
|
||||
if (j >= mddev->raid_disks) {
|
||||
printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
|
||||
"aborting!\n", mdname(mddev), j);
|
||||
goto abort;
|
||||
@ -289,7 +295,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
|
||||
kfree(conf->strip_zone);
|
||||
kfree(conf->devlist);
|
||||
kfree(conf);
|
||||
*private_conf = NULL;
|
||||
*private_conf = ERR_PTR(err);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
|
||||
"%s does not support generic reshape\n", __func__);
|
||||
|
||||
rdev_for_each(rdev, mddev)
|
||||
array_sectors += rdev->sectors;
|
||||
array_sectors += (rdev->sectors &
|
||||
~(sector_t)(mddev->chunk_sectors-1));
|
||||
|
||||
return array_sectors;
|
||||
}
|
||||
|
@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
|
||||
bio_list_merge(&conf->pending_bio_list, &plug->pending);
|
||||
conf->pending_count += plug->pending_cnt;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
wake_up(&conf->wait_barrier);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
kfree(plug);
|
||||
return;
|
||||
@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||
const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
|
||||
const unsigned long do_discard = (bio->bi_rw
|
||||
& (REQ_DISCARD | REQ_SECURE));
|
||||
const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
|
||||
struct md_rdev *blocked_rdev;
|
||||
struct blk_plug_cb *cb;
|
||||
struct raid1_plug_cb *plug = NULL;
|
||||
@ -1301,7 +1303,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||
conf->mirrors[i].rdev->data_offset);
|
||||
mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
|
||||
mbio->bi_end_io = raid1_end_write_request;
|
||||
mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard;
|
||||
mbio->bi_rw =
|
||||
WRITE | do_flush_fua | do_sync | do_discard | do_same;
|
||||
mbio->bi_private = r1_bio;
|
||||
|
||||
atomic_inc(&r1_bio->remaining);
|
||||
@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev)
|
||||
if (IS_ERR(conf))
|
||||
return PTR_ERR(conf);
|
||||
|
||||
if (mddev->queue)
|
||||
blk_queue_max_write_same_sectors(mddev->queue,
|
||||
mddev->chunk_sectors);
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if (!mddev->gendisk)
|
||||
continue;
|
||||
|
@ -38,21 +38,36 @@
|
||||
* near_copies (stored in low byte of layout)
|
||||
* far_copies (stored in second byte of layout)
|
||||
* far_offset (stored in bit 16 of layout )
|
||||
* use_far_sets (stored in bit 17 of layout )
|
||||
*
|
||||
* The data to be stored is divided into chunks using chunksize.
|
||||
* Each device is divided into far_copies sections.
|
||||
* In each section, chunks are laid out in a style similar to raid0, but
|
||||
* near_copies copies of each chunk is stored (each on a different drive).
|
||||
* The starting device for each section is offset near_copies from the starting
|
||||
* device of the previous section.
|
||||
* Thus they are (near_copies*far_copies) of each chunk, and each is on a different
|
||||
* drive.
|
||||
* near_copies and far_copies must be at least one, and their product is at most
|
||||
* raid_disks.
|
||||
* The data to be stored is divided into chunks using chunksize. Each device
|
||||
* is divided into far_copies sections. In each section, chunks are laid out
|
||||
* in a style similar to raid0, but near_copies copies of each chunk is stored
|
||||
* (each on a different drive). The starting device for each section is offset
|
||||
* near_copies from the starting device of the previous section. Thus there
|
||||
* are (near_copies * far_copies) of each chunk, and each is on a different
|
||||
* drive. near_copies and far_copies must be at least one, and their product
|
||||
* is at most raid_disks.
|
||||
*
|
||||
* If far_offset is true, then the far_copies are handled a bit differently.
|
||||
* The copies are still in different stripes, but instead of be very far apart
|
||||
* on disk, there are adjacent stripes.
|
||||
* The copies are still in different stripes, but instead of being very far
|
||||
* apart on disk, there are adjacent stripes.
|
||||
*
|
||||
* The far and offset algorithms are handled slightly differently if
|
||||
* 'use_far_sets' is true. In this case, the array's devices are grouped into
|
||||
* sets that are (near_copies * far_copies) in size. The far copied stripes
|
||||
* are still shifted by 'near_copies' devices, but this shifting stays confined
|
||||
* to the set rather than the entire array. This is done to improve the number
|
||||
* of device combinations that can fail without causing the array to fail.
|
||||
* Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
|
||||
* on a device):
|
||||
* A B C D A B C D E
|
||||
* ... ...
|
||||
* D A B C E A B C D
|
||||
* Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
|
||||
* [A B] [C D] [A B] [C D E]
|
||||
* |...| |...| |...| | ... |
|
||||
* [B A] [D C] [B A] [E C D]
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
|
||||
sector_t stripe;
|
||||
int dev;
|
||||
int slot = 0;
|
||||
int last_far_set_start, last_far_set_size;
|
||||
|
||||
last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
|
||||
last_far_set_start *= geo->far_set_size;
|
||||
|
||||
last_far_set_size = geo->far_set_size;
|
||||
last_far_set_size += (geo->raid_disks % geo->far_set_size);
|
||||
|
||||
/* now calculate first sector/dev */
|
||||
chunk = r10bio->sector >> geo->chunk_shift;
|
||||
@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
|
||||
/* and calculate all the others */
|
||||
for (n = 0; n < geo->near_copies; n++) {
|
||||
int d = dev;
|
||||
int set;
|
||||
sector_t s = sector;
|
||||
r10bio->devs[slot].addr = sector;
|
||||
r10bio->devs[slot].devnum = d;
|
||||
r10bio->devs[slot].addr = s;
|
||||
slot++;
|
||||
|
||||
for (f = 1; f < geo->far_copies; f++) {
|
||||
set = d / geo->far_set_size;
|
||||
d += geo->near_copies;
|
||||
if (d >= geo->raid_disks)
|
||||
d -= geo->raid_disks;
|
||||
|
||||
if ((geo->raid_disks % geo->far_set_size) &&
|
||||
(d > last_far_set_start)) {
|
||||
d -= last_far_set_start;
|
||||
d %= last_far_set_size;
|
||||
d += last_far_set_start;
|
||||
} else {
|
||||
d %= geo->far_set_size;
|
||||
d += geo->far_set_size * set;
|
||||
}
|
||||
s += geo->stride;
|
||||
r10bio->devs[slot].devnum = d;
|
||||
r10bio->devs[slot].addr = s;
|
||||
@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
|
||||
* or recovery, so reshape isn't happening
|
||||
*/
|
||||
struct geom *geo = &conf->geo;
|
||||
int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
|
||||
int far_set_size = geo->far_set_size;
|
||||
int last_far_set_start;
|
||||
|
||||
if (geo->raid_disks % geo->far_set_size) {
|
||||
last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1;
|
||||
last_far_set_start *= geo->far_set_size;
|
||||
|
||||
if (dev >= last_far_set_start) {
|
||||
far_set_size = geo->far_set_size;
|
||||
far_set_size += (geo->raid_disks % geo->far_set_size);
|
||||
far_set_start = last_far_set_start;
|
||||
}
|
||||
}
|
||||
|
||||
offset = sector & geo->chunk_mask;
|
||||
if (geo->far_offset) {
|
||||
@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
|
||||
chunk = sector >> geo->chunk_shift;
|
||||
fc = sector_div(chunk, geo->far_copies);
|
||||
dev -= fc * geo->near_copies;
|
||||
if (dev < 0)
|
||||
dev += geo->raid_disks;
|
||||
if (dev < far_set_start)
|
||||
dev += far_set_size;
|
||||
} else {
|
||||
while (sector >= geo->stride) {
|
||||
sector -= geo->stride;
|
||||
if (dev < geo->near_copies)
|
||||
dev += geo->raid_disks - geo->near_copies;
|
||||
if (dev < (geo->near_copies + far_set_start))
|
||||
dev += far_set_size - geo->near_copies;
|
||||
else
|
||||
dev -= geo->near_copies;
|
||||
}
|
||||
@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
|
||||
bio_list_merge(&conf->pending_bio_list, &plug->pending);
|
||||
conf->pending_count += plug->pending_cnt;
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
wake_up(&conf->wait_barrier);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
kfree(plug);
|
||||
return;
|
||||
@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
|
||||
const unsigned long do_discard = (bio->bi_rw
|
||||
& (REQ_DISCARD | REQ_SECURE));
|
||||
const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
|
||||
unsigned long flags;
|
||||
struct md_rdev *blocked_rdev;
|
||||
struct blk_plug_cb *cb;
|
||||
@ -1460,7 +1508,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||
rdev));
|
||||
mbio->bi_bdev = rdev->bdev;
|
||||
mbio->bi_end_io = raid10_end_write_request;
|
||||
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
|
||||
mbio->bi_rw =
|
||||
WRITE | do_sync | do_fua | do_discard | do_same;
|
||||
mbio->bi_private = r10_bio;
|
||||
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
@ -1502,7 +1551,8 @@ static void make_request(struct mddev *mddev, struct bio * bio)
|
||||
r10_bio, rdev));
|
||||
mbio->bi_bdev = rdev->bdev;
|
||||
mbio->bi_end_io = raid10_end_write_request;
|
||||
mbio->bi_rw = WRITE | do_sync | do_fua | do_discard;
|
||||
mbio->bi_rw =
|
||||
WRITE | do_sync | do_fua | do_discard | do_same;
|
||||
mbio->bi_private = r10_bio;
|
||||
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
|
||||
disks = mddev->raid_disks + mddev->delta_disks;
|
||||
break;
|
||||
}
|
||||
if (layout >> 17)
|
||||
if (layout >> 18)
|
||||
return -1;
|
||||
if (chunk < (PAGE_SIZE >> 9) ||
|
||||
!is_power_of_2(chunk))
|
||||
@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
|
||||
geo->near_copies = nc;
|
||||
geo->far_copies = fc;
|
||||
geo->far_offset = fo;
|
||||
geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
|
||||
geo->chunk_mask = chunk - 1;
|
||||
geo->chunk_shift = ffz(~chunk);
|
||||
return nc*fc;
|
||||
@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev)
|
||||
if (mddev->queue) {
|
||||
blk_queue_max_discard_sectors(mddev->queue,
|
||||
mddev->chunk_sectors);
|
||||
blk_queue_max_write_same_sectors(mddev->queue,
|
||||
mddev->chunk_sectors);
|
||||
blk_queue_io_min(mddev->queue, chunk_size);
|
||||
if (conf->geo.raid_disks % conf->geo.near_copies)
|
||||
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
|
||||
|
@ -33,6 +33,11 @@ struct r10conf {
|
||||
* far_offset, in which case it is
|
||||
* 1 stripe.
|
||||
*/
|
||||
int far_set_size; /* The number of devices in a set,
|
||||
* where a 'set' are devices that
|
||||
* contain far/offset copies of
|
||||
* each other.
|
||||
*/
|
||||
int chunk_shift; /* shift from chunks to sectors */
|
||||
sector_t chunk_mask;
|
||||
} prev, geo;
|
||||
|
@ -1403,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
|
||||
&sh->ops.zero_sum_result, percpu->spare_page, &submit);
|
||||
}
|
||||
|
||||
static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
||||
static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
||||
{
|
||||
int overlap_clear = 0, i, disks = sh->disks;
|
||||
struct dma_async_tx_descriptor *tx = NULL;
|
||||
@ -1468,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MULTICORE_RAID456
|
||||
static void async_run_ops(void *param, async_cookie_t cookie)
|
||||
{
|
||||
struct stripe_head *sh = param;
|
||||
unsigned long ops_request = sh->ops.request;
|
||||
|
||||
clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
|
||||
wake_up(&sh->ops.wait_for_ops);
|
||||
|
||||
__raid_run_ops(sh, ops_request);
|
||||
release_stripe(sh);
|
||||
}
|
||||
|
||||
static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
|
||||
{
|
||||
/* since handle_stripe can be called outside of raid5d context
|
||||
* we need to ensure sh->ops.request is de-staged before another
|
||||
* request arrives
|
||||
*/
|
||||
wait_event(sh->ops.wait_for_ops,
|
||||
!test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
|
||||
sh->ops.request = ops_request;
|
||||
|
||||
atomic_inc(&sh->count);
|
||||
async_schedule(async_run_ops, sh);
|
||||
}
|
||||
#else
|
||||
#define raid_run_ops __raid_run_ops
|
||||
#endif
|
||||
|
||||
static int grow_one_stripe(struct r5conf *conf)
|
||||
{
|
||||
struct stripe_head *sh;
|
||||
@ -1506,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf)
|
||||
return 0;
|
||||
|
||||
sh->raid_conf = conf;
|
||||
#ifdef CONFIG_MULTICORE_RAID456
|
||||
init_waitqueue_head(&sh->ops.wait_for_ops);
|
||||
#endif
|
||||
|
||||
spin_lock_init(&sh->stripe_lock);
|
||||
|
||||
@ -1627,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
|
||||
break;
|
||||
|
||||
nsh->raid_conf = conf;
|
||||
#ifdef CONFIG_MULTICORE_RAID456
|
||||
init_waitqueue_head(&nsh->ops.wait_for_ops);
|
||||
#endif
|
||||
spin_lock_init(&nsh->stripe_lock);
|
||||
|
||||
list_add(&nsh->lru, &newstripes);
|
||||
|
Loading…
Reference in New Issue
Block a user