From c5d736f548ec5aab7e877872417ac23a5c42f1fd Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Mon, 14 Aug 2023 21:53:54 +0800 Subject: [PATCH 1/7] md/raid1: call free_r1bio() before allow_barrier() in raid_end_bio_io() After allow_barrier, a concurrent raid1_reshape() will replace old mempool and r1conf::raid_disks. Move allow_barrier() to the end of raid_end_bio_io(), so that r1bio can be freed safely. Reviewed-by: Yu Kuai Signed-off-by: Xueshi Hu Link: https://lore.kernel.org/r/20230814135356.1113639-2-xueshi.hu@smartx.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c18b7c096c8d..642c8bae0df0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -311,6 +311,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; struct r1conf *conf = r1_bio->mddev->private; + sector_t sector = r1_bio->sector; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { @@ -321,13 +322,13 @@ static void raid_end_bio_io(struct r1bio *r1_bio) call_bio_endio(r1_bio); } + + free_r1bio(r1_bio); /* * Wake up any possible resync thread that waits for the device * to go idle. All I/Os, even write-behind writes, are done. */ - allow_barrier(conf, r1_bio->sector); - - free_r1bio(r1_bio); + allow_barrier(conf, sector); } /* From 992db13a4aee766c8bfbf046ad15c2db5fa7cab8 Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Mon, 14 Aug 2023 21:53:55 +0800 Subject: [PATCH 2/7] md/raid1: free the r1bio before waiting for blocked rdev Raid1 reshape will change mempool and r1conf::raid_disks which are needed to free r1bio. allow_barrier() make a concurrent raid1_reshape() possible. So, free the in-flight r1bio before waiting blocked rdev. Fixes: 6bfe0b499082 ("md: support blocking writes to an array on device failure") Reviewed-by: Yu Kuai Signed-off-by: Xueshi Hu Link: https://lore.kernel.org/r/20230814135356.1113639-3-xueshi.hu@smartx.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 642c8bae0df0..b3fc44157cfc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1378,6 +1378,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, return; } + retry_write: r1_bio = alloc_r1bio(mddev, bio); r1_bio->sectors = max_write_sectors; @@ -1393,7 +1394,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ disks = conf->raid_disks * 2; - retry_write: blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1473,7 +1473,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); - r1_bio->state = 0; + free_r1bio(r1_bio); allow_barrier(conf, bio->bi_iter.bi_sector); if (bio->bi_opf & REQ_NOWAIT) { From c069da449a13669ffa754fd971747e7e17e7d691 Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Mon, 14 Aug 2023 21:53:56 +0800 Subject: [PATCH 3/7] md/raid1: hold the barrier until handle_read_error() finishes handle_read_error() will call allow_barrier() to match the former barrier raising. However, it should put the allow_barrier() at the end to avoid a concurrent raid reshape. Fixes: 689389a06ce7 ("md/raid1: simplify handle_read_error().") Reviewed-by: Yu Kuai Signed-off-by: Xueshi Hu Link: https://lore.kernel.org/r/20230814135356.1113639-4-xueshi.hu@smartx.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index b3fc44157cfc..56f2725a996f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2511,6 +2511,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) struct mddev *mddev = conf->mddev; struct bio *bio; struct md_rdev *rdev; + sector_t sector; clear_bit(R1BIO_ReadError, &r1_bio->state); /* we got a read error. Maybe the drive is bad. Maybe just @@ -2540,12 +2541,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) } rdev_dec_pending(rdev, conf->mddev); - allow_barrier(conf, r1_bio->sector); + sector = r1_bio->sector; bio = r1_bio->master_bio; /* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */ r1_bio->state = 0; raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio); + allow_barrier(conf, sector); } static void raid1d(struct md_thread *thread) From 6b2460e66ce6d483b5ff77227ac799d6e8a9ebd6 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Mon, 14 Aug 2023 14:01:15 +0200 Subject: [PATCH 4/7] md raid1: allow writebehind to work on any leg device set WriteMostly As the WriteMostly flag can be set on any component device of a RAID1 array, remove the constraint that it only works if set on the first one. Signed-off-by: Heinz Mauelshagen Tested-by: Xiao Ni Link: https://lore.kernel.org/r/2a9592bf3340f34bf588eec984b23ee219f3985e.1692013451.git.heinzm@redhat.com Signed-off-by: Song Liu --- drivers/md/raid1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 56f2725a996f..4b30a1742162 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1523,8 +1523,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ - if (bitmap && - test_bit(WriteMostly, &rdev->flags) && + if (bitmap && write_behind && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && !waitqueue_active(&bitmap->behind_wait)) { From af50e20afb401cc203bd2a9ff62ece0ae4976103 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Aug 2023 11:27:07 +0200 Subject: [PATCH 5/7] md/raid0: Factor out helper for mapping and submitting a bio Factor out helper function for mapping and submitting a bio out of raid0_make_request(). We will use it later for submitting both parts of a split bio. Signed-off-by: Jan Kara Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230814092720.3931-1-jack@suse.cz Signed-off-by: Song Liu --- drivers/md/raid0.c | 83 +++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 4106d943aae7..91b24c510b7f 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -545,14 +545,51 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); } -static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; struct strip_zone *zone; struct md_rdev *tmp_dev; - sector_t bio_sector; + sector_t bio_sector = bio->bi_iter.bi_sector; + sector_t sector = bio_sector; + + if (bio->bi_pool != &mddev->bio_set) + md_account_bio(mddev, &bio); + + zone = find_zone(mddev->private, §or); + switch (conf->layout) { + case RAID0_ORIG_LAYOUT: + tmp_dev = map_sector(mddev, zone, bio_sector, §or); + break; + case RAID0_ALT_MULTIZONE_LAYOUT: + tmp_dev = map_sector(mddev, zone, sector, §or); + break; + default: + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); + bio_io_error(bio); + return; + } + + if (unlikely(is_rdev_broken(tmp_dev))) { + bio_io_error(bio); + md_error(mddev, tmp_dev); + return; + } + + bio_set_dev(bio, tmp_dev->bdev); + bio->bi_iter.bi_sector = sector + zone->dev_start + + tmp_dev->data_offset; + + if (mddev->gendisk) + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_write_zeroes(mddev, bio); + submit_bio_noacct(bio); +} + +static bool raid0_make_request(struct mddev *mddev, struct bio *bio) +{ sector_t sector; - sector_t orig_sector; unsigned chunk_sects; unsigned sectors; @@ -565,8 +602,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) return true; } - bio_sector = bio->bi_iter.bi_sector; - sector = bio_sector; + sector = bio->bi_iter.bi_sector; chunk_sects = mddev->chunk_sectors; sectors = chunk_sects - @@ -574,9 +610,6 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) ? (sector & (chunk_sects-1)) : sector_div(sector, chunk_sects)); - /* Restore due to sector_div */ - sector = bio_sector; - if (sectors < bio_sectors(bio)) { struct bio *split = bio_split(bio, sectors, GFP_NOIO, &mddev->bio_set); @@ -585,39 +618,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio = split; } - if (bio->bi_pool != &mddev->bio_set) - md_account_bio(mddev, &bio); - - orig_sector = sector; - zone = find_zone(mddev->private, §or); - switch (conf->layout) { - case RAID0_ORIG_LAYOUT: - tmp_dev = map_sector(mddev, zone, orig_sector, §or); - break; - case RAID0_ALT_MULTIZONE_LAYOUT: - tmp_dev = map_sector(mddev, zone, sector, §or); - break; - default: - WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); - bio_io_error(bio); - return true; - } - - if (unlikely(is_rdev_broken(tmp_dev))) { - bio_io_error(bio); - md_error(mddev, tmp_dev); - return true; - } - - bio_set_dev(bio, tmp_dev->bdev); - bio->bi_iter.bi_sector = sector + zone->dev_start + - tmp_dev->data_offset; - - if (mddev->gendisk) - trace_block_bio_remap(bio, disk_devt(mddev->gendisk), - bio_sector); - mddev_check_write_zeroes(mddev, bio); - submit_bio_noacct(bio); + raid0_map_submit_bio(mddev, bio); return true; } From 319ff40a542736d67e5bce18635de35d0e7a0bff Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 14 Aug 2023 11:27:08 +0200 Subject: [PATCH 6/7] md/raid0: Fix performance regression for large sequential writes Commit f00d7c85be9e ("md/raid0: fix up bio splitting.") among other things changed how bio that needs to be split is submitted. Before this commit, we have split the bio, mapped and submitted each part. After this commit, we map only the first part of the split bio and submit the second part unmapped. Due to bio sorting in __submit_bio_noacct() this results in the following request ordering: 9,0 18 1181 0.525037895 15995 Q WS 1479315464 + 63392 Split off chunk-sized (1024 sectors) request: 9,0 18 1182 0.629019647 15995 X WS 1479315464 / 1479316488 Request is unaligned to the chunk so it's split in raid0_make_request(). This is the first part mapped and punted to bio_list: 8,0 18 7053 0.629020455 15995 A WS 739921928 + 1016 <- (9,0) 1479315464 Now raid0_make_request() returns, second part is postponed on bio_list. __submit_bio_noacct() resorts the bio_list, mapped request is submitted to the underlying device: 8,0 18 7054 0.629022782 15995 G WS 739921928 + 1016 Now we take another request from the bio_list which is the remainder of the original huge request. Split off another chunk-sized bit from it and the situation repeats: 9,0 18 1183 0.629024499 15995 X WS 1479316488 / 1479317512 8,16 18 6998 0.629025110 15995 A WS 739921928 + 1016 <- (9,0) 1479316488 8,16 18 6999 0.629026728 15995 G WS 739921928 + 1016 ... 9,0 18 1184 0.629032940 15995 X WS 1479317512 / 1479318536 [libnetacq-write] 8,0 18 7059 0.629033294 15995 A WS 739922952 + 1016 <- (9,0) 1479317512 8,0 18 7060 0.629033902 15995 G WS 739922952 + 1016 ... This repeats until we consume the whole original huge request. Now we finally get to processing the second parts of the split off requests (in reverse order): 8,16 18 7181 0.629161384 15995 A WS 739952640 + 8 <- (9,0) 1479377920 8,0 18 7239 0.629162140 15995 A WS 739952640 + 8 <- (9,0) 1479376896 8,16 18 7186 0.629163881 15995 A WS 739951616 + 8 <- (9,0) 1479375872 8,0 18 7242 0.629164421 15995 A WS 739951616 + 8 <- (9,0) 1479374848 ... I guess it is obvious that this IO pattern is extremely inefficient way to perform sequential IO. It also makes bio_list to grow to rather long lengths. Change raid0_make_request() to map both parts of the split bio. Since we know we are provided with at most chunk-sized bios, we will always need to split the incoming bio at most once. Fixes: f00d7c85be9e ("md/raid0: fix up bio splitting.") Signed-off-by: Jan Kara Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230814092720.3931-2-jack@suse.cz Signed-off-by: Song Liu --- drivers/md/raid0.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 91b24c510b7f..abbd77977f98 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -614,7 +614,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, sectors, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - submit_bio_noacct(bio); + raid0_map_submit_bio(mddev, bio); bio = split; } From cc22b5407e9ca76adb7efeed843146510b1b72a5 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Wed, 16 Aug 2023 14:13:55 -0400 Subject: [PATCH 7/7] md: raid0: account for split bio in iostat accounting When a bio is split by md raid0, the newly created bio will not be tracked by md for I/O accounting. Only the portion of I/O still assigned to the original bio which was reduced by the split will be accounted for. This results in md iostat data sometimes showing I/O values far below the actual amount of data being sent through md. md_account_bio() needs to be called for all bio generated by the bio split. A simple example of the issue was generated using a raid0 device on partitions to the same device. Since all raid0 I/O then goes to one device, it makes it easy to see a gap between the md device and its sd storage. Reading an lvm device on top of the md device, the iostat output (some 0 columns and extra devices removed to make the data more compact) was: Device tps kB_read/s kB_wrtn/s kB_dscd/s kB_read md2 0.00 0.00 0.00 0.00 0 sde 0.00 0.00 0.00 0.00 0 md2 1364.00 411496.00 0.00 0.00 411496 sde 1734.00 646144.00 0.00 0.00 646144 md2 1699.00 510680.00 0.00 0.00 510680 sde 2155.00 802784.00 0.00 0.00 802784 md2 803.00 241480.00 0.00 0.00 241480 sde 1016.00 377888.00 0.00 0.00 377888 md2 0.00 0.00 0.00 0.00 0 sde 0.00 0.00 0.00 0.00 0 I/O was generated doing large direct I/O reads (12M) with dd to a linear lvm volume on top of the 4 leg raid0 device. The md2 reads were showing as roughly 2/3 of the reads to the sde device containing all of md2's raid partitions. The sum of reads to sde was 1826816 kB, which was the expected amount as it was the amount read by dd. With the patch, the total reads from md will match the reads from sde and be consistent with the amount of I/O generated. Fixes: 10764815ff47 ("md: add io accounting for raid0 and raid5") Signed-off-by: David Jeffery Tested-by: Laurence Oberman Reviewed-by: Laurence Oberman Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230816181433.13289-1-djeffery@redhat.com --- drivers/md/raid0.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index abbd77977f98..c50a7abda744 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -553,8 +553,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) sector_t bio_sector = bio->bi_iter.bi_sector; sector_t sector = bio_sector; - if (bio->bi_pool != &mddev->bio_set) - md_account_bio(mddev, &bio); + md_account_bio(mddev, &bio); zone = find_zone(mddev->private, §or); switch (conf->layout) {