From c5d736f548ec5aab7e877872417ac23a5c42f1fd Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Mon, 14 Aug 2023 21:53:54 +0800
Subject: [PATCH 1/7] md/raid1: call free_r1bio() before allow_barrier() in
 raid_end_bio_io()

After allow_barrier, a concurrent raid1_reshape() will replace old mempool
and r1conf::raid_disks. Move allow_barrier() to the end of raid_end_bio_io(),
so that r1bio can be freed safely.

Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Link: https://lore.kernel.org/r/20230814135356.1113639-2-xueshi.hu@smartx.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c18b7c096c8d..642c8bae0df0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -311,6 +311,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
 	struct r1conf *conf = r1_bio->mddev->private;
+	sector_t sector = r1_bio->sector;
 
 	/* if nobody has done the final endio yet, do it now */
 	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
@@ -321,13 +322,13 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
 
 		call_bio_endio(r1_bio);
 	}
+
+	free_r1bio(r1_bio);
 	/*
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.  All I/Os, even write-behind writes, are done.
 	 */
-	allow_barrier(conf, r1_bio->sector);
-
-	free_r1bio(r1_bio);
+	allow_barrier(conf, sector);
 }
 
 /*

From 992db13a4aee766c8bfbf046ad15c2db5fa7cab8 Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Mon, 14 Aug 2023 21:53:55 +0800
Subject: [PATCH 2/7] md/raid1: free the r1bio before waiting for blocked rdev

Raid1 reshape will change mempool and r1conf::raid_disks which are
needed to free r1bio. allow_barrier() make a concurrent raid1_reshape()
possible. So, free the in-flight r1bio before waiting blocked rdev.

Fixes: 6bfe0b499082 ("md: support blocking writes to an array on device failure")
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Link: https://lore.kernel.org/r/20230814135356.1113639-3-xueshi.hu@smartx.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 642c8bae0df0..b3fc44157cfc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1378,6 +1378,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		return;
 	}
 
+ retry_write:
 	r1_bio = alloc_r1bio(mddev, bio);
 	r1_bio->sectors = max_write_sectors;
 
@@ -1393,7 +1394,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 	 */
 
 	disks = conf->raid_disks * 2;
- retry_write:
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r1_bio->sectors;
@@ -1473,7 +1473,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 		for (j = 0; j < i; j++)
 			if (r1_bio->bios[j])
 				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
-		r1_bio->state = 0;
+		free_r1bio(r1_bio);
 		allow_barrier(conf, bio->bi_iter.bi_sector);
 
 		if (bio->bi_opf & REQ_NOWAIT) {

From c069da449a13669ffa754fd971747e7e17e7d691 Mon Sep 17 00:00:00 2001
From: Xueshi Hu <xueshi.hu@smartx.com>
Date: Mon, 14 Aug 2023 21:53:56 +0800
Subject: [PATCH 3/7] md/raid1: hold the barrier until handle_read_error()
 finishes

handle_read_error() will call allow_barrier() to match the former barrier
raising. However, it should put the allow_barrier() at the end to avoid a
concurrent raid reshape.

Fixes: 689389a06ce7 ("md/raid1: simplify handle_read_error().")
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Xueshi Hu <xueshi.hu@smartx.com>
Link: https://lore.kernel.org/r/20230814135356.1113639-4-xueshi.hu@smartx.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b3fc44157cfc..56f2725a996f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2511,6 +2511,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	struct mddev *mddev = conf->mddev;
 	struct bio *bio;
 	struct md_rdev *rdev;
+	sector_t sector;
 
 	clear_bit(R1BIO_ReadError, &r1_bio->state);
 	/* we got a read error. Maybe the drive is bad.  Maybe just
@@ -2540,12 +2541,13 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	}
 
 	rdev_dec_pending(rdev, conf->mddev);
-	allow_barrier(conf, r1_bio->sector);
+	sector = r1_bio->sector;
 	bio = r1_bio->master_bio;
 
 	/* Reuse the old r1_bio so that the IO_BLOCKED settings are preserved */
 	r1_bio->state = 0;
 	raid1_read_request(mddev, bio, r1_bio->sectors, r1_bio);
+	allow_barrier(conf, sector);
 }
 
 static void raid1d(struct md_thread *thread)

From 6b2460e66ce6d483b5ff77227ac799d6e8a9ebd6 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Mon, 14 Aug 2023 14:01:15 +0200
Subject: [PATCH 4/7] md raid1: allow writebehind to work on any leg device set
 WriteMostly

As the WriteMostly flag can be set on any component device of a RAID1
array, remove the constraint that it only works if set on the first one.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Tested-by: Xiao Ni <xni@redhat.com>
Link: https://lore.kernel.org/r/2a9592bf3340f34bf588eec984b23ee219f3985e.1692013451.git.heinzm@redhat.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid1.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 56f2725a996f..4b30a1742162 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1523,8 +1523,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			 * Not if there are too many, or cannot
 			 * allocate memory, or a reader on WriteMostly
 			 * is waiting for behind writes to flush */
-			if (bitmap &&
-			    test_bit(WriteMostly, &rdev->flags) &&
+			if (bitmap && write_behind &&
 			    (atomic_read(&bitmap->behind_writes)
 			     < mddev->bitmap_info.max_write_behind) &&
 			    !waitqueue_active(&bitmap->behind_wait)) {

From af50e20afb401cc203bd2a9ff62ece0ae4976103 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 14 Aug 2023 11:27:07 +0200
Subject: [PATCH 5/7] md/raid0: Factor out helper for mapping and submitting a
 bio

Factor out helper function for mapping and submitting a bio out of
raid0_make_request(). We will use it later for submitting both parts of
a split bio.

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230814092720.3931-1-jack@suse.cz
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid0.c | 83 +++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 4106d943aae7..91b24c510b7f 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -545,14 +545,51 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 	bio_endio(bio);
 }
 
-static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
+static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
 {
 	struct r0conf *conf = mddev->private;
 	struct strip_zone *zone;
 	struct md_rdev *tmp_dev;
-	sector_t bio_sector;
+	sector_t bio_sector = bio->bi_iter.bi_sector;
+	sector_t sector = bio_sector;
+
+	if (bio->bi_pool != &mddev->bio_set)
+		md_account_bio(mddev, &bio);
+
+	zone = find_zone(mddev->private, &sector);
+	switch (conf->layout) {
+	case RAID0_ORIG_LAYOUT:
+		tmp_dev = map_sector(mddev, zone, bio_sector, &sector);
+		break;
+	case RAID0_ALT_MULTIZONE_LAYOUT:
+		tmp_dev = map_sector(mddev, zone, sector, &sector);
+		break;
+	default:
+		WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
+		bio_io_error(bio);
+		return;
+	}
+
+	if (unlikely(is_rdev_broken(tmp_dev))) {
+		bio_io_error(bio);
+		md_error(mddev, tmp_dev);
+		return;
+	}
+
+	bio_set_dev(bio, tmp_dev->bdev);
+	bio->bi_iter.bi_sector = sector + zone->dev_start +
+		tmp_dev->data_offset;
+
+	if (mddev->gendisk)
+		trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
+				      bio_sector);
+	mddev_check_write_zeroes(mddev, bio);
+	submit_bio_noacct(bio);
+}
+
+static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
+{
 	sector_t sector;
-	sector_t orig_sector;
 	unsigned chunk_sects;
 	unsigned sectors;
 
@@ -565,8 +602,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 		return true;
 	}
 
-	bio_sector = bio->bi_iter.bi_sector;
-	sector = bio_sector;
+	sector = bio->bi_iter.bi_sector;
 	chunk_sects = mddev->chunk_sectors;
 
 	sectors = chunk_sects -
@@ -574,9 +610,6 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 		 ? (sector & (chunk_sects-1))
 		 : sector_div(sector, chunk_sects));
 
-	/* Restore due to sector_div */
-	sector = bio_sector;
-
 	if (sectors < bio_sectors(bio)) {
 		struct bio *split = bio_split(bio, sectors, GFP_NOIO,
 					      &mddev->bio_set);
@@ -585,39 +618,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 		bio = split;
 	}
 
-	if (bio->bi_pool != &mddev->bio_set)
-		md_account_bio(mddev, &bio);
-
-	orig_sector = sector;
-	zone = find_zone(mddev->private, &sector);
-	switch (conf->layout) {
-	case RAID0_ORIG_LAYOUT:
-		tmp_dev = map_sector(mddev, zone, orig_sector, &sector);
-		break;
-	case RAID0_ALT_MULTIZONE_LAYOUT:
-		tmp_dev = map_sector(mddev, zone, sector, &sector);
-		break;
-	default:
-		WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
-		bio_io_error(bio);
-		return true;
-	}
-
-	if (unlikely(is_rdev_broken(tmp_dev))) {
-		bio_io_error(bio);
-		md_error(mddev, tmp_dev);
-		return true;
-	}
-
-	bio_set_dev(bio, tmp_dev->bdev);
-	bio->bi_iter.bi_sector = sector + zone->dev_start +
-		tmp_dev->data_offset;
-
-	if (mddev->gendisk)
-		trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
-				      bio_sector);
-	mddev_check_write_zeroes(mddev, bio);
-	submit_bio_noacct(bio);
+	raid0_map_submit_bio(mddev, bio);
 	return true;
 }
 

From 319ff40a542736d67e5bce18635de35d0e7a0bff Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 14 Aug 2023 11:27:08 +0200
Subject: [PATCH 6/7] md/raid0: Fix performance regression for large sequential
 writes

Commit f00d7c85be9e ("md/raid0: fix up bio splitting.") among other
things changed how bio that needs to be split is submitted. Before this
commit, we have split the bio, mapped and submitted each part. After
this commit, we map only the first part of the split bio and submit the
second part unmapped. Due to bio sorting in __submit_bio_noacct() this
results in the following request ordering:

  9,0   18     1181     0.525037895 15995  Q  WS 1479315464 + 63392

  Split off chunk-sized (1024 sectors) request:

  9,0   18     1182     0.629019647 15995  X  WS 1479315464 / 1479316488

  Request is unaligned to the chunk so it's split in
  raid0_make_request().  This is the first part mapped and punted to
  bio_list:

  8,0   18     7053     0.629020455 15995  A  WS 739921928 + 1016 <- (9,0) 1479315464

  Now raid0_make_request() returns, second part is postponed on
  bio_list. __submit_bio_noacct() resorts the bio_list, mapped request
  is submitted to the underlying device:

  8,0   18     7054     0.629022782 15995  G  WS 739921928 + 1016

  Now we take another request from the bio_list which is the remainder
  of the original huge request. Split off another chunk-sized bit from
  it and the situation repeats:

  9,0   18     1183     0.629024499 15995  X  WS 1479316488 / 1479317512
  8,16  18     6998     0.629025110 15995  A  WS 739921928 + 1016 <- (9,0) 1479316488
  8,16  18     6999     0.629026728 15995  G  WS 739921928 + 1016
  ...
  9,0   18     1184     0.629032940 15995  X  WS 1479317512 / 1479318536 [libnetacq-write]
  8,0   18     7059     0.629033294 15995  A  WS 739922952 + 1016 <- (9,0) 1479317512
  8,0   18     7060     0.629033902 15995  G  WS 739922952 + 1016
  ...

  This repeats until we consume the whole original huge request. Now we
  finally get to processing the second parts of the split off requests
  (in reverse order):

  8,16  18     7181     0.629161384 15995  A  WS 739952640 + 8 <- (9,0) 1479377920
  8,0   18     7239     0.629162140 15995  A  WS 739952640 + 8 <- (9,0) 1479376896
  8,16  18     7186     0.629163881 15995  A  WS 739951616 + 8 <- (9,0) 1479375872
  8,0   18     7242     0.629164421 15995  A  WS 739951616 + 8 <- (9,0) 1479374848
  ...

I guess it is obvious that this IO pattern is extremely inefficient way
to perform sequential IO. It also makes bio_list to grow to rather long
lengths.

Change raid0_make_request() to map both parts of the split bio. Since we
know we are provided with at most chunk-sized bios, we will always need
to split the incoming bio at most once.

Fixes: f00d7c85be9e ("md/raid0: fix up bio splitting.")
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230814092720.3931-2-jack@suse.cz
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/raid0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 91b24c510b7f..abbd77977f98 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -614,7 +614,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
 		struct bio *split = bio_split(bio, sectors, GFP_NOIO,
 					      &mddev->bio_set);
 		bio_chain(split, bio);
-		submit_bio_noacct(bio);
+		raid0_map_submit_bio(mddev, bio);
 		bio = split;
 	}
 

From cc22b5407e9ca76adb7efeed843146510b1b72a5 Mon Sep 17 00:00:00 2001
From: David Jeffery <djeffery@redhat.com>
Date: Wed, 16 Aug 2023 14:13:55 -0400
Subject: [PATCH 7/7] md: raid0: account for split bio in iostat accounting

When a bio is split by md raid0, the newly created bio will not be tracked
by md for I/O accounting. Only the portion of I/O still assigned to the
original bio which was reduced by the split will be accounted for. This
results in md iostat data sometimes showing I/O values far below the actual
amount of data being sent through md.

md_account_bio() needs to be called for all bio generated by the bio split.

A simple example of the issue was generated using a raid0 device on partitions
to the same device. Since all raid0 I/O then goes to one device, it makes it
easy to see a gap between the md device and its sd storage. Reading an lvm
device on top of the md device, the iostat output (some 0 columns and extra
devices removed to make the data more compact) was:

Device             tps    kB_read/s    kB_wrtn/s    kB_dscd/s    kB_read
md2               0.00         0.00         0.00         0.00          0
sde               0.00         0.00         0.00         0.00          0
md2            1364.00    411496.00         0.00         0.00     411496
sde            1734.00    646144.00         0.00         0.00     646144
md2            1699.00    510680.00         0.00         0.00     510680
sde            2155.00    802784.00         0.00         0.00     802784
md2             803.00    241480.00         0.00         0.00     241480
sde            1016.00    377888.00         0.00         0.00     377888
md2               0.00         0.00         0.00         0.00          0
sde               0.00         0.00         0.00         0.00          0

I/O was generated doing large direct I/O reads (12M) with dd to a linear
lvm volume on top of the 4 leg raid0 device.

The md2 reads were showing as roughly 2/3 of the reads to the sde device
containing all of md2's raid partitions. The sum of reads to sde was
1826816 kB, which was the expected amount as it was the amount read by
dd. With the patch, the total reads from md will match the reads from
sde and be consistent with the amount of I/O generated.

Fixes: 10764815ff47 ("md: add io accounting for raid0 and raid5")
Signed-off-by: David Jeffery <djeffery@redhat.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Reviewed-by: Laurence Oberman <loberman@redhat.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230816181433.13289-1-djeffery@redhat.com
---
 drivers/md/raid0.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index abbd77977f98..c50a7abda744 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -553,8 +553,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
 	sector_t bio_sector = bio->bi_iter.bi_sector;
 	sector_t sector = bio_sector;
 
-	if (bio->bi_pool != &mddev->bio_set)
-		md_account_bio(mddev, &bio);
+	md_account_bio(mddev, &bio);
 
 	zone = find_zone(mddev->private, &sector);
 	switch (conf->layout) {