From 786bb02458819df7a833361c6c7448a4925a89ce Mon Sep 17 00:00:00 2001
From: Pankaj Raghav
Date: Thu, 11 May 2023 14:15:44 +0200
Subject: [PATCH 001/266] brd: use XArray instead of radix-tree to index
backing pages
XArray was introduced to hold large array of pointers with a simple API.
XArray API also provides array semantics which simplifies the way we store
and access the backing pages, and the code becomes significantly easier
to understand.
No performance difference was noticed between the two implementation
using fio with direct=1 [1].
[1] Performance in KIOPS:
| radix-tree | XArray | Diff
| | |
write | 315 | 313 | -0.6%
randwrite | 286 | 290 | +1.3%
read | 330 | 335 | +1.5%
randread | 309 | 312 | +0.9%
Signed-off-by: Pankaj Raghav
Reviewed-by: Hannes Reinecke
Link: https://lore.kernel.org/r/20230511121544.111648-1-p.raghav@samsung.com
Signed-off-by: Jens Axboe
---
drivers/block/brd.c | 93 ++++++++++++---------------------------------
1 file changed, 24 insertions(+), 69 deletions(-)
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index bcad9b926b0c..2f71376afc71 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -19,7 +19,7 @@
#include
#include
#include
-#include
+#include
#include
#include
#include
@@ -28,7 +28,7 @@
#include
/*
- * Each block ramdisk device has a radix_tree brd_pages of pages that stores
+ * Each block ramdisk device has a xarray brd_pages of pages that stores
* the pages containing the block device's contents. A brd page's ->index is
* its offset in PAGE_SIZE units. This is similar to, but in no way connected
* with, the kernel's pagecache or buffer cache (which sit above our block
@@ -40,11 +40,9 @@ struct brd_device {
struct list_head brd_list;
/*
- * Backing store of pages and lock to protect it. This is the contents
- * of the block device.
+ * Backing store of pages. This is the contents of the block device.
*/
- spinlock_t brd_lock;
- struct radix_tree_root brd_pages;
+ struct xarray brd_pages;
u64 brd_nr_pages;
};
@@ -56,21 +54,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
pgoff_t idx;
struct page *page;
- /*
- * The page lifetime is protected by the fact that we have opened the
- * device node -- brd pages will never be deleted under us, so we
- * don't need any further locking or refcounting.
- *
- * This is strictly true for the radix-tree nodes as well (ie. we
- * don't actually need the rcu_read_lock()), however that is not a
- * documented feature of the radix-tree API so it is better to be
- * safe here (we don't have total exclusion from radix tree updates
- * here, only deletes).
- */
- rcu_read_lock();
idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
- page = radix_tree_lookup(&brd->brd_pages, idx);
- rcu_read_unlock();
+ page = xa_load(&brd->brd_pages, idx);
BUG_ON(page && page->index != idx);
@@ -83,7 +68,7 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
{
pgoff_t idx;
- struct page *page;
+ struct page *page, *cur;
int ret = 0;
page = brd_lookup_page(brd, sector);
@@ -94,71 +79,42 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
if (!page)
return -ENOMEM;
- if (radix_tree_maybe_preload(gfp)) {
- __free_page(page);
- return -ENOMEM;
- }
+ xa_lock(&brd->brd_pages);
- spin_lock(&brd->brd_lock);
idx = sector >> PAGE_SECTORS_SHIFT;
page->index = idx;
- if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+
+ cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
+
+ if (unlikely(cur)) {
__free_page(page);
- page = radix_tree_lookup(&brd->brd_pages, idx);
- if (!page)
- ret = -ENOMEM;
- else if (page->index != idx)
+ ret = xa_err(cur);
+ if (!ret && (cur->index != idx))
ret = -EIO;
} else {
brd->brd_nr_pages++;
}
- spin_unlock(&brd->brd_lock);
- radix_tree_preload_end();
+ xa_unlock(&brd->brd_pages);
+
return ret;
}
/*
- * Free all backing store pages and radix tree. This must only be called when
+ * Free all backing store pages and xarray. This must only be called when
* there are no other users of the device.
*/
-#define FREE_BATCH 16
static void brd_free_pages(struct brd_device *brd)
{
- unsigned long pos = 0;
- struct page *pages[FREE_BATCH];
- int nr_pages;
+ struct page *page;
+ pgoff_t idx;
- do {
- int i;
+ xa_for_each(&brd->brd_pages, idx, page) {
+ __free_page(page);
+ cond_resched_rcu();
+ }
- nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
- (void **)pages, pos, FREE_BATCH);
-
- for (i = 0; i < nr_pages; i++) {
- void *ret;
-
- BUG_ON(pages[i]->index < pos);
- pos = pages[i]->index;
- ret = radix_tree_delete(&brd->brd_pages, pos);
- BUG_ON(!ret || ret != pages[i]);
- __free_page(pages[i]);
- }
-
- pos++;
-
- /*
- * It takes 3.4 seconds to remove 80GiB ramdisk.
- * So, we need cond_resched to avoid stalling the CPU.
- */
- cond_resched();
-
- /*
- * This assumes radix_tree_gang_lookup always returns as
- * many pages as possible. If the radix-tree code changes,
- * so will this have to.
- */
- } while (nr_pages == FREE_BATCH);
+ xa_destroy(&brd->brd_pages);
}
/*
@@ -372,8 +328,7 @@ static int brd_alloc(int i)
brd->brd_number = i;
list_add_tail(&brd->brd_list, &brd_devices);
- spin_lock_init(&brd->brd_lock);
- INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+ xa_init(&brd->brd_pages);
snprintf(buf, DISK_NAME_LEN, "ram%d", i);
if (!IS_ERR_OR_NULL(brd_debugfs_dir))
From d5fb8726f1dea70543a93ab1d7332857f157b7f3 Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Thu, 18 May 2023 15:27:08 -0700
Subject: [PATCH 002/266] block: Decode all flag names in the debugfs output
See also:
* Commit 4d337cebcb1c ("blk-mq: avoid to touch q->elevator without any protection").
* Commit 414dd48e882c ("blk-mq: add tagset quiesce interface").
Cc: Christoph Hellwig
Cc: Damien Le Moal
Cc: Ming Lei
Cc: Chaitanya Kulkarni
Signed-off-by: Bart Van Assche
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230518222708.1190867-1-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/blk-mq-debugfs.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index d23a8554ec4a..f89865a90dba 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -88,6 +88,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(ADD_RANDOM),
+ QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(STABLE_WRITES),
@@ -103,6 +104,8 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(NOWAIT),
+ QUEUE_FLAG_NAME(SQ_SCHED),
+ QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
};
#undef QUEUE_FLAG_NAME
From d97217e7f024bbe9aa62aea070771234c2879358 Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Thu, 18 May 2023 07:30:59 +0200
Subject: [PATCH 003/266] blk-mq: don't queue plugged passthrough requests into
scheduler
Passthrough requests should never be queued to the I/O scheduler,
as scheduling these opaque requests doesn't make sense, and I/O
schedulers might require req->bio to be always valid.
We never let passthrough requests insert into the scheduler before
commit 1c2d2fff6dc0 ("block: wire-up support for passthrough plugging"),
restore this behavior even for passthrough requests issued under a plug.
[hch: use blk_mq_insert_requests for passthrough requests,
fix up the commit message and comments]
Reported-by: Guangwu Zhang
Closes: https://lore.kernel.org/linux-block/CAGS2=YosaYaUTEMU3uaf+y=8MqSrhL7sYsJn8EwbaM=76p_4Qg@mail.gmail.com/
Investigated-by: Yu Kuai
Fixes: 1c2d2fff6dc0 ("block: wire-up support for passthrough plugging")
Signed-off-by: Ming Lei
Signed-off-by: Christoph Hellwig
Link: https://lore.kernel.org/r/20230518053101.760632-2-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-mq.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f6dad0886a2f..8b7e4daaa5b7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2711,6 +2711,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
struct request *requeue_list = NULL;
struct request **requeue_lastp = &requeue_list;
unsigned int depth = 0;
+ bool is_passthrough = false;
LIST_HEAD(list);
do {
@@ -2719,7 +2720,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
if (!this_hctx) {
this_hctx = rq->mq_hctx;
this_ctx = rq->mq_ctx;
- } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ is_passthrough = blk_rq_is_passthrough(rq);
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
+ is_passthrough != blk_rq_is_passthrough(rq)) {
rq_list_add_tail(&requeue_lastp, rq);
continue;
}
@@ -2731,7 +2734,8 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
trace_block_unplug(this_hctx->queue, depth, !from_sched);
percpu_ref_get(&this_hctx->queue->q_usage_counter);
- if (this_hctx->queue->elevator) {
+ /* passthrough requests should never be issued to the I/O scheduler */
+ if (this_hctx->queue->elevator && !is_passthrough) {
this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
&list, 0);
blk_mq_run_hw_queue(this_hctx, from_sched);
From fdcab6cddef24a26b86d798814b3c25057e53c21 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Thu, 18 May 2023 07:31:00 +0200
Subject: [PATCH 004/266] blk-mq: remove RQF_ELVPRIV
RQF_ELVPRIV is set for all non-flush requests that have RQF_ELV set.
Expand this condition in the two users of the flag and remove it.
Signed-off-by: Christoph Hellwig
Reviewed-by: Ming Lei
Reviewed-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230518053101.760632-3-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-mq-debugfs.c | 1 -
block/blk-mq-sched.h | 4 ++--
block/blk-mq.c | 6 ++----
include/linux/blk-mq.h | 2 --
4 files changed, 4 insertions(+), 9 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f89865a90dba..ae1b3080b62b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -251,7 +251,6 @@ static const char *const rqf_name[] = {
RQF_NAME(DONTPREP),
RQF_NAME(FAILED),
RQF_NAME(QUIET),
- RQF_NAME(ELVPRIV),
RQF_NAME(IO_STAT),
RQF_NAME(PM),
RQF_NAME(HASHED),
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 7c3cbad17f30..4d8d2cd3b473 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -58,11 +58,11 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
- if (rq->rq_flags & RQF_ELV) {
+ if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags)) {
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
- if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
+ if (e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
}
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8b7e4daaa5b7..7470c6636dc4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -393,10 +393,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
RB_CLEAR_NODE(&rq->rb_node);
if (!op_is_flush(data->cmd_flags) &&
- e->type->ops.prepare_request) {
+ e->type->ops.prepare_request)
e->type->ops.prepare_request(rq);
- rq->rq_flags |= RQF_ELVPRIV;
- }
}
return rq;
@@ -696,7 +694,7 @@ void blk_mq_free_request(struct request *rq)
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- if ((rq->rq_flags & RQF_ELVPRIV) &&
+ if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags) &&
q->elevator->type->ops.finish_request)
q->elevator->type->ops.finish_request(rq);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 06caacd77ed6..5529e7d28ae6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -42,8 +42,6 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_FAILED ((__force req_flags_t)(1 << 10))
/* don't warn about errors */
#define RQF_QUIET ((__force req_flags_t)(1 << 11))
-/* elevator private data attached */
-#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12))
/* account into disk and partition IO statistics */
#define RQF_IO_STAT ((__force req_flags_t)(1 << 13))
/* runtime pm request */
From dd6216bb16e83e349d5d987227328031b0b0d30d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Thu, 18 May 2023 07:31:01 +0200
Subject: [PATCH 005/266] blk-mq: make sure elevator callbacks aren't called
for passthrough request
In case of q->elevator, passthrough request can still be marked as
RQF_ELV, so some elevator callbacks will be called for them.
Fix this by splitting RQF_SCHED_TAGS, which is set for all requests that
are issued on a queue that uses an I/O scheduler, and RQF_USE_SCHED for
non-flush, non-passthrough requests on such a queue.
Roughly based on two different patches from
Ming Lei .
Signed-off-by: Christoph Hellwig
Reviewed-by: Ming Lei
Link: https://lore.kernel.org/r/20230518053101.760632-4-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-mq-debugfs.c | 3 ++-
block/blk-mq-sched.h | 6 ++---
block/blk-mq.c | 53 +++++++++++++++++++++++-------------------
block/blk-mq.h | 6 ++---
include/linux/blk-mq.h | 12 ++++++----
5 files changed, 44 insertions(+), 36 deletions(-)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ae1b3080b62b..22e39b9a77ec 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -249,6 +249,8 @@ static const char *const rqf_name[] = {
RQF_NAME(MIXED_MERGE),
RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
+ RQF_NAME(SCHED_TAGS),
+ RQF_NAME(USE_SCHED),
RQF_NAME(FAILED),
RQF_NAME(QUIET),
RQF_NAME(IO_STAT),
@@ -258,7 +260,6 @@ static const char *const rqf_name[] = {
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(TIMED_OUT),
- RQF_NAME(ELV),
RQF_NAME(RESV),
};
#undef RQF_NAME
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 4d8d2cd3b473..1326526bb733 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -37,7 +37,7 @@ static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = q->elevator;
if (e->type->ops.allow_merge)
@@ -48,7 +48,7 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = rq->q->elevator;
if (e->type->ops.completed_request)
@@ -58,7 +58,7 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
- if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags)) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7470c6636dc4..e021740154fe 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -354,12 +354,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
data->rq_flags |= RQF_IO_STAT;
rq->rq_flags = data->rq_flags;
- if (!(data->rq_flags & RQF_ELV)) {
- rq->tag = tag;
- rq->internal_tag = BLK_MQ_NO_TAG;
- } else {
+ if (data->rq_flags & RQF_SCHED_TAGS) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
+ } else {
+ rq->tag = tag;
+ rq->internal_tag = BLK_MQ_NO_TAG;
}
rq->timeout = 0;
@@ -386,14 +386,13 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
WRITE_ONCE(rq->deadline, 0);
req_ref_set(rq, 1);
- if (rq->rq_flags & RQF_ELV) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = data->q->elevator;
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
- if (!op_is_flush(data->cmd_flags) &&
- e->type->ops.prepare_request)
+ if (e->type->ops.prepare_request)
e->type->ops.prepare_request(rq);
}
@@ -447,26 +446,32 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (q->elevator) {
- struct elevator_queue *e = q->elevator;
-
- data->rq_flags |= RQF_ELV;
+ /*
+ * All requests use scheduler tags when an I/O scheduler is
+ * enabled for the queue.
+ */
+ data->rq_flags |= RQF_SCHED_TAGS;
/*
* Flush/passthrough requests are special and go directly to the
- * dispatch list. Don't include reserved tags in the
- * limiting, as it isn't useful.
+ * dispatch list.
*/
if (!op_is_flush(data->cmd_flags) &&
- !blk_op_is_passthrough(data->cmd_flags) &&
- e->type->ops.limit_depth &&
- !(data->flags & BLK_MQ_REQ_RESERVED))
- e->type->ops.limit_depth(data->cmd_flags, data);
+ !blk_op_is_passthrough(data->cmd_flags)) {
+ struct elevator_mq_ops *ops = &q->elevator->type->ops;
+
+ WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
+
+ data->rq_flags |= RQF_USE_SCHED;
+ if (ops->limit_depth)
+ ops->limit_depth(data->cmd_flags, data);
+ }
}
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->rq_flags & RQF_ELV))
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
blk_mq_tag_busy(data->hctx);
if (data->flags & BLK_MQ_REQ_RESERVED)
@@ -646,10 +651,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
- if (!q->elevator)
- blk_mq_tag_busy(data.hctx);
+ if (q->elevator)
+ data.rq_flags |= RQF_SCHED_TAGS;
else
- data.rq_flags |= RQF_ELV;
+ blk_mq_tag_busy(data.hctx);
if (flags & BLK_MQ_REQ_RESERVED)
data.rq_flags |= RQF_RESV;
@@ -694,7 +699,7 @@ void blk_mq_free_request(struct request *rq)
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags) &&
+ if ((rq->rq_flags & RQF_USE_SCHED) &&
q->elevator->type->ops.finish_request)
q->elevator->type->ops.finish_request(rq);
@@ -1268,7 +1273,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
if (!plug->multiple_queues && last && last->q != rq->q)
plug->multiple_queues = true;
- if (!plug->has_elevator && (rq->rq_flags & RQF_ELV))
+ if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED))
plug->has_elevator = true;
rq->rq_next = NULL;
rq_list_add(&plug->mq_list, rq);
@@ -2620,7 +2625,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
return;
}
- if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) {
+ if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
blk_mq_insert_request(rq, 0);
blk_mq_run_hw_queue(hctx, false);
return;
@@ -2983,7 +2988,7 @@ void blk_mq_submit_bio(struct bio *bio)
}
hctx = rq->mq_hctx;
- if ((rq->rq_flags & RQF_ELV) ||
+ if ((rq->rq_flags & RQF_USE_SCHED) ||
(hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
blk_mq_insert_request(rq, 0);
blk_mq_run_hw_queue(hctx, true);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e876584d3516..d15981db34b9 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -226,9 +226,9 @@ static inline bool blk_mq_is_shared_tags(unsigned int flags)
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
- if (!(data->rq_flags & RQF_ELV))
- return data->hctx->tags;
- return data->hctx->sched_tags;
+ if (data->rq_flags & RQF_SCHED_TAGS)
+ return data->hctx->sched_tags;
+ return data->hctx->tags;
}
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5529e7d28ae6..e4a211957db6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -38,6 +38,10 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6))
/* don't call prep for this one */
#define RQF_DONTPREP ((__force req_flags_t)(1 << 7))
+/* use hctx->sched_tags */
+#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8))
+/* use an I/O scheduler for this request */
+#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9))
/* vaguely specified driver internal error. Ignored by the block layer */
#define RQF_FAILED ((__force req_flags_t)(1 << 10))
/* don't warn about errors */
@@ -57,9 +61,7 @@ typedef __u32 __bitwise req_flags_t;
#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
/* ->timeout has been called, don't expire again */
#define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21))
-/* queue has elevator attached */
-#define RQF_ELV ((__force req_flags_t)(1 << 22))
-#define RQF_RESV ((__force req_flags_t)(1 << 23))
+#define RQF_RESV ((__force req_flags_t)(1 << 23))
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
@@ -842,7 +844,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib);
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
- return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV));
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
}
static inline bool blk_mq_is_reserved_rq(struct request *rq)
@@ -858,7 +860,7 @@ static inline bool blk_mq_add_to_batch(struct request *req,
struct io_comp_batch *iob, int ioerror,
void (*complete)(struct io_comp_batch *))
{
- if (!iob || (req->rq_flags & RQF_ELV) || ioerror ||
+ if (!iob || (req->rq_flags & RQF_USE_SCHED) || ioerror ||
(req->end_io && !blk_rq_is_passthrough(req)))
return false;
From 45b46b6f157169b452772430566772506e25687a Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:19 -0700
Subject: [PATCH 006/266] block: mq-deadline: Add a word in a source code
comment
Add the missing word "and".
Cc: Damien Le Moal
Suggested-by: Damien Le Moal
Fixes: 945ffb60c11d ("mq-deadline: add blk-mq adaptation of the deadline IO scheduler")
Signed-off-by: Bart Van Assche
Tested-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230517174230.897144-2-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/mq-deadline.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 5839a027e0f0..cea1b084c69e 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -443,7 +443,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
*/
rq = deadline_next_request(dd, per_prio, dd->last_dir);
if (rq && dd->batching < dd->fifo_batch)
- /* we have a next request are still entitled to batch */
+ /* we have a next request and are still entitled to batch */
goto dispatch_request;
/*
From 4f51644ccff1e4bf159e86da3d9695a1a33ca231 Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:20 -0700
Subject: [PATCH 007/266] block: Simplify blk_req_needs_zone_write_lock()
Remove the blk_rq_is_passthrough() check because it is redundant:
blk_req_needs_zone_write_lock() also calls bdev_op_is_zoned_write()
and the latter function returns false for pass-through requests.
Reviewed-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Reviewed-by: Hannes Reinecke
Reviewed-by: Johannes Thumshirn
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230517174230.897144-3-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/blk-zoned.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index fce9082384d6..835d9e937d4d 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -57,9 +57,6 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str);
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
- if (blk_rq_is_passthrough(rq))
- return false;
-
if (!rq->q->disk->seq_zones_wlock)
return false;
From 3ddbe2a7e0d4a155a805f69c906c9beed30d4cc4 Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:21 -0700
Subject: [PATCH 008/266] block: Fix the type of the second
bdev_op_is_zoned_write() argument
Change the type of the second argument of bdev_op_is_zoned_write() from
blk_opf_t into enum req_op because this function expects an operation
without flags as second argument.
Reviewed-by: Johannes Thumshirn
Reviewed-by: Pankaj Raghav
Reviewed-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Reviewed-by: Hannes Reinecke
Cc: Ming Lei
Fixes: 8cafdb5ab94c ("block: adapt blk_mq_plug() to not plug for writes that require a zone lock")
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230517174230.897144-4-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
include/linux/blkdev.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b441e633f4dd..db24cf98ccfb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1282,7 +1282,7 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
}
static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
- blk_opf_t op)
+ enum req_op op)
{
if (!bdev_is_zoned(bdev))
return false;
From a370798201b537f78288e4ef5e0f7fc70889e7ee Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:22 -0700
Subject: [PATCH 009/266] block: Introduce op_needs_zoned_write_locking()
Introduce a helper function for checking whether write serialization is
required if the operation will be sent to a zoned device. A second caller
for op_needs_zoned_write_locking() will be introduced in the next patch
in this series.
Suggested-by: Christoph Hellwig
Reviewed-by: Christoph Hellwig
Cc: Damien Le Moal
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230517174230.897144-5-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
include/linux/blkdev.h | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index db24cf98ccfb..3952c52d6cd1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1281,13 +1281,16 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
return disk_zone_no(bdev->bd_disk, sec);
}
+/* Whether write serialization is required for @op on zoned devices. */
+static inline bool op_needs_zoned_write_locking(enum req_op op)
+{
+ return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
+}
+
static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
enum req_op op)
{
- if (!bdev_is_zoned(bdev))
- return false;
-
- return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
+ return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
}
static inline sector_t bdev_zone_sectors(struct block_device *bdev)
From 19821fee3ed42e5b294e95814892d0ad6a9890c9 Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:23 -0700
Subject: [PATCH 010/266] block: Introduce blk_rq_is_seq_zoned_write()
Introduce the function blk_rq_is_seq_zoned_write(). This function will
be used in later patches to preserve the order of zoned writes that
require write serialization.
This patch includes an optimization: instead of using
rq->q->disk->part0->bd_queue to check whether or not the queue is
associated with a zoned block device, use rq->q->disk->queue.
Cc: Christoph Hellwig
Cc: Damien Le Moal
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230517174230.897144-6-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/blk-zoned.c | 5 +----
include/linux/blk-mq.h | 17 +++++++++++++++++
2 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 835d9e937d4d..096b6b47561f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -60,10 +60,7 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
if (!rq->q->disk->seq_zones_wlock)
return false;
- if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq)))
- return blk_rq_zone_is_seq(rq);
-
- return false;
+ return blk_rq_is_seq_zoned_write(rq);
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index e4a211957db6..49d14b1acfa5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1164,6 +1164,18 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
}
+/**
+ * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization.
+ * @rq: Request to examine.
+ *
+ * Note: REQ_OP_ZONE_APPEND requests do not require serialization.
+ */
+static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
+{
+ return op_needs_zoned_write_locking(req_op(rq)) &&
+ blk_rq_zone_is_seq(rq);
+}
+
bool blk_req_needs_zone_write_lock(struct request *rq);
bool blk_req_zone_write_trylock(struct request *rq);
void __blk_req_zone_write_lock(struct request *rq);
@@ -1194,6 +1206,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
return !blk_req_zone_is_write_locked(rq);
}
#else /* CONFIG_BLK_DEV_ZONED */
+static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
+{
+ return false;
+}
+
static inline bool blk_req_needs_zone_write_lock(struct request *rq)
{
return false;
From e0d85cde95bba7d40caa3bf9bc41ee810f0e96df Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:24 -0700
Subject: [PATCH 011/266] block: mq-deadline: Clean up deadline_check_fifo()
Change the return type of deadline_check_fifo() from 'int' into 'bool'.
Use time_is_before_eq_jiffies() instead of time_after_eq(). No
functionality has been changed.
Reviewed-by: Christoph Hellwig
Reviewed-by: Hannes Reinecke
Cc: Damien Le Moal
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230517174230.897144-7-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/mq-deadline.c | 16 +++++-----------
1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index cea1b084c69e..cea91ba4a6ea 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -272,21 +272,15 @@ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
}
/*
- * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
- * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ * deadline_check_fifo returns true if and only if there are expired requests
+ * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]).
*/
-static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
- enum dd_data_dir data_dir)
+static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
{
struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- /*
- * rq is expired!
- */
- if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
- return 1;
-
- return 0;
+ return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
}
/*
From 3b463cbea908a9c8d4b9eda09765070506864cbe Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:25 -0700
Subject: [PATCH 012/266] block: mq-deadline: Simplify
deadline_skip_seq_writes()
Make the deadline_skip_seq_writes() code shorter without changing its
functionality.
Reviewed-by: Damien Le Moal
Reviewed-by: Christoph Hellwig
Reviewed-by: Hannes Reinecke
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230517174230.897144-8-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/mq-deadline.c | 9 +++------
1 file changed, 3 insertions(+), 6 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index cea91ba4a6ea..56782ee93522 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -304,14 +304,11 @@ static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
struct request *rq)
{
sector_t pos = blk_rq_pos(rq);
- sector_t skipped_sectors = 0;
- while (rq) {
- if (blk_rq_pos(rq) != pos + skipped_sectors)
- break;
- skipped_sectors += blk_rq_sectors(rq);
+ do {
+ pos += blk_rq_sectors(rq);
rq = deadline_latter_request(rq);
- }
+ } while (rq && blk_rq_pos(rq) == pos);
return rq;
}
From b2097bd24b438d49d82a5c317be4dc74b626236a Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:26 -0700
Subject: [PATCH 013/266] block: mq-deadline: Reduce lock contention
blk_mq_free_requests() calls dd_finish_request() indirectly. Prevent
nested locking of dd->lock and dd->zone_lock by moving the code for
freeing requests.
Reviewed-by: Damien Le Moal
Reviewed-by: Christoph Hellwig
Reviewed-by: Hannes Reinecke
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230517174230.897144-9-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/mq-deadline.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 56782ee93522..44222d18f6d4 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -757,7 +757,7 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
* add rq to rbtree and fifo
*/
static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- blk_insert_t flags)
+ blk_insert_t flags, struct list_head *free)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
@@ -766,7 +766,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
struct dd_per_prio *per_prio;
enum dd_prio prio;
- LIST_HEAD(free);
lockdep_assert_held(&dd->lock);
@@ -783,10 +782,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
rq->elv.priv[0] = (void *)(uintptr_t)1;
}
- if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
- blk_mq_free_requests(&free);
+ if (blk_mq_sched_try_insert_merge(q, rq, free))
return;
- }
trace_block_rq_insert(rq);
@@ -819,6 +816,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
+ LIST_HEAD(free);
spin_lock(&dd->lock);
while (!list_empty(list)) {
@@ -826,9 +824,11 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
- dd_insert_request(hctx, rq, flags);
+ dd_insert_request(hctx, rq, flags, &free);
}
spin_unlock(&dd->lock);
+
+ blk_mq_free_requests(&free);
}
/* Callback from inside blk_mq_rq_ctx_init(). */
From 83c46ed675579fe84354bd07b0d81b525a2b1ebb Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:27 -0700
Subject: [PATCH 014/266] block: mq-deadline: Track the dispatch position
Track the position (sector_t) of the most recently dispatched request
instead of tracking a pointer to the next request to dispatch. This
patch is the basis for patch "Handle requeued requests correctly".
Without this patch it would be significantly more complicated to make
sure that zoned writes are dispatched in LBA order per zone.
Reviewed-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230517174230.897144-10-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/mq-deadline.c | 45 +++++++++++++++++++++++++++++++--------------
1 file changed, 31 insertions(+), 14 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 44222d18f6d4..91b689261d30 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -74,8 +74,8 @@ struct dd_per_prio {
struct list_head dispatch;
struct rb_root sort_list[DD_DIR_COUNT];
struct list_head fifo_list[DD_DIR_COUNT];
- /* Next request in FIFO order. Read, write or both are NULL. */
- struct request *next_rq[DD_DIR_COUNT];
+ /* Position of the most recently dispatched request. */
+ sector_t latest_pos[DD_DIR_COUNT];
struct io_stats_per_prio stats;
};
@@ -156,6 +156,25 @@ deadline_latter_request(struct request *rq)
return NULL;
}
+/* Return the first request for which blk_rq_pos() >= pos. */
+static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir, sector_t pos)
+{
+ struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
+ struct request *rq, *res = NULL;
+
+ while (node) {
+ rq = rb_entry_rq(node);
+ if (blk_rq_pos(rq) >= pos) {
+ res = rq;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+ return res;
+}
+
static void
deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
@@ -167,11 +186,6 @@ deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
static inline void
deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
- const enum dd_data_dir data_dir = rq_data_dir(rq);
-
- if (per_prio->next_rq[data_dir] == rq)
- per_prio->next_rq[data_dir] = deadline_latter_request(rq);
-
elv_rb_del(deadline_rb_root(per_prio, rq), rq);
}
@@ -251,10 +265,6 @@ static void
deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
struct request *rq)
{
- const enum dd_data_dir data_dir = rq_data_dir(rq);
-
- per_prio->next_rq[data_dir] = deadline_latter_request(rq);
-
/*
* take it off the sort and fifo list
*/
@@ -363,7 +373,8 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
struct request *rq;
unsigned long flags;
- rq = per_prio->next_rq[data_dir];
+ rq = deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
if (!rq)
return NULL;
@@ -426,6 +437,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
if (started_after(dd, rq, latest_start))
return NULL;
list_del_init(&rq->queuelist);
+ data_dir = rq_data_dir(rq);
goto done;
}
@@ -433,9 +445,11 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
* batches are currently reads XOR writes
*/
rq = deadline_next_request(dd, per_prio, dd->last_dir);
- if (rq && dd->batching < dd->fifo_batch)
+ if (rq && dd->batching < dd->fifo_batch) {
/* we have a next request and are still entitled to batch */
+ data_dir = rq_data_dir(rq);
goto dispatch_request;
+ }
/*
* at this point we are not running a batch. select the appropriate
@@ -513,6 +527,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
done:
ioprio_class = dd_rq_ioclass(rq);
prio = ioprio_class_to_prio[ioprio_class];
+ dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
dd->per_prio[prio].stats.dispatched++;
/*
* If the request needs its target zone locked, do it.
@@ -1026,8 +1041,10 @@ static int deadline_##name##_next_rq_show(void *data, \
struct request_queue *q = data; \
struct deadline_data *dd = q->elevator->elevator_data; \
struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
- struct request *rq = per_prio->next_rq[data_dir]; \
+ struct request *rq; \
\
+ rq = deadline_from_pos(per_prio, data_dir, \
+ per_prio->latest_pos[data_dir]); \
if (rq) \
__blk_mq_debugfs_rq_show(m, rq); \
return 0; \
From 0effb390c4bac1a484f0ca6ad3f1d183fcde882b Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:28 -0700
Subject: [PATCH 015/266] block: mq-deadline: Handle requeued requests
correctly
Start dispatching from the start of a zone instead of from the starting
position of the most recently dispatched request.
If a zoned write is requeued with an LBA that is lower than already
inserted zoned writes, make sure that it is submitted first.
Reviewed-by: Christoph Hellwig
Reviewed-by: Hannes Reinecke
Cc: Damien Le Moal
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230517174230.897144-11-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/mq-deadline.c | 34 ++++++++++++++++++++++++++++++++--
1 file changed, 32 insertions(+), 2 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 91b689261d30..e90879869c90 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -156,13 +156,28 @@ deadline_latter_request(struct request *rq)
return NULL;
}
-/* Return the first request for which blk_rq_pos() >= pos. */
+/*
+ * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
+ * return the first request after the start of the zone containing @pos.
+ */
static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
enum dd_data_dir data_dir, sector_t pos)
{
struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
struct request *rq, *res = NULL;
+ if (!node)
+ return NULL;
+
+ rq = rb_entry_rq(node);
+ /*
+ * A zoned write may have been requeued with a starting position that
+ * is below that of the most recently dispatched request. Hence, for
+ * zoned writes, start searching from the start of a zone.
+ */
+ if (blk_rq_is_seq_zoned_write(rq))
+ pos -= round_down(pos, rq->q->limits.chunk_sectors);
+
while (node) {
rq = rb_entry_rq(node);
if (blk_rq_pos(rq) >= pos) {
@@ -806,6 +821,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
list_add(&rq->queuelist, &per_prio->dispatch);
rq->fifo_time = jiffies;
} else {
+ struct list_head *insert_before;
+
deadline_add_rq_rb(per_prio, rq);
if (rq_mergeable(rq)) {
@@ -818,7 +835,20 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* set expire time and add to fifo list
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
- list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
+ insert_before = &per_prio->fifo_list[data_dir];
+#ifdef CONFIG_BLK_DEV_ZONED
+ /*
+ * Insert zoned writes such that requests are sorted by
+ * position per zone.
+ */
+ if (blk_rq_is_seq_zoned_write(rq)) {
+ struct request *rq2 = deadline_latter_request(rq);
+
+ if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
+ insert_before = &rq2->queuelist;
+ }
+#endif
+ list_add_tail(&rq->queuelist, insert_before);
}
}
From a036e698c231ba884daa37196be3ac6c6dce1d75 Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Wed, 17 May 2023 10:42:29 -0700
Subject: [PATCH 016/266] block: mq-deadline: Fix handling of at-head zoned
writes
Before dispatching a zoned write from the FIFO list, check whether there
are any zoned writes in the RB-tree with a lower LBA for the same zone.
This patch ensures that zoned writes happen in order even if at_head is
set for some writes for a zone and not for others.
Reviewed-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Cc: Ming Lei
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230517174230.897144-12-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/mq-deadline.c | 9 +++++++--
1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index e90879869c90..6aa5daf7ae32 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -346,7 +346,7 @@ static struct request *
deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
enum dd_data_dir data_dir)
{
- struct request *rq;
+ struct request *rq, *rb_rq, *next;
unsigned long flags;
if (list_empty(&per_prio->fifo_list[data_dir]))
@@ -364,7 +364,12 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
* zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
+ list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
+ queuelist) {
+ /* Check whether a prior request exists for the same zone. */
+ rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
+ if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
+ rq = rb_rq;
if (blk_req_can_dispatch_to_zone(rq) &&
(blk_queue_nonrot(rq->q) ||
!deadline_is_seq_write(dd, rq)))
From 3e49c1e4a6152b6ad758a28ecce8fb470f46f6ed Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Tue, 16 May 2023 15:38:53 -0700
Subject: [PATCH 017/266] block: BFQ: Add several invariant checks
If anything goes wrong with the counters that track the number of
requests, I/O locks up. Make such scenarios easier to debug by adding
invariant checks for the request counters. Additionally, check that
BFQ queues are empty before these are freed.
Cc: Jan Kara
Cc: Yu Kuai
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230516223853.1385255-1-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/bfq-iosched.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3164e3177965..c5727afad159 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5403,6 +5403,9 @@ void bfq_put_queue(struct bfq_queue *bfqq)
if (bfqq->bfqd->last_completed_rq_bfqq == bfqq)
bfqq->bfqd->last_completed_rq_bfqq = NULL;
+ WARN_ON_ONCE(!list_empty(&bfqq->fifo));
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
kmem_cache_free(bfq_pool, bfqq);
bfqg_and_blkg_put(bfqg);
}
@@ -7135,6 +7138,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
{
struct bfq_data *bfqd = e->elevator_data;
struct bfq_queue *bfqq, *n;
+ unsigned int actuator;
hrtimer_cancel(&bfqd->idle_slice_timer);
@@ -7143,6 +7147,11 @@ static void bfq_exit_queue(struct elevator_queue *e)
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
spin_unlock_irq(&bfqd->lock);
+ for (actuator = 0; actuator < bfqd->num_actuators; actuator++)
+ WARN_ON_ONCE(bfqd->rq_in_driver[actuator]);
+ WARN_ON_ONCE(bfqd->tot_rq_in_driver);
+ WARN_ON_ONCE(bfqq->dispatched);
+
hrtimer_cancel(&bfqd->idle_slice_timer);
/* release oom-queue reference to root group */
From bda2795a630b2f6c417675bfbf4d90ef7503dfc7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Mon, 8 May 2023 07:44:05 -0700
Subject: [PATCH 018/266] fs: remove the special !CONFIG_BLOCK def_blk_fops
def_blk_fops always returns -ENODEV, which dosn't match the return value
of a non-existing block device with CONFIG_BLOCK, which is -ENXIO.
Just remove the extra implementation and fall back to the default
no_open_fops that always returns -ENXIO.
Fixes: 9361401eb761 ("[PATCH] BLOCK: Make it possible to disable the block layer [try #6]")
Signed-off-by: Christoph Hellwig
Link: https://lore.kernel.org/r/20230508144405.41792-1-hch@lst.de
Signed-off-by: Jens Axboe
---
fs/Makefile | 10 ++--------
fs/inode.c | 3 ++-
fs/no-block.c | 19 -------------------
3 files changed, 4 insertions(+), 28 deletions(-)
delete mode 100644 fs/no-block.c
diff --git a/fs/Makefile b/fs/Makefile
index 834f1c3dba46..4709eba1303c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -17,14 +17,8 @@ obj-y := open.o read_write.o file_table.o super.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o
-ifeq ($(CONFIG_BLOCK),y)
-obj-y += buffer.o mpage.o
-else
-obj-y += no-block.o
-endif
-
-obj-$(CONFIG_PROC_FS) += proc_namespace.o
-
+obj-$(CONFIG_BLOCK) += buffer.o mpage.o
+obj-$(CONFIG_PROC_FS) += proc_namespace.o
obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o
obj-y += notify/
obj-$(CONFIG_EPOLL) += eventpoll.o
diff --git a/fs/inode.c b/fs/inode.c
index 577799b7855f..4d6a1544e95b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2264,7 +2264,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
} else if (S_ISBLK(mode)) {
- inode->i_fop = &def_blk_fops;
+ if (IS_ENABLED(CONFIG_BLOCK))
+ inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
} else if (S_ISFIFO(mode))
inode->i_fop = &pipefifo_fops;
diff --git a/fs/no-block.c b/fs/no-block.c
deleted file mode 100644
index 481c0f0ab4bd..000000000000
--- a/fs/no-block.c
+++ /dev/null
@@ -1,19 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/* no-block.c: implementation of routines required for non-BLOCK configuration
- *
- * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#include
-#include
-
-static int no_blkdev_open(struct inode * inode, struct file * filp)
-{
- return -ENODEV;
-}
-
-const struct file_operations def_blk_fops = {
- .open = no_blkdev_open,
- .llseek = noop_llseek,
-};
From 0b573692f19501dfe2aeaf37b272ec07f60c70b9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Fri, 19 May 2023 06:40:44 +0200
Subject: [PATCH 019/266] blk-mq: factor out a blk_rq_init_flush helper
Factor out a helper from blk_insert_flush that initializes the flush
machine related fields in struct request, and don't bother with the
full memset as there's just a few fields to initialize, and all but
one already have explicit initializers.
Signed-off-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Reviewed-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230519044050.107790-2-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-flush.c | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 04698ed9bcd4..ed37d272f787 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -376,6 +376,15 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
return RQ_END_IO_NONE;
}
+static void blk_rq_init_flush(struct request *rq)
+{
+ rq->flush.seq = 0;
+ INIT_LIST_HEAD(&rq->flush.list);
+ rq->rq_flags |= RQF_FLUSH_SEQ;
+ rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+ rq->end_io = mq_flush_data_end_io;
+}
+
/**
* blk_insert_flush - insert a new PREFLUSH/FUA request
* @rq: request to insert
@@ -437,13 +446,7 @@ void blk_insert_flush(struct request *rq)
* @rq should go through flush machinery. Mark it part of flush
* sequence and submit for further processing.
*/
- memset(&rq->flush, 0, sizeof(rq->flush));
- INIT_LIST_HEAD(&rq->flush.list);
- rq->rq_flags |= RQF_FLUSH_SEQ;
- rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
-
- rq->end_io = mq_flush_data_end_io;
-
+ blk_rq_init_flush(rq);
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
From c1075e548ce6e6b5c7b71f2b05d344164ebc52bb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Fri, 19 May 2023 06:40:45 +0200
Subject: [PATCH 020/266] blk-mq: reflow blk_insert_flush
Use a switch statement to decide on the disposition of a flush request
instead of multiple if statements, out of which one does checks that are
more complex than required. Also warn on a malformed request early
on instead of doing a BUG_ON later.
Signed-off-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Reviewed-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230519044050.107790-3-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-flush.c | 53 +++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 27 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index ed37d272f787..d8144f1f6fb1 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -402,6 +402,9 @@ void blk_insert_flush(struct request *rq)
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+ /* FLUSH/FUA request must never be merged */
+ WARN_ON_ONCE(rq->bio != rq->biotail);
+
/*
* @policy now records what operations need to be done. Adjust
* REQ_PREFLUSH and FUA for the driver.
@@ -417,39 +420,35 @@ void blk_insert_flush(struct request *rq)
*/
rq->cmd_flags |= REQ_SYNC;
- /*
- * An empty flush handed down from a stacking driver may
- * translate into nothing if the underlying device does not
- * advertise a write-back cache. In this case, simply
- * complete the request.
- */
- if (!policy) {
+ switch (policy) {
+ case 0:
+ /*
+ * An empty flush handed down from a stacking driver may
+ * translate into nothing if the underlying device does not
+ * advertise a write-back cache. In this case, simply
+ * complete the request.
+ */
blk_mq_end_request(rq, 0);
return;
- }
-
- BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
-
- /*
- * If there's data but flush is not necessary, the request can be
- * processed directly without going through flush machinery. Queue
- * for normal execution.
- */
- if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+ case REQ_FSEQ_DATA:
+ /*
+ * If there's data, but no flush is necessary, the request can
+ * be processed directly without going through flush machinery.
+ * Queue for normal execution.
+ */
blk_mq_request_bypass_insert(rq, 0);
blk_mq_run_hw_queue(hctx, false);
return;
+ default:
+ /*
+ * Mark the request as part of a flush sequence and submit it
+ * for further processing to the flush state machine.
+ */
+ blk_rq_init_flush(rq);
+ spin_lock_irq(&fq->mq_flush_lock);
+ blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
+ spin_unlock_irq(&fq->mq_flush_lock);
}
-
- /*
- * @rq should go through flush machinery. Mark it part of flush
- * sequence and submit for further processing.
- */
- blk_rq_init_flush(rq);
- spin_lock_irq(&fq->mq_flush_lock);
- blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
- spin_unlock_irq(&fq->mq_flush_lock);
}
/**
From 360f264834e34d08530c2fb9b67e3ffa65318761 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Fri, 19 May 2023 06:40:46 +0200
Subject: [PATCH 021/266] blk-mq: defer to the normal submission path for
non-flush flush commands
If blk_insert_flush decides that a command does not need to use the
flush state machine, return false and let blk_mq_submit_bio handle
it the normal way (including using an I/O scheduler) instead of doing
a bypass insert.
Signed-off-by: Christoph Hellwig
Reviewed-by: Bart Van Assche
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230519044050.107790-4-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-flush.c | 22 ++++++++--------------
block/blk-mq.c | 8 ++++----
block/blk-mq.h | 4 ----
block/blk.h | 2 +-
4 files changed, 13 insertions(+), 23 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index d8144f1f6fb1..6fb9cf2d3818 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -385,22 +385,17 @@ static void blk_rq_init_flush(struct request *rq)
rq->end_io = mq_flush_data_end_io;
}
-/**
- * blk_insert_flush - insert a new PREFLUSH/FUA request
- * @rq: request to insert
- *
- * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
- * or __blk_mq_run_hw_queue() to dispatch request.
- * @rq is being submitted. Analyze what needs to be done and put it on the
- * right queue.
+/*
+ * Insert a PREFLUSH/FUA request into the flush state machine.
+ * Returns true if the request has been consumed by the flush state machine,
+ * or false if the caller should continue to process it.
*/
-void blk_insert_flush(struct request *rq)
+bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
unsigned int policy = blk_flush_policy(fflags, rq);
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
/* FLUSH/FUA request must never be merged */
WARN_ON_ONCE(rq->bio != rq->biotail);
@@ -429,16 +424,14 @@ void blk_insert_flush(struct request *rq)
* complete the request.
*/
blk_mq_end_request(rq, 0);
- return;
+ return true;
case REQ_FSEQ_DATA:
/*
* If there's data, but no flush is necessary, the request can
* be processed directly without going through flush machinery.
* Queue for normal execution.
*/
- blk_mq_request_bypass_insert(rq, 0);
- blk_mq_run_hw_queue(hctx, false);
- return;
+ return false;
default:
/*
* Mark the request as part of a flush sequence and submit it
@@ -448,6 +441,7 @@ void blk_insert_flush(struct request *rq)
spin_lock_irq(&fq->mq_flush_lock);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
spin_unlock_irq(&fq->mq_flush_lock);
+ return true;
}
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e021740154fe..c0b394096b6b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -45,6 +45,8 @@
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
+static void blk_mq_request_bypass_insert(struct request *rq,
+ blk_insert_t flags);
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list);
@@ -2430,7 +2432,7 @@ static void blk_mq_run_work_fn(struct work_struct *work)
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
+static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -2977,10 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio)
return;
}
- if (op_is_flush(bio->bi_opf)) {
- blk_insert_flush(rq);
+ if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
return;
- }
if (plug) {
blk_add_rq_to_plug(plug, rq);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d15981db34b9..ec7d2fb0b3c8 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -64,10 +64,6 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags,
unsigned int hctx_idx);
-/*
- * Internal helpers for request insertion into sw queues
- */
-void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags);
/*
* CPU -> queue mappings
diff --git a/block/blk.h b/block/blk.h
index 45547bcf1119..9f171b8f1e34 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -269,7 +269,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
*/
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
-void blk_insert_flush(struct request *rq);
+bool blk_insert_flush(struct request *rq);
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
void elevator_disable(struct request_queue *q);
From be4c427809b0a746aff54dbb8ef663f0184291d0 Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Fri, 19 May 2023 06:40:47 +0200
Subject: [PATCH 022/266] blk-mq: use the I/O scheduler for writes from the
flush state machine
Send write requests issued by the flush state machine through the normal
I/O submission path including the I/O scheduler (if present) so that I/O
scheduler policies are applied to writes with the FUA flag set.
Separate the I/O scheduler members from the flush members in struct
request since now a request may pass through both an I/O scheduler
and the flush machinery.
Note that the actual flush requests, which have no bio attached to the
request still bypass the I/O schedulers.
Signed-off-by: Bart Van Assche
[hch: rebased]
Signed-off-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230519044050.107790-5-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-mq.c | 4 ++--
include/linux/blk-mq.h | 25 ++++++++++---------------
2 files changed, 12 insertions(+), 17 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c0b394096b6b..aac67bc3d368 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -458,7 +458,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
* Flush/passthrough requests are special and go directly to the
* dispatch list.
*/
- if (!op_is_flush(data->cmd_flags) &&
+ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
!blk_op_is_passthrough(data->cmd_flags)) {
struct elevator_mq_ops *ops = &q->elevator->type->ops;
@@ -2497,7 +2497,7 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
* dispatch it given we prioritize requests in hctx->dispatch.
*/
blk_mq_request_bypass_insert(rq, flags);
- } else if (rq->rq_flags & RQF_FLUSH_SEQ) {
+ } else if (req_op(rq) == REQ_OP_FLUSH) {
/*
* Firstly normal IO request is inserted to scheduler queue or
* sw queue, meantime we add flush request to dispatch queue(
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 49d14b1acfa5..935201c89743 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,25 +169,20 @@ struct request {
void *completion_data;
};
-
/*
* Three pointers are available for the IO schedulers, if they need
- * more they have to dynamically allocate it. Flush requests are
- * never put on the IO scheduler. So let the flush fields share
- * space with the elevator data.
+ * more they have to dynamically allocate it.
*/
- union {
- struct {
- struct io_cq *icq;
- void *priv[2];
- } elv;
+ struct {
+ struct io_cq *icq;
+ void *priv[2];
+ } elv;
- struct {
- unsigned int seq;
- struct list_head list;
- rq_end_io_fn *saved_end_io;
- } flush;
- };
+ struct {
+ unsigned int seq;
+ struct list_head list;
+ rq_end_io_fn *saved_end_io;
+ } flush;
union {
struct __call_single_data csd;
From 615939a2ae734e3e68c816d6749d1f5f79c62ab7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Fri, 19 May 2023 06:40:48 +0200
Subject: [PATCH 023/266] blk-mq: defer to the normal submission path for
post-flush requests
Requests with the FUA bit on hardware without FUA support need a post
flush before returning to the caller, but they can still be sent using
the normal I/O path after initializing the flush-related fields and
end I/O handler.
Signed-off-by: Christoph Hellwig
Reviewed-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230519044050.107790-6-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-flush.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 6fb9cf2d3818..7121f9ad0762 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -432,6 +432,17 @@ bool blk_insert_flush(struct request *rq)
* Queue for normal execution.
*/
return false;
+ case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH:
+ /*
+ * Initialize the flush fields and completion handler to trigger
+ * the post flush, and then just pass the command on.
+ */
+ blk_rq_init_flush(rq);
+ rq->flush.seq |= REQ_FSEQ_POSTFLUSH;
+ spin_lock_irq(&fq->mq_flush_lock);
+ list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
+ spin_unlock_irq(&fq->mq_flush_lock);
+ return false;
default:
/*
* Mark the request as part of a flush sequence and submit it
From 1e82fadfc6b96ca79f69d0bcf938d31032bb43d2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Fri, 19 May 2023 06:40:49 +0200
Subject: [PATCH 024/266] blk-mq: do not do head insertions post-pre-flush
commands
blk_flush_complete_seq currently queues requests that write data after
a pre-flush from the flush state machine at the head of the queue.
This doesn't really make sense, as the original request bypassed all
queue lists by directly diverting to blk_insert_flush from
blk_mq_submit_bio.
Signed-off-by: Christoph Hellwig
Reviewed-by: Bart Van Assche
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230519044050.107790-7-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-flush.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 7121f9ad0762..f407a5950317 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -188,7 +188,7 @@ static void blk_flush_complete_seq(struct request *rq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
- blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD);
+ blk_mq_add_to_requeue_list(rq, 0);
blk_mq_kick_requeue_list(q);
break;
From 9a67aa52a42b31ad44220cc218df3b75a5cd5d05 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Fri, 19 May 2023 06:40:50 +0200
Subject: [PATCH 025/266] blk-mq: don't use the requeue list to queue flush
commands
Currently both requeues of commands that were already sent to the driver
and flush commands submitted from the flush state machine share the same
requeue_list struct request_queue, despite requeues doing head
insertions and flushes not. Switch to using two separate lists instead.
Signed-off-by: Christoph Hellwig
Reviewed-by: Damien Le Moal
Link: https://lore.kernel.org/r/20230519044050.107790-8-hch@lst.de
Signed-off-by: Jens Axboe
---
block/blk-flush.c | 9 +++++++--
block/blk-mq-debugfs.c | 1 -
block/blk-mq.c | 42 +++++++++++++-----------------------------
block/blk-mq.h | 1 -
include/linux/blk-mq.h | 4 +---
include/linux/blkdev.h | 1 +
6 files changed, 22 insertions(+), 36 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index f407a5950317..dba392cf22be 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -188,7 +188,9 @@ static void blk_flush_complete_seq(struct request *rq,
case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
- blk_mq_add_to_requeue_list(rq, 0);
+ spin_lock(&q->requeue_lock);
+ list_add_tail(&rq->queuelist, &q->flush_list);
+ spin_unlock(&q->requeue_lock);
blk_mq_kick_requeue_list(q);
break;
@@ -346,7 +348,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
smp_wmb();
req_ref_set(flush_rq, 1);
- blk_mq_add_to_requeue_list(flush_rq, 0);
+ spin_lock(&q->requeue_lock);
+ list_add_tail(&flush_rq->queuelist, &q->flush_list);
+ spin_unlock(&q->requeue_lock);
+
blk_mq_kick_requeue_list(q);
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 22e39b9a77ec..68165a50951b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -244,7 +244,6 @@ static const char *const cmd_flag_name[] = {
#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
static const char *const rqf_name[] = {
RQF_NAME(STARTED),
- RQF_NAME(SOFTBARRIER),
RQF_NAME(FLUSH_SEQ),
RQF_NAME(MIXED_MERGE),
RQF_NAME(MQ_INFLIGHT),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index aac67bc3d368..551e7760f45e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1416,13 +1416,16 @@ static void __blk_mq_requeue_request(struct request *rq)
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
struct request_queue *q = rq->q;
+ unsigned long flags;
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
- blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD);
+ spin_lock_irqsave(&q->requeue_lock, flags);
+ list_add_tail(&rq->queuelist, &q->requeue_list);
+ spin_unlock_irqrestore(&q->requeue_lock, flags);
if (kick_requeue_list)
blk_mq_kick_requeue_list(q);
@@ -1434,13 +1437,16 @@ static void blk_mq_requeue_work(struct work_struct *work)
struct request_queue *q =
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
- struct request *rq, *next;
+ LIST_HEAD(flush_list);
+ struct request *rq;
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
+ list_splice_init(&q->flush_list, &flush_list);
spin_unlock_irq(&q->requeue_lock);
- list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+ while (!list_empty(&rq_list)) {
+ rq = list_entry(rq_list.next, struct request, queuelist);
/*
* If RQF_DONTPREP ist set, the request has been started by the
* driver already and might have driver-specific data allocated
@@ -1448,18 +1454,16 @@ static void blk_mq_requeue_work(struct work_struct *work)
* block layer merges for the request.
*/
if (rq->rq_flags & RQF_DONTPREP) {
- rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
blk_mq_request_bypass_insert(rq, 0);
- } else if (rq->rq_flags & RQF_SOFTBARRIER) {
- rq->rq_flags &= ~RQF_SOFTBARRIER;
+ } else {
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
}
}
- while (!list_empty(&rq_list)) {
- rq = list_entry(rq_list.next, struct request, queuelist);
+ while (!list_empty(&flush_list)) {
+ rq = list_entry(flush_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
blk_mq_insert_request(rq, 0);
}
@@ -1467,27 +1471,6 @@ static void blk_mq_requeue_work(struct work_struct *work)
blk_mq_run_hw_queues(q, false);
}
-void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags)
-{
- struct request_queue *q = rq->q;
- unsigned long flags;
-
- /*
- * We abuse this flag that is otherwise used by the I/O scheduler to
- * request head insertion from the workqueue.
- */
- BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
-
- spin_lock_irqsave(&q->requeue_lock, flags);
- if (insert_flags & BLK_MQ_INSERT_AT_HEAD) {
- rq->rq_flags |= RQF_SOFTBARRIER;
- list_add(&rq->queuelist, &q->requeue_list);
- } else {
- list_add_tail(&rq->queuelist, &q->requeue_list);
- }
- spin_unlock_irqrestore(&q->requeue_lock, flags);
-}
-
void blk_mq_kick_requeue_list(struct request_queue *q)
{
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
@@ -4239,6 +4222,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
+ INIT_LIST_HEAD(&q->flush_list);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ec7d2fb0b3c8..8c642e9f32f1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -47,7 +47,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
unsigned int);
-void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 935201c89743..d778cb6b2112 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -28,8 +28,6 @@ typedef __u32 __bitwise req_flags_t;
/* drive already may have started this one */
#define RQF_STARTED ((__force req_flags_t)(1 << 1))
-/* may not be passed by ioscheduler */
-#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3))
/* request for flush sequence */
#define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4))
/* merge of different types, fail separately */
@@ -65,7 +63,7 @@ typedef __u32 __bitwise req_flags_t;
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
- (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
+ (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
enum mq_rq_state {
MQ_RQ_IDLE = 0,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3952c52d6cd1..fe99948688df 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -487,6 +487,7 @@ struct request_queue {
* for flush operations
*/
struct blk_flush_queue *fq;
+ struct list_head flush_list;
struct list_head requeue_list;
spinlock_t requeue_lock;
From 29dc5d06613f2438ec20a4ba5e0a5a740584d346 Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Fri, 19 May 2023 14:50:24 +0800
Subject: [PATCH 026/266] ublk: kill queuing request by task_work_add
task_work_add() is used from early ublk development stage for handling
request in batch. However, since commit 7d4a93176e01 ("ublk_drv: don't
forward io commands in reserve order"), we can get similar batch
processing with io_uring_cmd_complete_in_task(), and similar performance
data is observed between task_work_add() and
io_uring_cmd_complete_in_task().
Meantime we can kill one fast code path, which is actually seldom used
given it is common to build ublk driver as module.
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230519065030.351216-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 40 ++--------------------------------------
1 file changed, 2 insertions(+), 38 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index c7ed5d69e9ee..b00c5c210c7f 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -62,7 +62,6 @@
struct ublk_rq_data {
struct llist_node node;
- struct callback_head work;
};
struct ublk_uring_cmd_pdu {
@@ -290,14 +289,6 @@ static int ublk_apply_params(struct ublk_device *ub)
return 0;
}
-static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
-{
- if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
- !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
- return true;
- return false;
-}
-
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_NEED_GET_DATA;
@@ -852,17 +843,6 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
ublk_forward_io_cmds(ubq, issue_flags);
}
-static void ublk_rq_task_work_fn(struct callback_head *work)
-{
- struct ublk_rq_data *data = container_of(work,
- struct ublk_rq_data, work);
- struct request *req = blk_mq_rq_from_pdu(data);
- struct ublk_queue *ubq = req->mq_hctx->driver_data;
- unsigned issue_flags = IO_URING_F_UNLOCKED;
-
- ublk_forward_io_cmds(ubq, issue_flags);
-}
-
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
{
struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
@@ -886,10 +866,6 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
*/
if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
ublk_abort_io_cmds(ubq);
- } else if (ublk_can_use_task_work(ubq)) {
- if (task_work_add(ubq->ubq_daemon, &data->work,
- TWA_SIGNAL_NO_IPI))
- ublk_abort_io_cmds(ubq);
} else {
struct io_uring_cmd *cmd = io->cmd;
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
@@ -961,19 +937,9 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
return 0;
}
-static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
- unsigned int hctx_idx, unsigned int numa_node)
-{
- struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
-
- init_task_work(&data->work, ublk_rq_task_work_fn);
- return 0;
-}
-
static const struct blk_mq_ops ublk_mq_ops = {
.queue_rq = ublk_queue_rq,
.init_hctx = ublk_init_hctx,
- .init_request = ublk_init_rq,
.timeout = ublk_timeout,
};
@@ -1813,10 +1779,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
*/
ub->dev_info.flags &= UBLK_F_ALL;
- if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
- ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
-
- ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE;
+ ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
+ UBLK_F_URING_CMD_COMP_IN_TASK;
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
From f236a21459a5cdd828b93f363946e116773494f6 Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Fri, 19 May 2023 14:50:25 +0800
Subject: [PATCH 027/266] ublk: cleanup io cmd code path by adding
ublk_fill_io_cmd()
Add one small helper to cleanup io command hanlding code path.
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230519065030.351216-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index b00c5c210c7f..0c8651c5ba4c 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -1256,6 +1256,14 @@ static inline int ublk_check_cmd_op(u32 cmd_op)
return 0;
}
+static inline void ublk_fill_io_cmd(struct ublk_io *io,
+ struct io_uring_cmd *cmd, unsigned long buf_addr)
+{
+ io->cmd = cmd;
+ io->flags |= UBLK_IO_FLAG_ACTIVE;
+ io->addr = buf_addr;
+}
+
static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags,
const struct ublksrv_io_cmd *ub_cmd)
@@ -1322,10 +1330,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
/* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */
if (!ub_cmd->addr && !ublk_need_get_data(ubq))
goto out;
- io->cmd = cmd;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
- io->addr = ub_cmd->addr;
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_mark_io_ready(ub, ubq);
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
@@ -1338,17 +1344,13 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
goto out;
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
- io->addr = ub_cmd->addr;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
- io->cmd = cmd;
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd);
break;
case UBLK_IO_NEED_GET_DATA:
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
- io->addr = ub_cmd->addr;
- io->cmd = cmd;
- io->flags |= UBLK_IO_FLAG_ACTIVE;
+ ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
break;
default:
From 981f95a571e3ca20a496c0b77dbf6b06039c6648 Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Fri, 19 May 2023 14:50:26 +0800
Subject: [PATCH 028/266] ublk: cleanup ublk_copy_user_pages
Clean up ublk_copy_user_pages() by using iov_iter_get_pages2, and code
gets simplified a lot and becomes much more readable than before.
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230519065030.351216-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 108 +++++++++++++++++----------------------
1 file changed, 47 insertions(+), 61 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 0c8651c5ba4c..afc07fa17040 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -412,49 +412,39 @@ static const struct block_device_operations ub_fops = {
#define UBLK_MAX_PIN_PAGES 32
-struct ublk_map_data {
- const struct request *rq;
- unsigned long ubuf;
- unsigned int len;
-};
-
struct ublk_io_iter {
struct page *pages[UBLK_MAX_PIN_PAGES];
- unsigned pg_off; /* offset in the 1st page in pages */
- int nr_pages; /* how many page pointers in pages */
struct bio *bio;
struct bvec_iter iter;
};
-static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
- unsigned max_bytes, bool to_vm)
+/* return how many pages are copied */
+static void ublk_copy_io_pages(struct ublk_io_iter *data,
+ size_t total, size_t pg_off, int dir)
{
- const unsigned total = min_t(unsigned, max_bytes,
- PAGE_SIZE - data->pg_off +
- ((data->nr_pages - 1) << PAGE_SHIFT));
unsigned done = 0;
unsigned pg_idx = 0;
while (done < total) {
struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
- const unsigned int bytes = min3(bv.bv_len, total - done,
- (unsigned)(PAGE_SIZE - data->pg_off));
+ unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
+ (unsigned)(PAGE_SIZE - pg_off));
void *bv_buf = bvec_kmap_local(&bv);
void *pg_buf = kmap_local_page(data->pages[pg_idx]);
- if (to_vm)
- memcpy(pg_buf + data->pg_off, bv_buf, bytes);
+ if (dir == ITER_DEST)
+ memcpy(pg_buf + pg_off, bv_buf, bytes);
else
- memcpy(bv_buf, pg_buf + data->pg_off, bytes);
+ memcpy(bv_buf, pg_buf + pg_off, bytes);
kunmap_local(pg_buf);
kunmap_local(bv_buf);
/* advance page array */
- data->pg_off += bytes;
- if (data->pg_off == PAGE_SIZE) {
+ pg_off += bytes;
+ if (pg_off == PAGE_SIZE) {
pg_idx += 1;
- data->pg_off = 0;
+ pg_off = 0;
}
done += bytes;
@@ -468,41 +458,40 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
data->iter = data->bio->bi_iter;
}
}
-
- return done;
}
-static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm)
+/*
+ * Copy data between request pages and io_iter, and 'offset'
+ * is the start point of linear offset of request.
+ */
+static size_t ublk_copy_user_pages(const struct request *req,
+ struct iov_iter *uiter, int dir)
{
- const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
- const unsigned long start_vm = data->ubuf;
- unsigned int done = 0;
struct ublk_io_iter iter = {
- .pg_off = start_vm & (PAGE_SIZE - 1),
- .bio = data->rq->bio,
- .iter = data->rq->bio->bi_iter,
+ .bio = req->bio,
+ .iter = req->bio->bi_iter,
};
- const unsigned int nr_pages = round_up(data->len +
- (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
+ size_t done = 0;
- while (done < nr_pages) {
- const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
- nr_pages - done);
- unsigned i, len;
+ while (iov_iter_count(uiter) && iter.bio) {
+ unsigned nr_pages;
+ size_t len, off;
+ int i;
- iter.nr_pages = get_user_pages_fast(start_vm +
- (done << PAGE_SHIFT), to_pin, gup_flags,
- iter.pages);
- if (iter.nr_pages <= 0)
- return done == 0 ? iter.nr_pages : done;
- len = ublk_copy_io_pages(&iter, data->len, to_vm);
- for (i = 0; i < iter.nr_pages; i++) {
- if (to_vm)
+ len = iov_iter_get_pages2(uiter, iter.pages,
+ iov_iter_count(uiter),
+ UBLK_MAX_PIN_PAGES, &off);
+ if (len <= 0)
+ return done;
+
+ ublk_copy_io_pages(&iter, len, off, dir);
+ nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
+ for (i = 0; i < nr_pages; i++) {
+ if (dir == ITER_DEST)
set_page_dirty(iter.pages[i]);
put_page(iter.pages[i]);
}
- data->len -= len;
- done += iter.nr_pages;
+ done += len;
}
return done;
@@ -529,15 +518,14 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
* context is pretty fast, see ublk_pin_user_pages
*/
if (ublk_need_map_req(req)) {
- struct ublk_map_data data = {
- .rq = req,
- .ubuf = io->addr,
- .len = rq_bytes,
- };
+ struct iov_iter iter;
+ struct iovec iov;
+ const int dir = ITER_DEST;
- ublk_copy_user_pages(&data, true);
+ import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
+ &iov, &iter);
- return rq_bytes - data.len;
+ return ublk_copy_user_pages(req, &iter, dir);
}
return rq_bytes;
}
@@ -549,17 +537,15 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
const unsigned int rq_bytes = blk_rq_bytes(req);
if (ublk_need_unmap_req(req)) {
- struct ublk_map_data data = {
- .rq = req,
- .ubuf = io->addr,
- .len = io->res,
- };
+ struct iov_iter iter;
+ struct iovec iov;
+ const int dir = ITER_SOURCE;
WARN_ON_ONCE(io->res > rq_bytes);
- ublk_copy_user_pages(&data, false);
-
- return io->res - data.len;
+ import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
+ &iov, &iter);
+ return ublk_copy_user_pages(req, &iter, dir);
}
return rq_bytes;
}
From 8284066946e6d9cc979566ce698fe24e7ca0b31e Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Fri, 19 May 2023 14:50:27 +0800
Subject: [PATCH 029/266] ublk: grab request reference when the request is
handled by userspace
Add one reference counter into request pdu data, and hold this reference
in the request's lifetime.
Prepare for supporting to move request data copy into userspace, which
needs to copy request data by read()/write() on /dev/ublkcN, so we have
to guarantee that read()/write() is done on one valid/active request,
and that will be enhanced by holding the io request reference in
read()/write().
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230519065030.351216-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 67 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 64 insertions(+), 3 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index afc07fa17040..353ccdb60729 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -43,6 +43,7 @@
#include
#include
#include
+#include
#include
#define UBLK_MINORS (1U << MINORBITS)
@@ -62,6 +63,8 @@
struct ublk_rq_data {
struct llist_node node;
+
+ struct kref ref;
};
struct ublk_uring_cmd_pdu {
@@ -181,6 +184,9 @@ struct ublk_params_header {
__u32 types;
};
+static inline void __ublk_complete_rq(struct request *req);
+static void ublk_complete_rq(struct kref *ref);
+
static dev_t ublk_chr_devt;
static struct class *ublk_chr_class;
@@ -289,6 +295,45 @@ static int ublk_apply_params(struct ublk_device *ub)
return 0;
}
+static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
+{
+ return false;
+}
+
+static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ kref_init(&data->ref);
+ }
+}
+
+static inline bool ublk_get_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ return kref_get_unless_zero(&data->ref);
+ }
+
+ return true;
+}
+
+static inline void ublk_put_req_ref(const struct ublk_queue *ubq,
+ struct request *req)
+{
+ if (ublk_need_req_ref(ubq)) {
+ struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+ kref_put(&data->ref, ublk_complete_rq);
+ } else {
+ __ublk_complete_rq(req);
+ }
+}
+
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_NEED_GET_DATA;
@@ -625,13 +670,19 @@ static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
}
/* todo: handle partial completion */
-static void ublk_complete_rq(struct request *req)
+static inline void __ublk_complete_rq(struct request *req)
{
struct ublk_queue *ubq = req->mq_hctx->driver_data;
struct ublk_io *io = &ubq->ios[req->tag];
unsigned int unmapped_bytes;
blk_status_t res = BLK_STS_OK;
+ /* called from ublk_abort_queue() code path */
+ if (io->flags & UBLK_IO_FLAG_ABORTED) {
+ res = BLK_STS_IOERR;
+ goto exit;
+ }
+
/* failed read IO if nothing is read */
if (!io->res && req_op(req) == REQ_OP_READ)
io->res = -EIO;
@@ -671,6 +722,15 @@ static void ublk_complete_rq(struct request *req)
blk_mq_end_request(req, res);
}
+static void ublk_complete_rq(struct kref *ref)
+{
+ struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data,
+ ref);
+ struct request *req = blk_mq_rq_from_pdu(data);
+
+ __ublk_complete_rq(req);
+}
+
/*
* Since __ublk_rq_task_work always fails requests immediately during
* exiting, __ublk_fail_req() is only called from abort context during
@@ -689,7 +749,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
if (ublk_queue_can_use_recovery_reissue(ubq))
blk_mq_requeue_request(req, false);
else
- blk_mq_end_request(req, BLK_STS_IOERR);
+ ublk_put_req_ref(ubq, req);
}
}
@@ -798,6 +858,7 @@ static inline void __ublk_rq_task_work(struct request *req,
mapped_bytes >> 9;
}
+ ublk_init_req_ref(ubq, req);
ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
}
@@ -1002,7 +1063,7 @@ static void ublk_commit_completion(struct ublk_device *ub,
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
if (req && likely(!blk_should_fake_timeout(req->q)))
- ublk_complete_rq(req);
+ ublk_put_req_ref(ubq, req);
}
/*
From 38f2dd34410f9070b60969a07ff7d8743b4fd56c Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Fri, 19 May 2023 14:50:28 +0800
Subject: [PATCH 030/266] ublk: support to copy any part of request pages
Add 'offset' to 'struct ublk_map_data', so that ublk_copy_user_pages()
can be used to copy any sub-buffer(linear mapped) of the request.
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230519065030.351216-6-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 31 ++++++++++++++++++++++++-------
1 file changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 353ccdb60729..13523c37a165 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -505,19 +505,36 @@ static void ublk_copy_io_pages(struct ublk_io_iter *data,
}
}
+static bool ublk_advance_io_iter(const struct request *req,
+ struct ublk_io_iter *iter, unsigned int offset)
+{
+ struct bio *bio = req->bio;
+
+ for_each_bio(bio) {
+ if (bio->bi_iter.bi_size > offset) {
+ iter->bio = bio;
+ iter->iter = bio->bi_iter;
+ bio_advance_iter(iter->bio, &iter->iter, offset);
+ return true;
+ }
+ offset -= bio->bi_iter.bi_size;
+ }
+ return false;
+}
+
/*
* Copy data between request pages and io_iter, and 'offset'
* is the start point of linear offset of request.
*/
static size_t ublk_copy_user_pages(const struct request *req,
- struct iov_iter *uiter, int dir)
+ unsigned offset, struct iov_iter *uiter, int dir)
{
- struct ublk_io_iter iter = {
- .bio = req->bio,
- .iter = req->bio->bi_iter,
- };
+ struct ublk_io_iter iter;
size_t done = 0;
+ if (!ublk_advance_io_iter(req, &iter, offset))
+ return 0;
+
while (iov_iter_count(uiter) && iter.bio) {
unsigned nr_pages;
size_t len, off;
@@ -570,7 +587,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes,
&iov, &iter);
- return ublk_copy_user_pages(req, &iter, dir);
+ return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
}
@@ -590,7 +607,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
import_single_range(dir, u64_to_user_ptr(io->addr), io->res,
&iov, &iter);
- return ublk_copy_user_pages(req, &iter, dir);
+ return ublk_copy_user_pages(req, 0, &iter, dir);
}
return rq_bytes;
}
From 62fe99cef94a5900cac3bf15fd03ee8baad1a99c Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Fri, 19 May 2023 14:50:29 +0800
Subject: [PATCH 031/266] ublk: add read()/write() support for ublk char device
Support pread()/pwrite() on ublk char device for reading/writing request
io buffer, so data copy between io request buffer and userspace buffer
can be moved to ublk server from ublk driver. Then UBLK_F_NEED_GET_DATA
becomes not necessary, so ublk server can allocate buffer without one
extra round uring command communication for userspace to provide buffer.
IO buffer can be located by iocb->ki_pos which encodes buffer offset, io
tag and queue id info, and type of iocb->ki_pos is u64, so it is big
enough for holding reasonable queue depth, nr_queues and max io buffer
size.
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230519065030.351216-7-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 151 ++++++++++++++++++++++++++++++++++
include/uapi/linux/ublk_cmd.h | 22 ++++-
2 files changed, 172 insertions(+), 1 deletion(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 13523c37a165..ec40ac4f9af3 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -207,6 +207,23 @@ static unsigned int ublks_added; /* protected by ublk_ctl_mutex */
static struct miscdevice ublk_misc;
+static inline unsigned ublk_pos_to_hwq(loff_t pos)
+{
+ return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
+ UBLK_QID_BITS_MASK;
+}
+
+static inline unsigned ublk_pos_to_buf_off(loff_t pos)
+{
+ return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
+}
+
+static inline unsigned ublk_pos_to_tag(loff_t pos)
+{
+ return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
+ UBLK_TAG_BITS_MASK;
+}
+
static void ublk_dev_param_basic_apply(struct ublk_device *ub)
{
struct request_queue *q = ub->ub_disk->queue;
@@ -1429,6 +1446,36 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
return -EIOCBQUEUED;
}
+static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
+ struct ublk_queue *ubq, int tag, size_t offset)
+{
+ struct request *req;
+
+ if (!ublk_need_req_ref(ubq))
+ return NULL;
+
+ req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
+ if (!req)
+ return NULL;
+
+ if (!ublk_get_req_ref(ubq, req))
+ return NULL;
+
+ if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
+ goto fail_put;
+
+ if (!ublk_rq_has_data(req))
+ goto fail_put;
+
+ if (offset > blk_rq_bytes(req))
+ goto fail_put;
+
+ return req;
+fail_put:
+ ublk_put_req_ref(ubq, req);
+ return NULL;
+}
+
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
/*
@@ -1446,11 +1493,112 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
}
+static inline bool ublk_check_ubuf_dir(const struct request *req,
+ int ubuf_dir)
+{
+ /* copy ubuf to request pages */
+ if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE)
+ return true;
+
+ /* copy request pages to ubuf */
+ if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST)
+ return true;
+
+ return false;
+}
+
+static struct request *ublk_check_and_get_req(struct kiocb *iocb,
+ struct iov_iter *iter, size_t *off, int dir)
+{
+ struct ublk_device *ub = iocb->ki_filp->private_data;
+ struct ublk_queue *ubq;
+ struct request *req;
+ size_t buf_off;
+ u16 tag, q_id;
+
+ if (!ub)
+ return ERR_PTR(-EACCES);
+
+ if (!user_backed_iter(iter))
+ return ERR_PTR(-EACCES);
+
+ if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+ return ERR_PTR(-EACCES);
+
+ tag = ublk_pos_to_tag(iocb->ki_pos);
+ q_id = ublk_pos_to_hwq(iocb->ki_pos);
+ buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
+
+ if (q_id >= ub->dev_info.nr_hw_queues)
+ return ERR_PTR(-EINVAL);
+
+ ubq = ublk_get_queue(ub, q_id);
+ if (!ubq)
+ return ERR_PTR(-EINVAL);
+
+ if (tag >= ubq->q_depth)
+ return ERR_PTR(-EINVAL);
+
+ req = __ublk_check_and_get_req(ub, ubq, tag, buf_off);
+ if (!req)
+ return ERR_PTR(-EINVAL);
+
+ if (!req->mq_hctx || !req->mq_hctx->driver_data)
+ goto fail;
+
+ if (!ublk_check_ubuf_dir(req, dir))
+ goto fail;
+
+ *off = buf_off;
+ return req;
+fail:
+ ublk_put_req_ref(ubq, req);
+ return ERR_PTR(-EACCES);
+}
+
+static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct ublk_queue *ubq;
+ struct request *req;
+ size_t buf_off;
+ size_t ret;
+
+ req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
+ ubq = req->mq_hctx->driver_data;
+ ublk_put_req_ref(ubq, req);
+
+ return ret;
+}
+
+static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct ublk_queue *ubq;
+ struct request *req;
+ size_t buf_off;
+ size_t ret;
+
+ req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
+ ubq = req->mq_hctx->driver_data;
+ ublk_put_req_ref(ubq, req);
+
+ return ret;
+}
+
static const struct file_operations ublk_ch_fops = {
.owner = THIS_MODULE,
.open = ublk_ch_open,
.release = ublk_ch_release,
.llseek = no_llseek,
+ .read_iter = ublk_ch_read_iter,
+ .write_iter = ublk_ch_write_iter,
.uring_cmd = ublk_ch_uring_cmd,
.mmap = ublk_ch_mmap,
};
@@ -2362,6 +2510,9 @@ static int __init ublk_init(void)
{
int ret;
+ BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
+ UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
+
init_waitqueue_head(&ublk_idr_wq);
ret = misc_register(&ublk_misc);
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index 640bf687b94a..c0c1632c671e 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -93,9 +93,29 @@
#define UBLKSRV_CMD_BUF_OFFSET 0
#define UBLKSRV_IO_BUF_OFFSET 0x80000000
-/* tag bit is 12bit, so at most 4096 IOs for each queue */
+/* tag bit is 16bit, so far limit at most 4096 IOs for each queue */
#define UBLK_MAX_QUEUE_DEPTH 4096
+/* single IO buffer max size is 32MB */
+#define UBLK_IO_BUF_OFF 0
+#define UBLK_IO_BUF_BITS 25
+#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1)
+
+/* so at most 64K IOs for each queue */
+#define UBLK_TAG_OFF UBLK_IO_BUF_BITS
+#define UBLK_TAG_BITS 16
+#define UBLK_TAG_BITS_MASK ((1ULL << UBLK_TAG_BITS) - 1)
+
+/* max 4096 queues */
+#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS)
+#define UBLK_QID_BITS 12
+#define UBLK_QID_BITS_MASK ((1ULL << UBLK_QID_BITS) - 1)
+
+#define UBLK_MAX_NR_QUEUES (1U << UBLK_QID_BITS)
+
+#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS)
+#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS)
+
/*
* zero copy requires 4k block size, and can remap ublk driver's io
* request into ublksrv's vm space
From 1172d5b8beca6b899deb9f7f2850e7e47ec16198 Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Fri, 19 May 2023 14:50:30 +0800
Subject: [PATCH 032/266] ublk: support user copy
Currently copy between io request buffer(pages) and userspace buffer is
done inside ublk_map_io() or ublk_unmap_io(). This way performs very
well in case of pre-allocated userspace io buffer.
For dynamically allocated or external userspace backend io buffer,
UBLK_F_NEED_GET_DATA is added for ublk server to provide buffer by one
extra command communication for WRITE request. For READ, userspace
simply provides buffer, but can't know when the buffer is done[1].
Add UBLK_F_USER_COPY by moving io data copy out of kernel by providing
read()/write() on /dev/ublkcN, and simply let ublk server do the io
data copy. This way makes both side cleaner, the cost is that one extra
syscall for copy io data between request and backend buffer.
With UBLK_F_USER_COPY, it actually becomes possible to run per-io zero
copy now, such as, only do zero copy for big size IO, so it can be
thought as one prep patch for supporting zero copy. Meantime zero copy
still needs to expose read()/write() buffer for some corner case, such
as passthrough IO.
[1] READ buffer in UBLK_F_NEED_GET_DATA
https://lore.kernel.org/linux-block/116d8a56-0881-56d3-9bcc-78ff3e1dc4e5@linux.alibaba.com/T/#m23bd4b8634c0a054e6797063167b469949a247bb
ublksrv loop usercopy code:
https://github.com/ming1/ubdsrv/commits/usercopy
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230519065030.351216-8-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 58 ++++++++++++++++++++++++++++-------
include/uapi/linux/ublk_cmd.h | 3 ++
2 files changed, 50 insertions(+), 11 deletions(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index ec40ac4f9af3..e00733b6fea8 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -55,7 +55,8 @@
| UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_UNPRIVILEGED_DEV \
- | UBLK_F_CMD_IOCTL_ENCODE)
+ | UBLK_F_CMD_IOCTL_ENCODE \
+ | UBLK_F_USER_COPY)
/* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \
@@ -312,9 +313,18 @@ static int ublk_apply_params(struct ublk_device *ub)
return 0;
}
+static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
+{
+ return ubq->flags & UBLK_F_USER_COPY;
+}
+
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
{
- return false;
+ /*
+ * read()/write() is involved in user copy, so request reference
+ * has to be grabbed
+ */
+ return ublk_support_user_copy(ubq);
}
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
@@ -591,6 +601,9 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ublk_support_user_copy(ubq))
+ return rq_bytes;
+
/*
* no zero copy, we delay copy WRITE request data into ublksrv
* context and the big benefit is that pinning pages in current
@@ -615,6 +628,9 @@ static int ublk_unmap_io(const struct ublk_queue *ubq,
{
const unsigned int rq_bytes = blk_rq_bytes(req);
+ if (ublk_support_user_copy(ubq))
+ return rq_bytes;
+
if (ublk_need_unmap_req(req)) {
struct iov_iter iter;
struct iovec iov;
@@ -1390,6 +1406,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out;
+ if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = ublk_check_cmd_op(cmd_op);
if (ret)
goto out;
@@ -1408,23 +1429,34 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
*/
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
goto out;
- /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */
- if (!ub_cmd->addr && !ublk_need_get_data(ubq))
- goto out;
+
+ if (!ublk_support_user_copy(ubq)) {
+ /*
+ * FETCH_RQ has to provide IO buffer if NEED GET
+ * DATA is not enabled
+ */
+ if (!ub_cmd->addr && !ublk_need_get_data(ubq))
+ goto out;
+ }
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_mark_io_ready(ub, ubq);
break;
case UBLK_IO_COMMIT_AND_FETCH_REQ:
req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
- /*
- * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is
- * not enabled or it is Read IO.
- */
- if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ))
- goto out;
+
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
goto out;
+
+ if (!ublk_support_user_copy(ubq)) {
+ /*
+ * COMMIT_AND_FETCH_REQ has to provide IO buffer if
+ * NEED GET DATA is not enabled or it is Read IO.
+ */
+ if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
+ req_op(req) == REQ_OP_READ))
+ goto out;
+ }
ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd);
break;
@@ -1996,6 +2028,10 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
UBLK_F_URING_CMD_COMP_IN_TASK;
+ /* GET_DATA isn't needed any more with USER_COPY */
+ if (ub->dev_info.flags & UBLK_F_USER_COPY)
+ ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
+
/* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index c0c1632c671e..54b5b0aeefca 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -165,6 +165,9 @@
/* use ioctl encoding for uring command */
#define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6)
+/* Copy between request and user buffer by pread()/pwrite() */
+#define UBLK_F_USER_COPY (1UL << 7)
+
/* device state */
#define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1
From f80dd11dd1d07ada04a9fcd57032fa82e136462d Mon Sep 17 00:00:00 2001
From: Bart Van Assche
Date: Fri, 19 May 2023 15:03:46 -0700
Subject: [PATCH 033/266] block: BFQ: Move an invariant check
Check bfqq->dispatched for each BFQ queue instead of checking it for an
invalid bfqq pointer.
Fixes: 3e49c1e4a615 ("block: BFQ: Add several invariant checks")
Signed-off-by: Bart Van Assche
Link: https://lore.kernel.org/r/20230519220347.3643295-1-bvanassche@acm.org
Signed-off-by: Jens Axboe
---
block/bfq-iosched.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c5727afad159..09bbbcf9e049 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5405,6 +5405,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
WARN_ON_ONCE(!list_empty(&bfqq->fifo));
WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list));
+ WARN_ON_ONCE(bfqq->dispatched);
kmem_cache_free(bfq_pool, bfqq);
bfqg_and_blkg_put(bfqg);
@@ -7150,7 +7151,6 @@ static void bfq_exit_queue(struct elevator_queue *e)
for (actuator = 0; actuator < bfqd->num_actuators; actuator++)
WARN_ON_ONCE(bfqd->rq_in_driver[actuator]);
WARN_ON_ONCE(bfqd->tot_rq_in_driver);
- WARN_ON_ONCE(bfqq->dispatched);
hrtimer_cancel(&bfqd->idle_slice_timer);
From 712c7364655f69827d0b96f69594886ecbfb412f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Sat, 20 May 2023 06:45:03 +0200
Subject: [PATCH 034/266] block: don't plug in blkdev_write_iter
For direct I/O writes that issues more than a single bio, the plugging
is already done in __blkdev_direct_IO.
For synchronous buffered writes the plugging is done deep down in
writeback_inodes_wb / wb_writeback.
For the other cases there is no point in plugging as as single bio or no
bio at all is submitted.
Signed-off-by: Christoph Hellwig
Link: https://lore.kernel.org/r/20230520044503.334444-1-hch@lst.de
Signed-off-by: Jens Axboe
---
block/fops.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/block/fops.c b/block/fops.c
index d2e6be4e3d1c..102ee85fc6ee 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -520,7 +520,6 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct block_device *bdev = iocb->ki_filp->private_data;
struct inode *bd_inode = bdev->bd_inode;
loff_t size = bdev_nr_bytes(bdev);
- struct blk_plug plug;
size_t shorted = 0;
ssize_t ret;
@@ -545,12 +544,10 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, size);
}
- blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
iov_iter_reexpand(from, iov_iter_count(from) + shorted);
- blk_finish_plug(&plug);
return ret;
}
From b8b637d770ef7aa9bc3971670cc8532b1f0d757e Mon Sep 17 00:00:00 2001
From: Ming Lei
Date: Sat, 20 May 2023 23:11:34 +0800
Subject: [PATCH 035/266] ublk: fix build warning on iov_iter_get_pages2
Return type of iov_iter_get_pages2() is ssize_t instead of size_t, so
fix it.
Fixes: 981f95a571e3 ("ublk: cleanup ublk_copy_user_pages")
Reported-by: kernel test robot
Reported-by: Julia Lawall
Signed-off-by: Ming Lei
Link: https://lore.kernel.org/r/20230520151134.459679-1-ming.lei@redhat.com
Signed-off-by: Jens Axboe
---
drivers/block/ublk_drv.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index e00733b6fea8..539eada32861 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -564,7 +564,8 @@ static size_t ublk_copy_user_pages(const struct request *req,
while (iov_iter_count(uiter) && iter.bio) {
unsigned nr_pages;
- size_t len, off;
+ ssize_t len;
+ size_t off;
int i;
len = iov_iter_get_pages2(uiter, iter.pages,
From 712fd23a90eed6a73ea5135a500e59d30356d4f1 Mon Sep 17 00:00:00 2001
From: Li Nan
Date: Mon, 22 May 2023 16:53:55 +0800
Subject: [PATCH 036/266] block: remove redundant req_op in
blk_rq_is_passthrough
op &= REQ_OP_MASK in blk_op_is_passthrough() is exactly what req_op() do.
Therefore, it is redundant to call req_op() for blk_op_is_passthrough().
Signed-off-by: Li Nan
Reviewed-by: Christoph Hellwig
Reviewed-by: Chaitanya Kulkarni
Link: https://lore.kernel.org/r/20230522085355.1740772-1-linan666@huaweicloud.com
Signed-off-by: Jens Axboe
---
include/linux/blk-mq.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d778cb6b2112..59b52ec155b1 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -201,7 +201,7 @@ static inline enum req_op req_op(const struct request *req)
static inline bool blk_rq_is_passthrough(struct request *rq)
{
- return blk_op_is_passthrough(req_op(rq));
+ return blk_op_is_passthrough(rq->cmd_flags);
}
static inline unsigned short req_get_ioprio(struct request *req)
From a13bd91be22318768d55470cbc0b0f4488ef9edf Mon Sep 17 00:00:00 2001
From: Yu Kuai
Date: Fri, 14 Apr 2023 16:40:08 +0800
Subject: [PATCH 037/266] block/rq_qos: protect rq_qos apis with a new lock
commit 50e34d78815e ("block: disable the elevator int del_gendisk")
move rq_qos_exit() from disk_release() to del_gendisk(), this will
introduce some problems:
1) If rq_qos_add() is triggered by enabling iocost/iolatency through
cgroupfs, then it can concurrent with del_gendisk(), it's not safe to
write 'q->rq_qos' concurrently.
2) Activate cgroup policy that is relied on rq_qos will call
rq_qos_add() and blkcg_activate_policy(), and if rq_qos_exit() is
called in the middle, null-ptr-dereference will be triggered in
blkcg_activate_policy().
3) blkg_conf_open_bdev() can call blkdev_get_no_open() first to find the
disk, then if rq_qos_exit() from del_gendisk() is done before
rq_qos_add(), then memory will be leaked.
This patch add a new disk level mutex 'rq_qos_mutex':
1) The lock will protect rq_qos_exit() directly.
2) For wbt that doesn't relied on blk-cgroup, rq_qos_add() can only be
called from disk initialization for now because wbt can't be
destructed until rq_qos_exit(), so it's safe not to protect wbt for
now. Hoever, in case that rq_qos dynamically destruction is supported
in the furture, this patch also protect rq_qos_add() from wbt_init()
directly, this is enough because blk-sysfs already synchronize
writers with disk removal.
3) For iocost and iolatency, in order to synchronize disk removal and
cgroup configuration, the lock is held after blkdev_get_no_open()
from blkg_conf_open_bdev(), and is released in blkg_conf_exit().
In order to fix the above memory leak, disk_live() is checked after
holding the new lock.
Fixes: 50e34d78815e ("block: disable the elevator int del_gendisk")
Signed-off-by: Yu Kuai
Acked-by: Tejun Heo
Link: https://lore.kernel.org/r/20230414084008.2085155-1-yukuai1@huaweicloud.com
Signed-off-by: Jens Axboe
---
block/blk-cgroup.c | 9 +++++++++
block/blk-core.c | 1 +
block/blk-rq-qos.c | 20 ++++++--------------
block/blk-wbt.c | 2 ++
include/linux/blkdev.h | 1 +
5 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ce64dd73cfe..1c6716f51fff 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -748,6 +748,13 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
return -ENODEV;
}
+ mutex_lock(&bdev->bd_queue->rq_qos_mutex);
+ if (!disk_live(bdev->bd_disk)) {
+ blkdev_put_no_open(bdev);
+ mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
+ return -ENODEV;
+ }
+
ctx->body = input;
ctx->bdev = bdev;
return 0;
@@ -892,6 +899,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
*/
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
__releases(&ctx->bdev->bd_queue->queue_lock)
+ __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
if (ctx->blkg) {
spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
@@ -899,6 +907,7 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx)
}
if (ctx->bdev) {
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
blkdev_put_no_open(ctx->bdev);
ctx->body = NULL;
ctx->bdev = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index 00c74330fa92..2ae22bebeb3e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -420,6 +420,7 @@ struct request_queue *blk_alloc_queue(int node_id)
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
+ mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index d8cc820a365e..167be74df4ee 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -288,11 +288,13 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
void rq_qos_exit(struct request_queue *q)
{
+ mutex_lock(&q->rq_qos_mutex);
while (q->rq_qos) {
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
}
+ mutex_unlock(&q->rq_qos_mutex);
}
int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
@@ -300,6 +302,8 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
{
struct request_queue *q = disk->queue;
+ lockdep_assert_held(&q->rq_qos_mutex);
+
rqos->disk = disk;
rqos->id = id;
rqos->ops = ops;
@@ -307,18 +311,13 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
/*
* No IO can be in-flight when adding rqos, so freeze queue, which
* is fine since we only support rq_qos for blk-mq queue.
- *
- * Reuse ->queue_lock for protecting against other concurrent
- * rq_qos adding/deleting
*/
blk_mq_freeze_queue(q);
- spin_lock_irq(&q->queue_lock);
if (rq_qos_id(q, rqos->id))
goto ebusy;
rqos->next = q->rq_qos;
q->rq_qos = rqos;
- spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
@@ -330,7 +329,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
return 0;
ebusy:
- spin_unlock_irq(&q->queue_lock);
blk_mq_unfreeze_queue(q);
return -EBUSY;
}
@@ -340,21 +338,15 @@ void rq_qos_del(struct rq_qos *rqos)
struct request_queue *q = rqos->disk->queue;
struct rq_qos **cur;
- /*
- * See comment in rq_qos_add() about freezing queue & using
- * ->queue_lock.
- */
- blk_mq_freeze_queue(q);
+ lockdep_assert_held(&q->rq_qos_mutex);
- spin_lock_irq(&q->queue_lock);
+ blk_mq_freeze_queue(q);
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (*cur == rqos) {
*cur = rqos->next;
break;
}
}
- spin_unlock_irq(&q->queue_lock);
-
blk_mq_unfreeze_queue(q);
mutex_lock(&q->debugfs_mutex);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index e49a48684532..53bf5aa6f9ad 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -942,7 +942,9 @@ int wbt_init(struct gendisk *disk)
/*
* Assign rwb and add the stats callback.
*/
+ mutex_lock(&q->rq_qos_mutex);
ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+ mutex_unlock(&q->rq_qos_mutex);
if (ret)
goto err_free;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fe99948688df..b2ac587e3402 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -392,6 +392,7 @@ struct request_queue {
struct blk_queue_stats *stats;
struct rq_qos *rq_qos;
+ struct mutex rq_qos_mutex;
const struct blk_mq_ops *mq_ops;
From 5a80bd075f3bce24793ae1aeb06066895ec5aef0 Mon Sep 17 00:00:00 2001
From: Hengqi Chen
Date: Sat, 20 May 2023 08:40:57 +0000
Subject: [PATCH 038/266] block: introduce block_io_start/block_io_done
tracepoints
Currently, several BCC ([0]) tools (biosnoop/biostacks/biotop) use
kprobes to blk_account_io_start/blk_account_io_done to implement
their functionalities. This is fragile because the target kernel
functions may be renamed ([1]) or inlined ([2]). So introduce two
new tracepoints for such use cases.
[0]: https://github.com/iovisor/bcc
[1]: https://github.com/iovisor/bcc/issues/3954
[2]: https://github.com/iovisor/bcc/issues/4261
Tested-by: Francis Laniel
Signed-off-by: Hengqi Chen
Tested-by: Yonghong Song
Link: https://lore.kernel.org/r/20230520084057.1467003-1-hengqi.chen@gmail.com
Signed-off-by: Jens Axboe
---
block/blk-mq.c | 4 ++++
include/trace/events/block.h | 26 ++++++++++++++++++++++++++
2 files changed, 30 insertions(+)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 551e7760f45e..1749f5890606 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -962,6 +962,8 @@ EXPORT_SYMBOL_GPL(blk_update_request);
static inline void blk_account_io_done(struct request *req, u64 now)
{
+ trace_block_io_done(req);
+
/*
* Account IO completion. flush_rq isn't accounted as a
* normal IO on queueing nor completion. Accounting the
@@ -981,6 +983,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
static inline void blk_account_io_start(struct request *req)
{
+ trace_block_io_start(req);
+
if (blk_do_io_stat(req)) {
/*
* All non-passthrough requests are created from a bio with one
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 7f4dfbdf12a6..40e60c33cc6f 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -245,6 +245,32 @@ DEFINE_EVENT(block_rq, block_rq_merge,
TP_ARGS(rq)
);
+/**
+ * block_io_start - insert a request for execution
+ * @rq: block IO operation request
+ *
+ * Called when block operation request @rq is queued for execution
+ */
+DEFINE_EVENT(block_rq, block_io_start,
+
+ TP_PROTO(struct request *rq),
+
+ TP_ARGS(rq)
+);
+
+/**
+ * block_io_done - block IO operation request completed
+ * @rq: block IO operation request
+ *
+ * Called when block operation request @rq is completed
+ */
+DEFINE_EVENT(block_rq, block_io_done,
+
+ TP_PROTO(struct request *rq),
+
+ TP_ARGS(rq)
+);
+
/**
* block_bio_complete - completed all work on the block operation
* @q: queue holding the block operation
From a450f49708ea2ccabd1c5d2fe8a702ca5ef77941 Mon Sep 17 00:00:00 2001
From: David Howells
Date: Mon, 22 May 2023 21:57:39 +0100
Subject: [PATCH 039/266] iomap: Don't get an reference on ZERO_PAGE for direct
I/O block zeroing
ZERO_PAGE can't go away, no need to hold an extra reference.
Signed-off-by: David Howells
Reviewed-by: David Hildenbrand
Reviewed-by: John Hubbard
Reviewed-by: Dave Chinner
Reviewed-by: Christoph Hellwig
cc: Al Viro
cc: linux-fsdevel@vger.kernel.org
Reviewed-by: Christian Brauner
Reviewed-by: Jan Kara
Link: https://lore.kernel.org/r/20230522205744.2825689-2-dhowells@redhat.com
Signed-off-by: Jens Axboe
---
fs/iomap/direct-io.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 019cc87d0fb3..66a9f10e3207 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -203,7 +203,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- get_page(page);
+ bio_set_flag(bio, BIO_NO_PAGE_REF);
__bio_add_page(bio, page, len, 0);
iomap_dio_submit_bio(iter, dio, bio, pos);
}
From 09e8c253415b8eb9ca29a2131d2ebf17743534c5 Mon Sep 17 00:00:00 2001
From: David Howells
Date: Mon, 22 May 2023 21:57:40 +0100
Subject: [PATCH 040/266] block: Fix bio_flagged() so that gcc can better
optimise it
Fix bio_flagged() so that multiple instances of it, such as:
if (bio_flagged(bio, BIO_PAGE_REFFED) ||
bio_flagged(bio, BIO_PAGE_PINNED))
can be combined by the gcc optimiser into a single test in assembly
(arguably, this is a compiler optimisation issue[1]).
The missed optimisation stems from bio_flagged() comparing the result of
the bitwise-AND to zero. This results in an out-of-line bio_release_page()
being compiled to something like:
<+0>: mov 0x14(%rdi),%eax
<+3>: test $0x1,%al
<+5>: jne 0xffffffff816dac53
<+7>: test $0x2,%al
<+9>: je 0xffffffff816dac5c
<+11>: movzbl %sil,%esi
<+15>: jmp 0xffffffff816daba1 <__bio_release_pages>
<+20>: jmp 0xffffffff81d0b800 <__x86_return_thunk>
However, the test is superfluous as the return type is bool. Removing it
results in:
<+0>: testb $0x3,0x14(%rdi)
<+4>: je 0xffffffff816e4af4
<+6>: movzbl %sil,%esi
<+10>: jmp 0xffffffff816dab7c <__bio_release_pages>
<+15>: jmp 0xffffffff81d0b7c0 <__x86_return_thunk>
instead.
Also, the MOVZBL instruction looks unnecessary[2] - I think it's just
're-booling' the mark_dirty parameter.
Signed-off-by: David Howells
Reviewed-by: Christoph Hellwig
Reviewed-by: John Hubbard
cc: Jens Axboe
cc: linux-block@vger.kernel.org
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108370 [1]
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108371 [2]
Link: https://lore.kernel.org/r/167391056756.2311931.356007731815807265.stgit@warthog.procyon.org.uk/ # v6
Reviewed-by: Christian Brauner
Reviewed-by: Jan Kara
Link: https://lore.kernel.org/r/20230522205744.2825689-3-dhowells@redhat.com
Signed-off-by: Jens Axboe
---
include/linux/bio.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b3e7529ff55e..7f53be035cf0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -229,7 +229,7 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count)
static inline bool bio_flagged(struct bio *bio, unsigned int bit)
{
- return (bio->bi_flags & (1U << bit)) != 0;
+ return bio->bi_flags & (1U << bit);
}
static inline void bio_set_flag(struct bio *bio, unsigned int bit)
From e51bab4e20586fb3afc30536b776a97ed8ffb681 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Mon, 22 May 2023 21:57:41 +0100
Subject: [PATCH 041/266] block: Replace BIO_NO_PAGE_REF with BIO_PAGE_REFFED
with inverted logic
Replace BIO_NO_PAGE_REF with a BIO_PAGE_REFFED flag that has the inverted
meaning is only set when a page reference has been acquired that needs to
be released by bio_release_pages().
Signed-off-by: Christoph Hellwig
Signed-off-by: David Howells
Reviewed-by: John Hubbard
cc: Al Viro
cc: Jens Axboe
cc: Jan Kara
cc: Matthew Wilcox
cc: Logan Gunthorpe
cc: linux-block@vger.kernel.org
Reviewed-by: Jan Kara
Link: https://lore.kernel.org/r/20230522205744.2825689-4-dhowells@redhat.com
Signed-off-by: Jens Axboe
---
block/bio.c | 2 +-
block/blk-map.c | 1 +
fs/direct-io.c | 2 ++
fs/iomap/direct-io.c | 1 -
include/linux/bio.h | 2 +-
include/linux/blk_types.h | 2 +-
6 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 043944fd46eb..8516adeaea26 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1191,7 +1191,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset;
bio->bi_iter.bi_size = size;
- bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_CLONED);
}
@@ -1336,6 +1335,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return 0;
}
+ bio_set_flag(bio, BIO_PAGE_REFFED);
do {
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
diff --git a/block/blk-map.c b/block/blk-map.c
index 04c55f1c492e..33d9f6e89ba6 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -282,6 +282,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (blk_queue_pci_p2pdma(rq->q))
extraction_flags |= ITER_ALLOW_P2PDMA;
+ bio_set_flag(bio, BIO_PAGE_REFFED);
while (iov_iter_count(iter)) {
struct page **pages, *stack_pages[UIO_FASTIOV];
ssize_t bytes;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0b380bb8a81e..ad20f3428bab 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -402,6 +402,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_end_io = dio_bio_end_aio;
else
bio->bi_end_io = dio_bio_end_io;
+ /* for now require references for all pages */
+ bio_set_flag(bio, BIO_PAGE_REFFED);
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 66a9f10e3207..08873f0627dd 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -203,7 +203,6 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
- bio_set_flag(bio, BIO_NO_PAGE_REF);
__bio_add_page(bio, page, len, 0);
iomap_dio_submit_bio(iter, dio, bio, pos);
}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7f53be035cf0..0922729acd26 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -488,7 +488,7 @@ void zero_fill_bio(struct bio *bio);
static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
{
- if (!bio_flagged(bio, BIO_NO_PAGE_REF))
+ if (bio_flagged(bio, BIO_PAGE_REFFED))
__bio_release_pages(bio, mark_dirty);
}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 740afe80f297..dfd2c2cb909d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -323,7 +323,7 @@ struct bio {
* bio flags
*/
enum {
- BIO_NO_PAGE_REF, /* don't put release vec pages */
+ BIO_PAGE_REFFED, /* put pages in bio_release_pages() */
BIO_CLONED, /* doesn't own data */
BIO_BOUNCED, /* bio is a bounce bio */
BIO_QUIET, /* Make BIO Quiet */
From fd363244e883323e1ac9412d96fd22b51e255b0c Mon Sep 17 00:00:00 2001
From: David Howells
Date: Mon, 22 May 2023 21:57:42 +0100
Subject: [PATCH 042/266] block: Add BIO_PAGE_PINNED and associated
infrastructure
Add BIO_PAGE_PINNED to indicate that the pages in a bio are pinned
(FOLL_PIN) and that the pin will need removing.
Signed-off-by: David Howells
Reviewed-by: Christoph Hellwig
Reviewed-by: John Hubbard
cc: Al Viro
cc: Jens Axboe
cc: Jan Kara
cc: Matthew Wilcox
cc: Logan Gunthorpe
cc: linux-block@vger.kernel.org
Reviewed-by: Jan Kara
Link: https://lore.kernel.org/r/20230522205744.2825689-5-dhowells@redhat.com
Signed-off-by: Jens Axboe