From 786bb02458819df7a833361c6c7448a4925a89ce Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 11 May 2023 14:15:44 +0200 Subject: [PATCH 001/266] brd: use XArray instead of radix-tree to index backing pages XArray was introduced to hold large array of pointers with a simple API. XArray API also provides array semantics which simplifies the way we store and access the backing pages, and the code becomes significantly easier to understand. No performance difference was noticed between the two implementation using fio with direct=1 [1]. [1] Performance in KIOPS: | radix-tree | XArray | Diff | | | write | 315 | 313 | -0.6% randwrite | 286 | 290 | +1.3% read | 330 | 335 | +1.5% randread | 309 | 312 | +0.9% Signed-off-by: Pankaj Raghav Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20230511121544.111648-1-p.raghav@samsung.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 93 ++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 69 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index bcad9b926b0c..2f71376afc71 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -28,7 +28,7 @@ #include /* - * Each block ramdisk device has a radix_tree brd_pages of pages that stores + * Each block ramdisk device has a xarray brd_pages of pages that stores * the pages containing the block device's contents. A brd page's ->index is * its offset in PAGE_SIZE units. This is similar to, but in no way connected * with, the kernel's pagecache or buffer cache (which sit above our block @@ -40,11 +40,9 @@ struct brd_device { struct list_head brd_list; /* - * Backing store of pages and lock to protect it. This is the contents - * of the block device. + * Backing store of pages. This is the contents of the block device. */ - spinlock_t brd_lock; - struct radix_tree_root brd_pages; + struct xarray brd_pages; u64 brd_nr_pages; }; @@ -56,21 +54,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) pgoff_t idx; struct page *page; - /* - * The page lifetime is protected by the fact that we have opened the - * device node -- brd pages will never be deleted under us, so we - * don't need any further locking or refcounting. - * - * This is strictly true for the radix-tree nodes as well (ie. we - * don't actually need the rcu_read_lock()), however that is not a - * documented feature of the radix-tree API so it is better to be - * safe here (we don't have total exclusion from radix tree updates - * here, only deletes). - */ - rcu_read_lock(); idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ - page = radix_tree_lookup(&brd->brd_pages, idx); - rcu_read_unlock(); + page = xa_load(&brd->brd_pages, idx); BUG_ON(page && page->index != idx); @@ -83,7 +68,7 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) { pgoff_t idx; - struct page *page; + struct page *page, *cur; int ret = 0; page = brd_lookup_page(brd, sector); @@ -94,71 +79,42 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) if (!page) return -ENOMEM; - if (radix_tree_maybe_preload(gfp)) { - __free_page(page); - return -ENOMEM; - } + xa_lock(&brd->brd_pages); - spin_lock(&brd->brd_lock); idx = sector >> PAGE_SECTORS_SHIFT; page->index = idx; - if (radix_tree_insert(&brd->brd_pages, idx, page)) { + + cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp); + + if (unlikely(cur)) { __free_page(page); - page = radix_tree_lookup(&brd->brd_pages, idx); - if (!page) - ret = -ENOMEM; - else if (page->index != idx) + ret = xa_err(cur); + if (!ret && (cur->index != idx)) ret = -EIO; } else { brd->brd_nr_pages++; } - spin_unlock(&brd->brd_lock); - radix_tree_preload_end(); + xa_unlock(&brd->brd_pages); + return ret; } /* - * Free all backing store pages and radix tree. This must only be called when + * Free all backing store pages and xarray. This must only be called when * there are no other users of the device. */ -#define FREE_BATCH 16 static void brd_free_pages(struct brd_device *brd) { - unsigned long pos = 0; - struct page *pages[FREE_BATCH]; - int nr_pages; + struct page *page; + pgoff_t idx; - do { - int i; + xa_for_each(&brd->brd_pages, idx, page) { + __free_page(page); + cond_resched_rcu(); + } - nr_pages = radix_tree_gang_lookup(&brd->brd_pages, - (void **)pages, pos, FREE_BATCH); - - for (i = 0; i < nr_pages; i++) { - void *ret; - - BUG_ON(pages[i]->index < pos); - pos = pages[i]->index; - ret = radix_tree_delete(&brd->brd_pages, pos); - BUG_ON(!ret || ret != pages[i]); - __free_page(pages[i]); - } - - pos++; - - /* - * It takes 3.4 seconds to remove 80GiB ramdisk. - * So, we need cond_resched to avoid stalling the CPU. - */ - cond_resched(); - - /* - * This assumes radix_tree_gang_lookup always returns as - * many pages as possible. If the radix-tree code changes, - * so will this have to. - */ - } while (nr_pages == FREE_BATCH); + xa_destroy(&brd->brd_pages); } /* @@ -372,8 +328,7 @@ static int brd_alloc(int i) brd->brd_number = i; list_add_tail(&brd->brd_list, &brd_devices); - spin_lock_init(&brd->brd_lock); - INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); + xa_init(&brd->brd_pages); snprintf(buf, DISK_NAME_LEN, "ram%d", i); if (!IS_ERR_OR_NULL(brd_debugfs_dir)) From d5fb8726f1dea70543a93ab1d7332857f157b7f3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 18 May 2023 15:27:08 -0700 Subject: [PATCH 002/266] block: Decode all flag names in the debugfs output See also: * Commit 4d337cebcb1c ("blk-mq: avoid to touch q->elevator without any protection"). * Commit 414dd48e882c ("blk-mq: add tagset quiesce interface"). Cc: Christoph Hellwig Cc: Damien Le Moal Cc: Ming Lei Cc: Chaitanya Kulkarni Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230518222708.1190867-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index d23a8554ec4a..f89865a90dba 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -88,6 +88,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), + QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(STABLE_WRITES), @@ -103,6 +104,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(NOWAIT), + QUEUE_FLAG_NAME(SQ_SCHED), + QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), }; #undef QUEUE_FLAG_NAME From d97217e7f024bbe9aa62aea070771234c2879358 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 18 May 2023 07:30:59 +0200 Subject: [PATCH 003/266] blk-mq: don't queue plugged passthrough requests into scheduler Passthrough requests should never be queued to the I/O scheduler, as scheduling these opaque requests doesn't make sense, and I/O schedulers might require req->bio to be always valid. We never let passthrough requests insert into the scheduler before commit 1c2d2fff6dc0 ("block: wire-up support for passthrough plugging"), restore this behavior even for passthrough requests issued under a plug. [hch: use blk_mq_insert_requests for passthrough requests, fix up the commit message and comments] Reported-by: Guangwu Zhang Closes: https://lore.kernel.org/linux-block/CAGS2=YosaYaUTEMU3uaf+y=8MqSrhL7sYsJn8EwbaM=76p_4Qg@mail.gmail.com/ Investigated-by: Yu Kuai Fixes: 1c2d2fff6dc0 ("block: wire-up support for passthrough plugging") Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230518053101.760632-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f6dad0886a2f..8b7e4daaa5b7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2711,6 +2711,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) struct request *requeue_list = NULL; struct request **requeue_lastp = &requeue_list; unsigned int depth = 0; + bool is_passthrough = false; LIST_HEAD(list); do { @@ -2719,7 +2720,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) if (!this_hctx) { this_hctx = rq->mq_hctx; this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + is_passthrough = blk_rq_is_passthrough(rq); + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || + is_passthrough != blk_rq_is_passthrough(rq)) { rq_list_add_tail(&requeue_lastp, rq); continue; } @@ -2731,7 +2734,8 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); - if (this_hctx->queue->elevator) { + /* passthrough requests should never be issued to the I/O scheduler */ + if (this_hctx->queue->elevator && !is_passthrough) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); From fdcab6cddef24a26b86d798814b3c25057e53c21 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 May 2023 07:31:00 +0200 Subject: [PATCH 004/266] blk-mq: remove RQF_ELVPRIV RQF_ELVPRIV is set for all non-flush requests that have RQF_ELV set. Expand this condition in the two users of the flag and remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230518053101.760632-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/blk-mq-sched.h | 4 ++-- block/blk-mq.c | 6 ++---- include/linux/blk-mq.h | 2 -- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index f89865a90dba..ae1b3080b62b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -251,7 +251,6 @@ static const char *const rqf_name[] = { RQF_NAME(DONTPREP), RQF_NAME(FAILED), RQF_NAME(QUIET), - RQF_NAME(ELVPRIV), RQF_NAME(IO_STAT), RQF_NAME(PM), RQF_NAME(HASHED), diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 7c3cbad17f30..4d8d2cd3b473 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -58,11 +58,11 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) static inline void blk_mq_sched_requeue_request(struct request *rq) { - if (rq->rq_flags & RQF_ELV) { + if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags)) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request) + if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } diff --git a/block/blk-mq.c b/block/blk-mq.c index 8b7e4daaa5b7..7470c6636dc4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -393,10 +393,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); if (!op_is_flush(data->cmd_flags) && - e->type->ops.prepare_request) { + e->type->ops.prepare_request) e->type->ops.prepare_request(rq); - rq->rq_flags |= RQF_ELVPRIV; - } } return rq; @@ -696,7 +694,7 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - if ((rq->rq_flags & RQF_ELVPRIV) && + if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags) && q->elevator->type->ops.finish_request) q->elevator->type->ops.finish_request(rq); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 06caacd77ed6..5529e7d28ae6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -42,8 +42,6 @@ typedef __u32 __bitwise req_flags_t; #define RQF_FAILED ((__force req_flags_t)(1 << 10)) /* don't warn about errors */ #define RQF_QUIET ((__force req_flags_t)(1 << 11)) -/* elevator private data attached */ -#define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) /* runtime pm request */ From dd6216bb16e83e349d5d987227328031b0b0d30d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 18 May 2023 07:31:01 +0200 Subject: [PATCH 005/266] blk-mq: make sure elevator callbacks aren't called for passthrough request In case of q->elevator, passthrough request can still be marked as RQF_ELV, so some elevator callbacks will be called for them. Fix this by splitting RQF_SCHED_TAGS, which is set for all requests that are issued on a queue that uses an I/O scheduler, and RQF_USE_SCHED for non-flush, non-passthrough requests on such a queue. Roughly based on two different patches from Ming Lei . Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230518053101.760632-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 3 ++- block/blk-mq-sched.h | 6 ++--- block/blk-mq.c | 53 +++++++++++++++++++++++------------------- block/blk-mq.h | 6 ++--- include/linux/blk-mq.h | 12 ++++++---- 5 files changed, 44 insertions(+), 36 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index ae1b3080b62b..22e39b9a77ec 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -249,6 +249,8 @@ static const char *const rqf_name[] = { RQF_NAME(MIXED_MERGE), RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), + RQF_NAME(SCHED_TAGS), + RQF_NAME(USE_SCHED), RQF_NAME(FAILED), RQF_NAME(QUIET), RQF_NAME(IO_STAT), @@ -258,7 +260,6 @@ static const char *const rqf_name[] = { RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(TIMED_OUT), - RQF_NAME(ELV), RQF_NAME(RESV), }; #undef RQF_NAME diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 4d8d2cd3b473..1326526bb733 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -37,7 +37,7 @@ static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (rq->rq_flags & RQF_ELV) { + if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) @@ -48,7 +48,7 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { - if (rq->rq_flags & RQF_ELV) { + if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) @@ -58,7 +58,7 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) static inline void blk_mq_sched_requeue_request(struct request *rq) { - if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags)) { + if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; diff --git a/block/blk-mq.c b/block/blk-mq.c index 7470c6636dc4..e021740154fe 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -354,12 +354,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; - if (!(data->rq_flags & RQF_ELV)) { - rq->tag = tag; - rq->internal_tag = BLK_MQ_NO_TAG; - } else { + if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; + } else { + rq->tag = tag; + rq->internal_tag = BLK_MQ_NO_TAG; } rq->timeout = 0; @@ -386,14 +386,13 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, WRITE_ONCE(rq->deadline, 0); req_ref_set(rq, 1); - if (rq->rq_flags & RQF_ELV) { + if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); - if (!op_is_flush(data->cmd_flags) && - e->type->ops.prepare_request) + if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); } @@ -447,26 +446,32 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) data->flags |= BLK_MQ_REQ_NOWAIT; if (q->elevator) { - struct elevator_queue *e = q->elevator; - - data->rq_flags |= RQF_ELV; + /* + * All requests use scheduler tags when an I/O scheduler is + * enabled for the queue. + */ + data->rq_flags |= RQF_SCHED_TAGS; /* * Flush/passthrough requests are special and go directly to the - * dispatch list. Don't include reserved tags in the - * limiting, as it isn't useful. + * dispatch list. */ if (!op_is_flush(data->cmd_flags) && - !blk_op_is_passthrough(data->cmd_flags) && - e->type->ops.limit_depth && - !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.limit_depth(data->cmd_flags, data); + !blk_op_is_passthrough(data->cmd_flags)) { + struct elevator_mq_ops *ops = &q->elevator->type->ops; + + WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); + + data->rq_flags |= RQF_USE_SCHED; + if (ops->limit_depth) + ops->limit_depth(data->cmd_flags, data); + } } retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->rq_flags & RQF_ELV)) + if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_tag_busy(data->hctx); if (data->flags & BLK_MQ_REQ_RESERVED) @@ -646,10 +651,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); - if (!q->elevator) - blk_mq_tag_busy(data.hctx); + if (q->elevator) + data.rq_flags |= RQF_SCHED_TAGS; else - data.rq_flags |= RQF_ELV; + blk_mq_tag_busy(data.hctx); if (flags & BLK_MQ_REQ_RESERVED) data.rq_flags |= RQF_RESV; @@ -694,7 +699,7 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - if ((rq->rq_flags & RQF_ELV) && !op_is_flush(rq->cmd_flags) && + if ((rq->rq_flags & RQF_USE_SCHED) && q->elevator->type->ops.finish_request) q->elevator->type->ops.finish_request(rq); @@ -1268,7 +1273,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; - if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + if (!plug->has_elevator && (rq->rq_flags & RQF_USE_SCHED)) plug->has_elevator = true; rq->rq_next = NULL; rq_list_add(&plug->mq_list, rq); @@ -2620,7 +2625,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, return; } - if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) { + if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return; @@ -2983,7 +2988,7 @@ void blk_mq_submit_bio(struct bio *bio) } hctx = rq->mq_hctx; - if ((rq->rq_flags & RQF_ELV) || + if ((rq->rq_flags & RQF_USE_SCHED) || (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); diff --git a/block/blk-mq.h b/block/blk-mq.h index e876584d3516..d15981db34b9 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -226,9 +226,9 @@ static inline bool blk_mq_is_shared_tags(unsigned int flags) static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (!(data->rq_flags & RQF_ELV)) - return data->hctx->tags; - return data->hctx->sched_tags; + if (data->rq_flags & RQF_SCHED_TAGS) + return data->hctx->sched_tags; + return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5529e7d28ae6..e4a211957db6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -38,6 +38,10 @@ typedef __u32 __bitwise req_flags_t; #define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) /* don't call prep for this one */ #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) +/* use hctx->sched_tags */ +#define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8)) +/* use an I/O scheduler for this request */ +#define RQF_USE_SCHED ((__force req_flags_t)(1 << 9)) /* vaguely specified driver internal error. Ignored by the block layer */ #define RQF_FAILED ((__force req_flags_t)(1 << 10)) /* don't warn about errors */ @@ -57,9 +61,7 @@ typedef __u32 __bitwise req_flags_t; #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) -/* queue has elevator attached */ -#define RQF_ELV ((__force req_flags_t)(1 << 22)) -#define RQF_RESV ((__force req_flags_t)(1 << 23)) +#define RQF_RESV ((__force req_flags_t)(1 << 23)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ @@ -842,7 +844,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib); */ static inline bool blk_mq_need_time_stamp(struct request *rq) { - return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_ELV)); + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); } static inline bool blk_mq_is_reserved_rq(struct request *rq) @@ -858,7 +860,7 @@ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { - if (!iob || (req->rq_flags & RQF_ELV) || ioerror || + if (!iob || (req->rq_flags & RQF_USE_SCHED) || ioerror || (req->end_io && !blk_rq_is_passthrough(req))) return false; From 45b46b6f157169b452772430566772506e25687a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:19 -0700 Subject: [PATCH 006/266] block: mq-deadline: Add a word in a source code comment Add the missing word "and". Cc: Damien Le Moal Suggested-by: Damien Le Moal Fixes: 945ffb60c11d ("mq-deadline: add blk-mq adaptation of the deadline IO scheduler") Signed-off-by: Bart Van Assche Tested-by: Damien Le Moal Link: https://lore.kernel.org/r/20230517174230.897144-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 5839a027e0f0..cea1b084c69e 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -443,7 +443,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) - /* we have a next request are still entitled to batch */ + /* we have a next request and are still entitled to batch */ goto dispatch_request; /* From 4f51644ccff1e4bf159e86da3d9695a1a33ca231 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:20 -0700 Subject: [PATCH 007/266] block: Simplify blk_req_needs_zone_write_lock() Remove the blk_rq_is_passthrough() check because it is redundant: blk_req_needs_zone_write_lock() also calls bdev_op_is_zoned_write() and the latter function returns false for pass-through requests. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230517174230.897144-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index fce9082384d6..835d9e937d4d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -57,9 +57,6 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str); */ bool blk_req_needs_zone_write_lock(struct request *rq) { - if (blk_rq_is_passthrough(rq)) - return false; - if (!rq->q->disk->seq_zones_wlock) return false; From 3ddbe2a7e0d4a155a805f69c906c9beed30d4cc4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:21 -0700 Subject: [PATCH 008/266] block: Fix the type of the second bdev_op_is_zoned_write() argument Change the type of the second argument of bdev_op_is_zoned_write() from blk_opf_t into enum req_op because this function expects an operation without flags as second argument. Reviewed-by: Johannes Thumshirn Reviewed-by: Pankaj Raghav Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Cc: Ming Lei Fixes: 8cafdb5ab94c ("block: adapt blk_mq_plug() to not plug for writes that require a zone lock") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230517174230.897144-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b441e633f4dd..db24cf98ccfb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1282,7 +1282,7 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) } static inline bool bdev_op_is_zoned_write(struct block_device *bdev, - blk_opf_t op) + enum req_op op) { if (!bdev_is_zoned(bdev)) return false; From a370798201b537f78288e4ef5e0f7fc70889e7ee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:22 -0700 Subject: [PATCH 009/266] block: Introduce op_needs_zoned_write_locking() Introduce a helper function for checking whether write serialization is required if the operation will be sent to a zoned device. A second caller for op_needs_zoned_write_locking() will be introduced in the next patch in this series. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Cc: Damien Le Moal Cc: Ming Lei Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230517174230.897144-5-bvanassche@acm.org Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index db24cf98ccfb..3952c52d6cd1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1281,13 +1281,16 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) return disk_zone_no(bdev->bd_disk, sec); } +/* Whether write serialization is required for @op on zoned devices. */ +static inline bool op_needs_zoned_write_locking(enum req_op op) +{ + return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; +} + static inline bool bdev_op_is_zoned_write(struct block_device *bdev, enum req_op op) { - if (!bdev_is_zoned(bdev)) - return false; - - return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; + return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op); } static inline sector_t bdev_zone_sectors(struct block_device *bdev) From 19821fee3ed42e5b294e95814892d0ad6a9890c9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:23 -0700 Subject: [PATCH 010/266] block: Introduce blk_rq_is_seq_zoned_write() Introduce the function blk_rq_is_seq_zoned_write(). This function will be used in later patches to preserve the order of zoned writes that require write serialization. This patch includes an optimization: instead of using rq->q->disk->part0->bd_queue to check whether or not the queue is associated with a zoned block device, use rq->q->disk->queue. Cc: Christoph Hellwig Cc: Damien Le Moal Cc: Ming Lei Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230517174230.897144-6-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 5 +---- include/linux/blk-mq.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 835d9e937d4d..096b6b47561f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -60,10 +60,7 @@ bool blk_req_needs_zone_write_lock(struct request *rq) if (!rq->q->disk->seq_zones_wlock) return false; - if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq))) - return blk_rq_zone_is_seq(rq); - - return false; + return blk_rq_is_seq_zoned_write(rq); } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e4a211957db6..49d14b1acfa5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1164,6 +1164,18 @@ static inline unsigned int blk_rq_zone_is_seq(struct request *rq) return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq)); } +/** + * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization. + * @rq: Request to examine. + * + * Note: REQ_OP_ZONE_APPEND requests do not require serialization. + */ +static inline bool blk_rq_is_seq_zoned_write(struct request *rq) +{ + return op_needs_zoned_write_locking(req_op(rq)) && + blk_rq_zone_is_seq(rq); +} + bool blk_req_needs_zone_write_lock(struct request *rq); bool blk_req_zone_write_trylock(struct request *rq); void __blk_req_zone_write_lock(struct request *rq); @@ -1194,6 +1206,11 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq) return !blk_req_zone_is_write_locked(rq); } #else /* CONFIG_BLK_DEV_ZONED */ +static inline bool blk_rq_is_seq_zoned_write(struct request *rq) +{ + return false; +} + static inline bool blk_req_needs_zone_write_lock(struct request *rq) { return false; From e0d85cde95bba7d40caa3bf9bc41ee810f0e96df Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:24 -0700 Subject: [PATCH 011/266] block: mq-deadline: Clean up deadline_check_fifo() Change the return type of deadline_check_fifo() from 'int' into 'bool'. Use time_is_before_eq_jiffies() instead of time_after_eq(). No functionality has been changed. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Cc: Damien Le Moal Cc: Ming Lei Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230517174230.897144-7-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index cea1b084c69e..cea91ba4a6ea 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -272,21 +272,15 @@ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) } /* - * deadline_check_fifo returns 0 if there are no expired requests on the fifo, - * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + * deadline_check_fifo returns true if and only if there are expired requests + * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ -static inline int deadline_check_fifo(struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) +static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - /* - * rq is expired! - */ - if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) - return 1; - - return 0; + return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* From 3b463cbea908a9c8d4b9eda09765070506864cbe Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:25 -0700 Subject: [PATCH 012/266] block: mq-deadline: Simplify deadline_skip_seq_writes() Make the deadline_skip_seq_writes() code shorter without changing its functionality. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230517174230.897144-8-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index cea91ba4a6ea..56782ee93522 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -304,14 +304,11 @@ static struct request *deadline_skip_seq_writes(struct deadline_data *dd, struct request *rq) { sector_t pos = blk_rq_pos(rq); - sector_t skipped_sectors = 0; - while (rq) { - if (blk_rq_pos(rq) != pos + skipped_sectors) - break; - skipped_sectors += blk_rq_sectors(rq); + do { + pos += blk_rq_sectors(rq); rq = deadline_latter_request(rq); - } + } while (rq && blk_rq_pos(rq) == pos); return rq; } From b2097bd24b438d49d82a5c317be4dc74b626236a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:26 -0700 Subject: [PATCH 013/266] block: mq-deadline: Reduce lock contention blk_mq_free_requests() calls dd_finish_request() indirectly. Prevent nested locking of dd->lock and dd->zone_lock by moving the code for freeing requests. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230517174230.897144-9-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 56782ee93522..44222d18f6d4 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -757,7 +757,7 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_insert_t flags) + blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; @@ -766,7 +766,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; - LIST_HEAD(free); lockdep_assert_held(&dd->lock); @@ -783,10 +782,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, rq->elv.priv[0] = (void *)(uintptr_t)1; } - if (blk_mq_sched_try_insert_merge(q, rq, &free)) { - blk_mq_free_requests(&free); + if (blk_mq_sched_try_insert_merge(q, rq, free)) return; - } trace_block_rq_insert(rq); @@ -819,6 +816,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; + LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { @@ -826,9 +824,11 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, flags); + dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); + + blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ From 83c46ed675579fe84354bd07b0d81b525a2b1ebb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:27 -0700 Subject: [PATCH 014/266] block: mq-deadline: Track the dispatch position Track the position (sector_t) of the most recently dispatched request instead of tracking a pointer to the next request to dispatch. This patch is the basis for patch "Handle requeued requests correctly". Without this patch it would be significantly more complicated to make sure that zoned writes are dispatched in LBA order per zone. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230517174230.897144-10-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 44222d18f6d4..91b689261d30 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -74,8 +74,8 @@ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; - /* Next request in FIFO order. Read, write or both are NULL. */ - struct request *next_rq[DD_DIR_COUNT]; + /* Position of the most recently dispatched request. */ + sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; @@ -156,6 +156,25 @@ deadline_latter_request(struct request *rq) return NULL; } +/* Return the first request for which blk_rq_pos() >= pos. */ +static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir, sector_t pos) +{ + struct rb_node *node = per_prio->sort_list[data_dir].rb_node; + struct request *rq, *res = NULL; + + while (node) { + rq = rb_entry_rq(node); + if (blk_rq_pos(rq) >= pos) { + res = rq; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + return res; +} + static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { @@ -167,11 +186,6 @@ deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - const enum dd_data_dir data_dir = rq_data_dir(rq); - - if (per_prio->next_rq[data_dir] == rq) - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - elv_rb_del(deadline_rb_root(per_prio, rq), rq); } @@ -251,10 +265,6 @@ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { - const enum dd_data_dir data_dir = rq_data_dir(rq); - - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - /* * take it off the sort and fifo list */ @@ -363,7 +373,8 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq; unsigned long flags; - rq = per_prio->next_rq[data_dir]; + rq = deadline_from_pos(per_prio, data_dir, + per_prio->latest_pos[data_dir]); if (!rq) return NULL; @@ -426,6 +437,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, if (started_after(dd, rq, latest_start)) return NULL; list_del_init(&rq->queuelist); + data_dir = rq_data_dir(rq); goto done; } @@ -433,9 +445,11 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); - if (rq && dd->batching < dd->fifo_batch) + if (rq && dd->batching < dd->fifo_batch) { /* we have a next request and are still entitled to batch */ + data_dir = rq_data_dir(rq); goto dispatch_request; + } /* * at this point we are not running a batch. select the appropriate @@ -513,6 +527,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; + dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; /* * If the request needs its target zone locked, do it. @@ -1026,8 +1041,10 @@ static int deadline_##name##_next_rq_show(void *data, \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - struct request *rq = per_prio->next_rq[data_dir]; \ + struct request *rq; \ \ + rq = deadline_from_pos(per_prio, data_dir, \ + per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ From 0effb390c4bac1a484f0ca6ad3f1d183fcde882b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:28 -0700 Subject: [PATCH 015/266] block: mq-deadline: Handle requeued requests correctly Start dispatching from the start of a zone instead of from the starting position of the most recently dispatched request. If a zoned write is requeued with an LBA that is lower than already inserted zoned writes, make sure that it is submitted first. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Cc: Damien Le Moal Cc: Ming Lei Signed-off-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230517174230.897144-11-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 91b689261d30..e90879869c90 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -156,13 +156,28 @@ deadline_latter_request(struct request *rq) return NULL; } -/* Return the first request for which blk_rq_pos() >= pos. */ +/* + * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, + * return the first request after the start of the zone containing @pos. + */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) { struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; + if (!node) + return NULL; + + rq = rb_entry_rq(node); + /* + * A zoned write may have been requeued with a starting position that + * is below that of the most recently dispatched request. Hence, for + * zoned writes, start searching from the start of a zone. + */ + if (blk_rq_is_seq_zoned_write(rq)) + pos -= round_down(pos, rq->q->limits.chunk_sectors); + while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { @@ -806,6 +821,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { + struct list_head *insert_before; + deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { @@ -818,7 +835,20 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); + insert_before = &per_prio->fifo_list[data_dir]; +#ifdef CONFIG_BLK_DEV_ZONED + /* + * Insert zoned writes such that requests are sorted by + * position per zone. + */ + if (blk_rq_is_seq_zoned_write(rq)) { + struct request *rq2 = deadline_latter_request(rq); + + if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) + insert_before = &rq2->queuelist; + } +#endif + list_add_tail(&rq->queuelist, insert_before); } } From a036e698c231ba884daa37196be3ac6c6dce1d75 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 May 2023 10:42:29 -0700 Subject: [PATCH 016/266] block: mq-deadline: Fix handling of at-head zoned writes Before dispatching a zoned write from the FIFO list, check whether there are any zoned writes in the RB-tree with a lower LBA for the same zone. This patch ensures that zoned writes happen in order even if at_head is set for some writes for a zone and not for others. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230517174230.897144-12-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index e90879869c90..6aa5daf7ae32 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -346,7 +346,7 @@ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq; + struct request *rq, *rb_rq, *next; unsigned long flags; if (list_empty(&per_prio->fifo_list[data_dir])) @@ -364,7 +364,12 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { + list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], + queuelist) { + /* Check whether a prior request exists for the same zone. */ + rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); + if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) + rq = rb_rq; if (blk_req_can_dispatch_to_zone(rq) && (blk_queue_nonrot(rq->q) || !deadline_is_seq_write(dd, rq))) From 3e49c1e4a6152b6ad758a28ecce8fb470f46f6ed Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 May 2023 15:38:53 -0700 Subject: [PATCH 017/266] block: BFQ: Add several invariant checks If anything goes wrong with the counters that track the number of requests, I/O locks up. Make such scenarios easier to debug by adding invariant checks for the request counters. Additionally, check that BFQ queues are empty before these are freed. Cc: Jan Kara Cc: Yu Kuai Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230516223853.1385255-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3164e3177965..c5727afad159 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5403,6 +5403,9 @@ void bfq_put_queue(struct bfq_queue *bfqq) if (bfqq->bfqd->last_completed_rq_bfqq == bfqq) bfqq->bfqd->last_completed_rq_bfqq = NULL; + WARN_ON_ONCE(!list_empty(&bfqq->fifo)); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list)); + kmem_cache_free(bfq_pool, bfqq); bfqg_and_blkg_put(bfqg); } @@ -7135,6 +7138,7 @@ static void bfq_exit_queue(struct elevator_queue *e) { struct bfq_data *bfqd = e->elevator_data; struct bfq_queue *bfqq, *n; + unsigned int actuator; hrtimer_cancel(&bfqd->idle_slice_timer); @@ -7143,6 +7147,11 @@ static void bfq_exit_queue(struct elevator_queue *e) bfq_deactivate_bfqq(bfqd, bfqq, false, false); spin_unlock_irq(&bfqd->lock); + for (actuator = 0; actuator < bfqd->num_actuators; actuator++) + WARN_ON_ONCE(bfqd->rq_in_driver[actuator]); + WARN_ON_ONCE(bfqd->tot_rq_in_driver); + WARN_ON_ONCE(bfqq->dispatched); + hrtimer_cancel(&bfqd->idle_slice_timer); /* release oom-queue reference to root group */ From bda2795a630b2f6c417675bfbf4d90ef7503dfc7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 May 2023 07:44:05 -0700 Subject: [PATCH 018/266] fs: remove the special !CONFIG_BLOCK def_blk_fops def_blk_fops always returns -ENODEV, which dosn't match the return value of a non-existing block device with CONFIG_BLOCK, which is -ENXIO. Just remove the extra implementation and fall back to the default no_open_fops that always returns -ENXIO. Fixes: 9361401eb761 ("[PATCH] BLOCK: Make it possible to disable the block layer [try #6]") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230508144405.41792-1-hch@lst.de Signed-off-by: Jens Axboe --- fs/Makefile | 10 ++-------- fs/inode.c | 3 ++- fs/no-block.c | 19 ------------------- 3 files changed, 4 insertions(+), 28 deletions(-) delete mode 100644 fs/no-block.c diff --git a/fs/Makefile b/fs/Makefile index 834f1c3dba46..4709eba1303c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,14 +17,8 @@ obj-y := open.o read_write.o file_table.o super.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o -ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o mpage.o -else -obj-y += no-block.o -endif - -obj-$(CONFIG_PROC_FS) += proc_namespace.o - +obj-$(CONFIG_BLOCK) += buffer.o mpage.o +obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o diff --git a/fs/inode.c b/fs/inode.c index 577799b7855f..4d6a1544e95b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2264,7 +2264,8 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; } else if (S_ISBLK(mode)) { - inode->i_fop = &def_blk_fops; + if (IS_ENABLED(CONFIG_BLOCK)) + inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; } else if (S_ISFIFO(mode)) inode->i_fop = &pipefifo_fops; diff --git a/fs/no-block.c b/fs/no-block.c deleted file mode 100644 index 481c0f0ab4bd..000000000000 --- a/fs/no-block.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* no-block.c: implementation of routines required for non-BLOCK configuration - * - * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include -#include - -static int no_blkdev_open(struct inode * inode, struct file * filp) -{ - return -ENODEV; -} - -const struct file_operations def_blk_fops = { - .open = no_blkdev_open, - .llseek = noop_llseek, -}; From 0b573692f19501dfe2aeaf37b272ec07f60c70b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 May 2023 06:40:44 +0200 Subject: [PATCH 019/266] blk-mq: factor out a blk_rq_init_flush helper Factor out a helper from blk_insert_flush that initializes the flush machine related fields in struct request, and don't bother with the full memset as there's just a few fields to initialize, and all but one already have explicit initializers. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230519044050.107790-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 04698ed9bcd4..ed37d272f787 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -376,6 +376,15 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, return RQ_END_IO_NONE; } +static void blk_rq_init_flush(struct request *rq) +{ + rq->flush.seq = 0; + INIT_LIST_HEAD(&rq->flush.list); + rq->rq_flags |= RQF_FLUSH_SEQ; + rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ + rq->end_io = mq_flush_data_end_io; +} + /** * blk_insert_flush - insert a new PREFLUSH/FUA request * @rq: request to insert @@ -437,13 +446,7 @@ void blk_insert_flush(struct request *rq) * @rq should go through flush machinery. Mark it part of flush * sequence and submit for further processing. */ - memset(&rq->flush, 0, sizeof(rq->flush)); - INIT_LIST_HEAD(&rq->flush.list); - rq->rq_flags |= RQF_FLUSH_SEQ; - rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ - - rq->end_io = mq_flush_data_end_io; - + blk_rq_init_flush(rq); spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); From c1075e548ce6e6b5c7b71f2b05d344164ebc52bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 May 2023 06:40:45 +0200 Subject: [PATCH 020/266] blk-mq: reflow blk_insert_flush Use a switch statement to decide on the disposition of a flush request instead of multiple if statements, out of which one does checks that are more complex than required. Also warn on a malformed request early on instead of doing a BUG_ON later. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230519044050.107790-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 53 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index ed37d272f787..d8144f1f6fb1 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -402,6 +402,9 @@ void blk_insert_flush(struct request *rq) struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + /* FLUSH/FUA request must never be merged */ + WARN_ON_ONCE(rq->bio != rq->biotail); + /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. @@ -417,39 +420,35 @@ void blk_insert_flush(struct request *rq) */ rq->cmd_flags |= REQ_SYNC; - /* - * An empty flush handed down from a stacking driver may - * translate into nothing if the underlying device does not - * advertise a write-back cache. In this case, simply - * complete the request. - */ - if (!policy) { + switch (policy) { + case 0: + /* + * An empty flush handed down from a stacking driver may + * translate into nothing if the underlying device does not + * advertise a write-back cache. In this case, simply + * complete the request. + */ blk_mq_end_request(rq, 0); return; - } - - BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ - - /* - * If there's data but flush is not necessary, the request can be - * processed directly without going through flush machinery. Queue - * for normal execution. - */ - if ((policy & REQ_FSEQ_DATA) && - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + case REQ_FSEQ_DATA: + /* + * If there's data, but no flush is necessary, the request can + * be processed directly without going through flush machinery. + * Queue for normal execution. + */ blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); return; + default: + /* + * Mark the request as part of a flush sequence and submit it + * for further processing to the flush state machine. + */ + blk_rq_init_flush(rq); + spin_lock_irq(&fq->mq_flush_lock); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&fq->mq_flush_lock); } - - /* - * @rq should go through flush machinery. Mark it part of flush - * sequence and submit for further processing. - */ - blk_rq_init_flush(rq); - spin_lock_irq(&fq->mq_flush_lock); - blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&fq->mq_flush_lock); } /** From 360f264834e34d08530c2fb9b67e3ffa65318761 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 May 2023 06:40:46 +0200 Subject: [PATCH 021/266] blk-mq: defer to the normal submission path for non-flush flush commands If blk_insert_flush decides that a command does not need to use the flush state machine, return false and let blk_mq_submit_bio handle it the normal way (including using an I/O scheduler) instead of doing a bypass insert. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230519044050.107790-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 22 ++++++++-------------- block/blk-mq.c | 8 ++++---- block/blk-mq.h | 4 ---- block/blk.h | 2 +- 4 files changed, 13 insertions(+), 23 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index d8144f1f6fb1..6fb9cf2d3818 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -385,22 +385,17 @@ static void blk_rq_init_flush(struct request *rq) rq->end_io = mq_flush_data_end_io; } -/** - * blk_insert_flush - insert a new PREFLUSH/FUA request - * @rq: request to insert - * - * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. - * or __blk_mq_run_hw_queue() to dispatch request. - * @rq is being submitted. Analyze what needs to be done and put it on the - * right queue. +/* + * Insert a PREFLUSH/FUA request into the flush state machine. + * Returns true if the request has been consumed by the flush state machine, + * or false if the caller should continue to process it. */ -void blk_insert_flush(struct request *rq) +bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); @@ -429,16 +424,14 @@ void blk_insert_flush(struct request *rq) * complete the request. */ blk_mq_end_request(rq, 0); - return; + return true; case REQ_FSEQ_DATA: /* * If there's data, but no flush is necessary, the request can * be processed directly without going through flush machinery. * Queue for normal execution. */ - blk_mq_request_bypass_insert(rq, 0); - blk_mq_run_hw_queue(hctx, false); - return; + return false; default: /* * Mark the request as part of a flush sequence and submit it @@ -448,6 +441,7 @@ void blk_insert_flush(struct request *rq) spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); + return true; } } diff --git a/block/blk-mq.c b/block/blk-mq.c index e021740154fe..c0b394096b6b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -45,6 +45,8 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); +static void blk_mq_request_bypass_insert(struct request *rq, + blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); @@ -2430,7 +2432,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) +static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; @@ -2977,10 +2979,8 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (op_is_flush(bio->bi_opf)) { - blk_insert_flush(rq); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; - } if (plug) { blk_add_rq_to_plug(plug, rq); diff --git a/block/blk-mq.h b/block/blk-mq.h index d15981db34b9..ec7d2fb0b3c8 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -64,10 +64,6 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -/* - * Internal helpers for request insertion into sw queues - */ -void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); /* * CPU -> queue mappings diff --git a/block/blk.h b/block/blk.h index 45547bcf1119..9f171b8f1e34 100644 --- a/block/blk.h +++ b/block/blk.h @@ -269,7 +269,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) -void blk_insert_flush(struct request *rq); +bool blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_disable(struct request_queue *q); From be4c427809b0a746aff54dbb8ef663f0184291d0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 May 2023 06:40:47 +0200 Subject: [PATCH 022/266] blk-mq: use the I/O scheduler for writes from the flush state machine Send write requests issued by the flush state machine through the normal I/O submission path including the I/O scheduler (if present) so that I/O scheduler policies are applied to writes with the FUA flag set. Separate the I/O scheduler members from the flush members in struct request since now a request may pass through both an I/O scheduler and the flush machinery. Note that the actual flush requests, which have no bio attached to the request still bypass the I/O schedulers. Signed-off-by: Bart Van Assche [hch: rebased] Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230519044050.107790-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- include/linux/blk-mq.h | 25 ++++++++++--------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index c0b394096b6b..aac67bc3d368 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -458,7 +458,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) * Flush/passthrough requests are special and go directly to the * dispatch list. */ - if (!op_is_flush(data->cmd_flags) && + if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && !blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops; @@ -2497,7 +2497,7 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) * dispatch it given we prioritize requests in hctx->dispatch. */ blk_mq_request_bypass_insert(rq, flags); - } else if (rq->rq_flags & RQF_FLUSH_SEQ) { + } else if (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 49d14b1acfa5..935201c89743 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -169,25 +169,20 @@ struct request { void *completion_data; }; - /* * Three pointers are available for the IO schedulers, if they need - * more they have to dynamically allocate it. Flush requests are - * never put on the IO scheduler. So let the flush fields share - * space with the elevator data. + * more they have to dynamically allocate it. */ - union { - struct { - struct io_cq *icq; - void *priv[2]; - } elv; + struct { + struct io_cq *icq; + void *priv[2]; + } elv; - struct { - unsigned int seq; - struct list_head list; - rq_end_io_fn *saved_end_io; - } flush; - }; + struct { + unsigned int seq; + struct list_head list; + rq_end_io_fn *saved_end_io; + } flush; union { struct __call_single_data csd; From 615939a2ae734e3e68c816d6749d1f5f79c62ab7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 May 2023 06:40:48 +0200 Subject: [PATCH 023/266] blk-mq: defer to the normal submission path for post-flush requests Requests with the FUA bit on hardware without FUA support need a post flush before returning to the caller, but they can still be sent using the normal I/O path after initializing the flush-related fields and end I/O handler. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230519044050.107790-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/block/blk-flush.c b/block/blk-flush.c index 6fb9cf2d3818..7121f9ad0762 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -432,6 +432,17 @@ bool blk_insert_flush(struct request *rq) * Queue for normal execution. */ return false; + case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: + /* + * Initialize the flush fields and completion handler to trigger + * the post flush, and then just pass the command on. + */ + blk_rq_init_flush(rq); + rq->flush.seq |= REQ_FSEQ_POSTFLUSH; + spin_lock_irq(&fq->mq_flush_lock); + list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + spin_unlock_irq(&fq->mq_flush_lock); + return false; default: /* * Mark the request as part of a flush sequence and submit it From 1e82fadfc6b96ca79f69d0bcf938d31032bb43d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 May 2023 06:40:49 +0200 Subject: [PATCH 024/266] blk-mq: do not do head insertions post-pre-flush commands blk_flush_complete_seq currently queues requests that write data after a pre-flush from the flush state machine at the head of the queue. This doesn't really make sense, as the original request bypassed all queue lists by directly diverting to blk_insert_flush from blk_mq_submit_bio. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230519044050.107790-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 7121f9ad0762..f407a5950317 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -188,7 +188,7 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); - blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD); + blk_mq_add_to_requeue_list(rq, 0); blk_mq_kick_requeue_list(q); break; From 9a67aa52a42b31ad44220cc218df3b75a5cd5d05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 May 2023 06:40:50 +0200 Subject: [PATCH 025/266] blk-mq: don't use the requeue list to queue flush commands Currently both requeues of commands that were already sent to the driver and flush commands submitted from the flush state machine share the same requeue_list struct request_queue, despite requeues doing head insertions and flushes not. Switch to using two separate lists instead. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230519044050.107790-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 9 +++++++-- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 42 +++++++++++++----------------------------- block/blk-mq.h | 1 - include/linux/blk-mq.h | 4 +--- include/linux/blkdev.h | 1 + 6 files changed, 22 insertions(+), 36 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index f407a5950317..dba392cf22be 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -188,7 +188,9 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); - blk_mq_add_to_requeue_list(rq, 0); + spin_lock(&q->requeue_lock); + list_add_tail(&rq->queuelist, &q->flush_list); + spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; @@ -346,7 +348,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, smp_wmb(); req_ref_set(flush_rq, 1); - blk_mq_add_to_requeue_list(flush_rq, 0); + spin_lock(&q->requeue_lock); + list_add_tail(&flush_rq->queuelist, &q->flush_list); + spin_unlock(&q->requeue_lock); + blk_mq_kick_requeue_list(q); } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 22e39b9a77ec..68165a50951b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -244,7 +244,6 @@ static const char *const cmd_flag_name[] = { #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { RQF_NAME(STARTED), - RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), RQF_NAME(MQ_INFLIGHT), diff --git a/block/blk-mq.c b/block/blk-mq.c index aac67bc3d368..551e7760f45e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1416,13 +1416,16 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { struct request_queue *q = rq->q; + unsigned long flags; __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD); + spin_lock_irqsave(&q->requeue_lock, flags); + list_add_tail(&rq->queuelist, &q->requeue_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); if (kick_requeue_list) blk_mq_kick_requeue_list(q); @@ -1434,13 +1437,16 @@ static void blk_mq_requeue_work(struct work_struct *work) struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); - struct request *rq, *next; + LIST_HEAD(flush_list); + struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); + list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); - list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); /* * If RQF_DONTPREP ist set, the request has been started by the * driver already and might have driver-specific data allocated @@ -1448,18 +1454,16 @@ static void blk_mq_requeue_work(struct work_struct *work) * block layer merges for the request. */ if (rq->rq_flags & RQF_DONTPREP) { - rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); blk_mq_request_bypass_insert(rq, 0); - } else if (rq->rq_flags & RQF_SOFTBARRIER) { - rq->rq_flags &= ~RQF_SOFTBARRIER; + } else { list_del_init(&rq->queuelist); blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); } } - while (!list_empty(&rq_list)) { - rq = list_entry(rq_list.next, struct request, queuelist); + while (!list_empty(&flush_list)) { + rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); blk_mq_insert_request(rq, 0); } @@ -1467,27 +1471,6 @@ static void blk_mq_requeue_work(struct work_struct *work) blk_mq_run_hw_queues(q, false); } -void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags) -{ - struct request_queue *q = rq->q; - unsigned long flags; - - /* - * We abuse this flag that is otherwise used by the I/O scheduler to - * request head insertion from the workqueue. - */ - BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); - - spin_lock_irqsave(&q->requeue_lock, flags); - if (insert_flags & BLK_MQ_INSERT_AT_HEAD) { - rq->rq_flags |= RQF_SOFTBARRIER; - list_add(&rq->queuelist, &q->requeue_list); - } else { - list_add_tail(&rq->queuelist, &q->requeue_list); - } - spin_unlock_irqrestore(&q->requeue_lock, flags); -} - void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); @@ -4239,6 +4222,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); diff --git a/block/blk-mq.h b/block/blk-mq.h index ec7d2fb0b3c8..8c642e9f32f1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -47,7 +47,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, unsigned int); -void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 935201c89743..d778cb6b2112 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -28,8 +28,6 @@ typedef __u32 __bitwise req_flags_t; /* drive already may have started this one */ #define RQF_STARTED ((__force req_flags_t)(1 << 1)) -/* may not be passed by ioscheduler */ -#define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) /* request for flush sequence */ #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) /* merge of different types, fail separately */ @@ -65,7 +63,7 @@ typedef __u32 __bitwise req_flags_t; /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ - (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) + (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) enum mq_rq_state { MQ_RQ_IDLE = 0, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3952c52d6cd1..fe99948688df 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -487,6 +487,7 @@ struct request_queue { * for flush operations */ struct blk_flush_queue *fq; + struct list_head flush_list; struct list_head requeue_list; spinlock_t requeue_lock; From 29dc5d06613f2438ec20a4ba5e0a5a740584d346 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:24 +0800 Subject: [PATCH 026/266] ublk: kill queuing request by task_work_add task_work_add() is used from early ublk development stage for handling request in batch. However, since commit 7d4a93176e01 ("ublk_drv: don't forward io commands in reserve order"), we can get similar batch processing with io_uring_cmd_complete_in_task(), and similar performance data is observed between task_work_add() and io_uring_cmd_complete_in_task(). Meantime we can kill one fast code path, which is actually seldom used given it is common to build ublk driver as module. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 40 ++-------------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c7ed5d69e9ee..b00c5c210c7f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -62,7 +62,6 @@ struct ublk_rq_data { struct llist_node node; - struct callback_head work; }; struct ublk_uring_cmd_pdu { @@ -290,14 +289,6 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } -static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) -{ - if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && - !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) - return true; - return false; -} - static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_NEED_GET_DATA; @@ -852,17 +843,6 @@ static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags) ublk_forward_io_cmds(ubq, issue_flags); } -static void ublk_rq_task_work_fn(struct callback_head *work) -{ - struct ublk_rq_data *data = container_of(work, - struct ublk_rq_data, work); - struct request *req = blk_mq_rq_from_pdu(data); - struct ublk_queue *ubq = req->mq_hctx->driver_data; - unsigned issue_flags = IO_URING_F_UNLOCKED; - - ublk_forward_io_cmds(ubq, issue_flags); -} - static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) { struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); @@ -886,10 +866,6 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) */ if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) { ublk_abort_io_cmds(ubq); - } else if (ublk_can_use_task_work(ubq)) { - if (task_work_add(ubq->ubq_daemon, &data->work, - TWA_SIGNAL_NO_IPI)) - ublk_abort_io_cmds(ubq); } else { struct io_uring_cmd *cmd = io->cmd; struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); @@ -961,19 +937,9 @@ static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, return 0; } -static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, - unsigned int hctx_idx, unsigned int numa_node) -{ - struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); - - init_task_work(&data->work, ublk_rq_task_work_fn); - return 0; -} - static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .init_hctx = ublk_init_hctx, - .init_request = ublk_init_rq, .timeout = ublk_timeout, }; @@ -1813,10 +1779,8 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) */ ub->dev_info.flags &= UBLK_F_ALL; - if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK)) - ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK; - - ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE; + ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | + UBLK_F_URING_CMD_COMP_IN_TASK; /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; From f236a21459a5cdd828b93f363946e116773494f6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:25 +0800 Subject: [PATCH 027/266] ublk: cleanup io cmd code path by adding ublk_fill_io_cmd() Add one small helper to cleanup io command hanlding code path. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b00c5c210c7f..0c8651c5ba4c 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1256,6 +1256,14 @@ static inline int ublk_check_cmd_op(u32 cmd_op) return 0; } +static inline void ublk_fill_io_cmd(struct ublk_io *io, + struct io_uring_cmd *cmd, unsigned long buf_addr) +{ + io->cmd = cmd; + io->flags |= UBLK_IO_FLAG_ACTIVE; + io->addr = buf_addr; +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1322,10 +1330,8 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */ if (!ub_cmd->addr && !ublk_need_get_data(ubq)) goto out; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->addr = ub_cmd->addr; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: @@ -1338,17 +1344,13 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, goto out; if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; - io->addr = ub_cmd->addr; - io->flags |= UBLK_IO_FLAG_ACTIVE; - io->cmd = cmd; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; case UBLK_IO_NEED_GET_DATA: if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; - io->addr = ub_cmd->addr; - io->cmd = cmd; - io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); break; default: From 981f95a571e3ca20a496c0b77dbf6b06039c6648 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:26 +0800 Subject: [PATCH 028/266] ublk: cleanup ublk_copy_user_pages Clean up ublk_copy_user_pages() by using iov_iter_get_pages2, and code gets simplified a lot and becomes much more readable than before. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 108 +++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 61 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 0c8651c5ba4c..afc07fa17040 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -412,49 +412,39 @@ static const struct block_device_operations ub_fops = { #define UBLK_MAX_PIN_PAGES 32 -struct ublk_map_data { - const struct request *rq; - unsigned long ubuf; - unsigned int len; -}; - struct ublk_io_iter { struct page *pages[UBLK_MAX_PIN_PAGES]; - unsigned pg_off; /* offset in the 1st page in pages */ - int nr_pages; /* how many page pointers in pages */ struct bio *bio; struct bvec_iter iter; }; -static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, - unsigned max_bytes, bool to_vm) +/* return how many pages are copied */ +static void ublk_copy_io_pages(struct ublk_io_iter *data, + size_t total, size_t pg_off, int dir) { - const unsigned total = min_t(unsigned, max_bytes, - PAGE_SIZE - data->pg_off + - ((data->nr_pages - 1) << PAGE_SHIFT)); unsigned done = 0; unsigned pg_idx = 0; while (done < total) { struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); - const unsigned int bytes = min3(bv.bv_len, total - done, - (unsigned)(PAGE_SIZE - data->pg_off)); + unsigned int bytes = min3(bv.bv_len, (unsigned)total - done, + (unsigned)(PAGE_SIZE - pg_off)); void *bv_buf = bvec_kmap_local(&bv); void *pg_buf = kmap_local_page(data->pages[pg_idx]); - if (to_vm) - memcpy(pg_buf + data->pg_off, bv_buf, bytes); + if (dir == ITER_DEST) + memcpy(pg_buf + pg_off, bv_buf, bytes); else - memcpy(bv_buf, pg_buf + data->pg_off, bytes); + memcpy(bv_buf, pg_buf + pg_off, bytes); kunmap_local(pg_buf); kunmap_local(bv_buf); /* advance page array */ - data->pg_off += bytes; - if (data->pg_off == PAGE_SIZE) { + pg_off += bytes; + if (pg_off == PAGE_SIZE) { pg_idx += 1; - data->pg_off = 0; + pg_off = 0; } done += bytes; @@ -468,41 +458,40 @@ static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, data->iter = data->bio->bi_iter; } } - - return done; } -static int ublk_copy_user_pages(struct ublk_map_data *data, bool to_vm) +/* + * Copy data between request pages and io_iter, and 'offset' + * is the start point of linear offset of request. + */ +static size_t ublk_copy_user_pages(const struct request *req, + struct iov_iter *uiter, int dir) { - const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; - const unsigned long start_vm = data->ubuf; - unsigned int done = 0; struct ublk_io_iter iter = { - .pg_off = start_vm & (PAGE_SIZE - 1), - .bio = data->rq->bio, - .iter = data->rq->bio->bi_iter, + .bio = req->bio, + .iter = req->bio->bi_iter, }; - const unsigned int nr_pages = round_up(data->len + - (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; + size_t done = 0; - while (done < nr_pages) { - const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES, - nr_pages - done); - unsigned i, len; + while (iov_iter_count(uiter) && iter.bio) { + unsigned nr_pages; + size_t len, off; + int i; - iter.nr_pages = get_user_pages_fast(start_vm + - (done << PAGE_SHIFT), to_pin, gup_flags, - iter.pages); - if (iter.nr_pages <= 0) - return done == 0 ? iter.nr_pages : done; - len = ublk_copy_io_pages(&iter, data->len, to_vm); - for (i = 0; i < iter.nr_pages; i++) { - if (to_vm) + len = iov_iter_get_pages2(uiter, iter.pages, + iov_iter_count(uiter), + UBLK_MAX_PIN_PAGES, &off); + if (len <= 0) + return done; + + ublk_copy_io_pages(&iter, len, off, dir); + nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE); + for (i = 0; i < nr_pages; i++) { + if (dir == ITER_DEST) set_page_dirty(iter.pages[i]); put_page(iter.pages[i]); } - data->len -= len; - done += iter.nr_pages; + done += len; } return done; @@ -529,15 +518,14 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, * context is pretty fast, see ublk_pin_user_pages */ if (ublk_need_map_req(req)) { - struct ublk_map_data data = { - .rq = req, - .ubuf = io->addr, - .len = rq_bytes, - }; + struct iov_iter iter; + struct iovec iov; + const int dir = ITER_DEST; - ublk_copy_user_pages(&data, true); + import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, + &iov, &iter); - return rq_bytes - data.len; + return ublk_copy_user_pages(req, &iter, dir); } return rq_bytes; } @@ -549,17 +537,15 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, const unsigned int rq_bytes = blk_rq_bytes(req); if (ublk_need_unmap_req(req)) { - struct ublk_map_data data = { - .rq = req, - .ubuf = io->addr, - .len = io->res, - }; + struct iov_iter iter; + struct iovec iov; + const int dir = ITER_SOURCE; WARN_ON_ONCE(io->res > rq_bytes); - ublk_copy_user_pages(&data, false); - - return io->res - data.len; + import_single_range(dir, u64_to_user_ptr(io->addr), io->res, + &iov, &iter); + return ublk_copy_user_pages(req, &iter, dir); } return rq_bytes; } From 8284066946e6d9cc979566ce698fe24e7ca0b31e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:27 +0800 Subject: [PATCH 029/266] ublk: grab request reference when the request is handled by userspace Add one reference counter into request pdu data, and hold this reference in the request's lifetime. Prepare for supporting to move request data copy into userspace, which needs to copy request data by read()/write() on /dev/ublkcN, so we have to guarantee that read()/write() is done on one valid/active request, and that will be enhanced by holding the io request reference in read()/write(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 67 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index afc07fa17040..353ccdb60729 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #define UBLK_MINORS (1U << MINORBITS) @@ -62,6 +63,8 @@ struct ublk_rq_data { struct llist_node node; + + struct kref ref; }; struct ublk_uring_cmd_pdu { @@ -181,6 +184,9 @@ struct ublk_params_header { __u32 types; }; +static inline void __ublk_complete_rq(struct request *req); +static void ublk_complete_rq(struct kref *ref); + static dev_t ublk_chr_devt; static struct class *ublk_chr_class; @@ -289,6 +295,45 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } +static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) +{ + return false; +} + +static inline void ublk_init_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_init(&data->ref); + } +} + +static inline bool ublk_get_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + return kref_get_unless_zero(&data->ref); + } + + return true; +} + +static inline void ublk_put_req_ref(const struct ublk_queue *ubq, + struct request *req) +{ + if (ublk_need_req_ref(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + kref_put(&data->ref, ublk_complete_rq); + } else { + __ublk_complete_rq(req); + } +} + static inline bool ublk_need_get_data(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_NEED_GET_DATA; @@ -625,13 +670,19 @@ static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq) } /* todo: handle partial completion */ -static void ublk_complete_rq(struct request *req) +static inline void __ublk_complete_rq(struct request *req) { struct ublk_queue *ubq = req->mq_hctx->driver_data; struct ublk_io *io = &ubq->ios[req->tag]; unsigned int unmapped_bytes; blk_status_t res = BLK_STS_OK; + /* called from ublk_abort_queue() code path */ + if (io->flags & UBLK_IO_FLAG_ABORTED) { + res = BLK_STS_IOERR; + goto exit; + } + /* failed read IO if nothing is read */ if (!io->res && req_op(req) == REQ_OP_READ) io->res = -EIO; @@ -671,6 +722,15 @@ static void ublk_complete_rq(struct request *req) blk_mq_end_request(req, res); } +static void ublk_complete_rq(struct kref *ref) +{ + struct ublk_rq_data *data = container_of(ref, struct ublk_rq_data, + ref); + struct request *req = blk_mq_rq_from_pdu(data); + + __ublk_complete_rq(req); +} + /* * Since __ublk_rq_task_work always fails requests immediately during * exiting, __ublk_fail_req() is only called from abort context during @@ -689,7 +749,7 @@ static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io, if (ublk_queue_can_use_recovery_reissue(ubq)) blk_mq_requeue_request(req, false); else - blk_mq_end_request(req, BLK_STS_IOERR); + ublk_put_req_ref(ubq, req); } } @@ -798,6 +858,7 @@ static inline void __ublk_rq_task_work(struct request *req, mapped_bytes >> 9; } + ublk_init_req_ref(ubq, req); ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags); } @@ -1002,7 +1063,7 @@ static void ublk_commit_completion(struct ublk_device *ub, req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); if (req && likely(!blk_should_fake_timeout(req->q))) - ublk_complete_rq(req); + ublk_put_req_ref(ubq, req); } /* From 38f2dd34410f9070b60969a07ff7d8743b4fd56c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:28 +0800 Subject: [PATCH 030/266] ublk: support to copy any part of request pages Add 'offset' to 'struct ublk_map_data', so that ublk_copy_user_pages() can be used to copy any sub-buffer(linear mapped) of the request. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 353ccdb60729..13523c37a165 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -505,19 +505,36 @@ static void ublk_copy_io_pages(struct ublk_io_iter *data, } } +static bool ublk_advance_io_iter(const struct request *req, + struct ublk_io_iter *iter, unsigned int offset) +{ + struct bio *bio = req->bio; + + for_each_bio(bio) { + if (bio->bi_iter.bi_size > offset) { + iter->bio = bio; + iter->iter = bio->bi_iter; + bio_advance_iter(iter->bio, &iter->iter, offset); + return true; + } + offset -= bio->bi_iter.bi_size; + } + return false; +} + /* * Copy data between request pages and io_iter, and 'offset' * is the start point of linear offset of request. */ static size_t ublk_copy_user_pages(const struct request *req, - struct iov_iter *uiter, int dir) + unsigned offset, struct iov_iter *uiter, int dir) { - struct ublk_io_iter iter = { - .bio = req->bio, - .iter = req->bio->bi_iter, - }; + struct ublk_io_iter iter; size_t done = 0; + if (!ublk_advance_io_iter(req, &iter, offset)) + return 0; + while (iov_iter_count(uiter) && iter.bio) { unsigned nr_pages; size_t len, off; @@ -570,7 +587,7 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, import_single_range(dir, u64_to_user_ptr(io->addr), rq_bytes, &iov, &iter); - return ublk_copy_user_pages(req, &iter, dir); + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } @@ -590,7 +607,7 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, import_single_range(dir, u64_to_user_ptr(io->addr), io->res, &iov, &iter); - return ublk_copy_user_pages(req, &iter, dir); + return ublk_copy_user_pages(req, 0, &iter, dir); } return rq_bytes; } From 62fe99cef94a5900cac3bf15fd03ee8baad1a99c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:29 +0800 Subject: [PATCH 031/266] ublk: add read()/write() support for ublk char device Support pread()/pwrite() on ublk char device for reading/writing request io buffer, so data copy between io request buffer and userspace buffer can be moved to ublk server from ublk driver. Then UBLK_F_NEED_GET_DATA becomes not necessary, so ublk server can allocate buffer without one extra round uring command communication for userspace to provide buffer. IO buffer can be located by iocb->ki_pos which encodes buffer offset, io tag and queue id info, and type of iocb->ki_pos is u64, so it is big enough for holding reasonable queue depth, nr_queues and max io buffer size. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 151 ++++++++++++++++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 22 ++++- 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 13523c37a165..ec40ac4f9af3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -207,6 +207,23 @@ static unsigned int ublks_added; /* protected by ublk_ctl_mutex */ static struct miscdevice ublk_misc; +static inline unsigned ublk_pos_to_hwq(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) & + UBLK_QID_BITS_MASK; +} + +static inline unsigned ublk_pos_to_buf_off(loff_t pos) +{ + return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK; +} + +static inline unsigned ublk_pos_to_tag(loff_t pos) +{ + return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) & + UBLK_TAG_BITS_MASK; +} + static void ublk_dev_param_basic_apply(struct ublk_device *ub) { struct request_queue *q = ub->ub_disk->queue; @@ -1429,6 +1446,36 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, return -EIOCBQUEUED; } +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset) +{ + struct request *req; + + if (!ublk_need_req_ref(ubq)) + return NULL; + + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (!req) + return NULL; + + if (!ublk_get_req_ref(ubq, req)) + return NULL; + + if (unlikely(!blk_mq_request_started(req) || req->tag != tag)) + goto fail_put; + + if (!ublk_rq_has_data(req)) + goto fail_put; + + if (offset > blk_rq_bytes(req)) + goto fail_put; + + return req; +fail_put: + ublk_put_req_ref(ubq, req); + return NULL; +} + static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { /* @@ -1446,11 +1493,112 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); } +static inline bool ublk_check_ubuf_dir(const struct request *req, + int ubuf_dir) +{ + /* copy ubuf to request pages */ + if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE) + return true; + + /* copy request pages to ubuf */ + if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST) + return true; + + return false; +} + +static struct request *ublk_check_and_get_req(struct kiocb *iocb, + struct iov_iter *iter, size_t *off, int dir) +{ + struct ublk_device *ub = iocb->ki_filp->private_data; + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + u16 tag, q_id; + + if (!ub) + return ERR_PTR(-EACCES); + + if (!user_backed_iter(iter)) + return ERR_PTR(-EACCES); + + if (ub->dev_info.state == UBLK_S_DEV_DEAD) + return ERR_PTR(-EACCES); + + tag = ublk_pos_to_tag(iocb->ki_pos); + q_id = ublk_pos_to_hwq(iocb->ki_pos); + buf_off = ublk_pos_to_buf_off(iocb->ki_pos); + + if (q_id >= ub->dev_info.nr_hw_queues) + return ERR_PTR(-EINVAL); + + ubq = ublk_get_queue(ub, q_id); + if (!ubq) + return ERR_PTR(-EINVAL); + + if (tag >= ubq->q_depth) + return ERR_PTR(-EINVAL); + + req = __ublk_check_and_get_req(ub, ubq, tag, buf_off); + if (!req) + return ERR_PTR(-EINVAL); + + if (!req->mq_hctx || !req->mq_hctx->driver_data) + goto fail; + + if (!ublk_check_ubuf_dir(req, dir)) + goto fail; + + *off = buf_off; + return req; +fail: + ublk_put_req_ref(ubq, req); + return ERR_PTR(-EACCES); +} + +static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); + + return ret; +} + +static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct ublk_queue *ubq; + struct request *req; + size_t buf_off; + size_t ret; + + req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE); + ubq = req->mq_hctx->driver_data; + ublk_put_req_ref(ubq, req); + + return ret; +} + static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, .release = ublk_ch_release, .llseek = no_llseek, + .read_iter = ublk_ch_read_iter, + .write_iter = ublk_ch_write_iter, .uring_cmd = ublk_ch_uring_cmd, .mmap = ublk_ch_mmap, }; @@ -2362,6 +2510,9 @@ static int __init ublk_init(void) { int ret; + BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET + + UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET); + init_waitqueue_head(&ublk_idr_wq); ret = misc_register(&ublk_misc); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 640bf687b94a..c0c1632c671e 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -93,9 +93,29 @@ #define UBLKSRV_CMD_BUF_OFFSET 0 #define UBLKSRV_IO_BUF_OFFSET 0x80000000 -/* tag bit is 12bit, so at most 4096 IOs for each queue */ +/* tag bit is 16bit, so far limit at most 4096 IOs for each queue */ #define UBLK_MAX_QUEUE_DEPTH 4096 +/* single IO buffer max size is 32MB */ +#define UBLK_IO_BUF_OFF 0 +#define UBLK_IO_BUF_BITS 25 +#define UBLK_IO_BUF_BITS_MASK ((1ULL << UBLK_IO_BUF_BITS) - 1) + +/* so at most 64K IOs for each queue */ +#define UBLK_TAG_OFF UBLK_IO_BUF_BITS +#define UBLK_TAG_BITS 16 +#define UBLK_TAG_BITS_MASK ((1ULL << UBLK_TAG_BITS) - 1) + +/* max 4096 queues */ +#define UBLK_QID_OFF (UBLK_TAG_OFF + UBLK_TAG_BITS) +#define UBLK_QID_BITS 12 +#define UBLK_QID_BITS_MASK ((1ULL << UBLK_QID_BITS) - 1) + +#define UBLK_MAX_NR_QUEUES (1U << UBLK_QID_BITS) + +#define UBLKSRV_IO_BUF_TOTAL_BITS (UBLK_QID_OFF + UBLK_QID_BITS) +#define UBLKSRV_IO_BUF_TOTAL_SIZE (1ULL << UBLKSRV_IO_BUF_TOTAL_BITS) + /* * zero copy requires 4k block size, and can remap ublk driver's io * request into ublksrv's vm space From 1172d5b8beca6b899deb9f7f2850e7e47ec16198 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 19 May 2023 14:50:30 +0800 Subject: [PATCH 032/266] ublk: support user copy Currently copy between io request buffer(pages) and userspace buffer is done inside ublk_map_io() or ublk_unmap_io(). This way performs very well in case of pre-allocated userspace io buffer. For dynamically allocated or external userspace backend io buffer, UBLK_F_NEED_GET_DATA is added for ublk server to provide buffer by one extra command communication for WRITE request. For READ, userspace simply provides buffer, but can't know when the buffer is done[1]. Add UBLK_F_USER_COPY by moving io data copy out of kernel by providing read()/write() on /dev/ublkcN, and simply let ublk server do the io data copy. This way makes both side cleaner, the cost is that one extra syscall for copy io data between request and backend buffer. With UBLK_F_USER_COPY, it actually becomes possible to run per-io zero copy now, such as, only do zero copy for big size IO, so it can be thought as one prep patch for supporting zero copy. Meantime zero copy still needs to expose read()/write() buffer for some corner case, such as passthrough IO. [1] READ buffer in UBLK_F_NEED_GET_DATA https://lore.kernel.org/linux-block/116d8a56-0881-56d3-9bcc-78ff3e1dc4e5@linux.alibaba.com/T/#m23bd4b8634c0a054e6797063167b469949a247bb ublksrv loop usercopy code: https://github.com/ming1/ubdsrv/commits/usercopy Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230519065030.351216-8-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 58 ++++++++++++++++++++++++++++------- include/uapi/linux/ublk_cmd.h | 3 ++ 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ec40ac4f9af3..e00733b6fea8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -55,7 +55,8 @@ | UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ | UBLK_F_UNPRIVILEGED_DEV \ - | UBLK_F_CMD_IOCTL_ENCODE) + | UBLK_F_CMD_IOCTL_ENCODE \ + | UBLK_F_USER_COPY) /* All UBLK_PARAM_TYPE_* should be included here */ #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ @@ -312,9 +313,18 @@ static int ublk_apply_params(struct ublk_device *ub) return 0; } +static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) +{ + return ubq->flags & UBLK_F_USER_COPY; +} + static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) { - return false; + /* + * read()/write() is involved in user copy, so request reference + * has to be grabbed + */ + return ublk_support_user_copy(ubq); } static inline void ublk_init_req_ref(const struct ublk_queue *ubq, @@ -591,6 +601,9 @@ static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, { const unsigned int rq_bytes = blk_rq_bytes(req); + if (ublk_support_user_copy(ubq)) + return rq_bytes; + /* * no zero copy, we delay copy WRITE request data into ublksrv * context and the big benefit is that pinning pages in current @@ -615,6 +628,9 @@ static int ublk_unmap_io(const struct ublk_queue *ubq, { const unsigned int rq_bytes = blk_rq_bytes(req); + if (ublk_support_user_copy(ubq)) + return rq_bytes; + if (ublk_need_unmap_req(req)) { struct iov_iter iter; struct iovec iov; @@ -1390,6 +1406,11 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) goto out; + if (ublk_support_user_copy(ubq) && ub_cmd->addr) { + ret = -EINVAL; + goto out; + } + ret = ublk_check_cmd_op(cmd_op); if (ret) goto out; @@ -1408,23 +1429,34 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, */ if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) goto out; - /* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */ - if (!ub_cmd->addr && !ublk_need_get_data(ubq)) - goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * FETCH_RQ has to provide IO buffer if NEED GET + * DATA is not enabled + */ + if (!ub_cmd->addr && !ublk_need_get_data(ubq)) + goto out; + } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_mark_io_ready(ub, ubq); break; case UBLK_IO_COMMIT_AND_FETCH_REQ: req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); - /* - * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is - * not enabled or it is Read IO. - */ - if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ)) - goto out; + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; + + if (!ublk_support_user_copy(ubq)) { + /* + * COMMIT_AND_FETCH_REQ has to provide IO buffer if + * NEED GET DATA is not enabled or it is Read IO. + */ + if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || + req_op(req) == REQ_OP_READ)) + goto out; + } ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_commit_completion(ub, ub_cmd); break; @@ -1996,6 +2028,10 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE | UBLK_F_URING_CMD_COMP_IN_TASK; + /* GET_DATA isn't needed any more with USER_COPY */ + if (ub->dev_info.flags & UBLK_F_USER_COPY) + ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; + /* We are not ready to support zero copy */ ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index c0c1632c671e..54b5b0aeefca 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -165,6 +165,9 @@ /* use ioctl encoding for uring command */ #define UBLK_F_CMD_IOCTL_ENCODE (1UL << 6) +/* Copy between request and user buffer by pread()/pwrite() */ +#define UBLK_F_USER_COPY (1UL << 7) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 From f80dd11dd1d07ada04a9fcd57032fa82e136462d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 19 May 2023 15:03:46 -0700 Subject: [PATCH 033/266] block: BFQ: Move an invariant check Check bfqq->dispatched for each BFQ queue instead of checking it for an invalid bfqq pointer. Fixes: 3e49c1e4a615 ("block: BFQ: Add several invariant checks") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230519220347.3643295-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c5727afad159..09bbbcf9e049 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5405,6 +5405,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) WARN_ON_ONCE(!list_empty(&bfqq->fifo)); WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list)); + WARN_ON_ONCE(bfqq->dispatched); kmem_cache_free(bfq_pool, bfqq); bfqg_and_blkg_put(bfqg); @@ -7150,7 +7151,6 @@ static void bfq_exit_queue(struct elevator_queue *e) for (actuator = 0; actuator < bfqd->num_actuators; actuator++) WARN_ON_ONCE(bfqd->rq_in_driver[actuator]); WARN_ON_ONCE(bfqd->tot_rq_in_driver); - WARN_ON_ONCE(bfqq->dispatched); hrtimer_cancel(&bfqd->idle_slice_timer); From 712c7364655f69827d0b96f69594886ecbfb412f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2023 06:45:03 +0200 Subject: [PATCH 034/266] block: don't plug in blkdev_write_iter For direct I/O writes that issues more than a single bio, the plugging is already done in __blkdev_direct_IO. For synchronous buffered writes the plugging is done deep down in writeback_inodes_wb / wb_writeback. For the other cases there is no point in plugging as as single bio or no bio at all is submitted. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230520044503.334444-1-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/fops.c b/block/fops.c index d2e6be4e3d1c..102ee85fc6ee 100644 --- a/block/fops.c +++ b/block/fops.c @@ -520,7 +520,6 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) struct block_device *bdev = iocb->ki_filp->private_data; struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); - struct blk_plug plug; size_t shorted = 0; ssize_t ret; @@ -545,12 +544,10 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); - blk_finish_plug(&plug); return ret; } From b8b637d770ef7aa9bc3971670cc8532b1f0d757e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 20 May 2023 23:11:34 +0800 Subject: [PATCH 035/266] ublk: fix build warning on iov_iter_get_pages2 Return type of iov_iter_get_pages2() is ssize_t instead of size_t, so fix it. Fixes: 981f95a571e3 ("ublk: cleanup ublk_copy_user_pages") Reported-by: kernel test robot Reported-by: Julia Lawall Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230520151134.459679-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e00733b6fea8..539eada32861 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -564,7 +564,8 @@ static size_t ublk_copy_user_pages(const struct request *req, while (iov_iter_count(uiter) && iter.bio) { unsigned nr_pages; - size_t len, off; + ssize_t len; + size_t off; int i; len = iov_iter_get_pages2(uiter, iter.pages, From 712fd23a90eed6a73ea5135a500e59d30356d4f1 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 22 May 2023 16:53:55 +0800 Subject: [PATCH 036/266] block: remove redundant req_op in blk_rq_is_passthrough op &= REQ_OP_MASK in blk_op_is_passthrough() is exactly what req_op() do. Therefore, it is redundant to call req_op() for blk_op_is_passthrough(). Signed-off-by: Li Nan Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20230522085355.1740772-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d778cb6b2112..59b52ec155b1 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -201,7 +201,7 @@ static inline enum req_op req_op(const struct request *req) static inline bool blk_rq_is_passthrough(struct request *rq) { - return blk_op_is_passthrough(req_op(rq)); + return blk_op_is_passthrough(rq->cmd_flags); } static inline unsigned short req_get_ioprio(struct request *req) From a13bd91be22318768d55470cbc0b0f4488ef9edf Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 14 Apr 2023 16:40:08 +0800 Subject: [PATCH 037/266] block/rq_qos: protect rq_qos apis with a new lock commit 50e34d78815e ("block: disable the elevator int del_gendisk") move rq_qos_exit() from disk_release() to del_gendisk(), this will introduce some problems: 1) If rq_qos_add() is triggered by enabling iocost/iolatency through cgroupfs, then it can concurrent with del_gendisk(), it's not safe to write 'q->rq_qos' concurrently. 2) Activate cgroup policy that is relied on rq_qos will call rq_qos_add() and blkcg_activate_policy(), and if rq_qos_exit() is called in the middle, null-ptr-dereference will be triggered in blkcg_activate_policy(). 3) blkg_conf_open_bdev() can call blkdev_get_no_open() first to find the disk, then if rq_qos_exit() from del_gendisk() is done before rq_qos_add(), then memory will be leaked. This patch add a new disk level mutex 'rq_qos_mutex': 1) The lock will protect rq_qos_exit() directly. 2) For wbt that doesn't relied on blk-cgroup, rq_qos_add() can only be called from disk initialization for now because wbt can't be destructed until rq_qos_exit(), so it's safe not to protect wbt for now. Hoever, in case that rq_qos dynamically destruction is supported in the furture, this patch also protect rq_qos_add() from wbt_init() directly, this is enough because blk-sysfs already synchronize writers with disk removal. 3) For iocost and iolatency, in order to synchronize disk removal and cgroup configuration, the lock is held after blkdev_get_no_open() from blkg_conf_open_bdev(), and is released in blkg_conf_exit(). In order to fix the above memory leak, disk_live() is checked after holding the new lock. Fixes: 50e34d78815e ("block: disable the elevator int del_gendisk") Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230414084008.2085155-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 +++++++++ block/blk-core.c | 1 + block/blk-rq-qos.c | 20 ++++++-------------- block/blk-wbt.c | 2 ++ include/linux/blkdev.h | 1 + 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ce64dd73cfe..1c6716f51fff 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -748,6 +748,13 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) return -ENODEV; } + mutex_lock(&bdev->bd_queue->rq_qos_mutex); + if (!disk_live(bdev->bd_disk)) { + blkdev_put_no_open(bdev); + mutex_unlock(&bdev->bd_queue->rq_qos_mutex); + return -ENODEV; + } + ctx->body = input; ctx->bdev = bdev; return 0; @@ -892,6 +899,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) + __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); @@ -899,6 +907,7 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } if (ctx->bdev) { + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); blkdev_put_no_open(ctx->bdev); ctx->body = NULL; ctx->bdev = NULL; diff --git a/block/blk-core.c b/block/blk-core.c index 00c74330fa92..2ae22bebeb3e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -420,6 +420,7 @@ struct request_queue *blk_alloc_queue(int node_id) mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d8cc820a365e..167be74df4ee 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -288,11 +288,13 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, void rq_qos_exit(struct request_queue *q) { + mutex_lock(&q->rq_qos_mutex); while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); } + mutex_unlock(&q->rq_qos_mutex); } int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, @@ -300,6 +302,8 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, { struct request_queue *q = disk->queue; + lockdep_assert_held(&q->rq_qos_mutex); + rqos->disk = disk; rqos->id = id; rqos->ops = ops; @@ -307,18 +311,13 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, /* * No IO can be in-flight when adding rqos, so freeze queue, which * is fine since we only support rq_qos for blk-mq queue. - * - * Reuse ->queue_lock for protecting against other concurrent - * rq_qos adding/deleting */ blk_mq_freeze_queue(q); - spin_lock_irq(&q->queue_lock); if (rq_qos_id(q, rqos->id)) goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; - spin_unlock_irq(&q->queue_lock); blk_mq_unfreeze_queue(q); @@ -330,7 +329,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, return 0; ebusy: - spin_unlock_irq(&q->queue_lock); blk_mq_unfreeze_queue(q); return -EBUSY; } @@ -340,21 +338,15 @@ void rq_qos_del(struct rq_qos *rqos) struct request_queue *q = rqos->disk->queue; struct rq_qos **cur; - /* - * See comment in rq_qos_add() about freezing queue & using - * ->queue_lock. - */ - blk_mq_freeze_queue(q); + lockdep_assert_held(&q->rq_qos_mutex); - spin_lock_irq(&q->queue_lock); + blk_mq_freeze_queue(q); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } - spin_unlock_irq(&q->queue_lock); - blk_mq_unfreeze_queue(q); mutex_lock(&q->debugfs_mutex); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index e49a48684532..53bf5aa6f9ad 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -942,7 +942,9 @@ int wbt_init(struct gendisk *disk) /* * Assign rwb and add the stats callback. */ + mutex_lock(&q->rq_qos_mutex); ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); + mutex_unlock(&q->rq_qos_mutex); if (ret) goto err_free; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe99948688df..b2ac587e3402 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -392,6 +392,7 @@ struct request_queue { struct blk_queue_stats *stats; struct rq_qos *rq_qos; + struct mutex rq_qos_mutex; const struct blk_mq_ops *mq_ops; From 5a80bd075f3bce24793ae1aeb06066895ec5aef0 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Sat, 20 May 2023 08:40:57 +0000 Subject: [PATCH 038/266] block: introduce block_io_start/block_io_done tracepoints Currently, several BCC ([0]) tools (biosnoop/biostacks/biotop) use kprobes to blk_account_io_start/blk_account_io_done to implement their functionalities. This is fragile because the target kernel functions may be renamed ([1]) or inlined ([2]). So introduce two new tracepoints for such use cases. [0]: https://github.com/iovisor/bcc [1]: https://github.com/iovisor/bcc/issues/3954 [2]: https://github.com/iovisor/bcc/issues/4261 Tested-by: Francis Laniel Signed-off-by: Hengqi Chen Tested-by: Yonghong Song Link: https://lore.kernel.org/r/20230520084057.1467003-1-hengqi.chen@gmail.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++++ include/trace/events/block.h | 26 ++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 551e7760f45e..1749f5890606 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -962,6 +962,8 @@ EXPORT_SYMBOL_GPL(blk_update_request); static inline void blk_account_io_done(struct request *req, u64 now) { + trace_block_io_done(req); + /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the @@ -981,6 +983,8 @@ static inline void blk_account_io_done(struct request *req, u64 now) static inline void blk_account_io_start(struct request *req) { + trace_block_io_start(req); + if (blk_do_io_stat(req)) { /* * All non-passthrough requests are created from a bio with one diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 7f4dfbdf12a6..40e60c33cc6f 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -245,6 +245,32 @@ DEFINE_EVENT(block_rq, block_rq_merge, TP_ARGS(rq) ); +/** + * block_io_start - insert a request for execution + * @rq: block IO operation request + * + * Called when block operation request @rq is queued for execution + */ +DEFINE_EVENT(block_rq, block_io_start, + + TP_PROTO(struct request *rq), + + TP_ARGS(rq) +); + +/** + * block_io_done - block IO operation request completed + * @rq: block IO operation request + * + * Called when block operation request @rq is completed + */ +DEFINE_EVENT(block_rq, block_io_done, + + TP_PROTO(struct request *rq), + + TP_ARGS(rq) +); + /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation From a450f49708ea2ccabd1c5d2fe8a702ca5ef77941 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 21:57:39 +0100 Subject: [PATCH 039/266] iomap: Don't get an reference on ZERO_PAGE for direct I/O block zeroing ZERO_PAGE can't go away, no need to hold an extra reference. Signed-off-by: David Howells Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig cc: Al Viro cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christian Brauner Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230522205744.2825689-2-dhowells@redhat.com Signed-off-by: Jens Axboe --- fs/iomap/direct-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 019cc87d0fb3..66a9f10e3207 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -203,7 +203,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - get_page(page); + bio_set_flag(bio, BIO_NO_PAGE_REF); __bio_add_page(bio, page, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); } From 09e8c253415b8eb9ca29a2131d2ebf17743534c5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 21:57:40 +0100 Subject: [PATCH 040/266] block: Fix bio_flagged() so that gcc can better optimise it Fix bio_flagged() so that multiple instances of it, such as: if (bio_flagged(bio, BIO_PAGE_REFFED) || bio_flagged(bio, BIO_PAGE_PINNED)) can be combined by the gcc optimiser into a single test in assembly (arguably, this is a compiler optimisation issue[1]). The missed optimisation stems from bio_flagged() comparing the result of the bitwise-AND to zero. This results in an out-of-line bio_release_page() being compiled to something like: <+0>: mov 0x14(%rdi),%eax <+3>: test $0x1,%al <+5>: jne 0xffffffff816dac53 <+7>: test $0x2,%al <+9>: je 0xffffffff816dac5c <+11>: movzbl %sil,%esi <+15>: jmp 0xffffffff816daba1 <__bio_release_pages> <+20>: jmp 0xffffffff81d0b800 <__x86_return_thunk> However, the test is superfluous as the return type is bool. Removing it results in: <+0>: testb $0x3,0x14(%rdi) <+4>: je 0xffffffff816e4af4 <+6>: movzbl %sil,%esi <+10>: jmp 0xffffffff816dab7c <__bio_release_pages> <+15>: jmp 0xffffffff81d0b7c0 <__x86_return_thunk> instead. Also, the MOVZBL instruction looks unnecessary[2] - I think it's just 're-booling' the mark_dirty parameter. Signed-off-by: David Howells Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard cc: Jens Axboe cc: linux-block@vger.kernel.org Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108370 [1] Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108371 [2] Link: https://lore.kernel.org/r/167391056756.2311931.356007731815807265.stgit@warthog.procyon.org.uk/ # v6 Reviewed-by: Christian Brauner Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230522205744.2825689-3-dhowells@redhat.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index b3e7529ff55e..7f53be035cf0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -229,7 +229,7 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) static inline bool bio_flagged(struct bio *bio, unsigned int bit) { - return (bio->bi_flags & (1U << bit)) != 0; + return bio->bi_flags & (1U << bit); } static inline void bio_set_flag(struct bio *bio, unsigned int bit) From e51bab4e20586fb3afc30536b776a97ed8ffb681 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 May 2023 21:57:41 +0100 Subject: [PATCH 041/266] block: Replace BIO_NO_PAGE_REF with BIO_PAGE_REFFED with inverted logic Replace BIO_NO_PAGE_REF with a BIO_PAGE_REFFED flag that has the inverted meaning is only set when a page reference has been acquired that needs to be released by bio_release_pages(). Signed-off-by: Christoph Hellwig Signed-off-by: David Howells Reviewed-by: John Hubbard cc: Al Viro cc: Jens Axboe cc: Jan Kara cc: Matthew Wilcox cc: Logan Gunthorpe cc: linux-block@vger.kernel.org Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230522205744.2825689-4-dhowells@redhat.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-map.c | 1 + fs/direct-io.c | 2 ++ fs/iomap/direct-io.c | 1 - include/linux/bio.h | 2 +- include/linux/blk_types.h | 2 +- 6 files changed, 6 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index 043944fd46eb..8516adeaea26 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1191,7 +1191,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = size; - bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); } @@ -1336,6 +1335,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } + bio_set_flag(bio, BIO_PAGE_REFFED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); diff --git a/block/blk-map.c b/block/blk-map.c index 04c55f1c492e..33d9f6e89ba6 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -282,6 +282,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (blk_queue_pci_p2pdma(rq->q)) extraction_flags |= ITER_ALLOW_P2PDMA; + bio_set_flag(bio, BIO_PAGE_REFFED); while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; ssize_t bytes; diff --git a/fs/direct-io.c b/fs/direct-io.c index 0b380bb8a81e..ad20f3428bab 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -402,6 +402,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_aio; else bio->bi_end_io = dio_bio_end_io; + /* for now require references for all pages */ + bio_set_flag(bio, BIO_PAGE_REFFED); sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 66a9f10e3207..08873f0627dd 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -203,7 +203,6 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - bio_set_flag(bio, BIO_NO_PAGE_REF); __bio_add_page(bio, page, len, 0); iomap_dio_submit_bio(iter, dio, bio, pos); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 7f53be035cf0..0922729acd26 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -488,7 +488,7 @@ void zero_fill_bio(struct bio *bio); static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { - if (!bio_flagged(bio, BIO_NO_PAGE_REF)) + if (bio_flagged(bio, BIO_PAGE_REFFED)) __bio_release_pages(bio, mark_dirty); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 740afe80f297..dfd2c2cb909d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -323,7 +323,7 @@ struct bio { * bio flags */ enum { - BIO_NO_PAGE_REF, /* don't put release vec pages */ + BIO_PAGE_REFFED, /* put pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ BIO_QUIET, /* Make BIO Quiet */ From fd363244e883323e1ac9412d96fd22b51e255b0c Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 21:57:42 +0100 Subject: [PATCH 042/266] block: Add BIO_PAGE_PINNED and associated infrastructure Add BIO_PAGE_PINNED to indicate that the pages in a bio are pinned (FOLL_PIN) and that the pin will need removing. Signed-off-by: David Howells Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard cc: Al Viro cc: Jens Axboe cc: Jan Kara cc: Matthew Wilcox cc: Logan Gunthorpe cc: linux-block@vger.kernel.org Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230522205744.2825689-5-dhowells@redhat.com Signed-off-by: Jens Axboe --- block/bio.c | 6 +++--- block/blk.h | 12 ++++++++++++ include/linux/bio.h | 3 ++- include/linux/blk_types.h | 1 + 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index 8516adeaea26..17bd01ecde36 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1169,7 +1169,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) bio_for_each_segment_all(bvec, bio, iter_all) { if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + bio_release_page(bio, bvec->bv_page); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1489,8 +1489,8 @@ void bio_set_pages_dirty(struct bio *bio) * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one put_page() against each page and will run one - * bio_put() against the BIO. + * here on. It will unpin each page and will run one bio_put() against the + * BIO. */ static void bio_dirty_fn(struct work_struct *work); diff --git a/block/blk.h b/block/blk.h index 9f171b8f1e34..7ad7cb6ffa01 100644 --- a/block/blk.h +++ b/block/blk.h @@ -420,6 +420,18 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +/* + * Clean up a page appropriately, where the page may be pinned, may have a + * ref taken on it or neither. + */ +static inline void bio_release_page(struct bio *bio, struct page *page) +{ + if (bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_page(page); + else if (bio_flagged(bio, BIO_PAGE_REFFED)) + put_page(page); +} + struct request_queue *blk_alloc_queue(int node_id); int disk_scan_partitions(struct gendisk *disk, fmode_t mode); diff --git a/include/linux/bio.h b/include/linux/bio.h index 0922729acd26..8588bcfbc6ef 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -488,7 +488,8 @@ void zero_fill_bio(struct bio *bio); static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { - if (bio_flagged(bio, BIO_PAGE_REFFED)) + if (bio_flagged(bio, BIO_PAGE_REFFED) || + bio_flagged(bio, BIO_PAGE_PINNED)) __bio_release_pages(bio, mark_dirty); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index dfd2c2cb909d..8ef209e3aa96 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -323,6 +323,7 @@ struct bio { * bio flags */ enum { + BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */ BIO_PAGE_REFFED, /* put pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ From a7e689dd1c06e0cbd6d216a6868e33099d8fc8d8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 21:57:43 +0100 Subject: [PATCH 043/266] block: Convert bio_iov_iter_get_pages to use iov_iter_extract_pages This will pin pages or leave them unaltered rather than getting a ref on them as appropriate to the iterator. The pages need to be pinned for DIO rather than having refs taken on them to prevent VM copy-on-write from malfunctioning during a concurrent fork() (the result of the I/O could otherwise end up being affected by/visible to the child process). Signed-off-by: David Howells Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard cc: Al Viro cc: Jens Axboe cc: Jan Kara cc: Matthew Wilcox cc: Logan Gunthorpe cc: linux-block@vger.kernel.org Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230522205744.2825689-6-dhowells@redhat.com Signed-off-by: Jens Axboe --- block/bio.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/block/bio.c b/block/bio.c index 17bd01ecde36..798cc4cf3bd2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1205,7 +1205,7 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, } if (same_page) - put_page(page); + bio_release_page(bio, page); return 0; } @@ -1219,7 +1219,7 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; if (same_page) - put_page(page); + bio_release_page(bio, page); return 0; } @@ -1230,10 +1230,10 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. - * For multi-segment *iter, this function only adds pages from the - * next non-empty segment of the iov iterator. + * Extracts pages from *iter and appends them to @bio's bvec array. The pages + * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. + * For a multi-segment *iter, this function only adds pages from the next + * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1265,9 +1265,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ - size = iov_iter_get_pages(iter, pages, - UINT_MAX - bio->bi_iter.bi_size, - nr_pages, &offset, extraction_flags); + size = iov_iter_extract_pages(iter, &pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1300,7 +1300,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) iov_iter_revert(iter, left); out: while (i < nr_pages) - put_page(pages[i++]); + bio_release_page(bio, pages[i++]); return ret; } @@ -1335,7 +1335,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } - bio_set_flag(bio, BIO_PAGE_REFFED); + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); From 403b6fb8dac1e9407c04652cedd92285c5ae9aa5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 22 May 2023 21:57:44 +0100 Subject: [PATCH 044/266] block: convert bio_map_user_iov to use iov_iter_extract_pages This will pin pages or leave them unaltered rather than getting a ref on them as appropriate to the iterator. The pages need to be pinned for DIO rather than having refs taken on them to prevent VM copy-on-write from malfunctioning during a concurrent fork() (the result of the I/O could otherwise end up being visible to/affected by the child process). Signed-off-by: David Howells Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard cc: Al Viro cc: Jens Axboe cc: Jan Kara cc: Matthew Wilcox cc: Logan Gunthorpe cc: linux-block@vger.kernel.org Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230522205744.2825689-7-dhowells@redhat.com Signed-off-by: Jens Axboe --- block/blk-map.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 33d9f6e89ba6..3551c3ff17cf 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -281,22 +281,21 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (blk_queue_pci_p2pdma(rq->q)) extraction_flags |= ITER_ALLOW_P2PDMA; + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); - bio_set_flag(bio, BIO_PAGE_REFFED); while (iov_iter_count(iter)) { - struct page **pages, *stack_pages[UIO_FASTIOV]; + struct page *stack_pages[UIO_FASTIOV]; + struct page **pages = stack_pages; ssize_t bytes; size_t offs; int npages; - if (nr_vecs <= ARRAY_SIZE(stack_pages)) { - pages = stack_pages; - bytes = iov_iter_get_pages(iter, pages, LONG_MAX, - nr_vecs, &offs, extraction_flags); - } else { - bytes = iov_iter_get_pages_alloc(iter, &pages, - LONG_MAX, &offs, extraction_flags); - } + if (nr_vecs > ARRAY_SIZE(stack_pages)) + pages = NULL; + + bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, + nr_vecs, extraction_flags, &offs); if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -318,7 +317,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (!bio_add_hw_page(rq->q, bio, page, n, offs, max_sectors, &same_page)) { if (same_page) - put_page(page); + bio_release_page(bio, page); break; } @@ -330,7 +329,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, * release the pages we didn't map into the bio, if any */ while (j < npages) - put_page(pages[j++]); + bio_release_page(bio, pages[j++]); if (pages != stack_pages) kvfree(pages); /* couldn't stuff something into bio? */ From 539050f92ea7666bca17c2c380d8071d2f93dcde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 30 May 2023 19:09:57 +0200 Subject: [PATCH 045/266] block: constify partition prober array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The array is never modified so it can be const. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202304191640.SkNk7kVN-lkp@intel.com/ Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20230419-const-partition-v3-1-4e14e48be367@weissschuh.net Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 49e0496ff23c..650603dbe557 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -12,7 +12,7 @@ #include #include "check.h" -static int (*check_part[])(struct parsed_partitions *) = { +static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. From cdb37f73cf05631c4f7401f2cd99878733c0c3d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 30 May 2023 19:09:58 +0200 Subject: [PATCH 046/266] block: constify struct part_type part_type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct is never modified so it can be const. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20230419-const-partition-v3-2-4e14e48be367@weissschuh.net Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- include/linux/blkdev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 650603dbe557..2bc21063edef 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -256,7 +256,7 @@ static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) return 0; } -struct device_type part_type = { +const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b2ac587e3402..d89c2da14698 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -41,7 +41,7 @@ struct blk_stat_callback; struct blk_crypto_profile; extern const struct device_type disk_type; -extern struct device_type part_type; +extern const struct device_type part_type; extern struct class block_class; /* From 0bd478005cfc7f50ccb769744d952e9687ee75b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 30 May 2023 19:09:59 +0200 Subject: [PATCH 047/266] block: constify struct part_attr_group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct is never modified so it can be const. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20230419-const-partition-v3-3-4e14e48be367@weissschuh.net Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 2bc21063edef..d5f5633bf725 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -228,7 +228,7 @@ static struct attribute *part_attrs[] = { NULL }; -static struct attribute_group part_attr_group = { +static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; From a378f6a40fac4a2f1812adea7017613d2bd5dab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 30 May 2023 19:10:00 +0200 Subject: [PATCH 048/266] block: constify the whole_disk device_attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct is never modified so it can be const. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20230419-const-partition-v3-4-4e14e48be367@weissschuh.net Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index d5f5633bf725..82d26427deae 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -288,7 +288,7 @@ static ssize_t whole_disk_show(struct device *dev, { return 0; } -static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); +static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with open_mutex held, before a disk can be opened or From c8070b78751955e59b42457b974bea4a4fe00187 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 May 2023 22:41:40 +0100 Subject: [PATCH 049/266] mm: Don't pin ZERO_PAGE in pin_user_pages() Make pin_user_pages*() leave a ZERO_PAGE unpinned if it extracts a pointer to it from the page tables and make unpin_user_page*() correspondingly ignore a ZERO_PAGE when unpinning. We don't want to risk overrunning a zero page's refcount as we're only allowed ~2 million pins on it - something that userspace can conceivably trigger. Add a pair of functions to test whether a page or a folio is a ZERO_PAGE. Signed-off-by: David Howells cc: Christoph Hellwig cc: David Hildenbrand cc: Lorenzo Stoakes cc: Andrew Morton cc: Jens Axboe cc: Al Viro cc: Matthew Wilcox cc: Jan Kara cc: Jeff Layton cc: Jason Gunthorpe cc: Logan Gunthorpe cc: Hillf Danton cc: Christian Brauner cc: Linus Torvalds cc: linux-fsdevel@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-kernel@vger.kernel.org cc: linux-mm@kvack.org Reviewed-by: Lorenzo Stoakes Reviewed-by: Christoph Hellwig Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20230526214142.958751-2-dhowells@redhat.com Signed-off-by: Jens Axboe --- Documentation/core-api/pin_user_pages.rst | 6 +++++ include/linux/mm.h | 26 +++++++++++++++++-- mm/gup.c | 31 ++++++++++++++++++++++- 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 9fb0b1080d3b..d3c1f6d8c0e0 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -112,6 +112,12 @@ pages: This also leads to limitations: there are only 31-10==21 bits available for a counter that increments 10 bits at a time. +* Because of that limitation, special handling is applied to the zero pages + when using FOLL_PIN. We only pretend to pin a zero page - we don't alter its + refcount or pincount at all (it is permanent, so there's no need). The + unpinning functions also don't do anything to a zero page. This is + transparent to the caller. + * Callers must specifically request "dma-pinned tracking of pages". In other words, just calling get_user_pages() will not suffice; a new set of functions, pin_user_page() and related, must be used. diff --git a/include/linux/mm.h b/include/linux/mm.h index 27ce77080c79..3c2f6b452586 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1910,6 +1910,28 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, return page_maybe_dma_pinned(page); } +/** + * is_zero_page - Query if a page is a zero page + * @page: The page to query + * + * This returns true if @page is one of the permanent zero pages. + */ +static inline bool is_zero_page(const struct page *page) +{ + return is_zero_pfn(page_to_pfn(page)); +} + +/** + * is_zero_folio - Query if a folio is a zero page + * @folio: The folio to query + * + * This returns true if @folio is one of the permanent zero pages. + */ +static inline bool is_zero_folio(const struct folio *folio) +{ + return is_zero_page(&folio->page); +} + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION static inline bool is_longterm_pinnable_page(struct page *page) @@ -1920,8 +1942,8 @@ static inline bool is_longterm_pinnable_page(struct page *page) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - /* The zero page may always be pinned */ - if (is_zero_pfn(page_to_pfn(page))) + /* The zero page can be "pinned" but gets special handling. */ + if (is_zero_page(page)) return true; /* Coherent device memory must always allow eviction. */ diff --git a/mm/gup.c b/mm/gup.c index bbe416236593..ad28261dcafd 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -51,7 +51,8 @@ static inline void sanity_check_pinned_pages(struct page **pages, struct page *page = *pages; struct folio *folio = page_folio(page); - if (!folio_test_anon(folio)) + if (is_zero_page(page) || + !folio_test_anon(folio)) continue; if (!folio_test_large(folio) || folio_test_hugetlb(folio)) VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page); @@ -131,6 +132,13 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) else if (flags & FOLL_PIN) { struct folio *folio; + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return page_folio(page); + /* * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a * right zone, so fail and let the caller fall back to the slow @@ -180,6 +188,8 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) { if (flags & FOLL_PIN) { + if (is_zero_folio(folio)) + return; node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs); if (folio_test_large(folio)) atomic_sub(refs, &folio->_pincount); @@ -224,6 +234,13 @@ int __must_check try_grab_page(struct page *page, unsigned int flags) if (flags & FOLL_GET) folio_ref_inc(folio); else if (flags & FOLL_PIN) { + /* + * Don't take a pin on the zero page - it's not going anywhere + * and it is used in a *lot* of places. + */ + if (is_zero_page(page)) + return 0; + /* * Similar to try_grab_folio(): be sure to *also* * increment the normal page refcount field at least once, @@ -3079,6 +3096,9 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page() will not remove pins from it. */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -3110,6 +3130,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -3143,6 +3166,9 @@ EXPORT_SYMBOL(pin_user_pages_remote); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -3161,6 +3187,9 @@ EXPORT_SYMBOL(pin_user_pages); * pin_user_pages_unlocked() is the FOLL_PIN variant of * get_user_pages_unlocked(). Behavior is the same, except that this one sets * FOLL_PIN and rejects FOLL_GET. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) From 1101fb8f89e5fc548c4d0ad66750e98980291815 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 May 2023 22:41:41 +0100 Subject: [PATCH 050/266] mm: Provide a function to get an additional pin on a page Provide a function to get an additional pin on a page that we already have a pin on. This will be used in fs/direct-io.c when dispatching multiple bios to a page we've extracted from a user-backed iter rather than redoing the extraction. Signed-off-by: David Howells cc: Christoph Hellwig cc: David Hildenbrand cc: Lorenzo Stoakes cc: Andrew Morton cc: Jens Axboe cc: Al Viro cc: Matthew Wilcox cc: Jan Kara cc: Jeff Layton cc: Jason Gunthorpe cc: Logan Gunthorpe cc: Hillf Danton cc: Christian Brauner cc: Linus Torvalds cc: linux-fsdevel@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-kernel@vger.kernel.org cc: linux-mm@kvack.org Reviewed-by: Christoph Hellwig Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20230526214142.958751-3-dhowells@redhat.com Signed-off-by: Jens Axboe --- include/linux/mm.h | 1 + mm/gup.c | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c2f6b452586..200068d98686 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2405,6 +2405,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); +void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, diff --git a/mm/gup.c b/mm/gup.c index ad28261dcafd..0814576b7366 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -275,6 +275,33 @@ void unpin_user_page(struct page *page) } EXPORT_SYMBOL(unpin_user_page); +/** + * folio_add_pin - Try to get an additional pin on a pinned folio + * @folio: The folio to be pinned + * + * Get an additional pin on a folio we already have a pin on. Makes no change + * if the folio is a zero_page. + */ +void folio_add_pin(struct folio *folio) +{ + if (is_zero_folio(folio)) + return; + + /* + * Similar to try_grab_folio(): be sure to *also* increment the normal + * page refcount field at least once, so that the page really is + * pinned. + */ + if (folio_test_large(folio)) { + WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1); + folio_ref_inc(folio); + atomic_inc(&folio->_pincount); + } else { + WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS); + folio_ref_add(folio, GUP_PIN_COUNTING_BIAS); + } +} + static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { From 1ccf164ec866cb8575ab9b2e219fca875089c60e Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 26 May 2023 22:41:42 +0100 Subject: [PATCH 051/266] block: Use iov_iter_extract_pages() and page pinning in direct-io.c Change the old block-based direct-I/O code to use iov_iter_extract_pages() to pin user pages or leave kernel pages unpinned rather than taking refs when submitting bios. This makes use of the preceding patches to not take pins on the zero page (thereby allowing insertion of zero pages in with pinned pages) and to get additional pins on pages, allowing an extracted page to be used in multiple bios without having to re-extract it. Signed-off-by: David Howells cc: Christoph Hellwig cc: David Hildenbrand cc: Lorenzo Stoakes cc: Andrew Morton cc: Jens Axboe cc: Al Viro cc: Matthew Wilcox cc: Jan Kara cc: Jeff Layton cc: Jason Gunthorpe cc: Logan Gunthorpe cc: Hillf Danton cc: Christian Brauner cc: Linus Torvalds cc: linux-fsdevel@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-kernel@vger.kernel.org cc: linux-mm@kvack.org Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230526214142.958751-4-dhowells@redhat.com Signed-off-by: Jens Axboe --- fs/direct-io.c | 72 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index ad20f3428bab..0643f1bb4b59 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -42,8 +42,8 @@ #include "internal.h" /* - * How many user pages to map in one call to get_user_pages(). This determines - * the size of a structure in the slab cache + * How many user pages to map in one call to iov_iter_extract_pages(). This + * determines the size of a structure in the slab cache */ #define DIO_PAGES 64 @@ -121,12 +121,13 @@ struct dio { struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ + bool is_pinned; /* T if we have pins on the pages */ void *private; /* copy from map_bh.b_private */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ - int page_errors; /* errno from get_user_pages() */ + int page_errors; /* err from iov_iter_extract_pages() */ int is_async; /* is IO async ? */ bool defer_completion; /* defer AIO completion to workqueue? */ bool should_dirty; /* if pages should be dirtied */ @@ -165,14 +166,14 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio) */ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) { + struct page **pages = dio->pages; const enum req_op dio_op = dio->opf & REQ_OP_MASK; ssize_t ret; - ret = iov_iter_get_pages2(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES, - &sdio->from); + ret = iov_iter_extract_pages(sdio->iter, &pages, LONG_MAX, + DIO_PAGES, 0, &sdio->from); if (ret < 0 && sdio->blocks_available && dio_op == REQ_OP_WRITE) { - struct page *page = ZERO_PAGE(0); /* * A memory fault, but the filesystem has some outstanding * mapped blocks. We need to use those blocks up to avoid @@ -180,8 +181,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) */ if (dio->page_errors == 0) dio->page_errors = ret; - get_page(page); - dio->pages[0] = page; + dio->pages[0] = ZERO_PAGE(0); sdio->head = 0; sdio->tail = 1; sdio->from = 0; @@ -201,9 +201,9 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) /* * Get another userspace page. Returns an ERR_PTR on error. Pages are - * buffered inside the dio so that we can call get_user_pages() against a - * decent number of pages, less frequently. To provide nicer use of the - * L1 cache. + * buffered inside the dio so that we can call iov_iter_extract_pages() + * against a decent number of pages, less frequently. To provide nicer use of + * the L1 cache. */ static inline struct page *dio_get_page(struct dio *dio, struct dio_submit *sdio) @@ -219,6 +219,18 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head]; } +static void dio_pin_page(struct dio *dio, struct page *page) +{ + if (dio->is_pinned) + folio_add_pin(page_folio(page)); +} + +static void dio_unpin_page(struct dio *dio, struct page *page) +{ + if (dio->is_pinned) + unpin_user_page(page); +} + /* * dio_complete() - called when all DIO BIO I/O has been completed * @@ -402,8 +414,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_aio; else bio->bi_end_io = dio_bio_end_io; - /* for now require references for all pages */ - bio_set_flag(bio, BIO_PAGE_REFFED); + if (dio->is_pinned) + bio_set_flag(bio, BIO_PAGE_PINNED); sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } @@ -444,8 +456,9 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) */ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) { - while (sdio->head < sdio->tail) - put_page(dio->pages[sdio->head++]); + if (dio->is_pinned) + unpin_user_pages(dio->pages + sdio->head, + sdio->tail - sdio->head); } /* @@ -676,7 +689,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, * * Return zero on success. Non-zero means the caller needs to start a new BIO. */ -static inline int dio_bio_add_page(struct dio_submit *sdio) +static inline int dio_bio_add_page(struct dio *dio, struct dio_submit *sdio) { int ret; @@ -688,7 +701,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio) */ if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) sdio->pages_in_io--; - get_page(sdio->cur_page); + dio_pin_page(dio, sdio->cur_page); sdio->final_block_in_bio = sdio->cur_page_block + (sdio->cur_page_len >> sdio->blkbits); ret = 0; @@ -743,11 +756,11 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, goto out; } - if (dio_bio_add_page(sdio) != 0) { + if (dio_bio_add_page(dio, sdio) != 0) { dio_bio_submit(dio, sdio); ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); if (ret == 0) { - ret = dio_bio_add_page(sdio); + ret = dio_bio_add_page(dio, sdio); BUG_ON(ret != 0); } } @@ -804,13 +817,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->cur_page) { ret = dio_send_cur_page(dio, sdio, map_bh); - put_page(sdio->cur_page); + dio_unpin_page(dio, sdio->cur_page); sdio->cur_page = NULL; if (ret) return ret; } - get_page(page); /* It is in dio */ + dio_pin_page(dio, page); /* It is in dio */ sdio->cur_page = page; sdio->cur_page_offset = offset; sdio->cur_page_len = len; @@ -825,7 +838,7 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, ret = dio_send_cur_page(dio, sdio, map_bh); if (sdio->bio) dio_bio_submit(dio, sdio); - put_page(sdio->cur_page); + dio_unpin_page(dio, sdio->cur_page); sdio->cur_page = NULL; } return ret; @@ -926,7 +939,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, ret = get_more_blocks(dio, sdio, map_bh); if (ret) { - put_page(page); + dio_unpin_page(dio, page); goto out; } if (!buffer_mapped(map_bh)) @@ -971,7 +984,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, /* AKPM: eargh, -ENOTBLK is a hack */ if (dio_op == REQ_OP_WRITE) { - put_page(page); + dio_unpin_page(dio, page); return -ENOTBLK; } @@ -984,7 +997,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, if (sdio->block_in_file >= i_size_aligned >> blkbits) { /* We hit eof */ - put_page(page); + dio_unpin_page(dio, page); goto out; } zero_user(page, from, 1 << blkbits); @@ -1024,7 +1037,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, sdio->next_block_for_io, map_bh); if (ret) { - put_page(page); + dio_unpin_page(dio, page); goto out; } sdio->next_block_for_io += this_chunk_blocks; @@ -1039,8 +1052,8 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, break; } - /* Drop the ref which was taken in get_user_pages() */ - put_page(page); + /* Drop the pin which was taken in get_user_pages() */ + dio_unpin_page(dio, page); } out: return ret; @@ -1135,6 +1148,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, /* will be released by direct_io_worker */ inode_lock(inode); } + dio->is_pinned = iov_iter_extract_will_pin(iter); /* Once we sampled i_size check for reads beyond EOF */ dio->i_size = i_size_read(inode); @@ -1259,7 +1273,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, ret2 = dio_send_cur_page(dio, &sdio, &map_bh); if (retval == 0) retval = ret2; - put_page(sdio.cur_page); + dio_unpin_page(dio, sdio.cur_page); sdio.cur_page = NULL; } if (sdio.bio) From cb58bf91b138c1a8b18cca9503308789e26e3522 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:24 -0700 Subject: [PATCH 052/266] swap: use __bio_add_page to add page to bio The swap code only adds a single page to a newly created bio. So use __bio_add_page() to add the page which is guaranteed to succeed in this case. This brings us closer to marking bio_add_page() as __must_check. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/5bdafd9de806b2dab92302b30eb7a3a5f10c37d9.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- mm/page_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 87b682d18850..684cd3c7b59b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -338,7 +338,7 @@ static void swap_writepage_bdev_sync(struct page *page, bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); bio.bi_iter.bi_sector = swap_page_sector(page); - bio_add_page(&bio, page, thp_size(page), 0); + __bio_add_page(&bio, page, thp_size(page), 0); bio_associate_blkg_from_page(&bio, page); count_swpout_vm_event(page); @@ -360,7 +360,7 @@ static void swap_writepage_bdev_async(struct page *page, GFP_NOIO); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_write; - bio_add_page(bio, page, thp_size(page), 0); + __bio_add_page(bio, page, thp_size(page), 0); bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); @@ -468,7 +468,7 @@ static void swap_readpage_bdev_sync(struct page *page, bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ); bio.bi_iter.bi_sector = swap_page_sector(page); - bio_add_page(&bio, page, thp_size(page), 0); + __bio_add_page(&bio, page, thp_size(page), 0); /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. @@ -488,7 +488,7 @@ static void swap_readpage_bdev_async(struct page *page, bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_end_io = end_swap_bio_read; - bio_add_page(bio, page, thp_size(page), 0); + __bio_add_page(bio, page, thp_size(page), 0); count_vm_event(PSWPIN); submit_bio(bio); } From 8f11f79f193c935da617375ba5ea4e768a73a094 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:25 -0700 Subject: [PATCH 053/266] drbd: use __bio_add_page to add page to bio The drbd code only adds a single page to a newly created bio. So use __bio_add_page() to add the page which is guaranteed to succeed in this case. This brings us closer to marking bio_add_page() as __must_check. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/435007afac14f3766455559059d21843771fae53.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_bitmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 6ac8c54b44c7..85ca000a0564 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1043,9 +1043,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO, &drbd_md_io_bio_set); bio->bi_iter.bi_sector = on_disk_sector; - /* bio_add_page of a single page to an empty bio will always succeed, - * according to api. Do we want to assert that? */ - bio_add_page(bio, page, len, 0); + __bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = drbd_bm_endio; From fc8ac3e539561aff1c0a255d701d9412d425373c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:26 -0700 Subject: [PATCH 054/266] dm: dm-zoned: use __bio_add_page for adding single metadata page dm-zoned uses bio_add_page() for adding a single page to a freshly created metadata bio. Use __bio_add_page() instead as adding a single page to a new bio is always guaranteed to succeed. This brings us a step closer to marking bio_add_page() __must_check Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/55a0c8dad7550379647873b579dc7cfbe0191f96.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/md/dm-zoned-metadata.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 8f0896a6990b..9d3cca8e3dc9 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -577,7 +577,7 @@ static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd, bio->bi_iter.bi_sector = dmz_blk2sect(block); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; - bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); + __bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); return mblk; @@ -728,7 +728,7 @@ static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk, bio->bi_iter.bi_sector = dmz_blk2sect(block); bio->bi_private = mblk; bio->bi_end_io = dmz_mblock_bio_end_io; - bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); + __bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0); submit_bio(bio); return 0; @@ -752,7 +752,7 @@ static int dmz_rdwr_block(struct dmz_dev *dev, enum req_op op, bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO, GFP_NOIO); bio->bi_iter.bi_sector = dmz_blk2sect(block); - bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); + __bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0); ret = submit_bio_wait(bio); bio_put(bio); From 741af75d4027b1229fc6e62f4e3c4378dfe04897 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:27 -0700 Subject: [PATCH 055/266] fs: buffer: use __bio_add_page to add single page to bio The buffer_head submission code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Gou Hao Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/84ff2dcbe81b258a73ad900adb5266e208b61a4d.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- fs/buffer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a7fc561758b1..63da30ce946a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2760,8 +2760,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); - BUG_ON(bio->bi_iter.bi_size != bh->b_size); + __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; From 3c383235c51dcd6198d37ac3ac06e2acad79f981 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:28 -0700 Subject: [PATCH 056/266] md: use __bio_add_page to add single page The md-raid superblock writing code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Signed-of_-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Acked-by: Song Liu Link: https://lore.kernel.org/r/ca196f5e650e318106dbb4496eb6cbac4bc800bd.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e344b4b3444..6a559a7e89c0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -938,7 +938,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector; - bio_add_page(bio, page, size, 0); + __bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; @@ -979,7 +979,7 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, bio.bi_iter.bi_sector = sector + rdev->new_data_offset; else bio.bi_iter.bi_sector = sector + rdev->data_offset; - bio_add_page(&bio, page, size, 0); + __bio_add_page(&bio, page, size, 0); submit_bio_wait(&bio); From b0a2f17cad9d3fa564d67c543f5d19343401fefd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:29 -0700 Subject: [PATCH 057/266] md: raid5-log: use __bio_add_page to add single page The raid5 log metadata submission code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Acked-by: Song Liu Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/832a810d6c9e71f88b0a39cb076a8c70e8bcb821.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/md/raid5-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 46182b955aef..852b265c5db4 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -792,7 +792,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) io->current_bio = r5l_bio_alloc(log); io->current_bio->bi_end_io = r5l_log_endio; io->current_bio->bi_private = io; - bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); + __bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); r5_reserve_log_entry(log, io); From 6eea4ff8528d6a5b9f0eeb47992e48a8f44b5b8f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:30 -0700 Subject: [PATCH 058/266] md: raid5: use __bio_add_page to add single page to new bio The raid5-ppl submission code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. For adding consecutive pages, the return is actually checked and a new bio is allocated if adding the page fails. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Acked-by: Song Liu Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/27e6bcd762354bff74602e89159cdd12ae3d1fa9.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/md/raid5-ppl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index e495939bb3e0..eaea57aee602 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -465,7 +465,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) bio->bi_end_io = ppl_log_endio; bio->bi_iter.bi_sector = log->next_io_sector; - bio_add_page(bio, io->header_page, PAGE_SIZE, 0); + __bio_add_page(bio, io->header_page, PAGE_SIZE, 0); pr_debug("%s: log->current_io_sector: %llu\n", __func__, (unsigned long long)log->next_io_sector); @@ -496,7 +496,7 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) prev->bi_opf, GFP_NOIO, &ppl_conf->bs); bio->bi_iter.bi_sector = bio_end_sector(prev); - bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); + __bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0); bio_chain(bio, prev); ppl_submit_iounit_bio(io, prev); From 2896db174ced7a800863223f9e74543b98271ba0 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:31 -0700 Subject: [PATCH 059/266] jfs: logmgr: use __bio_add_page to add single page to bio The JFS IO code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Damien Le Moal Acked-by: Dave Kleikamp Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/9fb5ed86d19f6e0b6f64dfc4109a48ff8ff24497.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- fs/jfs/jfs_logmgr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 695415cbfe98..15c645827dec 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1974,7 +1974,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; @@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp) bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); bio->bi_end_io = lbmIODone; From effa7ddeeba782406c81b572791a142fbdaf6b05 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:32 -0700 Subject: [PATCH 060/266] gfs2: use __bio_add_page for adding single page to bio The GFS2 superblock reading code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Damien Le Moal Reviewed-by: Andreas Gruenbacher Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/087c67d4e4973f949d3519c1e4822784ce583c5a.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- fs/gfs2/ops_fstype.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 9af9ddb61ca0..cd962985b058 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -254,7 +254,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS); bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - bio_add_page(bio, page, PAGE_SIZE, 0); + __bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = end_bio_io_page; bio->bi_private = page; From 0fa5b08cf6e17b0a64ffcc5894d8efe186691ab8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:33 -0700 Subject: [PATCH 061/266] zonefs: use __bio_add_page for adding single page to bio The zonefs superblock reading code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Acked-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/04c9978ccaa0fc9871cd4248356638d98daccf0c.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- fs/zonefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 23b8b299c64e..9350221abfc5 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1128,7 +1128,7 @@ static int zonefs_read_super(struct super_block *sb) bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = 0; - bio_add_page(&bio, page, PAGE_SIZE, 0); + __bio_add_page(&bio, page, PAGE_SIZE, 0); ret = submit_bio_wait(&bio); if (ret) From 34848c910b911838e1e83e1370cb988b578c8860 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:34 -0700 Subject: [PATCH 062/266] zram: use __bio_add_page for adding single page to bio The zram writeback code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Damien Le Moal Reviewed-by: Sergey Senozhatsky Reviewed-by: Christoph Hellwig Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/cfd141dd7773315879a126f2aa81b7f698bc0e10.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f6d90f1ba5cf..b86691d2133e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -700,7 +700,7 @@ static ssize_t writeback_store(struct device *dev, bio_init(&bio, zram->bdev, &bio_vec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9); - bio_add_page(&bio, page, PAGE_SIZE, 0); + __bio_add_page(&bio, page, PAGE_SIZE, 0); /* * XXX: A single page IO would be inefficient for write From 5225229b8fdfb3e65520c43547ecf9a737161c3f Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:35 -0700 Subject: [PATCH 063/266] floppy: use __bio_add_page for adding single page to bio The floppy code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/33c445a3b431270c72d9be03d5da1b08ae983920.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index cec2c20f5e59..28ec6b442e9c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4147,7 +4147,7 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) cbdata.drive = drive; bio_init(&bio, bdev, &bio_vec, 1, REQ_OP_READ); - bio_add_page(&bio, page, block_size(bdev), 0); + __bio_add_page(&bio, page, block_size(bdev), 0); bio.bi_iter.bi_sector = 0; bio.bi_flags |= (1 << BIO_QUIET); From b42473cdbab7661535516ae76e0a871d98b5cb97 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:36 -0700 Subject: [PATCH 064/266] md: check for failure when adding pages in alloc_behind_master_bio alloc_behind_master_bio() can possibly add multiple pages to a bio, but it is not checking for the return value of bio_add_page() if adding really succeeded. Check if the page adding succeeded and if not bail out. Reviewed-by: Christoph Hellwig Acked-by: Song Liu Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/827aa12d44ebf3f50b41b47f5cedc0f80179f2c1.1685532726.git.johannes.thumshirn@wdc.com [axboe: fold in s/free_page/put_page fix] Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 68a9e2d9985b..9cfb25967f7b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1147,7 +1147,10 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio, if (unlikely(!page)) goto free_pages; - bio_add_page(behind_bio, page, len, 0); + if (!bio_add_page(behind_bio, page, len, 0)) { + put_page(page); + goto free_pages; + } size -= len; i++; From f83123223a8447a1369409568f698c68a230010e Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:37 -0700 Subject: [PATCH 065/266] md: raid1: use __bio_add_page for adding single page to bio The sync request code uses bio_add_page() to add a page to a newly created bio. bio_add_page() can fail, but the return value is never checked. Use __bio_add_page() as adding a single page to a newly created bio is guaranteed to succeed. This brings us a step closer to marking bio_add_page() as __must_check. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Acked-by: Song Liu Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/6cf7f66c6e646231200d025dfd5f2d3ae75c8fe5.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9cfb25967f7b..3570da63969b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2917,7 +2917,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, * won't fail because the vec table is big * enough to hold all these pages */ - bio_add_page(bio, page, len, 0); + __bio_add_page(bio, page, len, 0); } } nr_sectors += len>>9; From 0c67dd644176092d47b97215daca830c6ee0db18 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:38 -0700 Subject: [PATCH 066/266] md: raid1: check if adding pages to resync bio fails Check if adding pages to resync bio fails and if bail out. As the comment above suggests this cannot happen, WARN if it actually happens. Technically __bio_add_pages() would be sufficient here, but asserting the pages actually get added to the bio is preferred. This way we can mark bio_add_pages as __must_check. Reviewed-by: Damien Le Moal Acked-by: Song Liu Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/33aea4c271220dc9bcab58c4b7bec478c1511142.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/md/raid1-10.c | 11 ++++++----- drivers/md/raid10.c | 20 ++++++++++---------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index e61f6cad4e08..cd349e69ed77 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -101,11 +101,12 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, struct page *page = resync_fetch_page(rp, idx); int len = min_t(int, size, PAGE_SIZE); - /* - * won't fail because the vec table is big - * enough to hold all these pages - */ - bio_add_page(bio, page, len, 0); + if (WARN_ON(!bio_add_page(bio, page, len, 0))) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return; + } + size -= len; } while (idx++ < RESYNC_PAGES && size > 0); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4fcfcb350d2b..381c21f7fb06 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3819,11 +3819,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (bio= biolist ; bio ; bio=bio->bi_next) { struct resync_pages *rp = get_resync_pages(bio); page = resync_fetch_page(rp, page_idx); - /* - * won't fail because the vec table is big enough - * to hold all these pages - */ - bio_add_page(bio, page, len, 0); + if (WARN_ON(!bio_add_page(bio, page, len, 0))) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + goto giveup; + } } nr_sectors += len>>9; sector_nr += len>>9; @@ -4997,11 +4997,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, if (len > PAGE_SIZE) len = PAGE_SIZE; for (bio = blist; bio ; bio = bio->bi_next) { - /* - * won't fail because the vec table is big enough - * to hold all these pages - */ - bio_add_page(bio, page, len, 0); + if (WARN_ON(!bio_add_page(bio, page, len, 0))) { + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + return sectors_done; + } } sector_nr += len >> 9; nr_sectors += len >> 9; From 2c550517bc7e44c8d1151deb08f5b2b2cf63cf6c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:39 -0700 Subject: [PATCH 067/266] dm-crypt: use __bio_add_page to add single page to clone bio crypt_alloc_buffer() already allocates enough entries in the clone bio's vector, so adding a page to the bio can't fail. Use __bio_add_page() to reflect this. Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/f9a4dee5e81389fd70ffc442da01006538e55aca.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- drivers/md/dm-crypt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8b47b913ee83..09e37ebf7cc8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1693,8 +1693,7 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned int size) len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; - bio_add_page(clone, page, len, 0); - + __bio_add_page(clone, page, len, 0); remaining_size -= len; } From 83f2caaaf9cb25fe74775a59bf2662f184bfaa08 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:40 -0700 Subject: [PATCH 068/266] block: mark bio_add_page as __must_check Now that all users of bio_add_page check for the return value, mark bio_add_page as __must_check. Reviewed-by: Damien Le Moal Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/7ae4a902e08fe2e90c012ee07aeb35d4aae28373.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 8588bcfbc6ef..d63f0bb47c65 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -465,7 +465,8 @@ extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); -int bio_add_page(struct bio *, struct page *, unsigned len, unsigned off); +int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, + unsigned off); bool bio_add_folio(struct bio *, struct folio *, size_t len, size_t off); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); From 7a150f1ed19b709837e98571f49ab1ff2625ca89 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:41 -0700 Subject: [PATCH 069/266] block: add bio_add_folio_nofail Just like for bio_add_pages() add a no-fail variant for bio_add_folio(). Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/924dff4077812804398ef84128fb920507fa4be1.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- block/bio.c | 8 ++++++++ include/linux/bio.h | 2 ++ 2 files changed, 10 insertions(+) diff --git a/block/bio.c b/block/bio.c index 798cc4cf3bd2..8672179213b9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1138,6 +1138,14 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); +void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, + size_t off) +{ + WARN_ON_ONCE(len > UINT_MAX); + WARN_ON_ONCE(off > UINT_MAX); + __bio_add_page(bio, &folio->page, len, off); +} + /** * bio_add_folio - Attempt to add part of a folio to a bio. * @bio: BIO to add to. diff --git a/include/linux/bio.h b/include/linux/bio.h index d63f0bb47c65..0002bd78e02d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -474,6 +474,8 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); +void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, + size_t off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); From c2478469f2bb821a268bd02cae5b2af1c119c9bd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:42 -0700 Subject: [PATCH 070/266] fs: iomap: use bio_add_folio_nofail where possible When the iomap buffered-io code can't add a folio to a bio, it allocates a new bio and adds the folio to that one. This is done using bio_add_folio(), but doesn't check for errors. As adding a folio to a newly created bio can't fail, use the newly introduced bio_add_folio_nofail() function. Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/58fa893c24c67340a63323f09a179fefdca07f2a.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- fs/iomap/buffered-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 063133ec77f4..0edab9deae2a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -312,7 +312,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, ctx->bio->bi_opf |= REQ_RAHEAD; ctx->bio->bi_iter.bi_sector = sector; ctx->bio->bi_end_io = iomap_read_end_io; - bio_add_folio(ctx->bio, folio, plen, poff); + bio_add_folio_nofail(ctx->bio, folio, plen, poff); } done: @@ -539,7 +539,7 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_add_folio(&bio, folio, plen, poff); + bio_add_folio_nofail(&bio, folio, plen, poff); return submit_bio_wait(&bio); } @@ -1582,7 +1582,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - bio_add_folio(wpc->ioend->io_bio, folio, len, poff); + bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); } if (iop) From 6c500000af037f74b66dd01b565c8ee1b501cc1b Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 31 May 2023 04:50:43 -0700 Subject: [PATCH 071/266] block: mark bio_add_folio as __must_check Now that all callers of bio_add_folio() check the return value, mark it as __must_check. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/381360a45ac3684120cfbe1e07685e9c36256e47.1685532726.git.johannes.thumshirn@wdc.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 0002bd78e02d..617522928964 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -467,7 +467,8 @@ void bio_chain(struct bio *, struct bio *); int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); -bool bio_add_folio(struct bio *, struct folio *, size_t len, size_t off); +bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, + size_t len, size_t off); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); int bio_add_zone_append_page(struct bio *bio, struct page *page, From 5a0ac57c48aa9380126bd9bf3ec82140aab84548 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 31 May 2023 15:34:35 +0800 Subject: [PATCH 072/266] blk-ioc: protect ioc_destroy_icq() by 'queue_lock' Currently, icq is tracked by both request_queue(icq->q_node) and task(icq->ioc_node), and ioc_clear_queue() from elevator exit is not safe because it can access the list without protection: ioc_clear_queue ioc_release_fn lock queue_lock list_splice /* move queue list to a local list */ unlock queue_lock /* * lock is released, the local list * can be accessed through task exit. */ lock ioc->lock while (!hlist_empty) icq = hlist_entry lock queue_lock ioc_destroy_icq delete icq->ioc_node while (!list_empty) icq = list_entry() list_del icq->q_node /* * This is not protected by any lock, * list_entry concurrent with list_del * is not safe. */ unlock queue_lock unlock ioc->lock Fix this problem by protecting list 'icq->q_node' by queue_lock from ioc_clear_queue(). Reported-and-tested-by: Pradeep Pragallapati Link: https://lore.kernel.org/lkml/20230517084434.18932-1-quic_pragalla@quicinc.com/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531073435.2923422-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-ioc.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 63fc02042408..d5db92e62c43 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -77,6 +77,10 @@ static void ioc_destroy_icq(struct io_cq *icq) struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); + lockdep_assert_held(&q->queue_lock); + + if (icq->flags & ICQ_DESTROYED) + return; radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); @@ -128,12 +132,7 @@ static void ioc_release_fn(struct work_struct *work) spin_lock(&q->queue_lock); spin_lock(&ioc->lock); - /* - * The icq may have been destroyed when the ioc lock - * was released. - */ - if (!(icq->flags & ICQ_DESTROYED)) - ioc_destroy_icq(icq); + ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); rcu_read_unlock(); @@ -171,23 +170,20 @@ static bool ioc_delay_free(struct io_context *ioc) */ void ioc_clear_queue(struct request_queue *q) { - LIST_HEAD(icq_list); - spin_lock_irq(&q->queue_lock); - list_splice_init(&q->icq_list, &icq_list); - spin_unlock_irq(&q->queue_lock); - - rcu_read_lock(); - while (!list_empty(&icq_list)) { + while (!list_empty(&q->icq_list)) { struct io_cq *icq = - list_entry(icq_list.next, struct io_cq, q_node); + list_first_entry(&q->icq_list, struct io_cq, q_node); + /* + * Other context won't hold ioc lock to wait for queue_lock, see + * details in ioc_release_fn(). + */ spin_lock_irq(&icq->ioc->lock); - if (!(icq->flags & ICQ_DESTROYED)) - ioc_destroy_icq(icq); + ioc_destroy_icq(icq); spin_unlock_irq(&icq->ioc->lock); } - rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); } #else /* CONFIG_BLK_ICQ */ static inline void ioc_exit_icqs(struct io_context *ioc) From 20d099756b98fa6b5b838448b1ffbce46f4f3283 Mon Sep 17 00:00:00 2001 From: Azeem Shaikh Date: Tue, 30 May 2023 15:56:08 +0000 Subject: [PATCH 073/266] block: Replace all non-returning strlcpy with strscpy strlcpy() reads the entire source buffer first. This read may exceed the destination size limit. This is both inefficient and can lead to linear read overflows if a source string is not NUL-terminated [1]. In an effort to remove strlcpy() completely [2], replace strlcpy() here with strscpy(). No return values were used, so direct replacement is safe. [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strlcpy [2] https://github.com/KSPP/linux/issues/89 Signed-off-by: Azeem Shaikh Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20230530155608.272266-1-azeemshaikh38@gmail.com Signed-off-by: Jens Axboe --- block/blk-cgroup-fc-appid.c | 2 +- block/elevator.c | 2 +- block/genhd.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup-fc-appid.c b/block/blk-cgroup-fc-appid.c index 842e5e1c0f3c..3ec21333f393 100644 --- a/block/blk-cgroup-fc-appid.c +++ b/block/blk-cgroup-fc-appid.c @@ -34,7 +34,7 @@ int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) * the vmid from the fabric. * Adding the overhead of a lock is not necessary. */ - strlcpy(blkcg->fc_app_id, app_id, app_id_len); + strscpy(blkcg->fc_app_id, app_id, app_id_len); css_put(css); out_cgrp_put: cgroup_put(cgrp); diff --git a/block/elevator.c b/block/elevator.c index 24909069f872..8400e303fbcb 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -751,7 +751,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *buf, if (!elv_support_iosched(q)) return count; - strlcpy(elevator_name, buf, sizeof(elevator_name)); + strscpy(elevator_name, buf, sizeof(elevator_name)); ret = elevator_change(q, strstrip(elevator_name)); if (!ret) return count; diff --git a/block/genhd.c b/block/genhd.c index 1cb489b927d5..3537b7d7c484 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -253,7 +253,7 @@ int __register_blkdev(unsigned int major, const char *name, #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD p->probe = probe; #endif - strlcpy(p->name, name, sizeof(p->name)); + strscpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); From b5bbc52fd01278642773818642288999a0236cb6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 3 Jun 2023 12:06:01 +0800 Subject: [PATCH 074/266] ublk: add control command of UBLK_U_CMD_GET_FEATURES Add control command of UBLK_U_CMD_GET_FEATURES for returning driver's feature set or capability. This way can simplify userspace for maintaining compatibility because userspace doesn't need to send command to one device for querying driver feature set any more. Such as, with the queried feature set, userspace can choose to use: - UBLK_CMD_GET_DEV_INFO2 or UBLK_CMD_GET_DEV_INFO, - UBLK_U_CMD_* or UBLK_CMD_* Userspace code: https://github.com/ming1/ubdsrv/commits/features-cmd Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230603040601.775227-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 21 +++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 8 ++++++++ 2 files changed, 29 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 539eada32861..222a0341913f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2343,6 +2343,21 @@ static int ublk_ctrl_end_recovery(struct ublk_device *ub, return ret; } +static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) +{ + const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + void __user *argp = (void __user *)(unsigned long)header->addr; + u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + + if (header->len != UBLK_FEATURES_LEN || !header->addr) + return -EINVAL; + + if (copy_to_user(argp, &features, UBLK_FEATURES_LEN)) + return -EFAULT; + + return 0; +} + /* * All control commands are sent via /dev/ublk-control, so we have to check * the destination device's permission @@ -2423,6 +2438,7 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_GET_DEV_INFO2: case UBLK_CMD_GET_QUEUE_AFFINITY: case UBLK_CMD_GET_PARAMS: + case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)): mask = MAY_READ; break; case UBLK_CMD_START_DEV: @@ -2472,6 +2488,11 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, if (ret) goto out; + if (cmd_op == UBLK_U_CMD_GET_FEATURES) { + ret = ublk_ctrl_get_features(cmd); + goto out; + } + if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) { ret = -ENODEV; ub = ublk_get_device_from_id(header->dev_id); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 54b5b0aeefca..4b8558db90e1 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -47,6 +47,14 @@ _IOWR('u', UBLK_CMD_END_USER_RECOVERY, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_GET_DEV_INFO2 \ _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_GET_FEATURES \ + _IOR('u', 0x13, struct ublksrv_ctrl_cmd) + +/* + * 64bits are enough now, and it should be easy to extend in case of + * running out of feature flags + */ +#define UBLK_FEATURES_LEN 8 /* * IO commands, issued by ublk server, and handled by ublk driver. From d519df00938eed652fc041ff4e07b2b38a4ad3bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 17:16:46 +0200 Subject: [PATCH 075/266] drbd: stop defining __KERNEL_SYSCALLS__ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __KERNEL_SYSCALLS__ hasn't been needed since Linux 2.6.19 so stop defining it. Signed-off-by: Christoph Hellwig Reviewed-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20230601151646.1386867-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 1 - drivers/block/drbd/drbd_receiver.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 83987e7a5ef2..54223f64610a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -37,7 +37,6 @@ #include #include #include -#define __KERNEL_SYSCALLS__ #include #include #include diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 8c2bc47de473..0c9f54197768 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -27,7 +27,6 @@ #include #include #include -#define __KERNEL_SYSCALLS__ #include #include #include From 0783b1a7cbd9a02ddc35fe531b5966b674b304f0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:44 +0200 Subject: [PATCH 076/266] block: factor out a bd_end_claim helper from blkdev_put Move all the logic to release an exclusive claim into a helper. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Christian Brauner Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 63 +++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 21c63bfef323..317bfd9cba40 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -589,6 +589,37 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); +static void bd_end_claim(struct block_device *bdev) +{ + struct block_device *whole = bdev_whole(bdev); + bool unblock = false; + + /* + * Release a claim on the device. The holder fields are protected with + * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. + */ + spin_lock(&bdev_lock); + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--whole->bd_holders < 0); + if (!bdev->bd_holders) { + bdev->bd_holder = NULL; + if (bdev->bd_write_holder) + unblock = true; + } + if (!whole->bd_holders) + whole->bd_holder = NULL; + spin_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and unblock evpoll if + * it was a write holder. + */ + if (unblock) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } +} + static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); @@ -843,36 +874,8 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - if (mode & FMODE_EXCL) { - struct block_device *whole = bdev_whole(bdev); - bool bdev_free; - - /* - * Release a claim on the device. The holder fields - * are protected with bdev_lock. open_mutex is to - * synchronize disk_holder unlinking. - */ - spin_lock(&bdev_lock); - - WARN_ON_ONCE(--bdev->bd_holders < 0); - WARN_ON_ONCE(--whole->bd_holders < 0); - - if ((bdev_free = !bdev->bd_holders)) - bdev->bd_holder = NULL; - if (!whole->bd_holders) - whole->bd_holder = NULL; - - spin_unlock(&bdev_lock); - - /* - * If this was the last claim, remove holder link and - * unblock evpoll if it was a write holder. - */ - if (bdev_free && bdev->bd_write_holder) { - disk_unblock_events(disk); - bdev->bd_write_holder = false; - } - } + if (mode & FMODE_EXCL) + bd_end_claim(bdev); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE From ae5f855ead6b41422ca0c971ebda509c0414f8ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:45 +0200 Subject: [PATCH 077/266] block: refactor bd_may_claim The long if/else chain obsfucates the actual logic. Tidy it up to be more structured. Also drop the whole argument, as it can be trivially derived from bdev using bdev_whole, and having the bdev_whole in the function makes it easier to follow. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 317bfd9cba40..080b5c83bfbc 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -463,7 +463,6 @@ long nr_blockdev_pages(void) /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest - * @whole: whole block device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * * Test whether @bdev can be claimed by @holder. @@ -474,22 +473,27 @@ long nr_blockdev_pages(void) * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ -static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, - void *holder) +static bool bd_may_claim(struct block_device *bdev, void *holder) { - if (bdev->bd_holder == holder) - return true; /* already a holder */ - else if (bdev->bd_holder != NULL) - return false; /* held by someone else */ - else if (whole == bdev) - return true; /* is a whole device which isn't held */ + struct block_device *whole = bdev_whole(bdev); - else if (whole->bd_holder == bd_may_claim) - return true; /* is a partition of a device that is being partitioned */ - else if (whole->bd_holder != NULL) - return false; /* is a partition of a held device */ - else - return true; /* is a partition of an un-held device */ + if (bdev->bd_holder) { + /* + * The same holder can always re-claim. + */ + if (bdev->bd_holder == holder) + return true; + return false; + } + + /* + * If the whole devices holder is set to bd_may_claim, a partition on + * the device is claimed, but not the whole device. + */ + if (whole != bdev && + whole->bd_holder && whole->bd_holder != bd_may_claim) + return false; + return true; } /** @@ -513,7 +517,7 @@ int bd_prepare_to_claim(struct block_device *bdev, void *holder) retry: spin_lock(&bdev_lock); /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) { + if (!bd_may_claim(bdev, holder)) { spin_unlock(&bdev_lock); return -EBUSY; } @@ -559,7 +563,7 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) struct block_device *whole = bdev_whole(bdev); spin_lock(&bdev_lock); - BUG_ON(!bd_may_claim(bdev, whole, holder)); + BUG_ON(!bd_may_claim(bdev, holder)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder From 74e6464a987b2572771ac19163e961777fd0252e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:46 +0200 Subject: [PATCH 078/266] block: turn bdev_lock into a mutex There is no reason for this lock to spin, and being able to sleep under it will come in handy soon. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Christian Brauner Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 080b5c83bfbc..f5ffcac762e0 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -308,7 +308,7 @@ EXPORT_SYMBOL(thaw_bdev); * pseudo-fs */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); +static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) @@ -467,9 +467,6 @@ long nr_blockdev_pages(void) * * Test whether @bdev can be claimed by @holder. * - * CONTEXT: - * spin_lock(&bdev_lock). - * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ @@ -477,6 +474,8 @@ static bool bd_may_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); + lockdep_assert_held(&bdev_lock); + if (bdev->bd_holder) { /* * The same holder can always re-claim. @@ -515,10 +514,10 @@ int bd_prepare_to_claim(struct block_device *bdev, void *holder) if (WARN_ON_ONCE(!holder)) return -EINVAL; retry: - spin_lock(&bdev_lock); + mutex_lock(&bdev_lock); /* if someone else claimed, fail */ if (!bd_may_claim(bdev, holder)) { - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); return -EBUSY; } @@ -528,7 +527,7 @@ int bd_prepare_to_claim(struct block_device *bdev, void *holder) DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); goto retry; @@ -536,7 +535,7 @@ int bd_prepare_to_claim(struct block_device *bdev, void *holder) /* yay, all mine */ whole->bd_claiming = holder; - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); return 0; } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ @@ -562,7 +561,7 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); - spin_lock(&bdev_lock); + mutex_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, holder)); /* * Note that for a whole device bd_holders will be incremented twice, @@ -573,7 +572,7 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) bdev->bd_holders++; bdev->bd_holder = holder; bd_clear_claiming(whole, holder); - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); } /** @@ -587,9 +586,9 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) */ void bd_abort_claiming(struct block_device *bdev, void *holder) { - spin_lock(&bdev_lock); + mutex_lock(&bdev_lock); bd_clear_claiming(bdev_whole(bdev), holder); - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); @@ -602,7 +601,7 @@ static void bd_end_claim(struct block_device *bdev) * Release a claim on the device. The holder fields are protected with * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. */ - spin_lock(&bdev_lock); + mutex_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--whole->bd_holders < 0); if (!bdev->bd_holders) { @@ -612,7 +611,7 @@ static void bd_end_claim(struct block_device *bdev) } if (!whole->bd_holders) whole->bd_holder = NULL; - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); /* * If this was the last claim, remove holder link and unblock evpoll if From 66fddc25fe182fd7d28b35f4173113f3eefc7fb5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:47 +0200 Subject: [PATCH 079/266] block: consolidate the shutdown logic in blk_mark_disk_dead and del_gendisk blk_mark_disk_dead does very similar work a a section of del_gendisk: - set the GD_DEAD flag - set the capacity to zero - start a queue drain but del_gendisk also sets QUEUE_FLAG_DYING on the queue if it is owned by the disk, sets the capacity to zero before starting the drain, and both with sending a uevent and kernel message for this fake capacity change. Move the exact logic from the more heavily used del_gendisk into blk_mark_disk_dead and then call blk_mark_disk_dead from del_gendisk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 3537b7d7c484..aa327314905e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -572,13 +572,22 @@ EXPORT_SYMBOL(device_add_disk); */ void blk_mark_disk_dead(struct gendisk *disk) { + /* + * Fail any new I/O. + */ set_bit(GD_DEAD, &disk->state); - blk_queue_start_drain(disk->queue); + if (test_bit(GD_OWNS_QUEUE, &disk->state)) + blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); /* * Stop buffered writers from dirtying pages that can't be written out. */ - set_capacity_and_notify(disk, 0); + set_capacity(disk, 0); + + /* + * Prevent new I/O from crossing bio_queue_enter(). + */ + blk_queue_start_drain(disk->queue); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); @@ -620,18 +629,7 @@ void del_gendisk(struct gendisk *disk) fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); - /* - * Fail any new I/O. - */ - set_bit(GD_DEAD, &disk->state); - if (test_bit(GD_OWNS_QUEUE, &disk->state)) - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - set_capacity(disk, 0); - - /* - * Prevent new I/O from crossing bio_queue_enter(). - */ - blk_queue_start_drain(q); + blk_mark_disk_dead(disk); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); From a4f75764d16bed317276b05a9fe2c179ef61680d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:48 +0200 Subject: [PATCH 080/266] block: avoid repeated work in blk_mark_disk_dead Check if GD_DEAD is already set in blk_mark_disk_dead, and don't duplicate the work already done. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Christian Brauner Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-6-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index aa327314905e..6fa926a02d85 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -575,7 +575,9 @@ void blk_mark_disk_dead(struct gendisk *disk) /* * Fail any new I/O. */ - set_bit(GD_DEAD, &disk->state); + if (test_and_set_bit(GD_DEAD, &disk->state)) + return; + if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); From 69f90b70bdb62e1a930239d33579e04884cd0b9a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:49 +0200 Subject: [PATCH 081/266] block: unhash the inode earlier in delete_partition Move the call to remove_inode_hash to the beginning of delete_partition, as we want to prevent opening a block_device that is about to be removed ASAP. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-7-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 82d26427deae..9d1debaa5caf 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -267,6 +267,12 @@ static void delete_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); + /* + * Remove the block device from the inode hash, so that it cannot be + * looked up any more even when openers still hold references. + */ + remove_inode_hash(part->bd_inode); + fsync_bdev(part); __invalidate_device(part, true); @@ -274,12 +280,6 @@ static void delete_partition(struct block_device *part) kobject_put(part->bd_holder_dir); device_del(&part->bd_device); - /* - * Remove the block device from the inode hash, so that it cannot be - * looked up any more even when openers still hold references. - */ - remove_inode_hash(part->bd_inode); - put_device(&part->bd_device); } From eec1be4c30df73238b936fa9f3653773a6f8b15c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:50 +0200 Subject: [PATCH 082/266] block: delete partitions later in del_gendisk Delay dropping the block_devices for partitions in del_gendisk until after the call to blk_mark_disk_dead, so that we can implementat notification of removed devices in blk_mark_disk_dead. This requires splitting a lower-level drop_partition helper out of delete_partition and using that from del_gendisk, while having a common loop for the whole device and partitions that calls remove_inode_hash, fsync_bdev and __invalidate_device before the call to blk_mark_disk_dead. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 24 +++++++++++++++++++----- block/partitions/core.c | 19 ++++++++++++------- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/block/blk.h b/block/blk.h index 7ad7cb6ffa01..9582fcd0df41 100644 --- a/block/blk.h +++ b/block/blk.h @@ -409,7 +409,7 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); -void blk_drop_partitions(struct gendisk *disk); +void drop_partition(struct block_device *part); void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); diff --git a/block/genhd.c b/block/genhd.c index 6fa926a02d85..a668d2f02087 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -615,6 +615,8 @@ EXPORT_SYMBOL_GPL(blk_mark_disk_dead); void del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; + struct block_device *part; + unsigned long idx; might_sleep(); @@ -623,16 +625,28 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); + /* + * Prevent new openers by unlinked the bdev inode, and write out + * dirty data before marking the disk dead and stopping all I/O. + */ mutex_lock(&disk->open_mutex); - remove_inode_hash(disk->part0->bd_inode); - blk_drop_partitions(disk); + xa_for_each(&disk->part_tbl, idx, part) { + remove_inode_hash(part->bd_inode); + fsync_bdev(part); + __invalidate_device(part, true); + } mutex_unlock(&disk->open_mutex); - fsync_bdev(disk->part0); - __invalidate_device(disk->part0, true); - blk_mark_disk_dead(disk); + /* + * Drop all partitions now that the disk is marked dead. + */ + mutex_lock(&disk->open_mutex); + xa_for_each_start(&disk->part_tbl, idx, part, 1) + drop_partition(part); + mutex_unlock(&disk->open_mutex); + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); diff --git a/block/partitions/core.c b/block/partitions/core.c index 9d1debaa5caf..c3c12671a949 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -263,10 +263,19 @@ const struct device_type part_type = { .uevent = part_uevent, }; -static void delete_partition(struct block_device *part) +void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); + xa_erase(&part->bd_disk->part_tbl, part->bd_partno); + kobject_put(part->bd_holder_dir); + + device_del(&part->bd_device); + put_device(&part->bd_device); +} + +static void delete_partition(struct block_device *part) +{ /* * Remove the block device from the inode hash, so that it cannot be * looked up any more even when openers still hold references. @@ -276,11 +285,7 @@ static void delete_partition(struct block_device *part) fsync_bdev(part); __invalidate_device(part, true); - xa_erase(&part->bd_disk->part_tbl, part->bd_partno); - kobject_put(part->bd_holder_dir); - device_del(&part->bd_device); - - put_device(&part->bd_device); + drop_partition(part); } static ssize_t whole_disk_show(struct device *dev, @@ -519,7 +524,7 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) return true; } -void blk_drop_partitions(struct gendisk *disk) +static void blk_drop_partitions(struct gendisk *disk) { struct block_device *part; unsigned long idx; From 00080f7fb7a599c26523037b202fb945f3141811 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:51 +0200 Subject: [PATCH 083/266] block: remove blk_drop_partitions There is only a single caller left, so fold the loop into that. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-9-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index c3c12671a949..87a21942d606 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -524,17 +524,6 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) return true; } -static void blk_drop_partitions(struct gendisk *disk) -{ - struct block_device *part; - unsigned long idx; - - lockdep_assert_held(&disk->open_mutex); - - xa_for_each_start(&disk->part_tbl, idx, part, 1) - delete_partition(part); -} - static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { @@ -651,6 +640,8 @@ static int blk_add_partitions(struct gendisk *disk) int bdev_disk_changed(struct gendisk *disk, bool invalidate) { + struct block_device *part; + unsigned long idx; int ret = 0; lockdep_assert_held(&disk->open_mutex); @@ -663,8 +654,9 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate) return -EBUSY; sync_blockdev(disk->part0); invalidate_bdev(disk->part0); - blk_drop_partitions(disk); + xa_for_each_start(&disk->part_tbl, idx, part, 1) + delete_partition(part); clear_bit(GD_NEED_PART_SCAN, &disk->state); /* From 0718afd47f70cf46877c39c25d06b786e1a3f36c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:52 +0200 Subject: [PATCH 084/266] block: introduce holder ops Add a new blk_holder_ops structure, which is passed to blkdev_get_by_* and installed in the block_device for exclusive claims. It will be used to allow the block layer to call back into the user of the block device for thing like notification of a removed device or a device resize. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-10-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 41 ++++++++++++++++++++--------- block/fops.c | 2 +- block/genhd.c | 6 +++-- block/ioctl.c | 3 ++- drivers/block/drbd/drbd_nl.c | 3 ++- drivers/block/loop.c | 2 +- drivers/block/pktcdvd.c | 5 ++-- drivers/block/rnbd/rnbd-srv.c | 2 +- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/zram/zram_drv.c | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/dm.c | 2 +- drivers/md/md.c | 2 +- drivers/mtd/devices/block2mtd.c | 4 +-- drivers/nvme/target/io-cmd-bdev.c | 2 +- drivers/s390/block/dasd_genhd.c | 2 +- drivers/target/target_core_iblock.c | 2 +- drivers/target/target_core_pscsi.c | 3 ++- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/volumes.c | 6 ++--- fs/erofs/super.c | 2 +- fs/ext4/super.c | 3 ++- fs/f2fs/super.c | 4 +-- fs/jfs/jfs_logmgr.c | 2 +- fs/nfs/blocklayout/dev.c | 5 ++-- fs/nilfs2/super.c | 2 +- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/reiserfs/journal.c | 5 ++-- fs/super.c | 4 +-- fs/xfs/xfs_super.c | 2 +- include/linux/blk_types.h | 2 ++ include/linux/blkdev.h | 11 +++++--- kernel/power/swap.c | 4 +-- mm/swapfile.c | 3 ++- 34 files changed, 90 insertions(+), 56 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index f5ffcac762e0..5c46ff107706 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -102,7 +102,7 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, * under live filesystem. */ if (!(mode & FMODE_EXCL)) { - int err = bd_prepare_to_claim(bdev, truncate_bdev_range); + int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); if (err) goto invalidate; } @@ -415,6 +415,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); + mutex_init(&bdev->bd_holder_lock); bdev->bd_partno = partno; bdev->bd_inode = inode; bdev->bd_queue = disk->queue; @@ -464,13 +465,15 @@ long nr_blockdev_pages(void) * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest * @holder: holder trying to claim @bdev + * @hops: holder ops * * Test whether @bdev can be claimed by @holder. * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ -static bool bd_may_claim(struct block_device *bdev, void *holder) +static bool bd_may_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); @@ -480,8 +483,11 @@ static bool bd_may_claim(struct block_device *bdev, void *holder) /* * The same holder can always re-claim. */ - if (bdev->bd_holder == holder) + if (bdev->bd_holder == holder) { + if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) + return false; return true; + } return false; } @@ -499,6 +505,7 @@ static bool bd_may_claim(struct block_device *bdev, void *holder) * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @holder: holder trying to claim @bdev + * @hops: holder ops. * * Claim @bdev. This function fails if @bdev is already claimed by another * holder and waits if another claiming is in progress. return, the caller @@ -507,7 +514,8 @@ static bool bd_may_claim(struct block_device *bdev, void *holder) * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ -int bd_prepare_to_claim(struct block_device *bdev, void *holder) +int bd_prepare_to_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); @@ -516,7 +524,7 @@ int bd_prepare_to_claim(struct block_device *bdev, void *holder) retry: mutex_lock(&bdev_lock); /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, holder)) { + if (!bd_may_claim(bdev, holder, hops)) { mutex_unlock(&bdev_lock); return -EBUSY; } @@ -557,12 +565,13 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ -static void bd_finish_claiming(struct block_device *bdev, void *holder) +static void bd_finish_claiming(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); mutex_lock(&bdev_lock); - BUG_ON(!bd_may_claim(bdev, holder)); + BUG_ON(!bd_may_claim(bdev, holder, hops)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder @@ -570,7 +579,10 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) whole->bd_holders++; whole->bd_holder = bd_may_claim; bdev->bd_holders++; + mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = holder; + bdev->bd_holder_ops = hops; + mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); mutex_unlock(&bdev_lock); } @@ -605,7 +617,10 @@ static void bd_end_claim(struct block_device *bdev) WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--whole->bd_holders < 0); if (!bdev->bd_holders) { + mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = NULL; + bdev->bd_holder_ops = NULL; + mutex_unlock(&bdev->bd_holder_lock); if (bdev->bd_write_holder) unblock = true; } @@ -735,6 +750,7 @@ void blkdev_put_no_open(struct block_device *bdev) * @dev: device number of block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier + * @hops: holder operations * * Open the block device described by device number @dev. If @mode includes * %FMODE_EXCL, the block device is opened with exclusive access. Specifying @@ -751,7 +767,8 @@ void blkdev_put_no_open(struct block_device *bdev) * RETURNS: * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, + const struct blk_holder_ops *hops) { bool unblock_events = true; struct block_device *bdev; @@ -771,7 +788,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk = bdev->bd_disk; if (mode & FMODE_EXCL) { - ret = bd_prepare_to_claim(bdev, holder); + ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) goto put_blkdev; } @@ -791,7 +808,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) goto put_module; if (mode & FMODE_EXCL) { - bd_finish_claiming(bdev, holder); + bd_finish_claiming(bdev, holder, hops); /* * Block event polling for write claims if requested. Any write @@ -842,7 +859,7 @@ EXPORT_SYMBOL(blkdev_get_by_dev); * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder) + void *holder, const struct blk_holder_ops *hops) { struct block_device *bdev; dev_t dev; @@ -852,7 +869,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, if (error) return ERR_PTR(error); - bdev = blkdev_get_by_dev(dev, mode, holder); + bdev = blkdev_get_by_dev(dev, mode, holder, hops); if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, mode); return ERR_PTR(-EACCES); diff --git a/block/fops.c b/block/fops.c index b12c4b2a3a69..6a3087b750a6 100644 --- a/block/fops.c +++ b/block/fops.c @@ -490,7 +490,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) if ((filp->f_flags & O_ACCMODE) == 3) filp->f_mode |= FMODE_WRITE_IOCTL; - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); diff --git a/block/genhd.c b/block/genhd.c index a668d2f02087..b3bd58e9fbea 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -370,13 +370,15 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) * scanners. */ if (!(mode & FMODE_EXCL)) { - ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions); + ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, + NULL); if (ret) return ret; } set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXCL, NULL); + bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXCL, NULL, + NULL); if (IS_ERR(bdev)) ret = PTR_ERR(bdev); else diff --git a/block/ioctl.c b/block/ioctl.c index 9c5f637ff153..c7d7d4345edb 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -454,7 +454,8 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, if (mode & FMODE_EXCL) return set_blocksize(bdev, n); - if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev))) + if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev, + NULL))) return -EBUSY; ret = set_blocksize(bdev, n); blkdev_put(bdev, mode | FMODE_EXCL); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1a5d3d72d91d..cab59dab3410 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1641,7 +1641,8 @@ static struct block_device *open_backing_dev(struct drbd_device *device, int err = 0; bdev = blkdev_get_by_path(bdev_path, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + claim_ptr, NULL); if (IS_ERR(bdev)) { drbd_err(device, "open(\"%s\") failed with %ld\n", bdev_path, PTR_ERR(bdev)); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bc31bb7072a2..a73c857f5bfe 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1015,7 +1015,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, * here to avoid changing device under exclusive owner. */ if (!(mode & FMODE_EXCL)) { - error = bd_prepare_to_claim(bdev, loop_configure); + error = bd_prepare_to_claim(bdev, loop_configure, NULL); if (error) goto out_putf; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d5d7884cedd4..377f8b345352 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2125,7 +2125,8 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) * to read/write from/to it. It is already opened in O_NONBLOCK mode * so open should not fail. */ - bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd); + bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd, + NULL); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto out; @@ -2530,7 +2531,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } } - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL); + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); sdev = scsi_device_from_queue(bdev->bd_disk->queue); diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 2cfed2e58d64..cec22bbae2f9 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -719,7 +719,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, goto reject; } - bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE); + bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE, NULL); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n", diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 4807af1d5805..43b36da9b354 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -492,7 +492,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, vbd->pdevice = MKDEV(major, minor); bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ? - FMODE_READ : FMODE_WRITE, NULL); + FMODE_READ : FMODE_WRITE, NULL, NULL); if (IS_ERR(bdev)) { pr_warn("xen_vbd_create: device %08x could not be opened\n", diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b86691d2133e..0bc779446c6f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -508,7 +508,7 @@ static ssize_t backing_dev_store(struct device *dev, } bdev = blkdev_get_by_dev(inode->i_rdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram, NULL); if (IS_ERR(bdev)) { err = PTR_ERR(bdev); bdev = NULL; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7e9d19fd21dd..d84c09a73af8 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2560,7 +2560,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, err = "failed to open device"; bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, - sb); + sb, NULL); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { dev_t dev; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3b694ba3a106..d759f8bdb3df 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -746,7 +746,7 @@ static struct table_device *open_table_device(struct mapped_device *md, return ERR_PTR(-ENOMEM); refcount_set(&td->count, 1); - bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr); + bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr, NULL); if (IS_ERR(bdev)) { r = PTR_ERR(bdev); goto out_free_td; diff --git a/drivers/md/md.c b/drivers/md/md.c index 6a559a7e89c0..fabf9c543735 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3642,7 +3642,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe rdev->bdev = blkdev_get_by_dev(newdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, - super_format == -2 ? &claim_rdev : rdev); + super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", MAJOR(newdev), MINOR(newdev)); diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 4cd37ec45762..7ac82c6fe350 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -235,7 +235,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size, return NULL; /* Get a handle on the device */ - bdev = blkdev_get_by_path(devname, mode, dev); + bdev = blkdev_get_by_path(devname, mode, dev, NULL); #ifndef MODULE /* @@ -257,7 +257,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size, devt = name_to_dev_t(devname); if (!devt) continue; - bdev = blkdev_get_by_dev(devt, mode, dev); + bdev = blkdev_get_by_dev(devt, mode, dev, NULL); } #endif diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index c2d6cea0236b..9b6d6d85c725 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -85,7 +85,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) return -ENOTBLK; ns->bdev = blkdev_get_by_path(ns->device_path, - FMODE_READ | FMODE_WRITE, NULL); + FMODE_READ | FMODE_WRITE, NULL, NULL); if (IS_ERR(ns->bdev)) { ret = PTR_ERR(ns->bdev); if (ret != -ENOTBLK) { diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 998a961e1704..f21198bc483e 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -130,7 +130,7 @@ int dasd_scan_partitions(struct dasd_block *block) struct block_device *bdev; int rc; - bdev = blkdev_get_by_dev(disk_devt(block->gdp), FMODE_READ, NULL); + bdev = blkdev_get_by_dev(disk_devt(block->gdp), FMODE_READ, NULL, NULL); if (IS_ERR(bdev)) { DBF_DEV_EVENT(DBF_ERR, block->base, "scan partitions error, blkdev_get returned %ld", diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index cc838ffd1294..a5cbbefa78ee 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -114,7 +114,7 @@ static int iblock_configure_device(struct se_device *dev) else dev->dev_flags |= DF_READ_ONLY; - bd = blkdev_get_by_path(ib_dev->ibd_udev_path, mode, ib_dev); + bd = blkdev_get_by_path(ib_dev->ibd_udev_path, mode, ib_dev, NULL); if (IS_ERR(bd)) { ret = PTR_ERR(bd); goto out_free_bioset; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index e7425549e39c..e3494e036c6c 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -367,7 +367,8 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd) * for TYPE_DISK and TYPE_ZBC using supplied udev_path */ bd = blkdev_get_by_path(dev->udev_path, - FMODE_WRITE|FMODE_READ|FMODE_EXCL, pdv); + FMODE_WRITE|FMODE_READ|FMODE_EXCL, pdv, + NULL); if (IS_ERR(bd)) { pr_err("pSCSI: blkdev_get_by_path() failed\n"); scsi_device_put(sd); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 78696d331639..4de4984fa99b 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -258,7 +258,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - fs_info->bdev_holder); + fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 841e799dece5..784ccc8f6c69 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -496,7 +496,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, { int ret; - *bdev = blkdev_get_by_path(device_path, flags, holder); + *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); @@ -1377,7 +1377,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev = blkdev_get_by_path(path, flags, holder); + bdev = blkdev_get_by_path(path, flags, holder, NULL); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -2629,7 +2629,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - fs_info->bdev_holder); + fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 811ab66d805e..6c263e9cd38b 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -254,7 +254,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, dif->fscache = fscache; } else if (!sbi->devs->flatdev) { bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, - sb->s_type); + sb->s_type, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9680fe753e59..865625089ecc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1103,7 +1103,8 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb, + NULL); if (IS_ERR(bdev)) goto fail; return bdev; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 9f15b03037db..7c34ab082f13 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4025,7 +4025,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* Single zoned block device mount */ FDEV(0).bdev = blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, - sbi->sb->s_mode, sbi->sb->s_type); + sbi->sb->s_mode, sbi->sb->s_type, NULL); } else { /* Multi-device mount */ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); @@ -4044,7 +4044,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) sbi->log_blocks_per_seg) - 1; } FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, - sbi->sb->s_mode, sbi->sb->s_type); + sbi->sb->s_mode, sbi->sb->s_type, NULL); } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 15c645827dec..46d393c8088a 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1101,7 +1101,7 @@ int lmLogOpen(struct super_block *sb) */ bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - log); + log, NULL); if (IS_ERR(bdev)) { rc = PTR_ERR(bdev); goto free; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index fea5f8821da5..38b066ca699e 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -243,7 +243,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, if (!dev) return -EIO; - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL); + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL, NULL); if (IS_ERR(bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); @@ -312,7 +312,8 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); + bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL, + NULL); if (IS_ERR(bdev)) { pr_warn("pNFS: failed to open device %s (%ld)\n", devname, PTR_ERR(bdev)); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 77f1e5778d1c..91bfbd973d1d 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1285,7 +1285,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; - sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); + sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type, NULL); if (IS_ERR(sd.bdev)) return ERR_CAST(sd.bdev); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 60b97c92e2b2..6b13b8c3f2b8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1786,7 +1786,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, goto out2; reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, - FMODE_WRITE | FMODE_READ, NULL); + FMODE_WRITE | FMODE_READ, NULL, NULL); if (IS_ERR(reg->hr_bdev)) { ret = PTR_ERR(reg->hr_bdev); reg->hr_bdev = NULL; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4d11d60f493c..5e4db9a0c8e5 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2616,7 +2616,7 @@ static int journal_init_dev(struct super_block *super, if (jdev == super->s_dev) blkdev_mode &= ~FMODE_EXCL; journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, - journal); + journal, NULL); journal->j_dev_mode = blkdev_mode; if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); @@ -2632,7 +2632,8 @@ static int journal_init_dev(struct super_block *super, } journal->j_dev_mode = blkdev_mode; - journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal, + NULL); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; diff --git a/fs/super.c b/fs/super.c index 34afe411cf2b..012ce1400803 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1248,7 +1248,7 @@ int get_tree_bdev(struct fs_context *fc, if (!fc->source) return invalf(fc, "No source specified"); - bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type); + bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type, NULL); if (IS_ERR(bdev)) { errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev); @@ -1333,7 +1333,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; - bdev = blkdev_get_by_path(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type, NULL); if (IS_ERR(bdev)) return ERR_CAST(bdev); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7e706255f165..5684c538eb76 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -386,7 +386,7 @@ xfs_blkdev_get( int error = 0; *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - mp); + mp, NULL); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8ef209e3aa96..deb69eeab6bd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -55,6 +55,8 @@ struct block_device { struct super_block * bd_super; void * bd_claiming; void * bd_holder; + const struct blk_holder_ops *bd_holder_ops; + struct mutex bd_holder_lock; /* The counter of freeze processes */ int bd_fsfreeze_count; int bd_holders; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d89c2da14698..44f2a8bc57e8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1470,10 +1470,15 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #define BLKDEV_MAJOR_MAX 0 #endif +struct blk_holder_ops { +}; + +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, + const struct blk_holder_ops *hops); struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder); -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); -int bd_prepare_to_claim(struct block_device *bdev, void *holder); + void *holder, const struct blk_holder_ops *hops); +int bd_prepare_to_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); void blkdev_put(struct block_device *bdev, fmode_t mode); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 92e41ed292ad..801c411530d1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -357,7 +357,7 @@ static int swsusp_swap_check(void) root_swap = res; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, - NULL); + NULL, NULL); if (IS_ERR(hib_resume_bdev)) return PTR_ERR(hib_resume_bdev); @@ -1524,7 +1524,7 @@ int swsusp_check(void) mode |= FMODE_EXCL; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - mode, &holder); + mode, &holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); diff --git a/mm/swapfile.c b/mm/swapfile.c index 274bbf797480..cfbcf7d5705f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2770,7 +2770,8 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = blkdev_get_by_dev(inode->i_rdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, p, + NULL); if (IS_ERR(p->bdev)) { error = PTR_ERR(p->bdev); p->bdev = NULL; From f55e017c642051ddc01d77a89ab18f5ee71d6276 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:53 +0200 Subject: [PATCH 085/266] block: add a mark_dead holder operation Add a mark_dead method to blk_holder_ops that is called from blk_mark_disk_dead to notify the holder that the block device it is using has been marked dead. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Christian Brauner Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-11-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 24 ++++++++++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 25 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index b3bd58e9fbea..a07c4d6a1476 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -565,6 +565,28 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, } EXPORT_SYMBOL(device_add_disk); +static void blk_report_disk_dead(struct gendisk *disk) +{ + struct block_device *bdev; + unsigned long idx; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, bdev) { + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + continue; + rcu_read_unlock(); + + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) + bdev->bd_holder_ops->mark_dead(bdev); + mutex_unlock(&bdev->bd_holder_lock); + + put_device(&bdev->bd_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + /** * blk_mark_disk_dead - mark a disk as dead * @disk: disk to mark as dead @@ -592,6 +614,8 @@ void blk_mark_disk_dead(struct gendisk *disk) * Prevent new I/O from crossing bio_queue_enter(). */ blk_queue_start_drain(disk->queue); + + blk_report_disk_dead(disk); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 44f2a8bc57e8..9e9a9e4edee9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1471,6 +1471,7 @@ void blkdev_show(struct seq_file *seqf, off_t offset); #endif struct blk_holder_ops { + void (*mark_dead)(struct block_device *bdev); }; struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, From 87efb39075be6a288cd7f23858f15bd01c83028a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:54 +0200 Subject: [PATCH 086/266] fs: add a method to shut down the file system Add a new ->shutdown super operation that can be used to tell the file system to shut down, and call it from newly created holder ops when the block device under a file system shuts down. This only covers the main block device for "simple" file systems using get_tree_bdev / mount_bdev. File systems their own get_tree method or opening additional devices will need to set up their own blk_holder_ops. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-12-hch@lst.de Signed-off-by: Jens Axboe --- fs/super.c | 21 +++++++++++++++++++-- include/linux/fs.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/fs/super.c b/fs/super.c index 012ce1400803..f127589700ab 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1206,6 +1206,22 @@ int get_tree_keyed(struct fs_context *fc, EXPORT_SYMBOL(get_tree_keyed); #ifdef CONFIG_BLOCK +static void fs_mark_dead(struct block_device *bdev) +{ + struct super_block *sb; + + sb = get_super(bdev); + if (!sb) + return; + + if (sb->s_op->shutdown) + sb->s_op->shutdown(sb); + drop_super(sb); +} + +static const struct blk_holder_ops fs_holder_ops = { + .mark_dead = fs_mark_dead, +}; static int set_bdev_super(struct super_block *s, void *data) { @@ -1248,7 +1264,8 @@ int get_tree_bdev(struct fs_context *fc, if (!fc->source) return invalf(fc, "No source specified"); - bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type, NULL); + bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type, + &fs_holder_ops); if (IS_ERR(bdev)) { errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev); @@ -1333,7 +1350,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; - bdev = blkdev_get_by_path(dev_name, mode, fs_type, NULL); + bdev = blkdev_get_by_path(dev_name, mode, fs_type, &fs_holder_ops); if (IS_ERR(bdev)) return ERR_CAST(bdev); diff --git a/include/linux/fs.h b/include/linux/fs.h index 08ba2ae1d3ce..7b2053649820 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1932,6 +1932,7 @@ struct super_operations { struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); + void (*shutdown)(struct super_block *sb); }; /* From e7caa877e5ddac63886f4a8376cb3ffbd4dfe569 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:55 +0200 Subject: [PATCH 087/266] xfs: wire up sops->shutdown Wire up the shutdown method to shut down the file system when the underlying block device is marked dead. Add a new message to clearly distinguish this shutdown reason from other shutdowns. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-13-hch@lst.de Signed-off-by: Jens Axboe --- fs/xfs/xfs_fsops.c | 3 +++ fs/xfs/xfs_mount.h | 4 +++- fs/xfs/xfs_super.c | 8 ++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 13851c0d640b..9ebb8333a308 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -534,6 +534,9 @@ xfs_do_force_shutdown( } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of on-disk metadata"; + } else if (flags & SHUTDOWN_DEVICE_REMOVED) { + tag = XFS_PTAG_SHUTDOWN_IOERROR; + why = "Block device removal"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index aaaf5ec13492..429a5e12c103 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -457,12 +457,14 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, #define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ #define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */ +#define SHUTDOWN_DEVICE_REMOVED (1u << 5) /* device removed underneath us */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \ { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \ - { SHUTDOWN_CORRUPT_INCORE, "corruption" } + { SHUTDOWN_CORRUPT_INCORE, "corruption" }, \ + { SHUTDOWN_DEVICE_REMOVED, "device_removed" } /* * Flags for xfs_mountfs diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 5684c538eb76..eb469b8f9a04 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1159,6 +1159,13 @@ xfs_fs_free_cached_objects( return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); } +static void +xfs_fs_shutdown( + struct super_block *sb) +{ + xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, @@ -1172,6 +1179,7 @@ static const struct super_operations xfs_super_operations = { .show_options = xfs_fs_show_options, .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, + .shutdown = xfs_fs_shutdown, }; static int From 8067ca1dcdfcc2a5e0a51bff3730ad3eef0623d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:56 +0200 Subject: [PATCH 088/266] xfs: wire up the ->mark_dead holder operation for log and RT devices Implement a set of holder_ops that shut down the file system when the block device used as log or RT device is removed undeneath the file system. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-14-hch@lst.de Signed-off-by: Jens Axboe --- fs/xfs/xfs_super.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index eb469b8f9a04..1b4bd5c88f4a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -377,6 +377,17 @@ xfs_setup_dax_always( return 0; } +static void +xfs_bdev_mark_dead( + struct block_device *bdev) +{ + xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); +} + +static const struct blk_holder_ops xfs_holder_ops = { + .mark_dead = xfs_bdev_mark_dead, +}; + STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -386,7 +397,7 @@ xfs_blkdev_get( int error = 0; *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - mp, NULL); + mp, &xfs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); From 97524b454bc562f4052751f0e635a61dad78f1b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:57 +0200 Subject: [PATCH 089/266] ext4: split ext4_shutdown Split ext4_shutdown into a low-level helper that will be reused for implementing the shutdown super operation and a wrapper for the ioctl handling. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-15-hch@lst.de Signed-off-by: Jens Axboe --- fs/ext4/ext4.h | 1 + fs/ext4/ioctl.c | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6948d673bba2..2d60bbe8d171 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2965,6 +2965,7 @@ int ext4_fileattr_set(struct mnt_idmap *idmap, int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); +int ext4_force_shutdown(struct super_block *sb, u32 flags); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f9a430152063..961284cc9b65 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -793,16 +793,9 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) } #endif -static int ext4_shutdown(struct super_block *sb, unsigned long arg) +int ext4_force_shutdown(struct super_block *sb, u32 flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); - __u32 flags; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(flags, (__u32 __user *)arg)) - return -EFAULT; if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; @@ -838,6 +831,19 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) return 0; } +static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg) +{ + u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + return ext4_force_shutdown(sb, flags); +} + struct getfsmap_info { struct super_block *gi_sb; struct fsmap_head __user *gi_data; @@ -1566,7 +1572,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return ext4_ioctl_get_es_cache(filp, arg); case EXT4_IOC_SHUTDOWN: - return ext4_shutdown(sb, arg); + return ext4_ioctl_shutdown(sb, arg); case FS_IOC_ENABLE_VERITY: if (!ext4_has_feature_verity(sb)) From f5db130d4443ddf63b49e195782038ebaab0bec9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:58 +0200 Subject: [PATCH 090/266] ext4: wire up sops->shutdown Wire up the shutdown method to shut down the file system when the underlying block device is marked dead. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-16-hch@lst.de Signed-off-by: Jens Axboe --- fs/ext4/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 865625089ecc..a177a16c4d2f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1450,6 +1450,11 @@ static void ext4_destroy_inode(struct inode *inode) EXT4_I(inode)->i_reserved_data_blocks); } +static void ext4_shutdown(struct super_block *sb) +{ + ext4_force_shutdown(sb, EXT4_GOING_FLAGS_NOLOGFLUSH); +} + static void init_once(void *foo) { struct ext4_inode_info *ei = foo; @@ -1610,6 +1615,7 @@ static const struct super_operations ext4_sops = { .unfreeze_fs = ext4_unfreeze, .statfs = ext4_statfs, .show_options = ext4_show_options, + .shutdown = ext4_shutdown, #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, From dd2e31afba9e3a3107aa202726b6199c55075f59 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 1 Jun 2023 11:44:59 +0200 Subject: [PATCH 091/266] ext4: wire up the ->mark_dead holder operation for log devices Implement a set of holder_ops that shut down the file system when the block device used as log device is removed undeneath the file system. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Acked-by: Dave Chinner Reviewed-by: Dave Chinner Link: https://lore.kernel.org/r/20230601094459.1350643-17-hch@lst.de Signed-off-by: Jens Axboe --- fs/ext4/super.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a177a16c4d2f..9070ea9154d7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1096,6 +1096,15 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } +static void ext4_bdev_mark_dead(struct block_device *bdev) +{ + ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH); +} + +static const struct blk_holder_ops ext4_holder_ops = { + .mark_dead = ext4_bdev_mark_dead, +}; + /* * Open the external journal device */ @@ -1104,7 +1113,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb, - NULL); + &ext4_holder_ops); if (IS_ERR(bdev)) goto fail; return bdev; From aa5f6ed8c21ec1aa5fd688118d8d5cd87c5ffc1d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:12 +0200 Subject: [PATCH 092/266] driver core: return bool from driver_probe_done bool is the most sensible return value for a yes/no return. Also add __init as this funtion is only called from the early boot code. Signed-off-by: Christoph Hellwig Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230531125535.676098-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/base/dd.c | 6 ++---- include/linux/device/driver.h | 2 +- init/do_mounts.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 9c09ca5c4ab6..878aa7646b37 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -751,14 +751,12 @@ static int really_probe_debug(struct device *dev, struct device_driver *drv) * * Should somehow figure out how to use a semaphore, not an atomic variable... */ -int driver_probe_done(void) +bool __init driver_probe_done(void) { int local_probe_count = atomic_read(&probe_count); pr_debug("%s: probe_count = %d\n", __func__, local_probe_count); - if (local_probe_count) - return -EBUSY; - return 0; + return !local_probe_count; } /** diff --git a/include/linux/device/driver.h b/include/linux/device/driver.h index c244267a6744..7738f458995f 100644 --- a/include/linux/device/driver.h +++ b/include/linux/device/driver.h @@ -126,7 +126,7 @@ int __must_check driver_register(struct device_driver *drv); void driver_unregister(struct device_driver *drv); struct device_driver *driver_find(const char *name, const struct bus_type *bus); -int driver_probe_done(void); +bool __init driver_probe_done(void); void wait_for_device_probe(void); void __init wait_for_init_devices_probe(void); diff --git a/init/do_mounts.c b/init/do_mounts.c index 811e94daf0a8..2fe7901b5bcf 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -635,7 +635,7 @@ void __init prepare_namespace(void) if ((ROOT_DEV == 0) && root_wait) { printk(KERN_INFO "Waiting for root device %s...\n", saved_root_name); - while (driver_probe_done() != 0 || + while (!driver_probe_done() || (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) msleep(5); async_synchronize_full(); From 02b42d58f3898134b900ff3030561099e38adb32 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:13 +0200 Subject: [PATCH 093/266] PM: hibernate: factor out a helper to find the resume device Split the logic to find the resume device out software_resume and into a separate helper to start unwindig the convoluted goto logic. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-3-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 72 +++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 30d1274f03f6..072795063662 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -910,6 +910,41 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); +static int find_resume_device(void) +{ + if (!strlen(resume_file)) + return -ENOENT; + + pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); + + if (resume_delay) { + pr_info("Waiting %dsec before reading resume device ...\n", + resume_delay); + ssleep(resume_delay); + } + + /* Check if the device is there */ + swsusp_resume_device = name_to_dev_t(resume_file); + if (swsusp_resume_device) + return 0; + + /* + * Some device discovery might still be in progress; we need to wait for + * this to finish. + */ + wait_for_device_probe(); + if (resume_wait) { + while (!(swsusp_resume_device = name_to_dev_t(resume_file))) + msleep(10); + async_synchronize_full(); + } + + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) + return -ENODEV; + return 0; +} + /** * software_resume - Resume from a saved hibernation image. * @@ -949,45 +984,12 @@ static int software_resume(void) snapshot_test = false; - if (swsusp_resume_device) - goto Check_image; - - if (!strlen(resume_file)) { - error = -ENOENT; - goto Unlock; - } - - pm_pr_dbg("Checking hibernation image partition %s\n", resume_file); - - if (resume_delay) { - pr_info("Waiting %dsec before reading resume device ...\n", - resume_delay); - ssleep(resume_delay); - } - - /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { - /* - * Some device discovery might still be in progress; we need - * to wait for this to finish. - */ - wait_for_device_probe(); - - if (resume_wait) { - while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) - msleep(10); - async_synchronize_full(); - } - - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) { - error = -ENODEV; + error = find_resume_device(); + if (error) goto Unlock; - } } - Check_image: pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); From d6545e687271ab27472eebff770f2de6a5f1a464 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:14 +0200 Subject: [PATCH 094/266] PM: hibernate: remove the global snapshot_test variable Passing call dependent variable in global variables is a huge antipattern. Fix it up. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-4-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 17 ++++++----------- kernel/power/power.h | 3 +-- kernel/power/swap.c | 2 +- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 072795063662..78696aa04f5c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -64,7 +64,6 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; bool freezer_test_done; -bool snapshot_test; static const struct platform_hibernation_ops *hibernation_ops; @@ -684,7 +683,7 @@ static void power_down(void) cpu_relax(); } -static int load_image_and_restore(void) +static int load_image_and_restore(bool snapshot_test) { int error; unsigned int flags; @@ -721,6 +720,7 @@ static int load_image_and_restore(void) */ int hibernate(void) { + bool snapshot_test = false; unsigned int sleep_flags; int error; @@ -748,9 +748,6 @@ int hibernate(void) if (error) goto Exit; - /* protected by system_transition_mutex */ - snapshot_test = false; - lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); @@ -792,9 +789,9 @@ int hibernate(void) unlock_device_hotplug(); if (snapshot_test) { pm_pr_dbg("Checking hibernation image\n"); - error = swsusp_check(); + error = swsusp_check(snapshot_test); if (!error) - error = load_image_and_restore(); + error = load_image_and_restore(snapshot_test); } thaw_processes(); @@ -982,8 +979,6 @@ static int software_resume(void) */ mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); - snapshot_test = false; - if (!swsusp_resume_device) { error = find_resume_device(); if (error) @@ -994,7 +989,7 @@ static int software_resume(void) MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); - error = swsusp_check(); + error = swsusp_check(false); if (error) goto Unlock; @@ -1022,7 +1017,7 @@ static int software_resume(void) goto Close_Finish; } - error = load_image_and_restore(); + error = load_image_and_restore(false); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); diff --git a/kernel/power/power.h b/kernel/power/power.h index b83c8d5e188d..978189fcafd1 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -59,7 +59,6 @@ asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; -extern bool snapshot_test; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); @@ -174,7 +173,7 @@ extern int swsusp_swap_in_use(void); #define SF_HW_SIG 8 /* kernel/power/hibernate.c */ -extern int swsusp_check(void); +int swsusp_check(bool snapshot_test); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 801c411530d1..81aec3b2c605 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1514,7 +1514,7 @@ int swsusp_read(unsigned int *flags_p) * swsusp_check - Check for swsusp signature in the resume device */ -int swsusp_check(void) +int swsusp_check(bool snapshot_test) { int error; void *holder; From cc89c63e2fe37d476357c82390dfb12edcd41cdd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:15 +0200 Subject: [PATCH 095/266] PM: hibernate: move finding the resume device out of software_resume software_resume can be called either from an init call in the boot code, or from sysfs once the system has finished booting, and the two invocation methods this can't race with each other. For the latter case we did just parse the suspend device manually, while the former might not have one. Split software_resume so that the search only happens for the boot case, which also means the special lockdep nesting annotation can go away as the system transition mutex can be taken a little later and doesn't have the sysfs locking nest inside it. Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-5-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 80 ++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 41 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 78696aa04f5c..45e24b02cd50 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -907,7 +907,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) } EXPORT_SYMBOL_GPL(hibernate_quiet_exec); -static int find_resume_device(void) +static int __init find_resume_device(void) { if (!strlen(resume_file)) return -ENOENT; @@ -942,53 +942,16 @@ static int find_resume_device(void) return 0; } -/** - * software_resume - Resume from a saved hibernation image. - * - * This routine is called as a late initcall, when all devices have been - * discovered and initialized already. - * - * The image reading code is called to see if there is a hibernation image - * available for reading. If that is the case, devices are quiesced and the - * contents of memory is restored from the saved image. - * - * If this is successful, control reappears in the restored target kernel in - * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine - * attempts to recover gracefully and make the kernel return to the normal mode - * of operation. - */ static int software_resume(void) { int error; - /* - * If the user said "noresume".. bail out early. - */ - if (noresume || !hibernation_available()) - return 0; - - /* - * name_to_dev_t() below takes a sysfs buffer mutex when sysfs - * is configured into the kernel. Since the regular hibernate - * trigger path is via sysfs which takes a buffer mutex before - * calling hibernate functions (which take system_transition_mutex) - * this can cause lockdep to complain about a possible ABBA deadlock - * which cannot happen since we're in the boot code here and - * sysfs can't be invoked yet. Therefore, we use a subclass - * here to avoid lockdep complaining. - */ - mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); - - if (!swsusp_resume_device) { - error = find_resume_device(); - if (error) - goto Unlock; - } - pm_pr_dbg("Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); pm_pr_dbg("Looking for hibernation image.\n"); + + mutex_lock(&system_transition_mutex); error = swsusp_check(false); if (error) goto Unlock; @@ -1035,7 +998,39 @@ static int software_resume(void) goto Finish; } -late_initcall_sync(software_resume); +/** + * software_resume_initcall - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int __init software_resume_initcall(void) +{ + /* + * If the user said "noresume".. bail out early. + */ + if (noresume || !hibernation_available()) + return 0; + + if (!swsusp_resume_device) { + int error = find_resume_device(); + + if (error) + return error; + } + + return software_resume(); +} +late_initcall_sync(software_resume_initcall); static const char * const hibernation_modes[] = { @@ -1176,6 +1171,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, char *name; dev_t res; + if (!hibernation_available()) + return 0; + if (len && buf[len-1] == '\n') len--; name = kstrndup(buf, len, GFP_KERNEL); From f5524c3fadba35c075a5131bad74e3041507a694 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:16 +0200 Subject: [PATCH 096/266] init: remove pointless Root_* values Remove all unused defines, and just use the expanded versions for the SCSI disk majors. I've decided to keep Root_RAM0 even if it could be expanded as there is a lot of special casing for it in the init code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-6-hch@lst.de Signed-off-by: Jens Axboe --- arch/alpha/kernel/setup.c | 2 +- arch/ia64/kernel/setup.c | 2 +- arch/powerpc/platforms/powermac/setup.c | 3 ++- include/linux/root_dev.h | 8 -------- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 33bf3a627002..b650ff1cb022 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -658,7 +658,7 @@ setup_arch(char **cmdline_p) #endif /* Default root filesystem to sda2. */ - ROOT_DEV = Root_SDA2; + ROOT_DEV = MKDEV(SCSI_DISK0_MAJOR, 2); #ifdef CONFIG_EISA /* FIXME: only set this when we actually have EISA in this box? */ diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index c05728044272..becdb4f33c21 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -627,7 +627,7 @@ setup_arch (char **cmdline_p) * is physical disk 1 partition 1 and the Linux root disk is * physical disk 1 partition 2. */ - ROOT_DEV = Root_SDA2; /* default to second partition on first drive */ + ROOT_DEV = MKDEV(SCSI_DISK0_MAJOR, 2); if (is_uv_system()) uv_setup(cmdline_p); diff --git a/arch/powerpc/platforms/powermac/setup.c b/arch/powerpc/platforms/powermac/setup.c index 193cc9c39422..0c41f4b005bc 100644 --- a/arch/powerpc/platforms/powermac/setup.c +++ b/arch/powerpc/platforms/powermac/setup.c @@ -76,7 +76,8 @@ int pmac_newworld; static int current_root_goodness = -1; -#define DEFAULT_ROOT_DEVICE Root_SDA1 /* sda1 - slightly silly choice */ +/* sda1 - slightly silly choice */ +#define DEFAULT_ROOT_DEVICE MKDEV(SCSI_DISK0_MAJOR, 1) sys_ctrler_t sys_ctrler = SYS_CTRLER_UNKNOWN; EXPORT_SYMBOL(sys_ctrler); diff --git a/include/linux/root_dev.h b/include/linux/root_dev.h index 4e78651371ba..ed3ea8da6429 100644 --- a/include/linux/root_dev.h +++ b/include/linux/root_dev.h @@ -10,14 +10,6 @@ enum { Root_NFS = MKDEV(UNNAMED_MAJOR, 255), Root_CIFS = MKDEV(UNNAMED_MAJOR, 254), Root_RAM0 = MKDEV(RAMDISK_MAJOR, 0), - Root_RAM1 = MKDEV(RAMDISK_MAJOR, 1), - Root_FD0 = MKDEV(FLOPPY_MAJOR, 0), - Root_HDA1 = MKDEV(IDE0_MAJOR, 1), - Root_HDA2 = MKDEV(IDE0_MAJOR, 2), - Root_SDA1 = MKDEV(SCSI_DISK0_MAJOR, 1), - Root_SDA2 = MKDEV(SCSI_DISK0_MAJOR, 2), - Root_HDC1 = MKDEV(IDE1_MAJOR, 1), - Root_SR0 = MKDEV(SCSI_CDROM_MAJOR, 0), }; extern dev_t ROOT_DEV; From e3102722ffe77094ba9e7e46380792b3dd8a7abd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:17 +0200 Subject: [PATCH 097/266] init: rename mount_block_root to mount_root_generic mount_block_root is also used to mount non-block file systems, so give it a better name. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-7-hch@lst.de Signed-off-by: Jens Axboe --- init/do_mounts.c | 6 +++--- init/do_mounts.h | 2 +- init/do_mounts_initrd.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 2fe7901b5bcf..a2c0baace099 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -391,7 +391,7 @@ static int __init do_mount_root(const char *name, const char *fs, return ret; } -void __init mount_block_root(char *name, int flags) +void __init mount_root_generic(char *name, int flags) { struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); @@ -589,7 +589,7 @@ void __init mount_root(void) if (err < 0) pr_emerg("Failed to create /dev/root: %d\n", err); - mount_block_root("/dev/root", root_mountflags); + mount_root_generic("/dev/root", root_mountflags); } #endif } @@ -620,7 +620,7 @@ void __init prepare_namespace(void) root_device_name = saved_root_name; if (!strncmp(root_device_name, "mtd", 3) || !strncmp(root_device_name, "ubi", 3)) { - mount_block_root(root_device_name, root_mountflags); + mount_root_generic(root_device_name, root_mountflags); goto out; } ROOT_DEV = name_to_dev_t(root_device_name); diff --git a/init/do_mounts.h b/init/do_mounts.h index 7a29ac3e427b..33623025f695 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -10,7 +10,7 @@ #include #include -void mount_block_root(char *name, int flags); +void mount_root_generic(char *name, int flags); void mount_root(void); extern int root_mountflags; diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 34731241377d..686d1ff3af4b 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -95,7 +95,7 @@ static void __init handle_initrd(void) real_root_dev = new_encode_dev(ROOT_DEV); create_dev("/dev/root.old", Root_RAM0); /* mount initrd on rootfs' /root */ - mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); + mount_root_generic("/dev/root.old", root_mountflags & ~MS_RDONLY); init_mkdir("/old", 0700); init_chdir("/old"); From a6a41d39c2d91ff2543d31b6cc6070f3957e3aea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:18 +0200 Subject: [PATCH 098/266] init: refactor mount_root Provide stubs for all the lower level mount helpers, and just switch on ROOT_DEV in the main function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-8-hch@lst.de Signed-off-by: Jens Axboe --- init/do_mounts.c | 104 +++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 48 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index a2c0baace099..e708b02d9d65 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -453,15 +453,14 @@ void __init mount_root_generic(char *name, int flags) #define NFSROOT_TIMEOUT_MAX 30 #define NFSROOT_RETRY_MAX 5 -static int __init mount_nfs_root(void) +static void __init mount_nfs_root(void) { char *root_dev, *root_data; unsigned int timeout; - int try, err; + int try; - err = nfs_root_data(&root_dev, &root_data); - if (err != 0) - return 0; + if (nfs_root_data(&root_dev, &root_data)) + goto fail; /* * The server or network may not be ready, so try several @@ -470,10 +469,8 @@ static int __init mount_nfs_root(void) */ timeout = NFSROOT_TIMEOUT_MIN; for (try = 1; ; try++) { - err = do_mount_root(root_dev, "nfs", - root_mountflags, root_data); - if (err == 0) - return 1; + if (!do_mount_root(root_dev, "nfs", root_mountflags, root_data)) + return; if (try > NFSROOT_RETRY_MAX) break; @@ -483,9 +480,14 @@ static int __init mount_nfs_root(void) if (timeout > NFSROOT_TIMEOUT_MAX) timeout = NFSROOT_TIMEOUT_MAX; } - return 0; +fail: + pr_err("VFS: Unable to mount root fs via NFS.\n"); } -#endif +#else +static inline void mount_nfs_root(void) +{ +} +#endif /* CONFIG_ROOT_NFS */ #ifdef CONFIG_CIFS_ROOT @@ -495,22 +497,20 @@ extern int cifs_root_data(char **dev, char **opts); #define CIFSROOT_TIMEOUT_MAX 30 #define CIFSROOT_RETRY_MAX 5 -static int __init mount_cifs_root(void) +static void __init mount_cifs_root(void) { char *root_dev, *root_data; unsigned int timeout; - int try, err; + int try; - err = cifs_root_data(&root_dev, &root_data); - if (err != 0) - return 0; + if (cifs_root_data(&root_dev, &root_data)) + goto fail; timeout = CIFSROOT_TIMEOUT_MIN; for (try = 1; ; try++) { - err = do_mount_root(root_dev, "cifs", root_mountflags, - root_data); - if (err == 0) - return 1; + if (!do_mount_root(root_dev, "cifs", root_mountflags, + root_data)) + return; if (try > CIFSROOT_RETRY_MAX) break; @@ -519,9 +519,14 @@ static int __init mount_cifs_root(void) if (timeout > CIFSROOT_TIMEOUT_MAX) timeout = CIFSROOT_TIMEOUT_MAX; } - return 0; +fail: + pr_err("VFS: Unable to mount root fs via SMB.\n"); } -#endif +#else +static inline void mount_cifs_root(void) +{ +} +#endif /* CONFIG_CIFS_ROOT */ static bool __init fs_is_nodev(char *fstype) { @@ -563,35 +568,38 @@ static int __init mount_nodev_root(void) return err; } +#ifdef CONFIG_BLOCK +static void __init mount_block_root(void) +{ + int err = create_dev("/dev/root", ROOT_DEV); + + if (err < 0) + pr_emerg("Failed to create /dev/root: %d\n", err); + mount_root_generic("/dev/root", root_mountflags); +} +#else +static inline void mount_block_root(void) +{ +} +#endif /* CONFIG_BLOCK */ + void __init mount_root(void) { -#ifdef CONFIG_ROOT_NFS - if (ROOT_DEV == Root_NFS) { - if (!mount_nfs_root()) - printk(KERN_ERR "VFS: Unable to mount root fs via NFS.\n"); - return; + switch (ROOT_DEV) { + case Root_NFS: + mount_nfs_root(); + break; + case Root_CIFS: + mount_cifs_root(); + break; + case 0: + if (root_device_name && root_fs_names && mount_nodev_root() == 0) + break; + fallthrough; + default: + mount_block_root(); + break; } -#endif -#ifdef CONFIG_CIFS_ROOT - if (ROOT_DEV == Root_CIFS) { - if (!mount_cifs_root()) - printk(KERN_ERR "VFS: Unable to mount root fs via SMB.\n"); - return; - } -#endif - if (ROOT_DEV == 0 && root_device_name && root_fs_names) { - if (mount_nodev_root() == 0) - return; - } -#ifdef CONFIG_BLOCK - { - int err = create_dev("/dev/root", ROOT_DEV); - - if (err < 0) - pr_emerg("Failed to create /dev/root: %d\n", err); - mount_root_generic("/dev/root", root_mountflags); - } -#endif } /* From c8643c72bc42781fc169c6498a3902bec447099e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:19 +0200 Subject: [PATCH 099/266] init: pass root_device_name explicitly Instead of declaring root_device_name as a global variable pass it as an argument to the functions using it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-9-hch@lst.de Signed-off-by: Jens Axboe --- init/do_mounts.c | 29 ++++++++++++++++------------- init/do_mounts.h | 14 +++++++------- init/do_mounts_initrd.c | 11 ++++++----- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index e708b02d9d65..1405ee7218bf 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -28,7 +28,6 @@ #include "do_mounts.h" int root_mountflags = MS_RDONLY | MS_SILENT; -static char * __initdata root_device_name; static char __initdata saved_root_name[64]; static int root_wait; @@ -391,7 +390,7 @@ static int __init do_mount_root(const char *name, const char *fs, return ret; } -void __init mount_root_generic(char *name, int flags) +void __init mount_root_generic(char *name, char *pretty_name, int flags) { struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); @@ -425,7 +424,7 @@ void __init mount_root_generic(char *name, int flags) * and give them a list of the available devices */ printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", - root_device_name, b, err); + pretty_name, b, err); printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); printk_all_partitions(); @@ -541,7 +540,7 @@ static bool __init fs_is_nodev(char *fstype) return ret; } -static int __init mount_nodev_root(void) +static int __init mount_nodev_root(char *root_device_name) { char *fs_names, *fstype; int err = -EINVAL; @@ -569,21 +568,21 @@ static int __init mount_nodev_root(void) } #ifdef CONFIG_BLOCK -static void __init mount_block_root(void) +static void __init mount_block_root(char *root_device_name) { int err = create_dev("/dev/root", ROOT_DEV); if (err < 0) pr_emerg("Failed to create /dev/root: %d\n", err); - mount_root_generic("/dev/root", root_mountflags); + mount_root_generic("/dev/root", root_device_name, root_mountflags); } #else -static inline void mount_block_root(void) +static inline void mount_block_root(char *root_device_name) { } #endif /* CONFIG_BLOCK */ -void __init mount_root(void) +void __init mount_root(char *root_device_name) { switch (ROOT_DEV) { case Root_NFS: @@ -593,11 +592,12 @@ void __init mount_root(void) mount_cifs_root(); break; case 0: - if (root_device_name && root_fs_names && mount_nodev_root() == 0) + if (root_device_name && root_fs_names && + mount_nodev_root(root_device_name) == 0) break; fallthrough; default: - mount_block_root(); + mount_block_root(root_device_name); break; } } @@ -607,6 +607,8 @@ void __init mount_root(void) */ void __init prepare_namespace(void) { + char *root_device_name; + if (root_delay) { printk(KERN_INFO "Waiting %d sec before mounting root device...\n", root_delay); @@ -628,7 +630,8 @@ void __init prepare_namespace(void) root_device_name = saved_root_name; if (!strncmp(root_device_name, "mtd", 3) || !strncmp(root_device_name, "ubi", 3)) { - mount_root_generic(root_device_name, root_mountflags); + mount_root_generic(root_device_name, root_device_name, + root_mountflags); goto out; } ROOT_DEV = name_to_dev_t(root_device_name); @@ -636,7 +639,7 @@ void __init prepare_namespace(void) root_device_name += 5; } - if (initrd_load()) + if (initrd_load(root_device_name)) goto out; /* wait for any asynchronous scanning to complete */ @@ -649,7 +652,7 @@ void __init prepare_namespace(void) async_synchronize_full(); } - mount_root(); + mount_root(root_device_name); out: devtmpfs_mount(); init_mount(".", "/", NULL, MS_MOVE, NULL); diff --git a/init/do_mounts.h b/init/do_mounts.h index 33623025f695..15e372b00ce7 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -10,8 +10,8 @@ #include #include -void mount_root_generic(char *name, int flags); -void mount_root(void); +void mount_root_generic(char *name, char *pretty_name, int flags); +void mount_root(char *root_device_name); extern int root_mountflags; static inline __init int create_dev(char *name, dev_t dev) @@ -33,11 +33,11 @@ static inline int rd_load_image(char *from) { return 0; } #endif #ifdef CONFIG_BLK_DEV_INITRD - -bool __init initrd_load(void); - +bool __init initrd_load(char *root_device_name); #else - -static inline bool initrd_load(void) { return false; } +static inline bool initrd_load(char *root_device_name) +{ + return false; + } #endif diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 686d1ff3af4b..425f4bcf4b77 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -83,7 +83,7 @@ static int __init init_linuxrc(struct subprocess_info *info, struct cred *new) return 0; } -static void __init handle_initrd(void) +static void __init handle_initrd(char *root_device_name) { struct subprocess_info *info; static char *argv[] = { "linuxrc", NULL, }; @@ -95,7 +95,8 @@ static void __init handle_initrd(void) real_root_dev = new_encode_dev(ROOT_DEV); create_dev("/dev/root.old", Root_RAM0); /* mount initrd on rootfs' /root */ - mount_root_generic("/dev/root.old", root_mountflags & ~MS_RDONLY); + mount_root_generic("/dev/root.old", root_device_name, + root_mountflags & ~MS_RDONLY); init_mkdir("/old", 0700); init_chdir("/old"); @@ -117,7 +118,7 @@ static void __init handle_initrd(void) init_chdir("/"); ROOT_DEV = new_decode_dev(real_root_dev); - mount_root(); + mount_root(root_device_name); printk(KERN_NOTICE "Trying to move old root to /initrd ... "); error = init_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); @@ -133,7 +134,7 @@ static void __init handle_initrd(void) } } -bool __init initrd_load(void) +bool __init initrd_load(char *root_device_name) { if (mount_initrd) { create_dev("/dev/ram", Root_RAM0); @@ -145,7 +146,7 @@ bool __init initrd_load(void) */ if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { init_unlink("/initrd.image"); - handle_initrd(); + handle_initrd(root_device_name); return true; } } From 73231b58b1b496d631fa0ecf9fa7f64f5a07c6e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:20 +0200 Subject: [PATCH 100/266] init: don't remove the /dev/ prefix from error messages Remove the code that drops the /dev/ prefix from root_device_name, which is only used for error messages when mounting the root device fails. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-10-hch@lst.de Signed-off-by: Jens Axboe --- init/do_mounts.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 1405ee7218bf..74cc96bffbdd 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -607,8 +607,6 @@ void __init mount_root(char *root_device_name) */ void __init prepare_namespace(void) { - char *root_device_name; - if (root_delay) { printk(KERN_INFO "Waiting %d sec before mounting root device...\n", root_delay); @@ -627,19 +625,16 @@ void __init prepare_namespace(void) md_run_setup(); if (saved_root_name[0]) { - root_device_name = saved_root_name; - if (!strncmp(root_device_name, "mtd", 3) || - !strncmp(root_device_name, "ubi", 3)) { - mount_root_generic(root_device_name, root_device_name, + if (!strncmp(saved_root_name, "mtd", 3) || + !strncmp(saved_root_name, "ubi", 3)) { + mount_root_generic(saved_root_name, saved_root_name, root_mountflags); goto out; } - ROOT_DEV = name_to_dev_t(root_device_name); - if (strncmp(root_device_name, "/dev/", 5) == 0) - root_device_name += 5; + ROOT_DEV = name_to_dev_t(saved_root_name); } - if (initrd_load(root_device_name)) + if (initrd_load(saved_root_name)) goto out; /* wait for any asynchronous scanning to complete */ @@ -652,7 +647,7 @@ void __init prepare_namespace(void) async_synchronize_full(); } - mount_root(root_device_name); + mount_root(saved_root_name); out: devtmpfs_mount(); init_mount(".", "/", NULL, MS_MOVE, NULL); From 07d63cbb67cdb5e2a7720fdd8579b3be979c2d66 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:21 +0200 Subject: [PATCH 101/266] init: handle ubi/mtd root mounting like all other root types Assign a Root_Generic magic value for UBI/MTD root and handle the root mounting in mount_root like all other root types. Besides making the code more clear this also means that UBI/MTD root can be used together with an initrd (not that anyone should care). Also factor parsing of the root name into a helper now that it can be easily done and will get more complicated with subsequent patches. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-11-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/root_dev.h | 1 + init/do_mounts.c | 23 ++++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/root_dev.h b/include/linux/root_dev.h index ed3ea8da6429..847c9a06101b 100644 --- a/include/linux/root_dev.h +++ b/include/linux/root_dev.h @@ -9,6 +9,7 @@ enum { Root_NFS = MKDEV(UNNAMED_MAJOR, 255), Root_CIFS = MKDEV(UNNAMED_MAJOR, 254), + Root_Generic = MKDEV(UNNAMED_MAJOR, 253), Root_RAM0 = MKDEV(RAMDISK_MAJOR, 0), }; diff --git a/init/do_mounts.c b/init/do_mounts.c index 74cc96bffbdd..be6d14733ba0 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -591,6 +591,10 @@ void __init mount_root(char *root_device_name) case Root_CIFS: mount_cifs_root(); break; + case Root_Generic: + mount_root_generic(root_device_name, root_device_name, + root_mountflags); + break; case 0: if (root_device_name && root_fs_names && mount_nodev_root(root_device_name) == 0) @@ -602,6 +606,14 @@ void __init mount_root(char *root_device_name) } } +static dev_t __init parse_root_device(char *root_device_name) +{ + if (!strncmp(root_device_name, "mtd", 3) || + !strncmp(root_device_name, "ubi", 3)) + return Root_Generic; + return name_to_dev_t(root_device_name); +} + /* * Prepare the namespace - decide what/where to mount, load ramdisks, etc. */ @@ -624,15 +636,8 @@ void __init prepare_namespace(void) md_run_setup(); - if (saved_root_name[0]) { - if (!strncmp(saved_root_name, "mtd", 3) || - !strncmp(saved_root_name, "ubi", 3)) { - mount_root_generic(saved_root_name, saved_root_name, - root_mountflags); - goto out; - } - ROOT_DEV = name_to_dev_t(saved_root_name); - } + if (saved_root_name[0]) + ROOT_DEV = parse_root_device(saved_root_name); if (initrd_load(saved_root_name)) goto out; From 3701c600a3e735b9fbac6f7a73e4c086090c97ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:22 +0200 Subject: [PATCH 102/266] init: factor the root_wait logic in prepare_namespace into a helper The root_wait logic is a bit obsfucated right now. Expand it and move it into a helper. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-12-hch@lst.de Signed-off-by: Jens Axboe --- init/do_mounts.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index be6d14733ba0..d5c06c1546e8 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -606,6 +606,26 @@ void __init mount_root(char *root_device_name) } } +/* wait for any asynchronous scanning to complete */ +static void __init wait_for_root(char *root_device_name) +{ + if (ROOT_DEV != 0) + return; + + pr_info("Waiting for root device %s...\n", root_device_name); + + for (;;) { + if (driver_probe_done()) { + ROOT_DEV = name_to_dev_t(root_device_name); + if (ROOT_DEV) + break; + } + msleep(5); + } + async_synchronize_full(); + +} + static dev_t __init parse_root_device(char *root_device_name) { if (!strncmp(root_device_name, "mtd", 3) || @@ -642,16 +662,8 @@ void __init prepare_namespace(void) if (initrd_load(saved_root_name)) goto out; - /* wait for any asynchronous scanning to complete */ - if ((ROOT_DEV == 0) && root_wait) { - printk(KERN_INFO "Waiting for root device %s...\n", - saved_root_name); - while (!driver_probe_done() || - (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) - msleep(5); - async_synchronize_full(); - } - + if (root_wait) + wait_for_root(saved_root_name); mount_root(saved_root_name); out: devtmpfs_mount(); From c0c1a7dcb6f5db4500e6574294674213bc24940c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:23 +0200 Subject: [PATCH 103/266] init: move the nfs/cifs/ram special cases out of name_to_dev_t The nfs/cifs/ram special case only needs to be parsed once, and only in the boot code. Move them out of name_to_dev_t and into prepare_namespace. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-13-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/admin-guide/kernel-parameters.txt | 7 ++++++- init/do_mounts.c | 14 ++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e5bab29685f..457c342d1597 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5452,7 +5452,12 @@ port and the regular usb controller gets disabled. root= [KNL] Root filesystem - See name_to_dev_t comment in init/do_mounts.c. + Usually this a a block device specifier of some kind, + see the name_to_dev_t comment in init/do_mounts.c for + details. + Alternatively this can be "ram" for the legacy initial + ramdisk, "nfs" and "cifs" for root on a network file + system, or "mtd" and "ubi" for mounting from raw flash. rootdelay= [KNL] Delay (in seconds) to pause before attempting to mount the root filesystem diff --git a/init/do_mounts.c b/init/do_mounts.c index d5c06c1546e8..86599faf2bf8 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -248,7 +248,6 @@ static dev_t devt_from_devnum(const char *name) * * 1) device number in hexadecimal represents itself * no leading 0x, for example b302. - * 2) /dev/nfs represents Root_NFS (0xff) * 3) /dev/ represents the device number of disk * 4) /dev/ represents the device number * of partition - device number of disk plus the partition number @@ -266,7 +265,6 @@ static dev_t devt_from_devnum(const char *name) * a colon. * 9) PARTLABEL= with name being the GPT partition label. * MSDOS partitions do not support labels! - * 10) /dev/cifs represents Root_CIFS (0xfe) * * If name doesn't have fall into the categories above, we return (0,0). * block_class is used to check if something is a disk name. If the disk @@ -275,12 +273,6 @@ static dev_t devt_from_devnum(const char *name) */ dev_t name_to_dev_t(const char *name) { - if (strcmp(name, "/dev/nfs") == 0) - return Root_NFS; - if (strcmp(name, "/dev/cifs") == 0) - return Root_CIFS; - if (strcmp(name, "/dev/ram") == 0) - return Root_RAM0; #ifdef CONFIG_BLOCK if (strncmp(name, "PARTUUID=", 9) == 0) return devt_from_partuuid(name + 9); @@ -631,6 +623,12 @@ static dev_t __init parse_root_device(char *root_device_name) if (!strncmp(root_device_name, "mtd", 3) || !strncmp(root_device_name, "ubi", 3)) return Root_Generic; + if (strcmp(root_device_name, "/dev/nfs") == 0) + return Root_NFS; + if (strcmp(root_device_name, "/dev/cifs") == 0) + return Root_CIFS; + if (strcmp(root_device_name, "/dev/ram") == 0) + return Root_RAM0; return name_to_dev_t(root_device_name); } From cf056a43121559d3642419917d405c3237ded90a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:24 +0200 Subject: [PATCH 104/266] init: improve the name_to_dev_t interface name_to_dev_t has a very misleading name, that doesn't make clear it should only be used by the early init code, and also has a bad calling convention that doesn't allow returning different kinds of errors. Rename it to early_lookup_bdev to make the use case clear, and return an errno, where -EINVAL means the string could not be parsed, and -ENODEV means it the string was valid, but there was no device found for it. Also stub out the whole call for !CONFIG_BLOCK as all the non-block root cases are always covered in the caller. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-14-hch@lst.de Signed-off-by: Jens Axboe --- .../admin-guide/kernel-parameters.txt | 4 +- drivers/md/dm-table.c | 5 +- drivers/md/md-autodetect.c | 3 +- drivers/mtd/devices/block2mtd.c | 3 +- fs/pstore/blk.c | 4 +- include/linux/blkdev.h | 5 + include/linux/mount.h | 1 - init/do_mounts.c | 102 +++++++++--------- kernel/power/hibernate.c | 22 ++-- 9 files changed, 74 insertions(+), 75 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 457c342d1597..a6bc31349cbb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5453,8 +5453,8 @@ root= [KNL] Root filesystem Usually this a a block device specifier of some kind, - see the name_to_dev_t comment in init/do_mounts.c for - details. + see the early_lookup_bdev comment in init/do_mounts.c + for details. Alternatively this can be "ram" for the legacy initial ramdisk, "nfs" and "cifs" for root on a network file system, or "mtd" and "ubi" for mounting from raw flash. diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1398f1d6e83e..05aa16da43b0 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -330,8 +330,9 @@ dev_t dm_get_dev_t(const char *path) { dev_t dev; - if (lookup_bdev(path, &dev)) - dev = name_to_dev_t(path); + if (lookup_bdev(path, &dev) && + early_lookup_bdev(path, &dev)) + return 0; return dev; } EXPORT_SYMBOL_GPL(dm_get_dev_t); diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 91836e6de326..6eaa0eab40f9 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -147,7 +147,8 @@ static void __init md_setup_drive(struct md_setup_args *args) if (p) *p++ = 0; - dev = name_to_dev_t(devname); + if (early_lookup_bdev(devname, &dev)) + dev = 0; if (strncmp(devname, "/dev/", 5) == 0) devname += 5; snprintf(comp_name, 63, "/dev/%s", devname); diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 7ac82c6fe350..a127cdde03b7 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -254,8 +254,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size, msleep(1000); wait_for_device_probe(); - devt = name_to_dev_t(devname); - if (!devt) + if (early_lookup_bdev(devname, &devt)) continue; bdev = blkdev_get_by_dev(devt, mode, dev, NULL); } diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 4ae0cfcd15f2..de8cf5d75f34 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -263,9 +263,9 @@ static __init const char *early_boot_devpath(const char *initial_devname) * same scheme to find the device that we use for mounting * the root file system. */ - dev_t dev = name_to_dev_t(initial_devname); + dev_t dev; - if (!dev) { + if (early_lookup_bdev(initial_devname, &dev)) { pr_err("failed to resolve '%s'!\n", initial_devname); return initial_devname; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9e9a9e4edee9..d682e233fd66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1501,6 +1501,7 @@ int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); void printk_all_partitions(void); +int early_lookup_bdev(const char *pathname, dev_t *dev); #else static inline void invalidate_bdev(struct block_device *bdev) { @@ -1522,6 +1523,10 @@ static inline void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) static inline void printk_all_partitions(void) { } +static inline int early_lookup_bdev(const char *pathname, dev_t *dev) +{ + return -EINVAL; +} #endif /* CONFIG_BLOCK */ int fsync_bdev(struct block_device *bdev); diff --git a/include/linux/mount.h b/include/linux/mount.h index 1ea326c368f7..4b81ea90440e 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -107,7 +107,6 @@ extern struct vfsmount *vfs_submount(const struct dentry *mountpoint, extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); -extern dev_t name_to_dev_t(const char *name); extern bool path_is_mountpoint(const struct path *path); extern bool our_mnt(struct vfsmount *mnt); diff --git a/init/do_mounts.c b/init/do_mounts.c index 86599faf2bf8..f1953aeb3219 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -96,11 +96,10 @@ static int match_dev_by_uuid(struct device *dev, const void *data) * * Returns the matching dev_t on success or 0 on failure. */ -static dev_t devt_from_partuuid(const char *uuid_str) +static int devt_from_partuuid(const char *uuid_str, dev_t *devt) { struct uuidcmp cmp; struct device *dev = NULL; - dev_t devt = 0; int offset = 0; char *slash; @@ -124,21 +123,21 @@ static dev_t devt_from_partuuid(const char *uuid_str) dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); if (!dev) - return 0; + return -ENODEV; if (offset) { /* * Attempt to find the requested partition by adding an offset * to the partition number found by UUID. */ - devt = part_devt(dev_to_disk(dev), - dev_to_bdev(dev)->bd_partno + offset); + *devt = part_devt(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); } else { - devt = dev->devt; + *devt = dev->devt; } put_device(dev); - return devt; + return 0; clear_root_wait: pr_err("VFS: PARTUUID= is invalid.\n" @@ -146,7 +145,7 @@ static dev_t devt_from_partuuid(const char *uuid_str) if (root_wait) pr_err("Disabling rootwait; root= is invalid.\n"); root_wait = 0; - return 0; + return -EINVAL; } /** @@ -166,38 +165,35 @@ static int match_dev_by_label(struct device *dev, const void *data) return 1; } -static dev_t devt_from_partlabel(const char *label) +static int devt_from_partlabel(const char *label, dev_t *devt) { struct device *dev; - dev_t devt = 0; dev = class_find_device(&block_class, NULL, label, &match_dev_by_label); - if (dev) { - devt = dev->devt; - put_device(dev); - } - - return devt; + if (!dev) + return -ENODEV; + *devt = dev->devt; + put_device(dev); + return 0; } -static dev_t devt_from_devname(const char *name) +static int devt_from_devname(const char *name, dev_t *devt) { - dev_t devt = 0; int part; char s[32]; char *p; if (strlen(name) > 31) - return 0; + return -EINVAL; strcpy(s, name); for (p = s; *p; p++) { if (*p == '/') *p = '!'; } - devt = blk_lookup_devt(s, 0); - if (devt) - return devt; + *devt = blk_lookup_devt(s, 0); + if (*devt) + return 0; /* * Try non-existent, but valid partition, which may only exist after @@ -206,41 +202,42 @@ static dev_t devt_from_devname(const char *name) while (p > s && isdigit(p[-1])) p--; if (p == s || !*p || *p == '0') - return 0; + return -EINVAL; /* try disk name without */ part = simple_strtoul(p, NULL, 10); *p = '\0'; - devt = blk_lookup_devt(s, part); - if (devt) - return devt; + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; /* try disk name without p */ if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') - return 0; + return -EINVAL; p[-1] = '\0'; - return blk_lookup_devt(s, part); + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; + return -EINVAL; } -#endif /* CONFIG_BLOCK */ -static dev_t devt_from_devnum(const char *name) +static int devt_from_devnum(const char *name, dev_t *devt) { unsigned maj, min, offset; - dev_t devt = 0; char *p, dummy; if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { - devt = MKDEV(maj, min); - if (maj != MAJOR(devt) || min != MINOR(devt)) - return 0; + *devt = MKDEV(maj, min); + if (maj != MAJOR(*devt) || min != MINOR(*devt)) + return -EINVAL; } else { - devt = new_decode_dev(simple_strtoul(name, &p, 16)); + *devt = new_decode_dev(simple_strtoul(name, &p, 16)); if (*p) - return 0; + return -EINVAL; } - return devt; + return 0; } /* @@ -271,19 +268,18 @@ static dev_t devt_from_devnum(const char *name) * name contains slashes, the device name has them replaced with * bangs. */ -dev_t name_to_dev_t(const char *name) +int early_lookup_bdev(const char *name, dev_t *devt) { -#ifdef CONFIG_BLOCK if (strncmp(name, "PARTUUID=", 9) == 0) - return devt_from_partuuid(name + 9); + return devt_from_partuuid(name + 9, devt); if (strncmp(name, "PARTLABEL=", 10) == 0) - return devt_from_partlabel(name + 10); + return devt_from_partlabel(name + 10, devt); if (strncmp(name, "/dev/", 5) == 0) - return devt_from_devname(name + 5); -#endif - return devt_from_devnum(name); + return devt_from_devname(name + 5, devt); + return devt_from_devnum(name, devt); } -EXPORT_SYMBOL_GPL(name_to_dev_t); +EXPORT_SYMBOL_GPL(early_lookup_bdev); +#endif static int __init root_dev_setup(char *line) { @@ -606,20 +602,17 @@ static void __init wait_for_root(char *root_device_name) pr_info("Waiting for root device %s...\n", root_device_name); - for (;;) { - if (driver_probe_done()) { - ROOT_DEV = name_to_dev_t(root_device_name); - if (ROOT_DEV) - break; - } + while (!driver_probe_done() || + early_lookup_bdev(root_device_name, &ROOT_DEV) < 0) msleep(5); - } async_synchronize_full(); } static dev_t __init parse_root_device(char *root_device_name) { + dev_t dev; + if (!strncmp(root_device_name, "mtd", 3) || !strncmp(root_device_name, "ubi", 3)) return Root_Generic; @@ -629,7 +622,10 @@ static dev_t __init parse_root_device(char *root_device_name) return Root_CIFS; if (strcmp(root_device_name, "/dev/ram") == 0) return Root_RAM0; - return name_to_dev_t(root_device_name); + + if (early_lookup_bdev(root_device_name, &dev)) + return 0; + return dev; } /* diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 45e24b02cd50..c52dedb9f7c8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "PM: hibernation: " fmt +#include #include #include #include @@ -921,8 +922,7 @@ static int __init find_resume_device(void) } /* Check if the device is there */ - swsusp_resume_device = name_to_dev_t(resume_file); - if (swsusp_resume_device) + if (!early_lookup_bdev(resume_file, &swsusp_resume_device)) return 0; /* @@ -931,15 +931,12 @@ static int __init find_resume_device(void) */ wait_for_device_probe(); if (resume_wait) { - while (!(swsusp_resume_device = name_to_dev_t(resume_file))) + while (early_lookup_bdev(resume_file, &swsusp_resume_device)) msleep(10); async_synchronize_full(); } - swsusp_resume_device = name_to_dev_t(resume_file); - if (!swsusp_resume_device) - return -ENODEV; - return 0; + return early_lookup_bdev(resume_file, &swsusp_resume_device); } static int software_resume(void) @@ -1169,7 +1166,8 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, unsigned int sleep_flags; int len = n; char *name; - dev_t res; + dev_t dev; + int error; if (!hibernation_available()) return 0; @@ -1180,13 +1178,13 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!name) return -ENOMEM; - res = name_to_dev_t(name); + error = early_lookup_bdev(name, &dev); kfree(name); - if (!res) - return -EINVAL; + if (error) + return error; sleep_flags = lock_system_sleep(); - swsusp_resume_device = res; + swsusp_resume_device = dev; unlock_system_sleep(sleep_flags); pm_pr_dbg("Configured hibernation resume from disk to %u\n", From 079caa35f7863cd9958b4555ae873ea4d352a502 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:25 +0200 Subject: [PATCH 105/266] init: clear root_wait on all invalid root= strings Instead of only clearing root_wait in devt_from_partuuid when the UUID format was invalid, do that in parse_root_device for all strings that failed to parse. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-15-hch@lst.de Signed-off-by: Jens Axboe --- init/do_mounts.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index f1953aeb3219..0b36a5f39ee8 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -112,14 +112,14 @@ static int devt_from_partuuid(const char *uuid_str, dev_t *devt) /* Explicitly fail on poor PARTUUID syntax. */ if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1) - goto clear_root_wait; + goto out_invalid; cmp.len = slash - uuid_str; } else { cmp.len = strlen(uuid_str); } if (!cmp.len) - goto clear_root_wait; + goto out_invalid; dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); if (!dev) @@ -139,12 +139,9 @@ static int devt_from_partuuid(const char *uuid_str, dev_t *devt) put_device(dev); return 0; -clear_root_wait: +out_invalid: pr_err("VFS: PARTUUID= is invalid.\n" "Expected PARTUUID=[/PARTNROFF=%%d]\n"); - if (root_wait) - pr_err("Disabling rootwait; root= is invalid.\n"); - root_wait = 0; return -EINVAL; } @@ -611,6 +608,7 @@ static void __init wait_for_root(char *root_device_name) static dev_t __init parse_root_device(char *root_device_name) { + int error; dev_t dev; if (!strncmp(root_device_name, "mtd", 3) || @@ -623,8 +621,14 @@ static dev_t __init parse_root_device(char *root_device_name) if (strcmp(root_device_name, "/dev/ram") == 0) return Root_RAM0; - if (early_lookup_bdev(root_device_name, &dev)) + error = early_lookup_bdev(root_device_name, &dev); + if (error) { + if (error == -EINVAL && root_wait) { + pr_err("Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + } return 0; + } return dev; } From 702f3189e454b3c3c2f3c99dbf30acf41aab707c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:26 +0200 Subject: [PATCH 106/266] block: move the code to do early boot lookup of block devices to block/ Create a new block/early-lookup.c to keep the early block device lookup code instead of having this code sit with the early mount code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-16-hch@lst.de Signed-off-by: Jens Axboe --- .../admin-guide/kernel-parameters.txt | 4 +- block/Makefile | 2 +- block/early-lookup.c | 224 ++++++++++++++++++ init/do_mounts.c | 219 ----------------- 4 files changed, 227 insertions(+), 222 deletions(-) create mode 100644 block/early-lookup.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a6bc31349cbb..911c54829c7c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5453,8 +5453,8 @@ root= [KNL] Root filesystem Usually this a a block device specifier of some kind, - see the early_lookup_bdev comment in init/do_mounts.c - for details. + see the early_lookup_bdev comment in + block/early-lookup.c for details. Alternatively this can be "ram" for the legacy initial ramdisk, "nfs" and "cifs" for root on a network file system, or "mtd" and "ubi" for mounting from raw flash. diff --git a/block/Makefile b/block/Makefile index b31b05390749..46ada9dc8bbf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,7 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ - disk-events.o blk-ia-ranges.o + disk-events.o blk-ia-ranges.o early-lookup.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o diff --git a/block/early-lookup.c b/block/early-lookup.c new file mode 100644 index 000000000000..9fc30d039508 --- /dev/null +++ b/block/early-lookup.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Code for looking up block devices in the early boot code before mounting the + * root file system. Unfortunately currently also abused in a few other places. + */ +#include +#include + +struct uuidcmp { + const char *uuid; + int len; +}; + +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to the desired struct uuidcmp to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_uuid(struct device *dev, const void *data) +{ + struct block_device *bdev = dev_to_bdev(dev); + const struct uuidcmp *cmp = data; + + if (!bdev->bd_meta_info || + strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) + return 0; + return 1; +} + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid_str: char array containing ascii UUID + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * + * Returns the matching dev_t on success or 0 on failure. + */ +static int devt_from_partuuid(const char *uuid_str, dev_t *devt) +{ + struct uuidcmp cmp; + struct device *dev = NULL; + int offset = 0; + char *slash; + + cmp.uuid = uuid_str; + + slash = strchr(uuid_str, '/'); + /* Check for optional partition number offset attributes. */ + if (slash) { + char c = 0; + + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1) + goto out_invalid; + cmp.len = slash - uuid_str; + } else { + cmp.len = strlen(uuid_str); + } + + if (!cmp.len) + goto out_invalid; + + dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); + if (!dev) + return -ENODEV; + + if (offset) { + /* + * Attempt to find the requested partition by adding an offset + * to the partition number found by UUID. + */ + *devt = part_devt(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); + } else { + *devt = dev->devt; + } + + put_device(dev); + return 0; + +out_invalid: + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=[/PARTNROFF=%%d]\n"); + return -EINVAL; +} + +/** + * match_dev_by_label - callback for finding a partition using its label + * @dev: device passed in by the caller + * @data: opaque pointer to the label to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int match_dev_by_label(struct device *dev, const void *data) +{ + struct block_device *bdev = dev_to_bdev(dev); + const char *label = data; + + if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) + return 0; + return 1; +} + +static int devt_from_partlabel(const char *label, dev_t *devt) +{ + struct device *dev; + + dev = class_find_device(&block_class, NULL, label, &match_dev_by_label); + if (!dev) + return -ENODEV; + *devt = dev->devt; + put_device(dev); + return 0; +} + +static int devt_from_devname(const char *name, dev_t *devt) +{ + int part; + char s[32]; + char *p; + + if (strlen(name) > 31) + return -EINVAL; + strcpy(s, name); + for (p = s; *p; p++) { + if (*p == '/') + *p = '!'; + } + + *devt = blk_lookup_devt(s, 0); + if (*devt) + return 0; + + /* + * Try non-existent, but valid partition, which may only exist after + * opening the device, like partitioned md devices. + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + return -EINVAL; + + /* try disk name without */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; + + /* try disk name without p */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + return -EINVAL; + p[-1] = '\0'; + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; + return -EINVAL; +} + +static int devt_from_devnum(const char *name, dev_t *devt) +{ + unsigned maj, min, offset; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { + *devt = MKDEV(maj, min); + if (maj != MAJOR(*devt) || min != MINOR(*devt)) + return -EINVAL; + } else { + *devt = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + return -EINVAL; + } + + return 0; +} + +/* + * Convert a name into device number. We accept the following variants: + * + * 1) device number in hexadecimal represents itself + * no leading 0x, for example b302. + * 3) /dev/ represents the device number of disk + * 4) /dev/ represents the device number + * of partition - device number of disk plus the partition number + * 5) /dev/p - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 7) PARTUUID=/PARTNROFF= to select a partition in relation to + * a partition with a known unique id. + * 8) : major and minor number of the device separated by + * a colon. + * 9) PARTLABEL= with name being the GPT partition label. + * MSDOS partitions do not support labels! + * + * If name doesn't have fall into the categories above, we return (0,0). + * block_class is used to check if something is a disk name. If the disk + * name contains slashes, the device name has them replaced with + * bangs. + */ +int early_lookup_bdev(const char *name, dev_t *devt) +{ + if (strncmp(name, "PARTUUID=", 9) == 0) + return devt_from_partuuid(name + 9, devt); + if (strncmp(name, "PARTLABEL=", 10) == 0) + return devt_from_partlabel(name + 10, devt); + if (strncmp(name, "/dev/", 5) == 0) + return devt_from_devname(name + 5, devt); + return devt_from_devnum(name, devt); +} +EXPORT_SYMBOL_GPL(early_lookup_bdev); diff --git a/init/do_mounts.c b/init/do_mounts.c index 0b36a5f39ee8..780546a6cbfb 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -59,225 +59,6 @@ static int __init readwrite(char *str) __setup("ro", readonly); __setup("rw", readwrite); -#ifdef CONFIG_BLOCK -struct uuidcmp { - const char *uuid; - int len; -}; - -/** - * match_dev_by_uuid - callback for finding a partition using its uuid - * @dev: device passed in by the caller - * @data: opaque pointer to the desired struct uuidcmp to match - * - * Returns 1 if the device matches, and 0 otherwise. - */ -static int match_dev_by_uuid(struct device *dev, const void *data) -{ - struct block_device *bdev = dev_to_bdev(dev); - const struct uuidcmp *cmp = data; - - if (!bdev->bd_meta_info || - strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) - return 0; - return 1; -} - -/** - * devt_from_partuuid - looks up the dev_t of a partition by its UUID - * @uuid_str: char array containing ascii UUID - * - * The function will return the first partition which contains a matching - * UUID value in its partition_meta_info struct. This does not search - * by filesystem UUIDs. - * - * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be - * extracted and used as an offset from the partition identified by the UUID. - * - * Returns the matching dev_t on success or 0 on failure. - */ -static int devt_from_partuuid(const char *uuid_str, dev_t *devt) -{ - struct uuidcmp cmp; - struct device *dev = NULL; - int offset = 0; - char *slash; - - cmp.uuid = uuid_str; - - slash = strchr(uuid_str, '/'); - /* Check for optional partition number offset attributes. */ - if (slash) { - char c = 0; - - /* Explicitly fail on poor PARTUUID syntax. */ - if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1) - goto out_invalid; - cmp.len = slash - uuid_str; - } else { - cmp.len = strlen(uuid_str); - } - - if (!cmp.len) - goto out_invalid; - - dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); - if (!dev) - return -ENODEV; - - if (offset) { - /* - * Attempt to find the requested partition by adding an offset - * to the partition number found by UUID. - */ - *devt = part_devt(dev_to_disk(dev), - dev_to_bdev(dev)->bd_partno + offset); - } else { - *devt = dev->devt; - } - - put_device(dev); - return 0; - -out_invalid: - pr_err("VFS: PARTUUID= is invalid.\n" - "Expected PARTUUID=[/PARTNROFF=%%d]\n"); - return -EINVAL; -} - -/** - * match_dev_by_label - callback for finding a partition using its label - * @dev: device passed in by the caller - * @data: opaque pointer to the label to match - * - * Returns 1 if the device matches, and 0 otherwise. - */ -static int match_dev_by_label(struct device *dev, const void *data) -{ - struct block_device *bdev = dev_to_bdev(dev); - const char *label = data; - - if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) - return 0; - return 1; -} - -static int devt_from_partlabel(const char *label, dev_t *devt) -{ - struct device *dev; - - dev = class_find_device(&block_class, NULL, label, &match_dev_by_label); - if (!dev) - return -ENODEV; - *devt = dev->devt; - put_device(dev); - return 0; -} - -static int devt_from_devname(const char *name, dev_t *devt) -{ - int part; - char s[32]; - char *p; - - if (strlen(name) > 31) - return -EINVAL; - strcpy(s, name); - for (p = s; *p; p++) { - if (*p == '/') - *p = '!'; - } - - *devt = blk_lookup_devt(s, 0); - if (*devt) - return 0; - - /* - * Try non-existent, but valid partition, which may only exist after - * opening the device, like partitioned md devices. - */ - while (p > s && isdigit(p[-1])) - p--; - if (p == s || !*p || *p == '0') - return -EINVAL; - - /* try disk name without */ - part = simple_strtoul(p, NULL, 10); - *p = '\0'; - *devt = blk_lookup_devt(s, part); - if (*devt) - return 0; - - /* try disk name without p */ - if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') - return -EINVAL; - p[-1] = '\0'; - *devt = blk_lookup_devt(s, part); - if (*devt) - return 0; - return -EINVAL; -} - -static int devt_from_devnum(const char *name, dev_t *devt) -{ - unsigned maj, min, offset; - char *p, dummy; - - if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || - sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { - *devt = MKDEV(maj, min); - if (maj != MAJOR(*devt) || min != MINOR(*devt)) - return -EINVAL; - } else { - *devt = new_decode_dev(simple_strtoul(name, &p, 16)); - if (*p) - return -EINVAL; - } - - return 0; -} - -/* - * Convert a name into device number. We accept the following variants: - * - * 1) device number in hexadecimal represents itself - * no leading 0x, for example b302. - * 3) /dev/ represents the device number of disk - * 4) /dev/ represents the device number - * of partition - device number of disk plus the partition number - * 5) /dev/p - same as the above, that form is - * used when disk name of partitioned disk ends on a digit. - * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the - * unique id of a partition if the partition table provides it. - * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS - * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- - * filled hex representation of the 32-bit "NT disk signature", and PP - * is a zero-filled hex representation of the 1-based partition number. - * 7) PARTUUID=/PARTNROFF= to select a partition in relation to - * a partition with a known unique id. - * 8) : major and minor number of the device separated by - * a colon. - * 9) PARTLABEL= with name being the GPT partition label. - * MSDOS partitions do not support labels! - * - * If name doesn't have fall into the categories above, we return (0,0). - * block_class is used to check if something is a disk name. If the disk - * name contains slashes, the device name has them replaced with - * bangs. - */ -int early_lookup_bdev(const char *name, dev_t *devt) -{ - if (strncmp(name, "PARTUUID=", 9) == 0) - return devt_from_partuuid(name + 9, devt); - if (strncmp(name, "PARTLABEL=", 10) == 0) - return devt_from_partlabel(name + 10, devt); - if (strncmp(name, "/dev/", 5) == 0) - return devt_from_devname(name + 5, devt); - return devt_from_devnum(name, devt); -} -EXPORT_SYMBOL_GPL(early_lookup_bdev); -#endif - static int __init root_dev_setup(char *line) { strscpy(saved_root_name, line, sizeof(saved_root_name)); From 7cadcaf1d82618852745e7206fffa2c72c17ce4b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:27 +0200 Subject: [PATCH 107/266] block: move more code to early-lookup.c blk_lookup_devt is only used by code in early-lookup.c, so move it there. printk_all_partitions and it's helper bdevt_str are only used by the early init code in init/do_mounts.c, so they should go there as well. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-17-hch@lst.de Signed-off-by: Jens Axboe --- block/early-lookup.c | 92 ++++++++++++++++++++++++++++++++++++++++++ block/genhd.c | 92 ------------------------------------------ include/linux/blkdev.h | 1 - 3 files changed, 92 insertions(+), 93 deletions(-) diff --git a/block/early-lookup.c b/block/early-lookup.c index 9fc30d039508..6016e781b6a0 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -120,6 +120,35 @@ static int devt_from_partlabel(const char *label, dev_t *devt) return 0; } +static dev_t blk_lookup_devt(const char *name, int partno) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + + if (strcmp(dev_name(dev), name)) + continue; + + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + } else { + devt = part_devt(disk, partno); + if (devt) + break; + } + } + class_dev_iter_exit(&iter); + return devt; +} + static int devt_from_devname(const char *name, dev_t *devt) { int part; @@ -222,3 +251,66 @@ int early_lookup_bdev(const char *name, dev_t *devt) return devt_from_devnum(name, devt); } EXPORT_SYMBOL_GPL(early_lookup_bdev); + +static char __init *bdevt_str(dev_t devt, char *buf) +{ + if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { + char tbuf[BDEVT_SIZE]; + snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); + snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); + } else + snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); + + return buf; +} + +/* + * print a full list of all partitions - intended for places where the root + * filesystem can't be mounted and thus to give the victim some idea of what + * went wrong + */ +void __init printk_all_partitions(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct block_device *part; + char devt_buf[BDEVT_SIZE]; + unsigned long idx; + + /* + * Don't show empty devices or things that have been + * suppressed + */ + if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) + continue; + + /* + * Note, unlike /proc/partitions, I am showing the numbers in + * hex - the same format as the root= option takes. + */ + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + printk("%s%s %10llu %pg %s", + bdev_is_partition(part) ? " " : "", + bdevt_str(part->bd_dev, devt_buf), + bdev_nr_sectors(part) >> 1, part, + part->bd_meta_info ? + part->bd_meta_info->uuid : ""); + if (bdev_is_partition(part)) + printk("\n"); + else if (dev->parent && dev->parent->driver) + printk(" driver: %s\n", + dev->parent->driver->name); + else + printk(" (driver?)\n"); + } + rcu_read_unlock(); + } + class_dev_iter_exit(&iter); +} diff --git a/block/genhd.c b/block/genhd.c index a07c4d6a1476..4e5fd6aaa883 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -318,18 +318,6 @@ void blk_free_ext_minor(unsigned int minor) ida_free(&ext_devt_ida, minor); } -static char *bdevt_str(dev_t devt, char *buf) -{ - if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { - char tbuf[BDEVT_SIZE]; - snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); - snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); - } else - snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); - - return buf; -} - void disk_uevent(struct gendisk *disk, enum kobject_action action) { struct block_device *part; @@ -795,57 +783,6 @@ void blk_request_module(dev_t devt) } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ -/* - * print a full list of all partitions - intended for places where the root - * filesystem can't be mounted and thus to give the victim some idea of what - * went wrong - */ -void __init printk_all_partitions(void) -{ - struct class_dev_iter iter; - struct device *dev; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct block_device *part; - char devt_buf[BDEVT_SIZE]; - unsigned long idx; - - /* - * Don't show empty devices or things that have been - * suppressed - */ - if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) - continue; - - /* - * Note, unlike /proc/partitions, I am showing the numbers in - * hex - the same format as the root= option takes. - */ - rcu_read_lock(); - xa_for_each(&disk->part_tbl, idx, part) { - if (!bdev_nr_sectors(part)) - continue; - printk("%s%s %10llu %pg %s", - bdev_is_partition(part) ? " " : "", - bdevt_str(part->bd_dev, devt_buf), - bdev_nr_sectors(part) >> 1, part, - part->bd_meta_info ? - part->bd_meta_info->uuid : ""); - if (bdev_is_partition(part)) - printk("\n"); - else if (dev->parent && dev->parent->driver) - printk(" driver: %s\n", - dev->parent->driver->name); - else - printk(" (driver?)\n"); - } - rcu_read_unlock(); - } - class_dev_iter_exit(&iter); -} - #ifdef CONFIG_PROC_FS /* iterator */ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) @@ -1379,35 +1316,6 @@ dev_t part_devt(struct gendisk *disk, u8 partno) return devt; } -dev_t blk_lookup_devt(const char *name, int partno) -{ - dev_t devt = MKDEV(0, 0); - struct class_dev_iter iter; - struct device *dev; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - - if (strcmp(dev_name(dev), name)) - continue; - - if (partno < disk->minors) { - /* We need to return the right devno, even - * if the partition doesn't exist yet. - */ - devt = MKDEV(MAJOR(dev->devt), - MINOR(dev->devt) + partno); - } else { - devt = part_devt(disk, partno); - if (devt) - break; - } - } - class_dev_iter_exit(&iter); - return devt; -} - struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d682e233fd66..52718176d1b4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -838,7 +838,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, dev_t part_devt(struct gendisk *disk, u8 partno); void inc_diskseq(struct gendisk *disk); -dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); extern int blk_register_queue(struct gendisk *disk); From 26110d5afe8117d1b505fe735ac709bdf063f4da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:28 +0200 Subject: [PATCH 108/266] dm-snap: simplify the origin_dev == cow_dev check in snapshot_ctr Use the block_device acquired in dm_get_device for the check instead of doing an extra lookup. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20230531125535.676098-18-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-snap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 9c49f53760d0..7832974b73eb 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1241,7 +1241,6 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int i; int r = -EINVAL; char *origin_path, *cow_path; - dev_t origin_dev, cow_dev; unsigned int args_used, num_flush_bios = 1; fmode_t origin_mode = FMODE_READ; @@ -1279,24 +1278,21 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot get origin device"; goto bad_origin; } - origin_dev = s->origin->bdev->bd_dev; cow_path = argv[0]; argv++; argc--; - cow_dev = dm_get_dev_t(cow_path); - if (cow_dev && cow_dev == origin_dev) { - ti->error = "COW device cannot be the same as origin device"; - r = -EINVAL; - goto bad_cow; - } - r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); if (r) { ti->error = "Cannot get COW device"; goto bad_cow; } + if (s->cow->bdev && s->cow->bdev == s->origin->bdev) { + ti->error = "COW device cannot be the same as origin device"; + r = -EINVAL; + goto bad_store; + } r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store); if (r) { From 49177377e910a8fd5cd1388c966d8fbb51075c3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:29 +0200 Subject: [PATCH 109/266] dm: open code dm_get_dev_t in dm_init_init dm_init_init is called from early boot code, and thus lookup_bdev will never succeed. Just open code that call to early_lookup_bdev instead. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20230531125535.676098-19-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-init.c b/drivers/md/dm-init.c index d369457dbed0..2a71bcdba92d 100644 --- a/drivers/md/dm-init.c +++ b/drivers/md/dm-init.c @@ -293,8 +293,10 @@ static int __init dm_init_init(void) for (i = 0; i < ARRAY_SIZE(waitfor); i++) { if (waitfor[i]) { + dev_t dev; + DMINFO("waiting for device %s ...", waitfor[i]); - while (!dm_get_dev_t(waitfor[i])) + while (early_lookup_bdev(waitfor[i], &dev)) fsleep(5000); } } From d4a28d7defe79006e59293a4b43d518ba8483fb0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:30 +0200 Subject: [PATCH 110/266] dm: remove dm_get_dev_t Open code dm_get_dev_t in the only remaining caller, and propagate the exact error code from lookup_bdev and early_lookup_bdev. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-20-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 22 +++++----------------- include/linux/device-mapper.h | 2 -- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 05aa16da43b0..1576b408768d 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -323,20 +323,6 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, return 0; } -/* - * Convert the path to a device - */ -dev_t dm_get_dev_t(const char *path) -{ - dev_t dev; - - if (lookup_bdev(path, &dev) && - early_lookup_bdev(path, &dev)) - return 0; - return dev; -} -EXPORT_SYMBOL_GPL(dm_get_dev_t); - /* * Add a device to the list, or just increment the usage count if * it's already present. @@ -359,9 +345,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, if (MAJOR(dev) != major || MINOR(dev) != minor) return -EOVERFLOW; } else { - dev = dm_get_dev_t(path); - if (!dev) - return -ENODEV; + r = lookup_bdev(path, &dev); + if (r) + r = early_lookup_bdev(path, &dev); + if (r) + return r; } if (dev == disk_devt(t->md->disk)) return -EINVAL; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index a52d2b9a6846..c27b84002d83 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -170,8 +170,6 @@ struct dm_dev { char name[16]; }; -dev_t dm_get_dev_t(const char *path); - /* * Constructors should call these functions to ensure destination devices * are opened/closed correctly. From 7a126d5bf975f082281fb9b45d110cd49b7c3ee4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:31 +0200 Subject: [PATCH 111/266] dm: only call early_lookup_bdev from early boot context early_lookup_bdev is supposed to only be called from the early boot code, but dm_get_device calls it as a general fallback when lookup_bdev fails, which is problematic because early_lookup_bdev bypasses all normal path based permission checking, and might cause problems with certain container environments renaming devices. Switch to only call early_lookup_bdev when dm is built-in and the system state in not running yet. This means it is still available when tables are constructed by dm-init.c from the kernel command line, but not otherwise. Note that this strictly speaking changes the kernel ABI as the PARTUUID= and PARTLABEL= style syntax is now not available during a running systems. They never were intended for that, but this breaks things we'll have to figure out a way to make them available again. But if avoidable in any way I'd rather avoid that. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20230531125535.676098-21-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 1576b408768d..2fd5826bfce1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -326,8 +326,11 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, /* * Add a device to the list, or just increment the usage count if * it's already present. + * + * Note: the __ref annotation is because this function can call the __init + * marked early_lookup_bdev when called during early boot code from dm-init.c. */ -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, +int __ref dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, struct dm_dev **result) { int r; @@ -346,8 +349,10 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, return -EOVERFLOW; } else { r = lookup_bdev(path, &dev); - if (r) +#ifndef MODULE + if (r && system_state < SYSTEM_RUNNING) r = early_lookup_bdev(path, &dev); +#endif if (r) return r; } From 1e8c813b083c4122dfeaa5c3b11028331026e85d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:32 +0200 Subject: [PATCH 112/266] PM: hibernate: don't use early_lookup_bdev in resume_store resume_store is a sysfs attribute written during normal kernel runtime, and it should not use the early_lookup_bdev API that bypasses all normal path based permission checking, and might cause problems with certain container environments renaming devices. Switch to lookup_bdev, which does a normal path lookup instead, and fall back to trying to parse a numeric dev_t just like early_lookup_bdev did. Note that this strictly speaking changes the kernel ABI as the PARTUUID= and PARTLABEL= style syntax is now not available during a running systems. They never were intended for that, but this breaks things we'll have to figure out a way to make them available again. But if avoidable in any way I'd rather avoid that. Fixes: 421a5fa1a6cf ("PM / hibernate: use name_to_dev_t to parse resume") Signed-off-by: Christoph Hellwig Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230531125535.676098-22-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/hibernate.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c52dedb9f7c8..7ae95ec72f99 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1178,7 +1178,23 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!name) return -ENOMEM; - error = early_lookup_bdev(name, &dev); + error = lookup_bdev(name, &dev); + if (error) { + unsigned maj, min, offset; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, + &dummy) == 3) { + dev = MKDEV(maj, min); + if (maj != MAJOR(dev) || min != MINOR(dev)) + error = -EINVAL; + } else { + dev = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + error = -EINVAL; + } + } kfree(name); if (error) return error; From b2baa57475e3a24bb9ad27bb9047ea3be94627f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:33 +0200 Subject: [PATCH 113/266] mtd: block2mtd: factor the early block device open logic into a helper Simplify add_device a bit by splitting out the cumbersome early boot logic into a separate helper. Signed-off-by: Christoph Hellwig Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20230531125535.676098-23-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mtd/devices/block2mtd.c | 61 ++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index a127cdde03b7..457774eef4d0 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -215,13 +215,42 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) kfree(dev); } +static struct block_device *mdtblock_early_get_bdev(const char *devname, + fmode_t mode, int timeout, struct block2mtd_dev *dev) +{ + struct block_device *bdev = ERR_PTR(-ENODEV); +#ifndef MODULE + int i; + + /* + * We might not have the root device mounted at this point. + * Try to resolve the device name by other means. + */ + for (i = 0; i <= timeout; i++) { + dev_t devt; + + if (i) + /* + * Calling wait_for_device_probe in the first loop + * was not enough, sleep for a bit in subsequent + * go-arounds. + */ + msleep(1000); + wait_for_device_probe(); + + if (!early_lookup_bdev(devname, &devt)) { + bdev = blkdev_get_by_dev(devt, mode, dev, NULL); + if (!IS_ERR(bdev)) + break; + } + } +#endif + return bdev; +} static struct block2mtd_dev *add_device(char *devname, int erase_size, char *label, int timeout) { -#ifndef MODULE - int i; -#endif const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; struct block_device *bdev; struct block2mtd_dev *dev; @@ -236,30 +265,8 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size, /* Get a handle on the device */ bdev = blkdev_get_by_path(devname, mode, dev, NULL); - -#ifndef MODULE - /* - * We might not have the root device mounted at this point. - * Try to resolve the device name by other means. - */ - for (i = 0; IS_ERR(bdev) && i <= timeout; i++) { - dev_t devt; - - if (i) - /* - * Calling wait_for_device_probe in the first loop - * was not enough, sleep for a bit in subsequent - * go-arounds. - */ - msleep(1000); - wait_for_device_probe(); - - if (early_lookup_bdev(devname, &devt)) - continue; - bdev = blkdev_get_by_dev(devt, mode, dev, NULL); - } -#endif - + if (IS_ERR(bdev)) + bdev = mdtblock_early_get_bdev(devname, mode, timeout, dev); if (IS_ERR(bdev)) { pr_err("error: cannot open device %s\n", devname); goto err_free_block2mtd; From 8d03187ee7328af8e18ef1782289e0b034e75485 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:34 +0200 Subject: [PATCH 114/266] mtd: block2mtd: don't call early_lookup_bdev after the system is running early_lookup_bdev is supposed to only be called from the early boot code, but mdtblock_early_get_bdev is called as a general fallback when lookup_bdev fails, which is problematic because early_lookup_bdev bypasses all normal path based permission checking, and might cause problems with certain container environments renaming devices. Switch to only call early_lookup_bdev when block2mtd is built-in and the system state in not running yet. Note that this strictly speaking changes the kernel ABI as the PARTUUID= and PARTLABEL= style syntax is now not available during a running systems. They never were intended for that, but this breaks things we'll have to figure out a way to make them available again. But if avoidable in any way I'd rather avoid that. Signed-off-by: Christoph Hellwig Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20230531125535.676098-24-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mtd/devices/block2mtd.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 457774eef4d0..218eb2af564a 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -215,13 +215,23 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) kfree(dev); } -static struct block_device *mdtblock_early_get_bdev(const char *devname, +/* + * This function is marked __ref because it calls the __init marked + * early_lookup_bdev when called from the early boot code. + */ +static struct block_device __ref *mdtblock_early_get_bdev(const char *devname, fmode_t mode, int timeout, struct block2mtd_dev *dev) { struct block_device *bdev = ERR_PTR(-ENODEV); #ifndef MODULE int i; + /* + * We can't use early_lookup_bdev from a running system. + */ + if (system_state >= SYSTEM_RUNNING) + return bdev; + /* * We might not have the root device mounted at this point. * Try to resolve the device name by other means. From 2577f53f42947d8ca01666e3444bb7307319ea38 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 31 May 2023 14:55:35 +0200 Subject: [PATCH 115/266] block: mark early_lookup_bdev as __init early_lookup_bdev is now only used during the early boot code as it should, so mark it __init to not waste run time memory on it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230531125535.676098-25-hch@lst.de Signed-off-by: Jens Axboe --- block/early-lookup.c | 19 +++++++++---------- include/linux/blkdev.h | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/block/early-lookup.c b/block/early-lookup.c index 6016e781b6a0..3ff0d2e4dcbf 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Code for looking up block devices in the early boot code before mounting the - * root file system. Unfortunately currently also abused in a few other places. + * root file system. */ #include #include @@ -18,7 +18,7 @@ struct uuidcmp { * * Returns 1 if the device matches, and 0 otherwise. */ -static int match_dev_by_uuid(struct device *dev, const void *data) +static int __init match_dev_by_uuid(struct device *dev, const void *data) { struct block_device *bdev = dev_to_bdev(dev); const struct uuidcmp *cmp = data; @@ -42,7 +42,7 @@ static int match_dev_by_uuid(struct device *dev, const void *data) * * Returns the matching dev_t on success or 0 on failure. */ -static int devt_from_partuuid(const char *uuid_str, dev_t *devt) +static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt) { struct uuidcmp cmp; struct device *dev = NULL; @@ -98,7 +98,7 @@ static int devt_from_partuuid(const char *uuid_str, dev_t *devt) * * Returns 1 if the device matches, and 0 otherwise. */ -static int match_dev_by_label(struct device *dev, const void *data) +static int __init match_dev_by_label(struct device *dev, const void *data) { struct block_device *bdev = dev_to_bdev(dev); const char *label = data; @@ -108,7 +108,7 @@ static int match_dev_by_label(struct device *dev, const void *data) return 1; } -static int devt_from_partlabel(const char *label, dev_t *devt) +static int __init devt_from_partlabel(const char *label, dev_t *devt) { struct device *dev; @@ -120,7 +120,7 @@ static int devt_from_partlabel(const char *label, dev_t *devt) return 0; } -static dev_t blk_lookup_devt(const char *name, int partno) +static dev_t __init blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); struct class_dev_iter iter; @@ -149,7 +149,7 @@ static dev_t blk_lookup_devt(const char *name, int partno) return devt; } -static int devt_from_devname(const char *name, dev_t *devt) +static int __init devt_from_devname(const char *name, dev_t *devt) { int part; char s[32]; @@ -193,7 +193,7 @@ static int devt_from_devname(const char *name, dev_t *devt) return -EINVAL; } -static int devt_from_devnum(const char *name, dev_t *devt) +static int __init devt_from_devnum(const char *name, dev_t *devt) { unsigned maj, min, offset; char *p, dummy; @@ -240,7 +240,7 @@ static int devt_from_devnum(const char *name, dev_t *devt) * name contains slashes, the device name has them replaced with * bangs. */ -int early_lookup_bdev(const char *name, dev_t *devt) +int __init early_lookup_bdev(const char *name, dev_t *devt) { if (strncmp(name, "PARTUUID=", 9) == 0) return devt_from_partuuid(name + 9, devt); @@ -250,7 +250,6 @@ int early_lookup_bdev(const char *name, dev_t *devt) return devt_from_devname(name + 5, devt); return devt_from_devnum(name, devt); } -EXPORT_SYMBOL_GPL(early_lookup_bdev); static char __init *bdevt_str(dev_t devt, char *buf) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 52718176d1b4..f4c339d9dd03 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1500,7 +1500,7 @@ int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); void bdev_statx_dioalign(struct inode *inode, struct kstat *stat); void printk_all_partitions(void); -int early_lookup_bdev(const char *pathname, dev_t *dev); +int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else static inline void invalidate_bdev(struct block_device *bdev) { From 8d211554679d0b23702bd32ba04aeac0c1c4f660 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 27 May 2023 17:19:04 +0800 Subject: [PATCH 116/266] blk-iocost: use spin_lock_irqsave in adjust_inuse_and_calc_cost adjust_inuse_and_calc_cost() use spin_lock_irq() and IRQ will be enabled when unlock. DEADLOCK might happen if we have held other locks and disabled IRQ before invoking it. Fix it by using spin_lock_irqsave() instead, which can keep IRQ state consistent with before when unlock. ================================ WARNING: inconsistent lock state 5.10.0-02758-g8e5f91fd772f #26 Not tainted -------------------------------- inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. kworker/2:3/388 [HC0[0]:SC0[0]:HE0:SE1] takes: ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: spin_lock_irq ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: bfq_bio_merge+0x141/0x390 {IN-HARDIRQ-W} state was registered at: __lock_acquire+0x3d7/0x1070 lock_acquire+0x197/0x4a0 __raw_spin_lock_irqsave _raw_spin_lock_irqsave+0x3b/0x60 bfq_idle_slice_timer_body bfq_idle_slice_timer+0x53/0x1d0 __run_hrtimer+0x477/0xa70 __hrtimer_run_queues+0x1c6/0x2d0 hrtimer_interrupt+0x302/0x9e0 local_apic_timer_interrupt __sysvec_apic_timer_interrupt+0xfd/0x420 run_sysvec_on_irqstack_cond sysvec_apic_timer_interrupt+0x46/0xa0 asm_sysvec_apic_timer_interrupt+0x12/0x20 irq event stamp: 837522 hardirqs last enabled at (837521): [] __raw_spin_unlock_irqrestore hardirqs last enabled at (837521): [] _raw_spin_unlock_irqrestore+0x3d/0x40 hardirqs last disabled at (837522): [] __raw_spin_lock_irq hardirqs last disabled at (837522): [] _raw_spin_lock_irq+0x43/0x50 softirqs last enabled at (835852): [] __do_softirq+0x558/0x8ec softirqs last disabled at (835845): [] asm_call_irq_on_stack+0xf/0x20 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&bfqd->lock); lock(&bfqd->lock); *** DEADLOCK *** 3 locks held by kworker/2:3/388: #0: ffff888107af0f38 ((wq_completion)kthrotld){+.+.}-{0:0}, at: process_one_work+0x742/0x13f0 #1: ffff8881176bfdd8 ((work_completion)(&td->dispatch_work)){+.+.}-{0:0}, at: process_one_work+0x777/0x13f0 #2: ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: spin_lock_irq #2: ffff888118c00c28 (&bfqd->lock){?.-.}-{2:2}, at: bfq_bio_merge+0x141/0x390 stack backtrace: CPU: 2 PID: 388 Comm: kworker/2:3 Not tainted 5.10.0-02758-g8e5f91fd772f #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Workqueue: kthrotld blk_throtl_dispatch_work_fn Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x167 print_usage_bug valid_state mark_lock_irq.cold+0x32/0x3a mark_lock+0x693/0xbc0 mark_held_locks+0x9e/0xe0 __trace_hardirqs_on_caller lockdep_hardirqs_on_prepare.part.0+0x151/0x360 trace_hardirqs_on+0x5b/0x180 __raw_spin_unlock_irq _raw_spin_unlock_irq+0x24/0x40 spin_unlock_irq adjust_inuse_and_calc_cost+0x4fb/0x970 ioc_rqos_merge+0x277/0x740 __rq_qos_merge+0x62/0xb0 rq_qos_merge bio_attempt_back_merge+0x12c/0x4a0 blk_mq_sched_try_merge+0x1b6/0x4d0 bfq_bio_merge+0x24a/0x390 __blk_mq_sched_bio_merge+0xa6/0x460 blk_mq_sched_bio_merge blk_mq_submit_bio+0x2e7/0x1ee0 __submit_bio_noacct_mq+0x175/0x3b0 submit_bio_noacct+0x1fb/0x270 blk_throtl_dispatch_work_fn+0x1ef/0x2b0 process_one_work+0x83e/0x13f0 process_scheduled_works worker_thread+0x7e3/0xd80 kthread+0x353/0x470 ret_from_fork+0x1f/0x30 Fixes: b0853ab4a238 ("blk-iocost: revamp in-period donation snapbacks") Signed-off-by: Li Nan Acked-by: Tejun Heo Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20230527091904.3001833-1-linan666@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 285ced3467ab..6084a9519883 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2455,6 +2455,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; + unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; @@ -2473,11 +2474,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, iocg->inuse == iocg->active) return cost; - spin_lock_irq(&ioc->lock); + spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); return cost; } @@ -2498,7 +2499,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); From ddf63516d8d37528dc6834c7f19b55084e956068 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 28 Apr 2023 15:44:04 +0800 Subject: [PATCH 117/266] blk-ioprio: Introduce promote-to-rt policy Since commit a78418e6a04c ("block: Always initialize bio IO priority on submit"), bio->bi_ioprio will never be IOPRIO_CLASS_NONE when calling blkcg_set_ioprio(), so there will be no way to promote the io-priority of one cgroup to IOPRIO_CLASS_RT, because bi_ioprio will always be greater than or equals to IOPRIO_CLASS_RT. It seems possible to call blkcg_set_ioprio() first then try to initialize bi_ioprio later in bio_set_ioprio(), but this doesn't work for bio in which bi_ioprio is already initialized (e.g., direct-io), so introduce a new promote-to-rt policy to promote the iopriority of bio to IOPRIO_CLASS_RT if the ioprio is not already RT. For none-to-rt policy, although it doesn't work now, but considering that its purpose was also to override the io-priority to RT and allowing for a smoother transition, just keep it and treat it as an alias of the promote-to-rt policy. Acked-by: Tejun Heo Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jan Kara Signed-off-by: Hou Tao Reviewed-by: Bagas Sanjaya Link: https://lore.kernel.org/r/20230428074404.280532-1-houtao@huaweicloud.com Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 42 ++++++++++++++----------- block/blk-ioprio.c | 23 ++++++++++++-- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index f67c0829350b..7544ce00e0cb 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2024,31 +2024,33 @@ that attribute: no-change Do not modify the I/O priority class. - none-to-rt - For requests that do not have an I/O priority class (NONE), - change the I/O priority class into RT. Do not modify - the I/O priority class of other requests. + promote-to-rt + For requests that have a non-RT I/O priority class, change it into RT. + Also change the priority level of these requests to 4. Do not modify + the I/O priority of requests that have priority class RT. restrict-to-be For requests that do not have an I/O priority class or that have I/O - priority class RT, change it into BE. Do not modify the I/O priority - class of requests that have priority class IDLE. + priority class RT, change it into BE. Also change the priority level + of these requests to 0. Do not modify the I/O priority class of + requests that have priority class IDLE. idle Change the I/O priority class of all requests into IDLE, the lowest I/O priority class. + none-to-rt + Deprecated. Just an alias for promote-to-rt. + The following numerical values are associated with the I/O priority policies: -+-------------+---+ -| no-change | 0 | -+-------------+---+ -| none-to-rt | 1 | -+-------------+---+ -| rt-to-be | 2 | -+-------------+---+ -| all-to-idle | 3 | -+-------------+---+ ++----------------+---+ +| no-change | 0 | ++----------------+---+ +| rt-to-be | 2 | ++----------------+---+ +| all-to-idle | 3 | ++----------------+---+ The numerical value that corresponds to each I/O priority class is as follows: @@ -2064,9 +2066,13 @@ The numerical value that corresponds to each I/O priority class is as follows: The algorithm to set the I/O priority class for a request is as follows: -- Translate the I/O priority class policy into a number. -- Change the request I/O priority class into the maximum of the I/O priority - class policy number and the numerical I/O priority class. +- If I/O priority class policy is promote-to-rt, change the request I/O + priority class to IOPRIO_CLASS_RT and change the request I/O priority + level to 4. +- If I/O priorityt class is not promote-to-rt, translate the I/O priority + class policy into a number, then change the request I/O priority class + into the maximum of the I/O priority class policy number and the numerical + I/O priority class. PID --- diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 055529b9b92b..4051fada01f1 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -23,25 +23,28 @@ /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. - * @POLICY_NONE_TO_RT: modify IOPRIO_CLASS_NONE into IOPRIO_CLASS_RT. + * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. + * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also . */ enum prio_policy { POLICY_NO_CHANGE = 0, - POLICY_NONE_TO_RT = 1, + POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, + POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", - [POLICY_NONE_TO_RT] = "none-to-rt", + [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", + [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; @@ -189,6 +192,20 @@ void blkcg_set_ioprio(struct bio *bio) if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; + if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || + blkcg->prio_policy == POLICY_NONE_TO_RT) { + /* + * For RT threads, the default priority level is 4 because + * task_nice is 0. By promoting non-RT io-priority to RT-class + * and default level 4, those requests that are already + * RT-class but need a higher io-priority can use ioprio_set() + * to achieve this. + */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); + return; + } + /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects From f12bc113ce904777fd6ca003b473b427782b3dde Mon Sep 17 00:00:00 2001 From: Zhong Jinghua Date: Mon, 5 Jun 2023 20:21:59 +0800 Subject: [PATCH 118/266] nbd: Add the maximum limit of allocated index in nbd_dev_add If the index allocated by idr_alloc greater than MINORMASK >> part_shift, the device number will overflow, resulting in failure to create a block device. Fix it by imiting the size of the max allocation. Signed-off-by: Zhong Jinghua Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230605122159.2134384-1-zhongjinghua@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 65ecde3e2a5b..6457a094abcc 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1776,7 +1776,8 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) if (err == -ENOSPC) err = -EEXIST; } else { - err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL); + err = idr_alloc(&nbd_index_idr, nbd, 0, + (MINORMASK >> part_shift) + 1, GFP_KERNEL); if (err >= 0) index = err; } From a7cfa0af0c88353b4eb59db5a2a0fbe35329b3f9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 6 Jun 2023 09:14:38 +0800 Subject: [PATCH 119/266] blk-ioc: fix recursive spin_lock/unlock_irq() in ioc_clear_queue() Recursive spin_lock/unlock_irq() is not safe, because spin_unlock_irq() will enable irq unconditionally: spin_lock_irq queue_lock -> disable irq spin_lock_irq ioc->lock spin_unlock_irq ioc->lock -> enable irq /* * AA dead lock will be triggered if current context is preempted by irq, * and irq try to hold queue_lock again. */ spin_unlock_irq queue_lock Fix this problem by using spin_lock/unlock() directly for 'ioc->lock'. Fixes: 5a0ac57c48aa ("blk-ioc: protect ioc_destroy_icq() by 'queue_lock'") Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230606011438.3743440-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-ioc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d5db92e62c43..25dd4db11121 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -179,9 +179,9 @@ void ioc_clear_queue(struct request_queue *q) * Other context won't hold ioc lock to wait for queue_lock, see * details in ioc_release_fn(). */ - spin_lock_irq(&icq->ioc->lock); + spin_lock(&icq->ioc->lock); ioc_destroy_icq(icq); - spin_unlock_irq(&icq->ioc->lock); + spin_unlock(&icq->ioc->lock); } spin_unlock_irq(&q->queue_lock); } From 3d2af77e31ade05ff7ccc3658c3635ec1bea0979 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 6 Jun 2023 14:07:24 -0400 Subject: [PATCH 120/266] blk-cgroup: Reinit blkg_iostat_set after clearing in blkcg_reset_stats() When blkg_alloc() is called to allocate a blkcg_gq structure with the associated blkg_iostat_set's, there are 2 fields within blkg_iostat_set that requires proper initialization - blkg & sync. The former field was introduced by commit 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()") while the later one was introduced by commit f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat"). Unfortunately those fields in the blkg_iostat_set's are not properly re-initialized when they are cleared in v1's blkcg_reset_stats(). This can lead to a kernel panic due to NULL pointer access of the blkg pointer. The missing initialization of sync is less problematic and can be a problem in a debug kernel due to missing lockdep initialization. Fix these problems by re-initializing them after memory clearing. Fixes: 3b8cc6298724 ("blk-cgroup: Optimize blkcg_rstat_flush()") Fixes: f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat") Signed-off-by: Waiman Long Reviewed-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230606180724.2455066-1-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1c6716f51fff..cab33bd4f252 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -610,8 +610,13 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkg_iostat_set *bis = per_cpu_ptr(blkg->iostat_cpu, cpu); memset(bis, 0, sizeof(*bis)); + + /* Re-initialize the cleared blkg_iostat_set */ + u64_stats_init(&bis->sync); + bis->blkg = blkg; } memset(&blkg->iostat, 0, sizeof(blkg->iostat)); + u64_stats_init(&blkg->iostat.sync); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; From 1341c7d2ccf42ed91aea80b8579d35bc1ea381e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 7 Jun 2023 15:57:46 +0200 Subject: [PATCH 121/266] block: fix rootwait= Failures to look up the gendisk must return -ENODEV so that rootwait retries the lookup instead of -EINVAL which exits early. Fixes: cf056a431215 ("init: improve the name_to_dev_t interface") Reported-by: Fabio Estevam Signed-off-by: Christoph Hellwig Tested-by: Fabio Estevam Link: https://lore.kernel.org/r/20230607135746.92995-1-hch@lst.de Signed-off-by: Jens Axboe --- block/early-lookup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/early-lookup.c b/block/early-lookup.c index 3ff0d2e4dcbf..48ea3e982419 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -181,7 +181,7 @@ static int __init devt_from_devname(const char *name, dev_t *devt) *p = '\0'; *devt = blk_lookup_devt(s, part); if (*devt) - return 0; + return -ENODEV; /* try disk name without p */ if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') @@ -190,7 +190,7 @@ static int __init devt_from_devname(const char *name, dev_t *devt) *devt = blk_lookup_devt(s, part); if (*devt) return 0; - return -EINVAL; + return -ENODEV; } static int __init devt_from_devnum(const char *name, dev_t *devt) From 3a41db531e5124adaa3a9ab9ca0c724aee85b10c Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:41 +0200 Subject: [PATCH 122/266] pktcdvd: Get rid of custom printing macros We may use traditional dev_*() macros instead of custom ones provided by the driver. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-2-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 247 ++++++++++++++++++----------------- include/linux/pktcdvd.h | 1 - include/uapi/linux/pktcdvd.h | 1 + 3 files changed, 130 insertions(+), 119 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 377f8b345352..a327cce67768 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -72,22 +72,6 @@ #define DRIVER_NAME "pktcdvd" -#define pkt_err(pd, fmt, ...) \ - pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) -#define pkt_notice(pd, fmt, ...) \ - pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) -#define pkt_info(pd, fmt, ...) \ - pr_info("%s: " fmt, pd->name, ##__VA_ARGS__) - -#define pkt_dbg(level, pd, fmt, ...) \ -do { \ - if (level == 2 && PACKET_DEBUG >= 2) \ - pr_notice("%s: %s():" fmt, \ - pd->name, __func__, ##__VA_ARGS__); \ - else if (level == 1 && PACKET_DEBUG >= 1) \ - pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \ -} while (0) - #define MAX_SPEED 0xffff static DEFINE_MUTEX(pktcdvd_mutex); @@ -319,7 +303,7 @@ static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) if (class_is_registered(&class_pktcdvd)) { pd->dev = device_create_with_groups(&class_pktcdvd, NULL, MKDEV(0, 0), pd, pkt_groups, - "%s", pd->name); + "%s", pd->disk->disk_name); if (IS_ERR(pd->dev)) pd->dev = NULL; } @@ -350,7 +334,7 @@ static ssize_t device_map_show(const struct class *c, const struct class_attribu if (!pd) continue; n += sprintf(data+n, "%s %u:%u %u:%u\n", - pd->name, + pd->disk->disk_name, MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), MAJOR(pd->bdev->bd_dev), MINOR(pd->bdev->bd_dev)); @@ -450,7 +434,7 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) { if (!pkt_debugfs_root) return; - pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); + pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); if (!pd->dfs_d_root) return; @@ -484,9 +468,11 @@ static void pkt_debugfs_cleanup(void) static void pkt_bio_finished(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); + BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - pkt_dbg(2, pd, "queue empty\n"); + dev_dbg(ddev, "queue empty\n"); atomic_set(&pd->iosched.attention, 1); wake_up(&pd->wqueue); } @@ -717,15 +703,16 @@ static const char *sense_key_string(__u8 index) static void pkt_dump_sense(struct pktcdvd_device *pd, struct packet_command *cgc) { + struct device *ddev = disk_to_dev(pd->disk); struct scsi_sense_hdr *sshdr = cgc->sshdr; if (sshdr) - pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", + dev_err(ddev, "%*ph - sense %02x.%02x.%02x (%s)\n", CDROM_PACKET_SIZE, cgc->cmd, sshdr->sense_key, sshdr->asc, sshdr->ascq, sense_key_string(sshdr->sense_key)); else - pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); + dev_err(ddev, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); } /* @@ -809,6 +796,7 @@ static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) */ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); if (atomic_read(&pd->iosched.attention) == 0) return; @@ -836,7 +824,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) need_write_seek = 0; if (need_write_seek && reads_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, pd, "write, waiting\n"); + dev_dbg(ddev, "write, waiting\n"); break; } pkt_flush_cache(pd); @@ -845,7 +833,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) } else { if (!reads_queued && writes_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, pd, "read, waiting\n"); + dev_dbg(ddev, "read, waiting\n"); break; } pd->iosched.writing = 1; @@ -892,6 +880,8 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) */ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) { + struct device *ddev = disk_to_dev(pd->disk); + if ((pd->settings.size << 9) / CD_FRAMESIZE <= queue_max_segments(q)) { /* @@ -908,7 +898,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que set_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; } else { - pkt_err(pd, "cdrom max_phys_segments too small\n"); + dev_err(ddev, "cdrom max_phys_segments too small\n"); return -EIO; } } @@ -919,7 +909,7 @@ static void pkt_end_io_read(struct bio *bio) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", + dev_dbg(disk_to_dev(pd->disk), "bio=%p sec0=%llx sec=%llx err=%d\n", bio, (unsigned long long)pkt->sector, (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); @@ -939,7 +929,7 @@ static void pkt_end_io_packet_write(struct bio *bio) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status); + dev_dbg(disk_to_dev(pd->disk), "id=%d, err=%d\n", pkt->id, bio->bi_status); pd->stats.pkt_ended++; @@ -955,6 +945,7 @@ static void pkt_end_io_packet_write(struct bio *bio) */ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) { + struct device *ddev = disk_to_dev(pd->disk); int frames_read = 0; struct bio *bio; int f; @@ -983,8 +974,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) spin_unlock(&pkt->lock); if (pkt->cache_valid) { - pkt_dbg(2, pd, "zone %llx cached\n", - (unsigned long long)pkt->sector); + dev_dbg(ddev, "zone %llx cached\n", (unsigned long long)pkt->sector); goto out_account; } @@ -1005,8 +995,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) p = (f * CD_FRAMESIZE) / PAGE_SIZE; offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n", - f, pkt->pages[p], offset); + dev_dbg(ddev, "Adding frame %d, page:%p offs:%d\n", f, + pkt->pages[p], offset); if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) BUG(); @@ -1016,8 +1006,8 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) } out_account: - pkt_dbg(2, pd, "need %d frames for zone %llx\n", - frames_read, (unsigned long long)pkt->sector); + dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, + (unsigned long long)pkt->sector); pd->stats.pkt_started++; pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); } @@ -1051,17 +1041,18 @@ static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *p } } -static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state) +static inline void pkt_set_state(struct device *ddev, struct packet_data *pkt, + enum packet_data_state state) { -#if PACKET_DEBUG > 1 static const char *state_name[] = { "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" }; enum packet_data_state old_state = pkt->state; - pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n", + + dev_dbg(ddev, "pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector, state_name[old_state], state_name[state]); -#endif + pkt->state = state; } @@ -1071,6 +1062,7 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state */ static int pkt_handle_queue(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_data *pkt, *p; struct bio *bio = NULL; sector_t zone = 0; /* Suppress gcc warning */ @@ -1080,7 +1072,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) atomic_set(&pd->scan_queue, 0); if (list_empty(&pd->cdrw.pkt_free_list)) { - pkt_dbg(2, pd, "no pkt\n"); + dev_dbg(ddev, "no pkt\n"); return 0; } @@ -1117,7 +1109,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) } spin_unlock(&pd->lock); if (!bio) { - pkt_dbg(2, pd, "no bio\n"); + dev_dbg(ddev, "no bio\n"); return 0; } @@ -1133,12 +1125,13 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) * to this packet. */ spin_lock(&pd->lock); - pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone); + dev_dbg(ddev, "looking for zone %llx\n", (unsigned long long)zone); while ((node = pkt_rbtree_find(pd, zone)) != NULL) { + sector_t tmp = get_zone(node->bio->bi_iter.bi_sector, pd); + bio = node->bio; - pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long) - get_zone(bio->bi_iter.bi_sector, pd)); - if (get_zone(bio->bi_iter.bi_sector, pd) != zone) + dev_dbg(ddev, "found zone=%llx\n", (unsigned long long)tmp); + if (tmp != zone) break; pkt_rbtree_erase(pd, node); spin_lock(&pkt->lock); @@ -1157,7 +1150,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) spin_unlock(&pd->lock); pkt->sleep_time = max(PACKET_WAIT_TIME, 1); - pkt_set_state(pkt, PACKET_WAITING_STATE); + pkt_set_state(ddev, pkt, PACKET_WAITING_STATE); atomic_set(&pkt->run_sm, 1); spin_lock(&pd->cdrw.active_list_lock); @@ -1209,6 +1202,7 @@ static void bio_list_copy_data(struct bio *dst, struct bio *src) */ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) { + struct device *ddev = disk_to_dev(pd->disk); int f; bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames, @@ -1225,7 +1219,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset)) BUG(); } - pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt); + dev_dbg(ddev, "vcnt=%d\n", pkt->w_bio->bi_vcnt); /* * Fill-in bvec with data from orig_bios. @@ -1233,11 +1227,11 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) spin_lock(&pkt->lock); bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head); - pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); + pkt_set_state(ddev, pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); - pkt_dbg(2, pd, "Writing %d frames for zone %llx\n", - pkt->write_size, (unsigned long long)pkt->sector); + dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, + (unsigned long long)pkt->sector); if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) pkt->cache_valid = 1; @@ -1265,7 +1259,9 @@ static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status) static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) { - pkt_dbg(2, pd, "pkt %d\n", pkt->id); + struct device *ddev = disk_to_dev(pd->disk); + + dev_dbg(ddev, "pkt %d\n", pkt->id); for (;;) { switch (pkt->state) { @@ -1275,7 +1271,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data pkt->sleep_time = 0; pkt_gather_data(pd, pkt); - pkt_set_state(pkt, PACKET_READ_WAIT_STATE); + pkt_set_state(ddev, pkt, PACKET_READ_WAIT_STATE); break; case PACKET_READ_WAIT_STATE: @@ -1283,7 +1279,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data return; if (atomic_read(&pkt->io_errors) > 0) { - pkt_set_state(pkt, PACKET_RECOVERY_STATE); + pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); } else { pkt_start_write(pd, pkt); } @@ -1294,15 +1290,15 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data return; if (!pkt->w_bio->bi_status) { - pkt_set_state(pkt, PACKET_FINISHED_STATE); + pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); } else { - pkt_set_state(pkt, PACKET_RECOVERY_STATE); + pkt_set_state(ddev, pkt, PACKET_RECOVERY_STATE); } break; case PACKET_RECOVERY_STATE: - pkt_dbg(2, pd, "No recovery possible\n"); - pkt_set_state(pkt, PACKET_FINISHED_STATE); + dev_dbg(ddev, "No recovery possible\n"); + pkt_set_state(ddev, pkt, PACKET_FINISHED_STATE); break; case PACKET_FINISHED_STATE: @@ -1318,6 +1314,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data static void pkt_handle_packets(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_data *pkt, *next; /* @@ -1338,7 +1335,7 @@ static void pkt_handle_packets(struct pktcdvd_device *pd) if (pkt->state == PACKET_FINISHED_STATE) { list_del(&pkt->list); pkt_put_packet_data(pd, pkt); - pkt_set_state(pkt, PACKET_IDLE_STATE); + pkt_set_state(ddev, pkt, PACKET_IDLE_STATE); atomic_set(&pd->scan_queue, 1); } } @@ -1367,7 +1364,9 @@ static void pkt_count_states(struct pktcdvd_device *pd, int *states) static int kcdrwd(void *foobar) { struct pktcdvd_device *pd = foobar; + struct device *ddev = disk_to_dev(pd->disk); struct packet_data *pkt; + int states[PACKET_NUM_STATES]; long min_sleep_time, residue; set_user_nice(current, MIN_NICE); @@ -1398,13 +1397,9 @@ static int kcdrwd(void *foobar) goto work_to_do; /* Otherwise, go to sleep */ - if (PACKET_DEBUG > 1) { - int states[PACKET_NUM_STATES]; - pkt_count_states(pd, states); - pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], - states[3], states[4], states[5]); - } + pkt_count_states(pd, states); + dev_dbg(ddev, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], states[3], states[4], states[5]); min_sleep_time = MAX_SCHEDULE_TIMEOUT; list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { @@ -1412,9 +1407,9 @@ static int kcdrwd(void *foobar) min_sleep_time = pkt->sleep_time; } - pkt_dbg(2, pd, "sleeping\n"); + dev_dbg(ddev, "sleeping\n"); residue = schedule_timeout(min_sleep_time); - pkt_dbg(2, pd, "wake up\n"); + dev_dbg(ddev, "wake up\n"); /* make swsusp happy with our thread */ try_to_freeze(); @@ -1462,7 +1457,7 @@ static int kcdrwd(void *foobar) static void pkt_print_settings(struct pktcdvd_device *pd) { - pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n", + dev_info(disk_to_dev(pd->disk), "%s packets, %u blocks, Mode-%c disc\n", pd->settings.fp ? "Fixed" : "Variable", pd->settings.size >> 2, pd->settings.block_mode == 8 ? '1' : '2'); @@ -1590,6 +1585,7 @@ static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, */ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; write_param_page *wp; @@ -1656,7 +1652,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) /* * paranoia */ - pkt_err(pd, "write mode wrong %d\n", wp->data_block_type); + dev_err(ddev, "write mode wrong %d\n", wp->data_block_type); return 1; } wp->packet_size = cpu_to_be32(pd->settings.size >> 2); @@ -1677,6 +1673,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) */ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) { + struct device *ddev = disk_to_dev(pd->disk); + switch (pd->mmc3_profile) { case 0x1a: /* DVD+RW */ case 0x12: /* DVD-RAM */ @@ -1701,7 +1699,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) if (ti->rt == 1 && ti->blank == 0) return 1; - pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); + dev_err(ddev, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); return 0; } @@ -1710,6 +1708,8 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) */ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) { + struct device *ddev = disk_to_dev(pd->disk); + switch (pd->mmc3_profile) { case 0x0a: /* CD-RW */ case 0xffff: /* MMC3 not supported */ @@ -1719,8 +1719,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) case 0x12: /* DVD-RAM */ return 1; default: - pkt_dbg(2, pd, "Wrong disc profile (%x)\n", - pd->mmc3_profile); + dev_dbg(ddev, "Wrong disc profile (%x)\n", pd->mmc3_profile); return 0; } @@ -1729,22 +1728,22 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) * but i'm not sure, should we leave this to user apps? probably. */ if (di->disc_type == 0xff) { - pkt_notice(pd, "unknown disc - no track?\n"); + dev_notice(ddev, "unknown disc - no track?\n"); return 0; } if (di->disc_type != 0x20 && di->disc_type != 0) { - pkt_err(pd, "wrong disc type (%x)\n", di->disc_type); + dev_err(ddev, "wrong disc type (%x)\n", di->disc_type); return 0; } if (di->erasable == 0) { - pkt_notice(pd, "disc not erasable\n"); + dev_err(ddev, "disc not erasable\n"); return 0; } if (di->border_status == PACKET_SESSION_RESERVED) { - pkt_err(pd, "can't write to last track (reserved)\n"); + dev_err(ddev, "can't write to last track (reserved)\n"); return 0; } @@ -1753,6 +1752,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; unsigned char buf[12]; disc_information di; @@ -1770,7 +1770,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) ret = pkt_get_disc_info(pd, &di); if (ret) { - pkt_err(pd, "failed get_disc\n"); + dev_err(ddev, "failed get_disc\n"); return ret; } @@ -1782,12 +1782,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ ret = pkt_get_track_info(pd, track, 1, &ti); if (ret) { - pkt_err(pd, "failed get_track\n"); + dev_err(ddev, "failed get_track\n"); return ret; } if (!pkt_writable_track(pd, &ti)) { - pkt_err(pd, "can't write to this track\n"); + dev_err(ddev, "can't write to this track\n"); return -EROFS; } @@ -1797,11 +1797,11 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) */ pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; if (pd->settings.size == 0) { - pkt_notice(pd, "detected zero packet size!\n"); + dev_notice(ddev, "detected zero packet size!\n"); return -ENXIO; } if (pd->settings.size > PACKET_MAX_SECTORS) { - pkt_err(pd, "packet size is too big\n"); + dev_err(ddev, "packet size is too big\n"); return -EROFS; } pd->settings.fp = ti.fp; @@ -1843,7 +1843,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) pd->settings.block_mode = PACKET_BLOCK_MODE2; break; default: - pkt_err(pd, "unknown data mode\n"); + dev_err(ddev, "unknown data mode\n"); return -EROFS; } return 0; @@ -1854,6 +1854,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) */ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; unsigned char buf[64]; @@ -1883,10 +1884,10 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); ret = pkt_mode_select(pd, &cgc); if (ret) { - pkt_err(pd, "write caching control failed\n"); + dev_err(ddev, "write caching control failed\n"); pkt_dump_sense(pd, &cgc); } else if (!ret && set) - pkt_notice(pd, "enabled write caching\n"); + dev_notice(ddev, "enabled write caching\n"); return ret; } @@ -1967,6 +1968,7 @@ static char us_clv_to_speed[16] = { static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, unsigned *speed) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; unsigned char buf[64]; @@ -2001,11 +2003,11 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, } if (!(buf[6] & 0x40)) { - pkt_notice(pd, "disc type is not CD-RW\n"); + dev_notice(ddev, "disc type is not CD-RW\n"); return 1; } if (!(buf[6] & 0x4)) { - pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n"); + dev_notice(ddev, "A1 values on media are not valid, maybe not CDRW?\n"); return 1; } @@ -2025,25 +2027,26 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, *speed = us_clv_to_speed[sp]; break; default: - pkt_notice(pd, "unknown disc sub-type %d\n", st); + dev_notice(ddev, "unknown disc sub-type %d\n", st); return 1; } if (*speed) { - pkt_info(pd, "maximum media speed: %d\n", *speed); + dev_info(ddev, "maximum media speed: %d\n", *speed); return 0; } else { - pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); + dev_notice(ddev, "unknown speed %d for sub-type %d\n", sp, st); return 1; } } static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); struct packet_command cgc; struct scsi_sense_hdr sshdr; int ret; - pkt_dbg(2, pd, "Performing OPC\n"); + dev_dbg(ddev, "Performing OPC\n"); init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); cgc.sshdr = &sshdr; @@ -2058,18 +2061,19 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) static int pkt_open_write(struct pktcdvd_device *pd) { + struct device *ddev = disk_to_dev(pd->disk); int ret; unsigned int write_speed, media_write_speed, read_speed; ret = pkt_probe_settings(pd); if (ret) { - pkt_dbg(2, pd, "failed probe\n"); + dev_dbg(ddev, "failed probe\n"); return ret; } ret = pkt_set_write_settings(pd); if (ret) { - pkt_dbg(1, pd, "failed saving write settings\n"); + dev_notice(ddev, "failed saving write settings\n"); return -EIO; } @@ -2082,30 +2086,29 @@ static int pkt_open_write(struct pktcdvd_device *pd) case 0x13: /* DVD-RW */ case 0x1a: /* DVD+RW */ case 0x12: /* DVD-RAM */ - pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); + dev_notice(ddev, "write speed %ukB/s\n", write_speed); break; default: ret = pkt_media_speed(pd, &media_write_speed); if (ret) media_write_speed = 16; write_speed = min(write_speed, media_write_speed * 177); - pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); + dev_notice(ddev, "write speed %ux\n", write_speed / 176); break; } read_speed = write_speed; ret = pkt_set_speed(pd, write_speed, read_speed); if (ret) { - pkt_dbg(1, pd, "couldn't set write speed\n"); + dev_notice(ddev, "couldn't set write speed\n"); return -EIO; } pd->write_speed = write_speed; pd->read_speed = read_speed; ret = pkt_perform_opc(pd); - if (ret) { - pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); - } + if (ret) + dev_notice(ddev, "Optimum Power Calibration failed\n"); return 0; } @@ -2115,6 +2118,7 @@ static int pkt_open_write(struct pktcdvd_device *pd) */ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) { + struct device *ddev = disk_to_dev(pd->disk); int ret; long lba; struct request_queue *q; @@ -2134,7 +2138,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) ret = pkt_get_last_written(pd, &lba); if (ret) { - pkt_err(pd, "pkt_get_last_written failed\n"); + dev_err(ddev, "pkt_get_last_written failed\n"); goto out_putdev; } @@ -2163,11 +2167,11 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) if (write) { if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - pkt_err(pd, "not enough memory for buffers\n"); + dev_err(ddev, "not enough memory for buffers\n"); ret = -ENOMEM; goto out_putdev; } - pkt_info(pd, "%lukB available on disc\n", lba << 1); + dev_info(ddev, "%lukB available on disc\n", lba << 1); } return 0; @@ -2184,8 +2188,10 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) */ static void pkt_release_dev(struct pktcdvd_device *pd, int flush) { + struct device *ddev = disk_to_dev(pd->disk); + if (flush && pkt_flush_cache(pd)) - pkt_dbg(1, pd, "not flushing cache\n"); + dev_notice(ddev, "not flushing cache\n"); pkt_lock_door(pd, 0); @@ -2386,13 +2392,14 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) static void pkt_submit_bio(struct bio *bio) { struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; + struct device *ddev = disk_to_dev(pd->disk); struct bio *split; bio = bio_split_to_limits(bio); if (!bio) return; - pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", + dev_dbg(ddev, "start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_iter.bi_sector, (unsigned long long)bio_end_sector(bio)); @@ -2405,13 +2412,13 @@ static void pkt_submit_bio(struct bio *bio) } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - pkt_notice(pd, "WRITE for ro device (%llu)\n", + dev_notice(ddev, "WRITE for ro device (%llu)\n", (unsigned long long)bio->bi_iter.bi_sector); goto end_io; } if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) { - pkt_err(pd, "wrong bio size\n"); + dev_err(ddev, "wrong bio size\n"); goto end_io; } @@ -2453,7 +2460,7 @@ static int pkt_seq_show(struct seq_file *m, void *p) char *msg; int states[PACKET_NUM_STATES]; - seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev); + seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, pd->bdev); seq_printf(m, "\nSettings:\n"); seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); @@ -2509,12 +2516,13 @@ static int pkt_seq_show(struct seq_file *m, void *p) static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) { + struct device *ddev = disk_to_dev(pd->disk); int i; struct block_device *bdev; struct scsi_device *sdev; if (pd->pkt_dev == dev) { - pkt_err(pd, "recursive setup not allowed\n"); + dev_err(ddev, "recursive setup not allowed\n"); return -EBUSY; } for (i = 0; i < MAX_WRITERS; i++) { @@ -2522,11 +2530,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (!pd2) continue; if (pd2->bdev->bd_dev == dev) { - pkt_err(pd, "%pg already setup\n", pd2->bdev); + dev_err(ddev, "%pg already setup\n", pd2->bdev); return -EBUSY; } if (pd2->pkt_dev == dev) { - pkt_err(pd, "can't chain pktcdvd devices\n"); + dev_err(ddev, "can't chain pktcdvd devices\n"); return -EBUSY; } } @@ -2550,14 +2558,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) pkt_init_queue(pd); atomic_set(&pd->cdrw.pending_bios, 0); - pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); + pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); if (IS_ERR(pd->cdrw.thread)) { - pkt_err(pd, "can't start kernel thread\n"); + dev_err(ddev, "can't start kernel thread\n"); goto out_mem; } - proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd); - pkt_dbg(1, pd, "writer mapped to %pg\n", bdev); + proc_create_single_data(pd->disk->disk_name, 0, pkt_proc, pkt_seq_show, pd); + dev_notice(ddev, "writer mapped to %pg\n", bdev); return 0; out_mem: @@ -2570,10 +2578,10 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct pktcdvd_device *pd = bdev->bd_disk->private_data; + struct device *ddev = disk_to_dev(pd->disk); int ret; - pkt_dbg(2, pd, "cmd %x, dev %d:%d\n", - cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + dev_dbg(ddev, "cmd %x, dev %d:%d\n", cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); mutex_lock(&pktcdvd_mutex); switch (cmd) { @@ -2599,7 +2607,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); break; default: - pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); + dev_dbg(ddev, "Unknown ioctl (%x)\n", cmd); ret = -ENOTTY; } mutex_unlock(&pktcdvd_mutex); @@ -2677,7 +2685,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) spin_lock_init(&pd->iosched.lock); bio_list_init(&pd->iosched.read_queue); bio_list_init(&pd->iosched.write_queue); - sprintf(pd->name, DRIVER_NAME"%d", idx); init_waitqueue_head(&pd->wqueue); pd->bio_queue = RB_ROOT; @@ -2694,7 +2701,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->minors = 1; disk->fops = &pktcdvd_ops; disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART; - strcpy(disk->disk_name, pd->name); + snprintf(disk->disk_name, sizeof(disk->disk_name), DRIVER_NAME"%d", idx); disk->private_data = pd; pd->pkt_dev = MKDEV(pktdev_major, idx); @@ -2736,6 +2743,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) static int pkt_remove_dev(dev_t pkt_dev) { struct pktcdvd_device *pd; + struct device *ddev; int idx; int ret = 0; @@ -2756,6 +2764,9 @@ static int pkt_remove_dev(dev_t pkt_dev) ret = -EBUSY; goto out; } + + ddev = disk_to_dev(pd->disk); + if (!IS_ERR(pd->cdrw.thread)) kthread_stop(pd->cdrw.thread); @@ -2766,8 +2777,8 @@ static int pkt_remove_dev(dev_t pkt_dev) blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); - remove_proc_entry(pd->name, pkt_proc); - pkt_dbg(1, pd, "writer unmapped\n"); + remove_proc_entry(pd->disk->disk_name, pkt_proc); + dev_notice(ddev, "writer unmapped\n"); del_gendisk(pd->disk); put_disk(pd->disk); diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h index f9c5ac80d59b..80cb00db42a4 100644 --- a/include/linux/pktcdvd.h +++ b/include/linux/pktcdvd.h @@ -156,7 +156,6 @@ struct pktcdvd_device { struct block_device *bdev; /* dev attached */ dev_t pkt_dev; /* our dev */ - char name[20]; struct packet_settings settings; struct packet_stats stats; int refcnt; /* Open count */ diff --git a/include/uapi/linux/pktcdvd.h b/include/uapi/linux/pktcdvd.h index 6a5552dfd6af..987a3022dc5f 100644 --- a/include/uapi/linux/pktcdvd.h +++ b/include/uapi/linux/pktcdvd.h @@ -16,6 +16,7 @@ #include /* + * UNUSED: * 1 for normal debug messages, 2 is very verbose. 0 to turn it off. */ #define PACKET_DEBUG 1 From 1a0ddd56e545b743af510b5a1b8dbdfe7d35cd3b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:42 +0200 Subject: [PATCH 123/266] pktcdvd: replace sscanf() by kstrtoul() The checkpatch.pl warns: "Prefer kstrto to single variable sscanf". Fix the code accordingly. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230310164549.22133-3-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index a327cce67768..488d03dc5152 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -237,15 +237,16 @@ static ssize_t congestion_off_store(struct device *dev, const char *buf, size_t len) { struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val; + int val, ret; - if (sscanf(buf, "%d", &val) == 1) { - spin_lock(&pd->lock); - pd->write_congestion_off = val; - init_write_congestion_marks(&pd->write_congestion_off, - &pd->write_congestion_on); - spin_unlock(&pd->lock); - } + ret = kstrtoint(buf, 10, &val); + if (ret) + return ret; + + spin_lock(&pd->lock); + pd->write_congestion_off = val; + init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); + spin_unlock(&pd->lock); return len; } static DEVICE_ATTR_RW(congestion_off); @@ -267,15 +268,16 @@ static ssize_t congestion_on_store(struct device *dev, const char *buf, size_t len) { struct pktcdvd_device *pd = dev_get_drvdata(dev); - int val; + int val, ret; - if (sscanf(buf, "%d", &val) == 1) { - spin_lock(&pd->lock); - pd->write_congestion_on = val; - init_write_congestion_marks(&pd->write_congestion_off, - &pd->write_congestion_on); - spin_unlock(&pd->lock); - } + ret = kstrtoint(buf, 10, &val); + if (ret) + return ret; + + spin_lock(&pd->lock); + pd->write_congestion_on = val; + init_write_congestion_marks(&pd->write_congestion_off, &pd->write_congestion_on); + spin_unlock(&pd->lock); return len; } static DEVICE_ATTR_RW(congestion_on); From 3bb5746c26cdfcc354af4867c02f1e0ec1131a62 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:43 +0200 Subject: [PATCH 124/266] pktcdvd: use sysfs_emit() to instead of scnprintf() Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-4-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 488d03dc5152..d90913fa2ac5 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -335,7 +335,7 @@ static ssize_t device_map_show(const struct class *c, const struct class_attribu struct pktcdvd_device *pd = pkt_devs[idx]; if (!pd) continue; - n += sprintf(data+n, "%s %u:%u %u:%u\n", + n += sysfs_emit_at(data, n, "%s %u:%u %u:%u\n", pd->disk->disk_name, MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), MAJOR(pd->bdev->bd_dev), From f023faaa988671f0f1d1a965dc5d7e61d0f5e65e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:44 +0200 Subject: [PATCH 125/266] pktcdvd: Get rid of pkt_seq_show() forward declaration The code can be neater without forward declarations. Get rid of pkt_seq_show() forward declaration. This will also allow futher cleanups to be cleaner. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-5-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 151 ++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 76 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d90913fa2ac5..6ea18ab9a2cb 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -91,7 +91,6 @@ static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ /* forward declaration */ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); static int pkt_remove_dev(dev_t pkt_dev); -static int pkt_seq_show(struct seq_file *m, void *p); static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) { @@ -414,6 +413,81 @@ static void pkt_sysfs_cleanup(void) *******************************************************************/ +static void pkt_count_states(struct pktcdvd_device *pd, int *states) +{ + struct packet_data *pkt; + int i; + + for (i = 0; i < PACKET_NUM_STATES; i++) + states[i] = 0; + + spin_lock(&pd->cdrw.active_list_lock); + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + states[pkt->state]++; + } + spin_unlock(&pd->cdrw.active_list_lock); +} + +static int pkt_seq_show(struct seq_file *m, void *p) +{ + struct pktcdvd_device *pd = m->private; + char *msg; + int states[PACKET_NUM_STATES]; + + seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, pd->bdev); + + seq_printf(m, "\nSettings:\n"); + seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); + + if (pd->settings.write_type == 0) + msg = "Packet"; + else + msg = "Unknown"; + seq_printf(m, "\twrite type:\t\t%s\n", msg); + + seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable"); + seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss); + + seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode); + + if (pd->settings.block_mode == PACKET_BLOCK_MODE1) + msg = "Mode 1"; + else if (pd->settings.block_mode == PACKET_BLOCK_MODE2) + msg = "Mode 2"; + else + msg = "Unknown"; + seq_printf(m, "\tblock mode:\t\t%s\n", msg); + + seq_printf(m, "\nStatistics:\n"); + seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started); + seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended); + seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1); + seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1); + seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1); + + seq_printf(m, "\nMisc:\n"); + seq_printf(m, "\treference count:\t%d\n", pd->refcnt); + seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags); + seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed); + seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed); + seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset); + seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset); + + seq_printf(m, "\nQueue state:\n"); + seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); + seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); + seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector); + + pkt_count_states(pd, states); + seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], states[3], states[4], states[5]); + + seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n", + pd->write_congestion_off, + pd->write_congestion_on); + return 0; +} + static int pkt_debugfs_seq_show(struct seq_file *m, void *p) { return pkt_seq_show(m, p); @@ -1344,21 +1418,6 @@ static void pkt_handle_packets(struct pktcdvd_device *pd) spin_unlock(&pd->cdrw.active_list_lock); } -static void pkt_count_states(struct pktcdvd_device *pd, int *states) -{ - struct packet_data *pkt; - int i; - - for (i = 0; i < PACKET_NUM_STATES; i++) - states[i] = 0; - - spin_lock(&pd->cdrw.active_list_lock); - list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { - states[pkt->state]++; - } - spin_unlock(&pd->cdrw.active_list_lock); -} - /* * kcdrwd is woken up when writes have been queued for one of our * registered devices @@ -2456,66 +2515,6 @@ static void pkt_init_queue(struct pktcdvd_device *pd) q->queuedata = pd; } -static int pkt_seq_show(struct seq_file *m, void *p) -{ - struct pktcdvd_device *pd = m->private; - char *msg; - int states[PACKET_NUM_STATES]; - - seq_printf(m, "Writer %s mapped to %pg:\n", pd->disk->disk_name, pd->bdev); - - seq_printf(m, "\nSettings:\n"); - seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); - - if (pd->settings.write_type == 0) - msg = "Packet"; - else - msg = "Unknown"; - seq_printf(m, "\twrite type:\t\t%s\n", msg); - - seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable"); - seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss); - - seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode); - - if (pd->settings.block_mode == PACKET_BLOCK_MODE1) - msg = "Mode 1"; - else if (pd->settings.block_mode == PACKET_BLOCK_MODE2) - msg = "Mode 2"; - else - msg = "Unknown"; - seq_printf(m, "\tblock mode:\t\t%s\n", msg); - - seq_printf(m, "\nStatistics:\n"); - seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started); - seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended); - seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1); - seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1); - seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1); - - seq_printf(m, "\nMisc:\n"); - seq_printf(m, "\treference count:\t%d\n", pd->refcnt); - seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags); - seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed); - seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed); - seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset); - seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset); - - seq_printf(m, "\nQueue state:\n"); - seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); - seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); - seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector); - - pkt_count_states(pd, states); - seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], states[4], states[5]); - - seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n", - pd->write_congestion_off, - pd->write_congestion_on); - return 0; -} - static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) { struct device *ddev = disk_to_dev(pd->disk); From 93c8f6f38be67e30adf8d8eb5e7e9ccb89326119 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:45 +0200 Subject: [PATCH 126/266] pktcdvd: Drop redundant castings for sector_t Since the commit 72deb455b5ec ("block: remove CONFIG_LBDAF") the sector_t is always 64-bit type, no need to cast anymore. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-6-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 6ea18ab9a2cb..67887d2b30a0 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -476,7 +476,7 @@ static int pkt_seq_show(struct seq_file *m, void *p) seq_printf(m, "\nQueue state:\n"); seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); - seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector); + seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", pd->current_sector); pkt_count_states(pd, states); seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", @@ -986,8 +986,7 @@ static void pkt_end_io_read(struct bio *bio) BUG_ON(!pd); dev_dbg(disk_to_dev(pd->disk), "bio=%p sec0=%llx sec=%llx err=%d\n", - bio, (unsigned long long)pkt->sector, - (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status); + bio, pkt->sector, bio->bi_iter.bi_sector, bio->bi_status); if (bio->bi_status) atomic_inc(&pkt->io_errors); @@ -1050,7 +1049,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) spin_unlock(&pkt->lock); if (pkt->cache_valid) { - dev_dbg(ddev, "zone %llx cached\n", (unsigned long long)pkt->sector); + dev_dbg(ddev, "zone %llx cached\n", pkt->sector); goto out_account; } @@ -1082,8 +1081,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) } out_account: - dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, - (unsigned long long)pkt->sector); + dev_dbg(ddev, "need %d frames for zone %llx\n", frames_read, pkt->sector); pd->stats.pkt_started++; pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); } @@ -1126,8 +1124,7 @@ static inline void pkt_set_state(struct device *ddev, struct packet_data *pkt, enum packet_data_state old_state = pkt->state; dev_dbg(ddev, "pkt %2d : s=%6llx %s -> %s\n", - pkt->id, (unsigned long long)pkt->sector, - state_name[old_state], state_name[state]); + pkt->id, pkt->sector, state_name[old_state], state_name[state]); pkt->state = state; } @@ -1201,12 +1198,12 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) * to this packet. */ spin_lock(&pd->lock); - dev_dbg(ddev, "looking for zone %llx\n", (unsigned long long)zone); + dev_dbg(ddev, "looking for zone %llx\n", zone); while ((node = pkt_rbtree_find(pd, zone)) != NULL) { sector_t tmp = get_zone(node->bio->bi_iter.bi_sector, pd); bio = node->bio; - dev_dbg(ddev, "found zone=%llx\n", (unsigned long long)tmp); + dev_dbg(ddev, "found zone=%llx\n", tmp); if (tmp != zone) break; pkt_rbtree_erase(pd, node); @@ -1306,8 +1303,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_set_state(ddev, pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); - dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, - (unsigned long long)pkt->sector); + dev_dbg(ddev, "Writing %d frames for zone %llx\n", pkt->write_size, pkt->sector); if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) pkt->cache_valid = 1; @@ -2461,8 +2457,7 @@ static void pkt_submit_bio(struct bio *bio) return; dev_dbg(ddev, "start = %6llx stop = %6llx\n", - (unsigned long long)bio->bi_iter.bi_sector, - (unsigned long long)bio_end_sector(bio)); + bio->bi_iter.bi_sector, bio_end_sector(bio)); /* * Clone READ bios so we can have our own bi_end_io callback. @@ -2473,8 +2468,7 @@ static void pkt_submit_bio(struct bio *bio) } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - dev_notice(ddev, "WRITE for ro device (%llu)\n", - (unsigned long long)bio->bi_iter.bi_sector); + dev_notice(ddev, "WRITE for ro device (%llu)\n", bio->bi_iter.bi_sector); goto end_io; } From 80d994d2a71f88e0809dcaccef2259c791d2e3ef Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:46 +0200 Subject: [PATCH 127/266] pktcdvd: Use DEFINE_SHOW_ATTRIBUTE() to simplify code Use DEFINE_SHOW_ATTRIBUTE() helper macro to simplify the code. No functional change. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-7-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 67887d2b30a0..88ba1a8c6e72 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -487,24 +487,7 @@ static int pkt_seq_show(struct seq_file *m, void *p) pd->write_congestion_on); return 0; } - -static int pkt_debugfs_seq_show(struct seq_file *m, void *p) -{ - return pkt_seq_show(m, p); -} - -static int pkt_debugfs_fops_open(struct inode *inode, struct file *file) -{ - return single_open(file, pkt_debugfs_seq_show, inode->i_private); -} - -static const struct file_operations debug_fops = { - .open = pkt_debugfs_fops_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .owner = THIS_MODULE, -}; +DEFINE_SHOW_ATTRIBUTE(pkt_seq); static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) { @@ -514,8 +497,8 @@ static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) if (!pd->dfs_d_root) return; - pd->dfs_f_info = debugfs_create_file("info", 0444, - pd->dfs_d_root, pd, &debug_fops); + pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, + pd, &pkt_seq_fops); } static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) From 046636a4bac575aff78e44c7e1cff84c83a345a9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:47 +0200 Subject: [PATCH 128/266] pktcdvd: Use put_unaligned_be16() and get_unaligned_be16() This makes the driver code slightly better to understand. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230310164549.22133-8-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 88ba1a8c6e72..d352f7a369ef 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -70,6 +70,8 @@ #include #include +#include + #define DRIVER_NAME "pktcdvd" #define MAX_SPEED 0xffff @@ -808,10 +810,8 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); cgc.sshdr = &sshdr; cgc.cmd[0] = GPCMD_SET_SPEED; - cgc.cmd[2] = (read_speed >> 8) & 0xff; - cgc.cmd[3] = read_speed & 0xff; - cgc.cmd[4] = (write_speed >> 8) & 0xff; - cgc.cmd[5] = write_speed & 0xff; + put_unaligned_be16(read_speed, &cgc.cmd[2]); + put_unaligned_be16(write_speed, &cgc.cmd[4]); ret = pkt_generic_packet(pd, &cgc); if (ret) @@ -1509,8 +1509,7 @@ static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, cgc->cmd[0] = GPCMD_MODE_SENSE_10; cgc->cmd[2] = page_code | (page_control << 6); - cgc->cmd[7] = cgc->buflen >> 8; - cgc->cmd[8] = cgc->buflen & 0xff; + put_unaligned_be16(cgc->buflen, &cgc->cmd[7]); cgc->data_direction = CGC_DATA_READ; return pkt_generic_packet(pd, cgc); } @@ -1521,8 +1520,7 @@ static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc memset(cgc->buffer, 0, 2); cgc->cmd[0] = GPCMD_MODE_SELECT_10; cgc->cmd[1] = 0x10; /* PF */ - cgc->cmd[7] = cgc->buflen >> 8; - cgc->cmd[8] = cgc->buflen & 0xff; + put_unaligned_be16(cgc->buflen, &cgc->cmd[7]); cgc->data_direction = CGC_DATA_WRITE; return pkt_generic_packet(pd, cgc); } @@ -1563,8 +1561,7 @@ static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; cgc.cmd[1] = type & 3; - cgc.cmd[4] = (track & 0xff00) >> 8; - cgc.cmd[5] = track & 0xff; + put_unaligned_be16(track, &cgc.cmd[4]); cgc.cmd[8] = 8; cgc.quiet = 1; @@ -1645,8 +1642,8 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) return ret; } - size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff)); - pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff); + size = 2 + get_unaligned_be16(&buffer[0]); + pd->mode_offset = get_unaligned_be16(&buffer[6]); if (size > sizeof(buffer)) size = sizeof(buffer); @@ -1803,7 +1800,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) cgc.cmd[0] = GPCMD_GET_CONFIGURATION; cgc.cmd[8] = 8; ret = pkt_generic_packet(pd, &cgc); - pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7]; + pd->mmc3_profile = ret ? 0xffff : get_unaligned_be16(&buf[6]); memset(&di, 0, sizeof(disc_information)); memset(&ti, 0, sizeof(track_information)); @@ -1921,7 +1918,7 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd) */ buf[pd->mode_offset + 10] |= (set << 2); - cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); + cgc.buflen = cgc.cmd[8] = 2 + get_unaligned_be16(&buf[0]); ret = pkt_mode_select(pd, &cgc); if (ret) { dev_err(ddev, "write caching control failed\n"); @@ -1976,12 +1973,12 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, * Speed Performance Descriptor Block", use the information * in the first block. (contains the highest speed) */ - int num_spdb = (cap_buf[30] << 8) + cap_buf[31]; + int num_spdb = get_unaligned_be16(&cap_buf[30]); if (num_spdb > 0) offset = 34; } - *write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1]; + *write_speed = get_unaligned_be16(&cap_buf[offset]); return 0; } @@ -2026,7 +2023,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, pkt_dump_sense(pd, &cgc); return ret; } - size = ((unsigned int) buf[0]<<8) + buf[1] + 2; + size = 2 + get_unaligned_be16(&buf[0]); if (size > sizeof(buf)) size = sizeof(buf); From 6a5945a8eb5a626afe6feb341824e7e1d007c8ff Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:48 +0200 Subject: [PATCH 129/266] pktcdvd: Get rid of redundant 'else' In the snippets like the following if (...) return / goto / break / continue ...; else ... the 'else' is redundant. Get rid of it. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-9-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d352f7a369ef..4b675ec8bde5 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -941,25 +941,25 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que { struct device *ddev = disk_to_dev(pd->disk); - if ((pd->settings.size << 9) / CD_FRAMESIZE - <= queue_max_segments(q)) { + if ((pd->settings.size << 9) / CD_FRAMESIZE <= queue_max_segments(q)) { /* * The cdrom device can handle one segment/frame */ clear_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; - } else if ((pd->settings.size << 9) / PAGE_SIZE - <= queue_max_segments(q)) { + } + + if ((pd->settings.size << 9) / PAGE_SIZE <= queue_max_segments(q)) { /* * We can handle this case at the expense of some extra memory * copies during write operations */ set_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; - } else { - dev_err(ddev, "cdrom max_phys_segments too small\n"); - return -EIO; } + + dev_err(ddev, "cdrom max_phys_segments too small\n"); + return -EIO; } static void pkt_end_io_read(struct bio *bio) From 7da15fb0318f18398feea2848d099a8d0d7b5965 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 10 Mar 2023 18:45:49 +0200 Subject: [PATCH 130/266] pktcdvd: Sort headers Sort the headers in alphabetic order in order to ease the maintenance for this part. Signed-off-by: Andy Shevchenko Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230310164549.22133-10-andriy.shevchenko@linux.intel.com Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 4b675ec8bde5..af1140548adb 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -46,30 +46,31 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include +#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include + #include #define DRIVER_NAME "pktcdvd" From bb91a7d96a5c9662f41a08024f405bf9ad333e86 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 9 Jun 2023 07:17:37 +0200 Subject: [PATCH 131/266] block: fix rootwait= again The previous rootwait fix added an -EINVAL return to a completely bogus superflous branch, fix this. Fixes: 1341c7d2ccf4 ("block: fix rootwait=") Reported-by: Mark Brown Signed-off-by: Christoph Hellwig Tested-by: Fabio Estevam Tested-by: Marek Szyprowski Tested-by: Mark Brown Link: https://lore.kernel.org/r/20230609051737.328930-1-hch@lst.de Signed-off-by: Jens Axboe --- block/early-lookup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/early-lookup.c b/block/early-lookup.c index 48ea3e982419..a5be3c68ed07 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -181,7 +181,7 @@ static int __init devt_from_devname(const char *name, dev_t *devt) *p = '\0'; *devt = blk_lookup_devt(s, part); if (*devt) - return -ENODEV; + return 0; /* try disk name without p */ if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') From b0488411e919368014907850f74191d03e25f031 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:19 +0800 Subject: [PATCH 132/266] block/rnbd: kill rnbd_flags_supported This routine is not called since added. Then the two flags (RNBD_OP_LAST and RNBD_F_ALL) can be removed too after kill rnbd_flags_supported. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20230524070026.2932-2-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-proto.h | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index da1d0542d7e2..84fd69844b7d 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -185,7 +185,6 @@ struct rnbd_msg_io { enum rnbd_io_flags { /* Operations */ - RNBD_OP_READ = 0, RNBD_OP_WRITE = 1, RNBD_OP_FLUSH = 2, @@ -193,15 +192,9 @@ enum rnbd_io_flags { RNBD_OP_SECURE_ERASE = 4, RNBD_OP_WRITE_SAME = 5, - RNBD_OP_LAST, - /* Flags */ - RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0), RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1), - - RNBD_F_ALL = (RNBD_F_SYNC | RNBD_F_FUA) - }; static inline u32 rnbd_op(u32 flags) @@ -214,21 +207,6 @@ static inline u32 rnbd_flags(u32 flags) return flags & ~RNBD_OP_MASK; } -static inline bool rnbd_flags_supported(u32 flags) -{ - u32 op; - - op = rnbd_op(flags); - flags = rnbd_flags(flags); - - if (op >= RNBD_OP_LAST) - return false; - if (flags & ~RNBD_F_ALL) - return false; - - return true; -} - static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) { blk_opf_t bio_opf; From 5783153ac67e20f65a402ef42237cd1a6d7fa320 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:20 +0800 Subject: [PATCH 133/266] block/rnbd-srv: remove unused header No need to include it since none of macros in limits.h are used by rnbd-srv. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20230524070026.2932-3-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-sysfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index d5d9267e1fa5..9fe7d9e0ab63 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -9,7 +9,6 @@ #undef pr_fmt #define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt -#include #include #include #include From d6e94913cb1cb4b4d1d737f72b5cef10b13395ff Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:21 +0800 Subject: [PATCH 134/266] block/rnbd: introduce rnbd_access_modes Add one new array (marked with __maybe_unused to prevent gcc warning about "defined but not used" with W=1), then we can remove rnbd_access_mode_str and rnbd-common.c accordingly. Signed-off-by: Guoqing Jiang Acked-by: Jack Wang Link: https://lore.kernel.org/r/20230524070026.2932-4-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/Makefile | 6 ++---- drivers/block/rnbd/rnbd-clt-sysfs.c | 4 ++-- drivers/block/rnbd/rnbd-common.c | 23 ----------------------- drivers/block/rnbd/rnbd-proto.h | 9 +++++++++ drivers/block/rnbd/rnbd-srv-sysfs.c | 2 +- drivers/block/rnbd/rnbd-srv.c | 4 ++-- 6 files changed, 16 insertions(+), 32 deletions(-) delete mode 100644 drivers/block/rnbd/rnbd-common.c diff --git a/drivers/block/rnbd/Makefile b/drivers/block/rnbd/Makefile index 40b31630822c..208e5f865497 100644 --- a/drivers/block/rnbd/Makefile +++ b/drivers/block/rnbd/Makefile @@ -3,13 +3,11 @@ ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs rnbd-client-y := rnbd-clt.o \ - rnbd-clt-sysfs.o \ - rnbd-common.o + rnbd-clt-sysfs.o CFLAGS_rnbd-srv-trace.o = -I$(src) -rnbd-server-y := rnbd-common.o \ - rnbd-srv.o \ +rnbd-server-y := rnbd-srv.o \ rnbd-srv-sysfs.o \ rnbd-srv-trace.o diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 8c6087949794..a0b49a0c0bdd 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -278,7 +278,7 @@ static ssize_t access_mode_show(struct kobject *kobj, dev = container_of(kobj, struct rnbd_clt_dev, kobj); - return sysfs_emit(page, "%s\n", rnbd_access_mode_str(dev->access_mode)); + return sysfs_emit(page, "%s\n", rnbd_access_modes[dev->access_mode].str); } static struct kobj_attribute rnbd_clt_access_mode = @@ -596,7 +596,7 @@ static ssize_t rnbd_clt_map_device_store(struct kobject *kobj, pr_info("Mapping device %s on session %s, (access_mode: %s, nr_poll_queues: %d)\n", pathname, sessname, - rnbd_access_mode_str(access_mode), + rnbd_access_modes[access_mode].str, nr_poll_queues); dev = rnbd_clt_map_device(sessname, paths, path_cnt, port_nr, pathname, diff --git a/drivers/block/rnbd/rnbd-common.c b/drivers/block/rnbd/rnbd-common.c deleted file mode 100644 index 596c3f732403..000000000000 --- a/drivers/block/rnbd/rnbd-common.c +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * RDMA Network Block Driver - * - * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. - * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. - * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. - */ -#include "rnbd-proto.h" - -const char *rnbd_access_mode_str(enum rnbd_access_mode mode) -{ - switch (mode) { - case RNBD_ACCESS_RO: - return "ro"; - case RNBD_ACCESS_RW: - return "rw"; - case RNBD_ACCESS_MIGRATION: - return "migration"; - default: - return "unknown"; - } -} diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index 84fd69844b7d..e32f8f2c868a 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -61,6 +61,15 @@ enum rnbd_access_mode { RNBD_ACCESS_MIGRATION, }; +static const __maybe_unused struct { + enum rnbd_access_mode mode; + const char *str; +} rnbd_access_modes[] = { + [RNBD_ACCESS_RO] = {RNBD_ACCESS_RO, "ro"}, + [RNBD_ACCESS_RW] = {RNBD_ACCESS_RW, "rw"}, + [RNBD_ACCESS_MIGRATION] = {RNBD_ACCESS_MIGRATION, "migration"}, +}; + /** * struct rnbd_msg_sess_info - initial session info from client to server * @hdr: message header diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 9fe7d9e0ab63..4962826e9639 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -103,7 +103,7 @@ static ssize_t access_mode_show(struct kobject *kobj, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); return sysfs_emit(page, "%s\n", - rnbd_access_mode_str(sess_dev->access_mode)); + rnbd_access_modes[sess_dev->access_mode].str); } static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr = diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index cec22bbae2f9..1eb3aecea69e 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -483,7 +483,7 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, pr_err("Mapping device '%s' for session %s with RW permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", srv_dev->id, srv_sess->sessname, srv_dev->open_write_cnt, - rnbd_access_mode_str(access_mode)); + rnbd_access_modes[access_mode].str); } break; case RNBD_ACCESS_MIGRATION: @@ -494,7 +494,7 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, pr_err("Mapping device '%s' for session %s with migration permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", srv_dev->id, srv_sess->sessname, srv_dev->open_write_cnt, - rnbd_access_mode_str(access_mode)); + rnbd_access_modes[access_mode].str); } break; default: From ba2eed1cf8f08f1e5b1ba009ac22554f14d05342 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:22 +0800 Subject: [PATCH 135/266] block/rnbd-srv: no need to check sess_dev Check ret is enough since if sess_dev is NULL which also implies ret should be 0. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20230524070026.2932-5-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 1eb3aecea69e..f610dfca1ebc 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -96,7 +96,7 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) ret = kref_get_unless_zero(&sess_dev->kref); rcu_read_unlock(); - if (!sess_dev || !ret) + if (!ret) return ERR_PTR(-ENXIO); return sess_dev; From 3ecdbf91513511fae49eb0cfa9f39f690eb4fe11 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:23 +0800 Subject: [PATCH 136/266] block/rnbd-srv: rename one member in rnbd_srv_dev It actually represents the name of rnbd_srv_dev. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20230524070026.2932-6-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 14 +++++++------- drivers/block/rnbd/rnbd-srv.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index f610dfca1ebc..300beb312e6e 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -180,7 +180,7 @@ static void destroy_device(struct kref *kref) WARN_ONCE(!list_empty(&dev->sess_dev_list), "Device %s is being destroyed but still in use!\n", - dev->id); + dev->name); spin_lock(&dev_lock); list_del(&dev->list); @@ -431,7 +431,7 @@ static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(struct block_device *bdev) if (!dev) return ERR_PTR(-ENOMEM); - snprintf(dev->id, sizeof(dev->id), "%pg", bdev); + snprintf(dev->name, sizeof(dev->name), "%pg", bdev); kref_init(&dev->kref); INIT_LIST_HEAD(&dev->sess_dev_list); mutex_init(&dev->lock); @@ -446,7 +446,7 @@ rnbd_srv_find_or_add_srv_dev(struct rnbd_srv_dev *new_dev) spin_lock(&dev_lock); list_for_each_entry(dev, &dev_list, list) { - if (!strncmp(dev->id, new_dev->id, sizeof(dev->id))) { + if (!strncmp(dev->name, new_dev->name, sizeof(dev->name))) { if (!kref_get_unless_zero(&dev->kref)) /* * We lost the race, device is almost dead. @@ -481,7 +481,7 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, ret = 0; } else { pr_err("Mapping device '%s' for session %s with RW permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", - srv_dev->id, srv_sess->sessname, + srv_dev->name, srv_sess->sessname, srv_dev->open_write_cnt, rnbd_access_modes[access_mode].str); } @@ -492,14 +492,14 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, ret = 0; } else { pr_err("Mapping device '%s' for session %s with migration permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", - srv_dev->id, srv_sess->sessname, + srv_dev->name, srv_sess->sessname, srv_dev->open_write_cnt, rnbd_access_modes[access_mode].str); } break; default: pr_err("Received mapping request for device '%s' on session %s with invalid access mode: %d\n", - srv_dev->id, srv_sess->sessname, access_mode); + srv_dev->name, srv_sess->sessname, access_mode); ret = -EINVAL; } @@ -774,7 +774,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list); mutex_unlock(&srv_dev->lock); - rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id); + rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->name); kfree(full_path); diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index f5962fd31d62..6b5e5ade18ae 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -35,7 +35,7 @@ struct rnbd_srv_dev { struct kobject dev_kobj; struct kobject *dev_sessions_kobj; struct kref kref; - char id[NAME_MAX]; + char name[NAME_MAX]; /* List of rnbd_srv_sess_dev structs */ struct list_head sess_dev_list; struct mutex lock; From 6a12d5379508d530a73140fc7d5502551558ced5 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:24 +0800 Subject: [PATCH 137/266] block/rnbd-srv: init ret with 0 instead of -EPERM Let's always set errno after pr_err which is consistent with default case. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20230524070026.2932-7-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 300beb312e6e..ac66a9bbd3d8 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -467,34 +467,33 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, struct rnbd_srv_session *srv_sess, enum rnbd_access_mode access_mode) { - int ret = -EPERM; + int ret = 0; mutex_lock(&srv_dev->lock); switch (access_mode) { case RNBD_ACCESS_RO: - ret = 0; break; case RNBD_ACCESS_RW: if (srv_dev->open_write_cnt == 0) { srv_dev->open_write_cnt++; - ret = 0; } else { pr_err("Mapping device '%s' for session %s with RW permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", srv_dev->name, srv_sess->sessname, srv_dev->open_write_cnt, rnbd_access_modes[access_mode].str); + ret = -EPERM; } break; case RNBD_ACCESS_MIGRATION: if (srv_dev->open_write_cnt < 2) { srv_dev->open_write_cnt++; - ret = 0; } else { pr_err("Mapping device '%s' for session %s with migration permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", srv_dev->name, srv_sess->sessname, srv_dev->open_write_cnt, rnbd_access_modes[access_mode].str); + ret = -EPERM; } break; default: From d3fc0b46642524bc8e38aed3c7f5e99742436495 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:25 +0800 Subject: [PATCH 138/266] block/rnbd-srv: init err earlier in rnbd_srv_init_module With this, we can remove several lines of code. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20230524070026.2932-8-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index ac66a9bbd3d8..0933e25ea9ab 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -807,7 +807,7 @@ static struct rtrs_srv_ctx *rtrs_ctx; static struct rtrs_srv_ops rtrs_ops; static int __init rnbd_srv_init_module(void) { - int err; + int err = 0; BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4); BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36); @@ -821,19 +821,17 @@ static int __init rnbd_srv_init_module(void) }; rtrs_ctx = rtrs_srv_open(&rtrs_ops, port_nr); if (IS_ERR(rtrs_ctx)) { - err = PTR_ERR(rtrs_ctx); pr_err("rtrs_srv_open(), err: %d\n", err); - return err; + return PTR_ERR(rtrs_ctx); } err = rnbd_srv_create_sysfs_files(); if (err) { pr_err("rnbd_srv_create_sysfs_files(), err: %d\n", err); rtrs_srv_close(rtrs_ctx); - return err; } - return 0; + return err; } static void __exit rnbd_srv_cleanup_module(void) From fece685cc7bbb5e1af89f891223c31c3bcc969f7 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 24 May 2023 15:00:26 +0800 Subject: [PATCH 139/266] block/rnbd-srv: make process_msg_sess_info returns void Change the return type to void given it always returns 0. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20230524070026.2932-9-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 0933e25ea9ab..0352c2585fb0 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -356,7 +356,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen); -static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, +static void process_msg_sess_info(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen); @@ -384,8 +384,7 @@ static int rnbd_srv_rdma_ev(void *priv, struct rtrs_srv_op *id, ret = process_msg_open(srv_sess, usr, usrlen, data, datalen); break; case RNBD_MSG_SESS_INFO: - ret = process_msg_sess_info(srv_sess, usr, usrlen, data, - datalen); + process_msg_sess_info(srv_sess, usr, usrlen, data, datalen); break; default: pr_warn("Received unexpected message type %d from session %s\n", @@ -630,7 +629,7 @@ static char *rnbd_srv_get_full_path(struct rnbd_srv_session *srv_sess, return full_path; } -static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, +static void process_msg_sess_info(struct rnbd_srv_session *srv_sess, const void *msg, size_t len, void *data, size_t datalen) { @@ -643,8 +642,6 @@ static int process_msg_sess_info(struct rnbd_srv_session *srv_sess, rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->ver = srv_sess->ver; - - return 0; } /** From 9d1c92872e7082f100f629a58b32fa0214aa1aec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:29 +0200 Subject: [PATCH 140/266] block: also call ->open for incremental partition opens For whole devices ->open is called for each open, but for partitions it is only called on the first open of a partition, e.g.: open("/dev/vdb", ...) open("/dev/vdb", ...) - 2 call to ->open open("/dev/vdb1", ...) open("/dev/vdb", ...) - 2 call to ->open open("/dev/vdb", ...) open("/dev/vdb", ...) - just open call to ->open This is problematic as various block drivers look at open flags and might not do all the required setup if the earlier open was with an odd flag like O_NDELAY or the magic 3 ioctl-only open mode. Signed-off-by: Christoph Hellwig Reviewed-by: Phillip Potter Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 5c46ff107706..981f61357951 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -683,9 +683,6 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) struct gendisk *disk = part->bd_disk; int ret; - if (atomic_read(&part->bd_openers)) - goto done; - ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) return ret; @@ -694,9 +691,10 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) if (!bdev_nr_sectors(part)) goto out_blkdev_put; - disk->open_partitions++; - set_init_blocksize(part); -done: + if (!atomic_read(&part->bd_openers)) { + disk->open_partitions++; + set_init_blocksize(part); + } atomic_inc(&part->bd_openers); return 0; @@ -709,10 +707,10 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) { struct block_device *whole = bdev_whole(part); - if (!atomic_dec_and_test(&part->bd_openers)) - return; - blkdev_flush_mapping(part); - whole->bd_disk->open_partitions--; + if (atomic_dec_and_test(&part->bd_openers)) { + blkdev_flush_mapping(part); + whole->bd_disk->open_partitions--; + } blkdev_put_whole(whole, mode); } From 764b83100b9aff52f950e408539c22a37cdedae8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:30 +0200 Subject: [PATCH 141/266] cdrom: remove the unused bdev argument to cdrom_open Signed-off-by: Christoph Hellwig Reviewed-by: Phillip Potter Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 3 +-- drivers/cdrom/gdrom.c | 2 +- drivers/scsi/sr.c | 2 +- include/linux/cdrom.h | 3 +-- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 416f723a2dbb..e3eab319cb04 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1155,8 +1155,7 @@ int open_for_data(struct cdrom_device_info *cdi) * is in their own interest: device control becomes a lot easier * this way. */ -int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, - fmode_t mode) +int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode) { int ret; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index ceded5772aac..eaa2d5a90bc8 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -481,7 +481,7 @@ static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode) bdev_check_media_change(bdev); mutex_lock(&gdrom_mutex); - ret = cdrom_open(gd.cd_info, bdev, mode); + ret = cdrom_open(gd.cd_info, mode); mutex_unlock(&gdrom_mutex); return ret; } diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 12869e6d4ebd..61b83880e395 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -498,7 +498,7 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode) sr_revalidate_disk(cd); mutex_lock(&cd->lock); - ret = cdrom_open(&cd->cdi, bdev, mode); + ret = cdrom_open(&cd->cdi, mode); mutex_unlock(&cd->lock); scsi_autopm_put_device(sdev); diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 67caa909e3e6..cc5717cb0fa8 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -101,8 +101,7 @@ int cdrom_read_tocentry(struct cdrom_device_info *cdi, struct cdrom_tocentry *entry); /* the general block_device operations structure: */ -extern int cdrom_open(struct cdrom_device_info *cdi, struct block_device *bdev, - fmode_t mode); +int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode); extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode); extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); From 473399b50de1fdc12606254351273c71d1786251 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:31 +0200 Subject: [PATCH 142/266] cdrom: remove the unused mode argument to cdrom_ioctl Signed-off-by: Christoph Hellwig Reviewed-by: Phillip Potter Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 2 +- drivers/cdrom/gdrom.c | 2 +- drivers/scsi/sr.c | 2 +- include/linux/cdrom.h | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index e3eab319cb04..245e5bbb05d4 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3336,7 +3336,7 @@ static int mmc_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, * ATAPI / SCSI specific code now mainly resides in mmc_ioctl(). */ int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, - fmode_t mode, unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; int ret; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index eaa2d5a90bc8..14922403983e 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -505,7 +505,7 @@ static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode, int ret; mutex_lock(&gdrom_mutex); - ret = cdrom_ioctl(gd.cd_info, bdev, mode, cmd, arg); + ret = cdrom_ioctl(gd.cd_info, bdev, cmd, arg); mutex_unlock(&gdrom_mutex); return ret; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 61b83880e395..444c7efc14cb 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -539,7 +539,7 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, scsi_autopm_get_device(sdev); if (cmd != CDROMCLOSETRAY && cmd != CDROMEJECT) { - ret = cdrom_ioctl(&cd->cdi, bdev, mode, cmd, arg); + ret = cdrom_ioctl(&cd->cdi, bdev, cmd, arg); if (ret != -ENOSYS) goto put; } diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index cc5717cb0fa8..4aea8c82d169 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -103,8 +103,8 @@ int cdrom_read_tocentry(struct cdrom_device_info *cdi, /* the general block_device operations structure: */ int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode); extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode); -extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, - fmode_t mode, unsigned int cmd, unsigned long arg); +int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, + unsigned int cmd, unsigned long arg); extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, unsigned int clearing); From a4cec8bc14c02e15006a71f02b0e1bbc72b9f796 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:32 +0200 Subject: [PATCH 143/266] cdrom: remove the unused cdrom_close_write release code cdrom_close_write is empty, and the for_data flag it is keyed off is never set. Remove all this clutter. Signed-off-by: Christoph Hellwig Reviewed-by: Phillip Potter Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 15 --------------- include/linux/cdrom.h | 1 - 2 files changed, 16 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 245e5bbb05d4..08abf1ffede0 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -978,15 +978,6 @@ static void cdrom_dvd_rw_close_write(struct cdrom_device_info *cdi) cdi->media_written = 0; } -static int cdrom_close_write(struct cdrom_device_info *cdi) -{ -#if 0 - return cdrom_flush_cache(cdi); -#else - return 0; -#endif -} - /* badly broken, I know. Is due for a fixup anytime. */ static void cdrom_count_tracks(struct cdrom_device_info *cdi, tracktype *tracks) { @@ -1282,12 +1273,6 @@ void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) opened_for_data = !(cdi->options & CDO_USE_FFLAGS) || !(mode & FMODE_NDELAY); - /* - * flush cache on last write release - */ - if (CDROM_CAN(CDC_RAM) && !cdi->use_count && cdi->for_data) - cdrom_close_write(cdi); - cdo->release(cdi); if (cdi->use_count == 0) { /* last process that closes dev*/ if (opened_for_data && diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 4aea8c82d169..0a5db0b0c958 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -61,7 +61,6 @@ struct cdrom_device_info { __u8 last_sense; __u8 media_written; /* dirty flag, DVD+RW bookkeeping */ unsigned short mmc3_profile; /* current MMC3 profile */ - int for_data; int (*exit)(struct cdrom_device_info *); int mrw_mode_page; __s64 last_media_change_ms; From 8cdf433e2b8e4fc6c7b4393deb93fb258175d537 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:33 +0200 Subject: [PATCH 144/266] cdrom: track if a cdrom_device_info was opened for data Set a flag when a cdrom_device_info is opened for writing, instead of trying to figure out this at release time. This will allow to eventually remove the mode argument to the ->release block_device_operation as nothing but the CDROM drivers uses that argument. Signed-off-by: Christoph Hellwig Reviewed-by: Phillip Potter Acked-by: Christian Brauner Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20230608110258.189493-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 12 +++++------- include/linux/cdrom.h | 1 + 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 08abf1ffede0..adebac1bd210 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1172,6 +1172,7 @@ int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode) ret = 0; cdi->media_written = 0; } + cdi->opened_for_data = true; } if (ret) @@ -1252,7 +1253,6 @@ static int check_for_audio_disc(struct cdrom_device_info *cdi, void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) { const struct cdrom_device_ops *cdo = cdi->ops; - int opened_for_data; cd_dbg(CD_CLOSE, "entering cdrom_release\n"); @@ -1270,14 +1270,12 @@ void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) } } - opened_for_data = !(cdi->options & CDO_USE_FFLAGS) || - !(mode & FMODE_NDELAY); - cdo->release(cdi); - if (cdi->use_count == 0) { /* last process that closes dev*/ - if (opened_for_data && - cdi->options & CDO_AUTO_EJECT && CDROM_CAN(CDC_OPEN_TRAY)) + + if (cdi->use_count == 0 && cdi->opened_for_data) { + if (cdi->options & CDO_AUTO_EJECT && CDROM_CAN(CDC_OPEN_TRAY)) cdo->tray_move(cdi, 1); + cdi->opened_for_data = false; } } EXPORT_SYMBOL(cdrom_release); diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 0a5db0b0c958..adcc9f2beb26 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -63,6 +63,7 @@ struct cdrom_device_info { unsigned short mmc3_profile; /* current MMC3 profile */ int (*exit)(struct cdrom_device_info *); int mrw_mode_page; + bool opened_for_data; __s64 last_media_change_ms; }; From 7ae24fcee9929f9002b84d8121144b2b3590b58c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:34 +0200 Subject: [PATCH 145/266] cdrom: remove the unused mode argument to cdrom_release Signed-off-by: Christoph Hellwig Reviewed-by: Phillip Potter Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 2 +- drivers/cdrom/gdrom.c | 2 +- drivers/scsi/sr.c | 2 +- include/linux/cdrom.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index adebac1bd210..998b03fe976e 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1250,7 +1250,7 @@ static int check_for_audio_disc(struct cdrom_device_info *cdi, return 0; } -void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode) +void cdrom_release(struct cdrom_device_info *cdi) { const struct cdrom_device_ops *cdo = cdi->ops; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 14922403983e..a401dc4218a9 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -481,7 +481,7 @@ static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode) bdev_check_media_change(bdev); mutex_lock(&gdrom_mutex); - ret = cdrom_open(gd.cd_info, mode); + ret = cdrom_open(gd.cd_info); mutex_unlock(&gdrom_mutex); return ret; } diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 444c7efc14cb..6d33120ee5ba 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -512,7 +512,7 @@ static void sr_block_release(struct gendisk *disk, fmode_t mode) struct scsi_cd *cd = scsi_cd(disk); mutex_lock(&cd->lock); - cdrom_release(&cd->cdi, mode); + cdrom_release(&cd->cdi); mutex_unlock(&cd->lock); scsi_device_put(cd->device); diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index adcc9f2beb26..3c253b29f4aa 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -102,7 +102,7 @@ int cdrom_read_tocentry(struct cdrom_device_info *cdi, /* the general block_device operations structure: */ int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode); -extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode); +void cdrom_release(struct cdrom_device_info *cdi); int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, unsigned int cmd, unsigned long arg); extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi, From 444aa2c58cb3b6cfe3b7cc7db6c294d73393a894 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:35 +0200 Subject: [PATCH 146/266] block: pass a gendisk on bdev_check_media_change bdev_check_media_change should only ever be called for the whole device. Pass a gendisk to make that explicit and rename the function to disk_check_media_change. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-8-hch@lst.de Signed-off-by: Jens Axboe --- block/disk-events.c | 18 +++++++++--------- drivers/block/amiflop.c | 2 +- drivers/block/ataflop.c | 6 +++--- drivers/block/floppy.c | 16 ++++++++-------- drivers/block/swim.c | 2 +- drivers/block/swim3.c | 2 +- drivers/cdrom/gdrom.c | 2 +- drivers/md/md.c | 2 +- drivers/scsi/sd.c | 9 ++++----- drivers/scsi/sr.c | 2 +- include/linux/blkdev.h | 2 +- 11 files changed, 31 insertions(+), 32 deletions(-) diff --git a/block/disk-events.c b/block/disk-events.c index aee25a7e1ab7..8b1b63225738 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -263,31 +263,31 @@ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) } /** - * bdev_check_media_change - check if a removable media has been changed - * @bdev: block device to check + * disk_check_media_change - check if a removable media has been changed + * @disk: gendisk to check * * Check whether a removable media has been changed, and attempt to free all * dentries and inodes and invalidates all block device page cache entries in * that case. * - * Returns %true if the block device changed, or %false if not. + * Returns %true if the media has changed, or %false if not. */ -bool bdev_check_media_change(struct block_device *bdev) +bool disk_check_media_change(struct gendisk *disk) { unsigned int events; - events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST); if (!(events & DISK_EVENT_MEDIA_CHANGE)) return false; - if (__invalidate_device(bdev, true)) + if (__invalidate_device(disk->part0, true)) pr_warn("VFS: busy inodes on changed media %s\n", - bdev->bd_disk->disk_name); - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } -EXPORT_SYMBOL(bdev_check_media_change); +EXPORT_SYMBOL(disk_check_media_change); /** * disk_force_media_change - force a media change event diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 4c8b2ba579ee..6de12b311749 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1675,7 +1675,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) } if (mode & (FMODE_READ|FMODE_WRITE)) { - bdev_check_media_change(bdev); + disk_check_media_change(bdev->bd_disk); if (mode & FMODE_WRITE) { int wrprot; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 9deb4df6bdb8..da481ddbca90 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1760,8 +1760,8 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, /* invalidate the buffer track to force a reread */ BufferDrive = -1; set_bit(drive, &fake_change); - if (bdev_check_media_change(bdev)) - floppy_revalidate(bdev->bd_disk); + if (disk_check_media_change(disk)) + floppy_revalidate(disk); return 0; default: return -EINVAL; @@ -1938,7 +1938,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return 0; if (mode & (FMODE_READ|FMODE_WRITE)) { - if (bdev_check_media_change(bdev)) + if (disk_check_media_change(bdev->bd_disk)) floppy_revalidate(bdev->bd_disk); if (mode & FMODE_WRITE) { if (p->wpstat) { diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 28ec6b442e9c..3accafcbc95c 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3210,13 +3210,13 @@ static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, #endif -static int invalidate_drive(struct block_device *bdev) +static int invalidate_drive(struct gendisk *disk) { /* invalidate the buffer track to force a reread */ - set_bit((long)bdev->bd_disk->private_data, &fake_change); + set_bit((long)disk->private_data, &fake_change); process_fd_request(); - if (bdev_check_media_change(bdev)) - floppy_revalidate(bdev->bd_disk); + if (disk_check_media_change(disk)) + floppy_revalidate(disk); return 0; } @@ -3287,7 +3287,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g, drive_state[current_drive].maxtrack || ((user_params[drive].sect ^ oldStretch) & (FD_SWAPSIDES | FD_SECTBASEMASK))) - invalidate_drive(bdev); + invalidate_drive(bdev->bd_disk); else process_fd_request(); } @@ -3464,7 +3464,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int current_type[drive] = NULL; floppy_sizes[drive] = MAX_DISK_SIZE << 1; drive_state[drive].keep_data = 0; - return invalidate_drive(bdev); + return invalidate_drive(bdev->bd_disk); case FDSETPRM: case FDDEFPRM: return set_geometry(cmd, &inparam.g, drive, type, bdev); @@ -3503,7 +3503,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int case FDFLUSH: if (lock_fdc(drive)) return -EINTR; - return invalidate_drive(bdev); + return invalidate_drive(bdev->bd_disk); case FDSETEMSGTRESH: drive_params[drive].max_errors.reporting = (unsigned short)(param & 0x0f); return 0; @@ -4054,7 +4054,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) drive_state[drive].last_checked = 0; clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags); - if (bdev_check_media_change(bdev)) + if (disk_check_media_change(bdev->bd_disk)) floppy_revalidate(bdev->bd_disk); if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) goto out; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 42b4b6828690..105bc5fd1b8c 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -640,7 +640,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return 0; if (mode & (FMODE_READ|FMODE_WRITE)) { - if (bdev_check_media_change(bdev) && fs->disk_in) + if (disk_check_media_change(bdev->bd_disk) && fs->disk_in) fs->ejected = 0; if ((mode & FMODE_WRITE) && fs->write_protected) { err = -EROFS; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index da811a7da03f..3d689ba312f5 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -963,7 +963,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (err == 0 && (mode & FMODE_NDELAY) == 0 && (mode & (FMODE_READ|FMODE_WRITE))) { - if (bdev_check_media_change(bdev)) + if (disk_check_media_change(bdev->bd_disk)) floppy_revalidate(bdev->bd_disk); if (fs->ejected) err = -ENXIO; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index a401dc4218a9..3cb92df38ebe 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -478,7 +478,7 @@ static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode) { int ret; - bdev_check_media_change(bdev); + disk_check_media_change(bdev->bd_disk); mutex_lock(&gdrom_mutex); ret = cdrom_open(gd.cd_info); diff --git a/drivers/md/md.c b/drivers/md/md.c index fabf9c543735..77046c91bea4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7789,7 +7789,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); - bdev_check_media_change(bdev); + disk_check_media_change(bdev->bd_disk); return 0; out_unlock: diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 1624d528aa1f..aab649d5bad3 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1280,11 +1280,10 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt) mempool_free(rq->special_vec.bv_page, sd_page_pool); } -static bool sd_need_revalidate(struct block_device *bdev, - struct scsi_disk *sdkp) +static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) { if (sdkp->device->removable || sdkp->write_prot) { - if (bdev_check_media_change(bdev)) + if (disk_check_media_change(disk)) return true; } @@ -1293,7 +1292,7 @@ static bool sd_need_revalidate(struct block_device *bdev, * nothing to do with partitions, BLKRRPART is used to force a full * revalidate after things like a format for historical reasons. */ - return test_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + return test_bit(GD_NEED_PART_SCAN, &disk->state); } /** @@ -1330,7 +1329,7 @@ static int sd_open(struct block_device *bdev, fmode_t mode) if (!scsi_block_when_processing_errors(sdev)) goto error_out; - if (sd_need_revalidate(bdev, sdkp)) + if (sd_need_revalidate(bdev->bd_disk, sdkp)) sd_revalidate_disk(bdev->bd_disk); /* diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 6d33120ee5ba..1592e6e10c74 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -494,7 +494,7 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode) return -ENXIO; scsi_autopm_get_device(sdev); - if (bdev_check_media_change(bdev)) + if (disk_check_media_change(bdev->bd_disk)) sr_revalidate_disk(cd); mutex_lock(&cd->lock); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f4c339d9dd03..a1688eba7e5e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -817,7 +817,7 @@ int __register_blkdev(unsigned int major, const char *name, __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); -bool bdev_check_media_change(struct block_device *bdev); +bool disk_check_media_change(struct gendisk *disk); int __invalidate_device(struct block_device *bdev, bool kill_dirty); void set_capacity(struct gendisk *disk, sector_t size); From d32e2bf83791727a84ad5d3e3d713e82f9adbe30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:36 +0200 Subject: [PATCH 147/266] block: pass a gendisk to ->open ->open is only called on the whole device. Make that explicit by passing a gendisk instead of the block_device. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: Jack Wang [rnbd] Link: https://lore.kernel.org/r/20230608110258.189493-9-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 5 ++--- arch/xtensa/platforms/iss/simdisk.c | 4 ++-- block/bdev.c | 2 +- drivers/block/amiflop.c | 8 ++++---- drivers/block/aoe/aoeblk.c | 4 ++-- drivers/block/ataflop.c | 16 +++++++-------- drivers/block/drbd/drbd_main.c | 6 +++--- drivers/block/floppy.c | 30 +++++++++++++++-------------- drivers/block/nbd.c | 8 ++++---- drivers/block/pktcdvd.c | 6 +++--- drivers/block/rbd.c | 4 ++-- drivers/block/rnbd/rnbd-clt.c | 4 ++-- drivers/block/swim.c | 10 +++++----- drivers/block/swim3.c | 10 +++++----- drivers/block/ublk_drv.c | 4 ++-- drivers/block/z2ram.c | 6 ++---- drivers/block/zram/zram_drv.c | 13 +++++-------- drivers/cdrom/gdrom.c | 4 ++-- drivers/md/bcache/super.c | 4 ++-- drivers/md/dm.c | 4 ++-- drivers/md/md.c | 6 +++--- drivers/mmc/core/block.c | 4 ++-- drivers/mtd/mtd_blkdevs.c | 4 ++-- drivers/mtd/ubi/block.c | 4 ++-- drivers/nvme/host/core.c | 4 ++-- drivers/nvme/host/multipath.c | 4 ++-- drivers/s390/block/dasd.c | 4 ++-- drivers/s390/block/dcssblk.c | 7 +++---- drivers/scsi/sd.c | 12 ++++++------ drivers/scsi/sr.c | 6 +++--- include/linux/blkdev.h | 2 +- 31 files changed, 102 insertions(+), 107 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index f4c1e6e97ad5..6b831f82881b 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,7 +108,7 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) static DEFINE_MUTEX(ubd_lock); static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ -static int ubd_open(struct block_device *bdev, fmode_t mode); +static int ubd_open(struct gendisk *disk, fmode_t mode); static void ubd_release(struct gendisk *disk, fmode_t mode); static int ubd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); @@ -1154,9 +1154,8 @@ static int __init ubd_driver_init(void){ device_initcall(ubd_driver_init); -static int ubd_open(struct block_device *bdev, fmode_t mode) +static int ubd_open(struct gendisk *disk, fmode_t mode) { - struct gendisk *disk = bdev->bd_disk; struct ubd *ubd_dev = disk->private_data; int err = 0; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index f50caaa1c249..38f95f79a127 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -120,9 +120,9 @@ static void simdisk_submit_bio(struct bio *bio) bio_endio(bio); } -static int simdisk_open(struct block_device *bdev, fmode_t mode) +static int simdisk_open(struct gendisk *disk, fmode_t mode) { - struct simdisk *dev = bdev->bd_disk->private_data; + struct simdisk *dev = disk->private_data; spin_lock(&dev->lock); ++dev->users; diff --git a/block/bdev.c b/block/bdev.c index 981f61357951..8a5fded303d4 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -652,7 +652,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) int ret; if (disk->fops->open) { - ret = disk->fops->open(bdev, mode); + ret = disk->fops->open(disk, mode); if (ret) { /* avoid ghost partitions on a removed medium */ if (ret == -ENOMEDIUM && diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 6de12b311749..0cf2e58294be 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1654,10 +1654,10 @@ static void fd_probe(int dev) * /dev/PS0 etc), and disallows simultaneous access to the same * drive with different device numbers. */ -static int floppy_open(struct block_device *bdev, fmode_t mode) +static int floppy_open(struct gendisk *disk, fmode_t mode) { - int drive = MINOR(bdev->bd_dev) & 3; - int system = (MINOR(bdev->bd_dev) & 4) >> 2; + int drive = disk->first_minor & 3; + int system = (disk->first_minor & 4) >> 2; int old_dev; unsigned long flags; @@ -1675,7 +1675,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) } if (mode & (FMODE_READ|FMODE_WRITE)) { - disk_check_media_change(bdev->bd_disk); + disk_check_media_change(disk); if (mode & FMODE_WRITE) { int wrprot; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 128722cf6c3c..4ca6bbb326d5 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -204,9 +204,9 @@ aoedisk_rm_debugfs(struct aoedev *d) } static int -aoeblk_open(struct block_device *bdev, fmode_t mode) +aoeblk_open(struct gendisk *disk, fmode_t mode) { - struct aoedev *d = bdev->bd_disk->private_data; + struct aoedev *d = disk->private_data; ulong flags; if (!virt_addr_valid(d)) { diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index da481ddbca90..4febd52be78c 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -447,7 +447,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int static void fd_probe( int drive ); static int fd_test_drive_present( int drive ); static void config_types( void ); -static int floppy_open(struct block_device *bdev, fmode_t mode); +static int floppy_open(struct gendisk *disk, fmode_t mode); static void floppy_release(struct gendisk *disk, fmode_t mode); /************************* End of Prototypes **************************/ @@ -1915,10 +1915,10 @@ static void __init config_types( void ) * drive with different device numbers. */ -static int floppy_open(struct block_device *bdev, fmode_t mode) +static int floppy_open(struct gendisk *disk, fmode_t mode) { - struct atari_floppy_struct *p = bdev->bd_disk->private_data; - int type = MINOR(bdev->bd_dev) >> 2; + struct atari_floppy_struct *p = disk->private_data; + int type = disk->first_minor >> 2; DPRINT(("fd_open: type=%d\n",type)); if (p->ref && p->type != type) @@ -1938,8 +1938,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return 0; if (mode & (FMODE_READ|FMODE_WRITE)) { - if (disk_check_media_change(bdev->bd_disk)) - floppy_revalidate(bdev->bd_disk); + if (disk_check_media_change(disk)) + floppy_revalidate(disk); if (mode & FMODE_WRITE) { if (p->wpstat) { if (p->ref < 0) @@ -1953,12 +1953,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return 0; } -static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +static int floppy_unlocked_open(struct gendisk *disk, fmode_t mode) { int ret; mutex_lock(&ataflop_mutex); - ret = floppy_open(bdev, mode); + ret = floppy_open(disk, mode); mutex_unlock(&ataflop_mutex); return ret; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 54223f64610a..8b6c19460f34 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -49,7 +49,7 @@ #include "drbd_debugfs.h" static DEFINE_MUTEX(drbd_main_mutex); -static int drbd_open(struct block_device *bdev, fmode_t mode); +static int drbd_open(struct gendisk *disk, fmode_t mode); static void drbd_release(struct gendisk *gd, fmode_t mode); static void md_sync_timer_fn(struct timer_list *t); static int w_bitmap_io(struct drbd_work *w, int unused); @@ -1882,9 +1882,9 @@ int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void return 0; } -static int drbd_open(struct block_device *bdev, fmode_t mode) +static int drbd_open(struct gendisk *disk, fmode_t mode) { - struct drbd_device *device = bdev->bd_disk->private_data; + struct drbd_device *device = disk->private_data; unsigned long flags; int rv = 0; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 3accafcbc95c..ef3bbb7c185b 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -402,7 +402,7 @@ static struct floppy_drive_struct drive_state[N_DRIVE]; static struct floppy_write_errors write_errors[N_DRIVE]; static struct timer_list motor_off_timer[N_DRIVE]; static struct blk_mq_tag_set tag_sets[N_DRIVE]; -static struct block_device *opened_bdev[N_DRIVE]; +static struct gendisk *opened_disk[N_DRIVE]; static DEFINE_MUTEX(open_lock); static struct floppy_raw_cmd *raw_cmd, default_raw_cmd; @@ -3251,10 +3251,11 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g, floppy_type[type].size + 1; process_fd_request(); for (cnt = 0; cnt < N_DRIVE; cnt++) { - struct block_device *bdev = opened_bdev[cnt]; - if (!bdev || ITYPE(drive_state[cnt].fd_device) != type) + struct gendisk *disk = opened_disk[cnt]; + + if (!disk || ITYPE(drive_state[cnt].fd_device) != type) continue; - __invalidate_device(bdev, true); + __invalidate_device(disk->part0, true); } mutex_unlock(&open_lock); } else { @@ -3973,7 +3974,7 @@ static void floppy_release(struct gendisk *disk, fmode_t mode) drive_state[drive].fd_ref = 0; } if (!drive_state[drive].fd_ref) - opened_bdev[drive] = NULL; + opened_disk[drive] = NULL; mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); } @@ -3983,9 +3984,9 @@ static void floppy_release(struct gendisk *disk, fmode_t mode) * /dev/PS0 etc), and disallows simultaneous access to the same * drive with different device numbers. */ -static int floppy_open(struct block_device *bdev, fmode_t mode) +static int floppy_open(struct gendisk *disk, fmode_t mode) { - int drive = (long)bdev->bd_disk->private_data; + int drive = (long)disk->private_data; int old_dev, new_dev; int try; int res = -EBUSY; @@ -3994,7 +3995,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) mutex_lock(&floppy_mutex); mutex_lock(&open_lock); old_dev = drive_state[drive].fd_device; - if (opened_bdev[drive] && opened_bdev[drive] != bdev) + if (opened_disk[drive] && opened_disk[drive] != disk) goto out2; if (!drive_state[drive].fd_ref && (drive_params[drive].flags & FD_BROKEN_DCL)) { @@ -4004,7 +4005,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) drive_state[drive].fd_ref++; - opened_bdev[drive] = bdev; + opened_disk[drive] = disk; res = -ENXIO; @@ -4038,7 +4039,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) } } - new_dev = MINOR(bdev->bd_dev); + new_dev = disk->first_minor; drive_state[drive].fd_device = new_dev; set_capacity(disks[drive][ITYPE(new_dev)], floppy_sizes[new_dev]); if (old_dev != -1 && old_dev != new_dev) { @@ -4054,8 +4055,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) drive_state[drive].last_checked = 0; clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags); - if (disk_check_media_change(bdev->bd_disk)) - floppy_revalidate(bdev->bd_disk); + if (disk_check_media_change(disk)) + floppy_revalidate(disk); if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags)) goto out; if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags)) @@ -4073,7 +4074,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) drive_state[drive].fd_ref--; if (!drive_state[drive].fd_ref) - opened_bdev[drive] = NULL; + opened_disk[drive] = NULL; out2: mutex_unlock(&open_lock); mutex_unlock(&floppy_mutex); @@ -4203,7 +4204,8 @@ static int floppy_revalidate(struct gendisk *disk) drive_state[drive].generation++; if (drive_no_geom(drive)) { /* auto-sensing */ - res = __floppy_read_block_0(opened_bdev[drive], drive); + res = __floppy_read_block_0(opened_disk[drive]->part0, + drive); } else { if (cf) poll_drive(false, FD_RAW_NEED_DISK); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6457a094abcc..14202b6a3550 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1553,13 +1553,13 @@ static struct nbd_config *nbd_alloc_config(void) return config; } -static int nbd_open(struct block_device *bdev, fmode_t mode) +static int nbd_open(struct gendisk *disk, fmode_t mode) { struct nbd_device *nbd; int ret = 0; mutex_lock(&nbd_index_mutex); - nbd = bdev->bd_disk->private_data; + nbd = disk->private_data; if (!nbd) { ret = -ENXIO; goto out; @@ -1587,10 +1587,10 @@ static int nbd_open(struct block_device *bdev, fmode_t mode) refcount_inc(&nbd->refs); mutex_unlock(&nbd->config_lock); if (max_part) - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + set_bit(GD_NEED_PART_SCAN, &disk->state); } else if (nbd_disconnected(nbd->config)) { if (max_part) - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + set_bit(GD_NEED_PART_SCAN, &disk->state); } out: mutex_unlock(&nbd_index_mutex); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index af1140548adb..93478d5a3fc4 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2248,14 +2248,14 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) return pkt_devs[dev_minor]; } -static int pkt_open(struct block_device *bdev, fmode_t mode) +static int pkt_open(struct gendisk *disk, fmode_t mode) { struct pktcdvd_device *pd = NULL; int ret; mutex_lock(&pktcdvd_mutex); mutex_lock(&ctl_mutex); - pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); + pd = pkt_find_dev_from_minor(disk->first_minor); if (!pd) { ret = -ENODEV; goto out; @@ -2277,7 +2277,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) * needed here as well, since ext2 (among others) may change * the blocksize at mount time */ - set_blocksize(bdev, CD_FRAMESIZE); + set_blocksize(disk->part0, CD_FRAMESIZE); } mutex_unlock(&ctl_mutex); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 84ad3b17956f..93231061db2f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -660,9 +660,9 @@ static bool pending_result_dec(struct pending_result *pending, int *result) return true; } -static int rbd_open(struct block_device *bdev, fmode_t mode) +static int rbd_open(struct gendisk *disk, fmode_t mode) { - struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + struct rbd_device *rbd_dev = disk->private_data; bool removing = false; spin_lock_irq(&rbd_dev->lock); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 5eb8c7855970..8ec00f4caf6b 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -921,9 +921,9 @@ rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) return sess; } -static int rnbd_client_open(struct block_device *block_device, fmode_t mode) +static int rnbd_client_open(struct gendisk *disk, fmode_t mode) { - struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + struct rnbd_clt_dev *dev = disk->private_data; if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE)) return -EPERM; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 105bc5fd1b8c..7ec8554187f7 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -608,9 +608,9 @@ static void setup_medium(struct floppy_state *fs) } } -static int floppy_open(struct block_device *bdev, fmode_t mode) +static int floppy_open(struct gendisk *disk, fmode_t mode) { - struct floppy_state *fs = bdev->bd_disk->private_data; + struct floppy_state *fs = disk->private_data; struct swim __iomem *base = fs->swd->base; int err; @@ -640,7 +640,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return 0; if (mode & (FMODE_READ|FMODE_WRITE)) { - if (disk_check_media_change(bdev->bd_disk) && fs->disk_in) + if (disk_check_media_change(disk) && fs->disk_in) fs->ejected = 0; if ((mode & FMODE_WRITE) && fs->write_protected) { err = -EROFS; @@ -659,12 +659,12 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) return err; } -static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +static int floppy_unlocked_open(struct gendisk *disk, fmode_t mode) { int ret; mutex_lock(&swim_mutex); - ret = floppy_open(bdev, mode); + ret = floppy_open(disk, mode); mutex_unlock(&swim_mutex); return ret; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 3d689ba312f5..c05a4e110d52 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -248,7 +248,7 @@ static void release_drive(struct floppy_state *fs); static int fd_eject(struct floppy_state *fs); static int floppy_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long param); -static int floppy_open(struct block_device *bdev, fmode_t mode); +static int floppy_open(struct gendisk *disk, fmode_t mode); static void floppy_release(struct gendisk *disk, fmode_t mode); static unsigned int floppy_check_events(struct gendisk *disk, unsigned int clearing); @@ -923,9 +923,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode, return ret; } -static int floppy_open(struct block_device *bdev, fmode_t mode) +static int floppy_open(struct gendisk *disk, fmode_t mode) { - struct floppy_state *fs = bdev->bd_disk->private_data; + struct floppy_state *fs = disk->private_data; struct swim3 __iomem *sw = fs->swim3; int n, err = 0; @@ -963,8 +963,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (err == 0 && (mode & FMODE_NDELAY) == 0 && (mode & (FMODE_READ|FMODE_WRITE))) { - if (disk_check_media_change(bdev->bd_disk)) - floppy_revalidate(bdev->bd_disk); + if (disk_check_media_change(disk)) + floppy_revalidate(disk); if (fs->ejected) err = -ENXIO; } diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 222a0341913f..92c900ac2ebc 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -447,9 +447,9 @@ static void ublk_store_owner_uid_gid(unsigned int *owner_uid, *owner_gid = from_kgid(&init_user_ns, gid); } -static int ublk_open(struct block_device *bdev, fmode_t mode) +static int ublk_open(struct gendisk *disk, fmode_t mode) { - struct ublk_device *ub = bdev->bd_disk->private_data; + struct ublk_device *ub = disk->private_data; if (capable(CAP_SYS_ADMIN)) return 0; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index c1e85f356e4d..a5575e012e29 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -140,16 +140,14 @@ static void get_chipram(void) return; } -static int z2_open(struct block_device *bdev, fmode_t mode) +static int z2_open(struct gendisk *disk, fmode_t mode) { - int device; + int device = disk->first_minor; int max_z2_map = (Z2RAM_SIZE / Z2RAM_CHUNKSIZE) * sizeof(z2ram_map[0]); int max_chip_map = (amiga_chip_size / Z2RAM_CHUNKSIZE) * sizeof(z2ram_map[0]); int rc = -ENOMEM; - device = MINOR(bdev->bd_dev); - mutex_lock(&z2ram_mutex); if (current_device != -1 && current_device != device) { rc = -EBUSY; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0bc779446c6f..f5644c606040 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2097,19 +2097,16 @@ static ssize_t reset_store(struct device *dev, return len; } -static int zram_open(struct block_device *bdev, fmode_t mode) +static int zram_open(struct gendisk *disk, fmode_t mode) { - int ret = 0; - struct zram *zram; + struct zram *zram = disk->private_data; - WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex)); + WARN_ON(!mutex_is_locked(&disk->open_mutex)); - zram = bdev->bd_disk->private_data; /* zram was claimed to reset so open request fails */ if (zram->claim) - ret = -EBUSY; - - return ret; + return -EBUSY; + return 0; } static const struct block_device_operations zram_devops = { diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 3cb92df38ebe..d35dd717e9fc 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -474,11 +474,11 @@ static const struct cdrom_device_ops gdrom_ops = { CDC_RESET | CDC_DRIVE_STATUS | CDC_CD_R, }; -static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode) +static int gdrom_bdops_open(struct gendisk *disk, fmode_t mode) { int ret; - disk_check_media_change(bdev->bd_disk); + disk_check_media_change(disk); mutex_lock(&gdrom_mutex); ret = cdrom_open(gd.cd_info); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d84c09a73af8..6683f66e7011 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -732,9 +732,9 @@ static int prio_read(struct cache *ca, uint64_t bucket) /* Bcache device */ -static int open_dev(struct block_device *b, fmode_t mode) +static int open_dev(struct gendisk *disk, fmode_t mode) { - struct bcache_device *d = b->bd_disk->private_data; + struct bcache_device *d = disk->private_data; if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d759f8bdb3df..06047a0ca4b3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -310,13 +310,13 @@ int dm_deleting_md(struct mapped_device *md) return test_bit(DMF_DELETING, &md->flags); } -static int dm_blk_open(struct block_device *bdev, fmode_t mode) +static int dm_blk_open(struct gendisk *disk, fmode_t mode) { struct mapped_device *md; spin_lock(&_minor_lock); - md = bdev->bd_disk->private_data; + md = disk->private_data; if (!md) goto out; diff --git a/drivers/md/md.c b/drivers/md/md.c index 77046c91bea4..aba13830bdb5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7767,13 +7767,13 @@ static int md_set_read_only(struct block_device *bdev, bool ro) return err; } -static int md_open(struct block_device *bdev, fmode_t mode) +static int md_open(struct gendisk *disk, fmode_t mode) { struct mddev *mddev; int err; spin_lock(&all_mddevs_lock); - mddev = mddev_get(bdev->bd_disk->private_data); + mddev = mddev_get(disk->private_data); spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; @@ -7789,7 +7789,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); - disk_check_media_change(bdev->bd_disk); + disk_check_media_change(disk); return 0; out_unlock: diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 00c33edb9fb9..fe217658705d 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -357,9 +357,9 @@ static const struct attribute_group *mmc_disk_attr_groups[] = { NULL, }; -static int mmc_blk_open(struct block_device *bdev, fmode_t mode) +static int mmc_blk_open(struct gendisk *disk, fmode_t mode) { - struct mmc_blk_data *md = mmc_blk_get(bdev->bd_disk); + struct mmc_blk_data *md = mmc_blk_get(disk); int ret = -ENXIO; mutex_lock(&block_mutex); diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 60b222799871..95f3ee6bde84 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -182,9 +182,9 @@ static blk_status_t mtd_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int blktrans_open(struct block_device *bdev, fmode_t mode) +static int blktrans_open(struct gendisk *disk, fmode_t mode) { - struct mtd_blktrans_dev *dev = bdev->bd_disk->private_data; + struct mtd_blktrans_dev *dev = disk->private_data; int ret = 0; kref_get(&dev->ref); diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 3711d7f74600..2f3442963919 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -227,9 +227,9 @@ static blk_status_t ubiblock_read(struct request *req) return BLK_STS_OK; } -static int ubiblock_open(struct block_device *bdev, fmode_t mode) +static int ubiblock_open(struct gendisk *disk, fmode_t mode) { - struct ubiblock *dev = bdev->bd_disk->private_data; + struct ubiblock *dev = disk->private_data; int ret; mutex_lock(&dev->dev_mutex); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ccb6eb1282f8..b1c8af5d9376 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1591,9 +1591,9 @@ static void nvme_ns_release(struct nvme_ns *ns) nvme_put_ns(ns); } -static int nvme_open(struct block_device *bdev, fmode_t mode) +static int nvme_open(struct gendisk *disk, fmode_t mode) { - return nvme_ns_open(bdev->bd_disk->private_data); + return nvme_ns_open(disk->private_data); } static void nvme_release(struct gendisk *disk, fmode_t mode) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9171452e2f6d..e8d5d62efa6d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -402,9 +402,9 @@ static void nvme_ns_head_submit_bio(struct bio *bio) srcu_read_unlock(&head->srcu, srcu_idx); } -static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +static int nvme_ns_head_open(struct gendisk *disk, fmode_t mode) { - if (!nvme_tryget_ns_head(bdev->bd_disk->private_data)) + if (!nvme_tryget_ns_head(disk->private_data)) return -ENXIO; return 0; } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 9fbfce735d56..e445b5fbd7fd 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3234,12 +3234,12 @@ struct blk_mq_ops dasd_mq_ops = { .exit_hctx = dasd_exit_hctx, }; -static int dasd_open(struct block_device *bdev, fmode_t mode) +static int dasd_open(struct gendisk *disk, fmode_t mode) { struct dasd_device *base; int rc; - base = dasd_device_from_gendisk(bdev->bd_disk); + base = dasd_device_from_gendisk(disk); if (!base) return -ENODEV; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index c09f2e053bf8..6150d20b5843 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -28,7 +28,7 @@ #define DCSSBLK_PARM_LEN 400 #define DCSS_BUS_ID_SIZE 20 -static int dcssblk_open(struct block_device *bdev, fmode_t mode); +static int dcssblk_open(struct gendisk *disk, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); static void dcssblk_submit_bio(struct bio *bio); static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, @@ -809,12 +809,11 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch } static int -dcssblk_open(struct block_device *bdev, fmode_t mode) +dcssblk_open(struct gendisk *disk, fmode_t mode) { - struct dcssblk_dev_info *dev_info; + struct dcssblk_dev_info *dev_info = disk->private_data; int rc; - dev_info = bdev->bd_disk->private_data; if (NULL == dev_info) { rc = -ENODEV; goto out; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index aab649d5bad3..c31a675db015 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1297,7 +1297,7 @@ static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) /** * sd_open - open a scsi disk device - * @bdev: Block device of the scsi disk to open + * @disk: disk to open * @mode: FMODE_* mask * * Returns 0 if successful. Returns a negated errno value in case @@ -1308,11 +1308,11 @@ static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * - * Locking: called with bdev->bd_disk->open_mutex held. + * Locking: called with disk->open_mutex held. **/ -static int sd_open(struct block_device *bdev, fmode_t mode) +static int sd_open(struct gendisk *disk, fmode_t mode) { - struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); + struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; int retval; @@ -1329,8 +1329,8 @@ static int sd_open(struct block_device *bdev, fmode_t mode) if (!scsi_block_when_processing_errors(sdev)) goto error_out; - if (sd_need_revalidate(bdev->bd_disk, sdkp)) - sd_revalidate_disk(bdev->bd_disk); + if (sd_need_revalidate(disk, sdkp)) + sd_revalidate_disk(disk); /* * If the drive is empty, just let the open fail. diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 1592e6e10c74..3ff3a2f96047 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -484,9 +484,9 @@ static void sr_revalidate_disk(struct scsi_cd *cd) get_sectorsize(cd); } -static int sr_block_open(struct block_device *bdev, fmode_t mode) +static int sr_block_open(struct gendisk *disk, fmode_t mode) { - struct scsi_cd *cd = scsi_cd(bdev->bd_disk); + struct scsi_cd *cd = scsi_cd(disk); struct scsi_device *sdev = cd->device; int ret; @@ -494,7 +494,7 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode) return -ENXIO; scsi_autopm_get_device(sdev); - if (disk_check_media_change(bdev->bd_disk)) + if (disk_check_media_change(disk)) sr_revalidate_disk(cd); mutex_lock(&cd->lock); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a1688eba7e5e..1366eea88186 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1386,7 +1386,7 @@ struct block_device_operations { void (*submit_bio)(struct bio *bio); int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); - int (*open) (struct block_device *, fmode_t); + int (*open)(struct gendisk *disk, fmode_t mode); void (*release) (struct gendisk *, fmode_t); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); From ae220766d87cd6799dbf918fea10613ae14c0654 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:37 +0200 Subject: [PATCH 148/266] block: remove the unused mode argument to ->release The mode argument to the ->release block_device_operation is never used, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: Jack Wang [rnbd] Link: https://lore.kernel.org/r/20230608110258.189493-10-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 4 ++-- arch/xtensa/platforms/iss/simdisk.c | 2 +- block/bdev.c | 14 +++++++------- drivers/block/amiflop.c | 2 +- drivers/block/aoe/aoeblk.c | 2 +- drivers/block/ataflop.c | 4 ++-- drivers/block/drbd/drbd_main.c | 4 ++-- drivers/block/floppy.c | 2 +- drivers/block/loop.c | 2 +- drivers/block/nbd.c | 2 +- drivers/block/pktcdvd.c | 4 ++-- drivers/block/rbd.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/block/swim.c | 2 +- drivers/block/swim3.c | 3 +-- drivers/block/z2ram.c | 2 +- drivers/cdrom/gdrom.c | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/dm.c | 2 +- drivers/md/md.c | 2 +- drivers/mmc/core/block.c | 2 +- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/mtd/ubi/block.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 2 +- drivers/s390/block/dasd.c | 2 +- drivers/s390/block/dcssblk.c | 4 ++-- drivers/scsi/sd.c | 3 +-- drivers/scsi/sr.c | 2 +- include/linux/blkdev.h | 2 +- 30 files changed, 41 insertions(+), 43 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 6b831f82881b..8b79554968ad 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -109,7 +109,7 @@ static DEFINE_MUTEX(ubd_lock); static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ static int ubd_open(struct gendisk *disk, fmode_t mode); -static void ubd_release(struct gendisk *disk, fmode_t mode); +static void ubd_release(struct gendisk *disk); static int ubd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -1182,7 +1182,7 @@ static int ubd_open(struct gendisk *disk, fmode_t mode) return err; } -static void ubd_release(struct gendisk *disk, fmode_t mode) +static void ubd_release(struct gendisk *disk) { struct ubd *ubd_dev = disk->private_data; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 38f95f79a127..2ad9da3de0d9 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -130,7 +130,7 @@ static int simdisk_open(struct gendisk *disk, fmode_t mode) return 0; } -static void simdisk_release(struct gendisk *disk, fmode_t mode) +static void simdisk_release(struct gendisk *disk) { struct simdisk *dev = disk->private_data; spin_lock(&dev->lock); diff --git a/block/bdev.c b/block/bdev.c index 8a5fded303d4..2c6888ceb378 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -670,12 +670,12 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) return 0; } -static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) +static void blkdev_put_whole(struct block_device *bdev) { if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk, mode); + bdev->bd_disk->fops->release(bdev->bd_disk); } static int blkdev_get_part(struct block_device *part, fmode_t mode) @@ -699,11 +699,11 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) return 0; out_blkdev_put: - blkdev_put_whole(bdev_whole(part), mode); + blkdev_put_whole(bdev_whole(part)); return ret; } -static void blkdev_put_part(struct block_device *part, fmode_t mode) +static void blkdev_put_part(struct block_device *part) { struct block_device *whole = bdev_whole(part); @@ -711,7 +711,7 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; } - blkdev_put_whole(whole, mode); + blkdev_put_whole(whole); } struct block_device *blkdev_get_no_open(dev_t dev) @@ -903,9 +903,9 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); if (bdev_is_partition(bdev)) - blkdev_put_part(bdev, mode); + blkdev_put_part(bdev); else - blkdev_put_whole(bdev, mode); + blkdev_put_whole(bdev); mutex_unlock(&disk->open_mutex); module_put(disk->fops->owner); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 0cf2e58294be..9a0e9dc74a8c 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1709,7 +1709,7 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) return 0; } -static void floppy_release(struct gendisk *disk, fmode_t mode) +static void floppy_release(struct gendisk *disk) { struct amiga_floppy_struct *p = disk->private_data; int drive = p - unit; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 4ca6bbb326d5..c3a39e02ab95 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -232,7 +232,7 @@ aoeblk_open(struct gendisk *disk, fmode_t mode) } static void -aoeblk_release(struct gendisk *disk, fmode_t mode) +aoeblk_release(struct gendisk *disk) { struct aoedev *d = disk->private_data; ulong flags; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 4febd52be78c..66a3242bb062 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -448,7 +448,7 @@ static void fd_probe( int drive ); static int fd_test_drive_present( int drive ); static void config_types( void ); static int floppy_open(struct gendisk *disk, fmode_t mode); -static void floppy_release(struct gendisk *disk, fmode_t mode); +static void floppy_release(struct gendisk *disk); /************************* End of Prototypes **************************/ @@ -1964,7 +1964,7 @@ static int floppy_unlocked_open(struct gendisk *disk, fmode_t mode) return ret; } -static void floppy_release(struct gendisk *disk, fmode_t mode) +static void floppy_release(struct gendisk *disk) { struct atari_floppy_struct *p = disk->private_data; mutex_lock(&ataflop_mutex); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 8b6c19460f34..7f3d7ca6ce6b 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -50,7 +50,7 @@ static DEFINE_MUTEX(drbd_main_mutex); static int drbd_open(struct gendisk *disk, fmode_t mode); -static void drbd_release(struct gendisk *gd, fmode_t mode); +static void drbd_release(struct gendisk *gd); static void md_sync_timer_fn(struct timer_list *t); static int w_bitmap_io(struct drbd_work *w, int unused); @@ -1908,7 +1908,7 @@ static int drbd_open(struct gendisk *disk, fmode_t mode) return rv; } -static void drbd_release(struct gendisk *gd, fmode_t mode) +static void drbd_release(struct gendisk *gd) { struct drbd_device *device = gd->private_data; mutex_lock(&drbd_main_mutex); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index ef3bbb7c185b..d79fac288a73 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3963,7 +3963,7 @@ static void __init config_types(void) pr_cont("\n"); } -static void floppy_release(struct gendisk *disk, fmode_t mode) +static void floppy_release(struct gendisk *disk) { int drive = (long)disk->private_data; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a73c857f5bfe..ca40d24572ae 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1727,7 +1727,7 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, } #endif -static void lo_release(struct gendisk *disk, fmode_t mode) +static void lo_release(struct gendisk *disk) { struct loop_device *lo = disk->private_data; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 14202b6a3550..cfb835238684 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1597,7 +1597,7 @@ static int nbd_open(struct gendisk *disk, fmode_t mode) return ret; } -static void nbd_release(struct gendisk *disk, fmode_t mode) +static void nbd_release(struct gendisk *disk) { struct nbd_device *nbd = disk->private_data; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 93478d5a3fc4..7bfc058cb665 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2292,7 +2292,7 @@ static int pkt_open(struct gendisk *disk, fmode_t mode) return ret; } -static void pkt_close(struct gendisk *disk, fmode_t mode) +static void pkt_release(struct gendisk *disk) { struct pktcdvd_device *pd = disk->private_data; @@ -2616,7 +2616,7 @@ static const struct block_device_operations pktcdvd_ops = { .owner = THIS_MODULE, .submit_bio = pkt_submit_bio, .open = pkt_open, - .release = pkt_close, + .release = pkt_release, .ioctl = pkt_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, .check_events = pkt_check_events, diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 93231061db2f..5215eff94fe9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -679,7 +679,7 @@ static int rbd_open(struct gendisk *disk, fmode_t mode) return 0; } -static void rbd_release(struct gendisk *disk, fmode_t mode) +static void rbd_release(struct gendisk *disk) { struct rbd_device *rbd_dev = disk->private_data; unsigned long open_count_before; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 8ec00f4caf6b..d5261d36d786 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -935,7 +935,7 @@ static int rnbd_client_open(struct gendisk *disk, fmode_t mode) return 0; } -static void rnbd_client_release(struct gendisk *gen, fmode_t mode) +static void rnbd_client_release(struct gendisk *gen) { struct rnbd_clt_dev *dev = gen->private_data; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 7ec8554187f7..a629b38dec66 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -670,7 +670,7 @@ static int floppy_unlocked_open(struct gendisk *disk, fmode_t mode) return ret; } -static void floppy_release(struct gendisk *disk, fmode_t mode) +static void floppy_release(struct gendisk *disk) { struct floppy_state *fs = disk->private_data; struct swim __iomem *base = fs->swd->base; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c05a4e110d52..b696deff3d8b 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -249,7 +249,6 @@ static int fd_eject(struct floppy_state *fs); static int floppy_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long param); static int floppy_open(struct gendisk *disk, fmode_t mode); -static void floppy_release(struct gendisk *disk, fmode_t mode); static unsigned int floppy_check_events(struct gendisk *disk, unsigned int clearing); static int floppy_revalidate(struct gendisk *disk); @@ -1004,7 +1003,7 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) return ret; } -static void floppy_release(struct gendisk *disk, fmode_t mode) +static void floppy_release(struct gendisk *disk) { struct floppy_state *fs = disk->private_data; struct swim3 __iomem *sw = fs->swim3; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index a5575e012e29..a2e41cc084ca 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -288,7 +288,7 @@ static int z2_open(struct gendisk *disk, fmode_t mode) return rc; } -static void z2_release(struct gendisk *disk, fmode_t mode) +static void z2_release(struct gendisk *disk) { mutex_lock(&z2ram_mutex); if (current_device == -1) { diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index d35dd717e9fc..dac148d4d1fe 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -486,7 +486,7 @@ static int gdrom_bdops_open(struct gendisk *disk, fmode_t mode) return ret; } -static void gdrom_bdops_release(struct gendisk *disk, fmode_t mode) +static void gdrom_bdops_release(struct gendisk *disk) { mutex_lock(&gdrom_mutex); cdrom_release(gd.cd_info, mode); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6683f66e7011..94b91c45c9e2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -743,7 +743,7 @@ static int open_dev(struct gendisk *disk, fmode_t mode) return 0; } -static void release_dev(struct gendisk *b, fmode_t mode) +static void release_dev(struct gendisk *b) { struct bcache_device *d = b->private_data; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 06047a0ca4b3..246b8f028a98 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -334,7 +334,7 @@ static int dm_blk_open(struct gendisk *disk, fmode_t mode) return md ? 0 : -ENXIO; } -static void dm_blk_close(struct gendisk *disk, fmode_t mode) +static void dm_blk_close(struct gendisk *disk) { struct mapped_device *md; diff --git a/drivers/md/md.c b/drivers/md/md.c index aba13830bdb5..159197dd7b6d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7799,7 +7799,7 @@ static int md_open(struct gendisk *disk, fmode_t mode) return err; } -static void md_release(struct gendisk *disk, fmode_t mode) +static void md_release(struct gendisk *disk) { struct mddev *mddev = disk->private_data; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index fe217658705d..b16eedf22d4e 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -375,7 +375,7 @@ static int mmc_blk_open(struct gendisk *disk, fmode_t mode) return ret; } -static void mmc_blk_release(struct gendisk *disk, fmode_t mode) +static void mmc_blk_release(struct gendisk *disk) { struct mmc_blk_data *md = disk->private_data; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 95f3ee6bde84..f0bb09fde95e 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -225,7 +225,7 @@ static int blktrans_open(struct gendisk *disk, fmode_t mode) return ret; } -static void blktrans_release(struct gendisk *disk, fmode_t mode) +static void blktrans_release(struct gendisk *disk) { struct mtd_blktrans_dev *dev = disk->private_data; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 2f3442963919..e85fb9de0b70 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -270,7 +270,7 @@ static int ubiblock_open(struct gendisk *disk, fmode_t mode) return ret; } -static void ubiblock_release(struct gendisk *gd, fmode_t mode) +static void ubiblock_release(struct gendisk *gd) { struct ubiblock *dev = gd->private_data; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b1c8af5d9376..fd7f8e6d66fd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1596,7 +1596,7 @@ static int nvme_open(struct gendisk *disk, fmode_t mode) return nvme_ns_open(disk->private_data); } -static void nvme_release(struct gendisk *disk, fmode_t mode) +static void nvme_release(struct gendisk *disk) { nvme_ns_release(disk->private_data); } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e8d5d62efa6d..698c0e70bcfa 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -409,7 +409,7 @@ static int nvme_ns_head_open(struct gendisk *disk, fmode_t mode) return 0; } -static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) +static void nvme_ns_head_release(struct gendisk *disk) { nvme_put_ns_head(disk->private_data); } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index e445b5fbd7fd..19295b2df470 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3287,7 +3287,7 @@ static int dasd_open(struct gendisk *disk, fmode_t mode) return rc; } -static void dasd_release(struct gendisk *disk, fmode_t mode) +static void dasd_release(struct gendisk *disk) { struct dasd_device *base = dasd_device_from_gendisk(disk); if (base) { diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 6150d20b5843..5aee3106bfda 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -29,7 +29,7 @@ #define DCSS_BUS_ID_SIZE 20 static int dcssblk_open(struct gendisk *disk, fmode_t mode); -static void dcssblk_release(struct gendisk *disk, fmode_t mode); +static void dcssblk_release(struct gendisk *disk); static void dcssblk_submit_bio(struct bio *bio); static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, void **kaddr, @@ -825,7 +825,7 @@ dcssblk_open(struct gendisk *disk, fmode_t mode) } static void -dcssblk_release(struct gendisk *disk, fmode_t mode) +dcssblk_release(struct gendisk *disk) { struct dcssblk_dev_info *dev_info = disk->private_data; struct segment_info *entry; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index c31a675db015..c67c84f6ba61 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1373,7 +1373,6 @@ static int sd_open(struct gendisk *disk, fmode_t mode) * sd_release - invoked when the (last) close(2) is called on this * scsi disk. * @disk: disk to release - * @mode: FMODE_* mask * * Returns 0. * @@ -1382,7 +1381,7 @@ static int sd_open(struct gendisk *disk, fmode_t mode) * * Locking: called with bdev->bd_disk->open_mutex held. **/ -static void sd_release(struct gendisk *disk, fmode_t mode) +static void sd_release(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 3ff3a2f96047..55082acb59bc 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -507,7 +507,7 @@ static int sr_block_open(struct gendisk *disk, fmode_t mode) return ret; } -static void sr_block_release(struct gendisk *disk, fmode_t mode) +static void sr_block_release(struct gendisk *disk) { struct scsi_cd *cd = scsi_cd(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1366eea88186..25bdd0cc74dc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1387,7 +1387,7 @@ struct block_device_operations { int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); int (*open)(struct gendisk *disk, fmode_t mode); - void (*release) (struct gendisk *, fmode_t); + void (*release)(struct gendisk *disk); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, From 7ee34cbc291a28134b60683b246ba58b4b676ec3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:38 +0200 Subject: [PATCH 149/266] block: rename blkdev_close to blkdev_release Make the function name match the method name. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-11-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/fops.c b/block/fops.c index 6a3087b750a6..26af2b39c758 100644 --- a/block/fops.c +++ b/block/fops.c @@ -500,7 +500,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) return 0; } -static int blkdev_close(struct inode *inode, struct file *filp) +static int blkdev_release(struct inode *inode, struct file *filp) { struct block_device *bdev = filp->private_data; @@ -677,7 +677,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, const struct file_operations def_blk_fops = { .open = blkdev_open, - .release = blkdev_close, + .release = blkdev_release, .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, From c889d0793d9dc07e94a5fddcc05356157fab00b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:39 +0200 Subject: [PATCH 150/266] swsusp: don't pass a stack address to blkdev_get_by_path holder is just an on-stack pointer that can easily be reused by other calls, replace it with a static variable that doesn't change. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20230608110258.189493-12-hch@lst.de Signed-off-by: Jens Axboe --- kernel/power/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 81aec3b2c605..b03ff1a33c7f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1510,6 +1510,8 @@ int swsusp_read(unsigned int *flags_p) return error; } +static void *swsusp_holder; + /** * swsusp_check - Check for swsusp signature in the resume device */ @@ -1517,14 +1519,13 @@ int swsusp_read(unsigned int *flags_p) int swsusp_check(bool snapshot_test) { int error; - void *holder; fmode_t mode = FMODE_READ; if (snapshot_test) mode |= FMODE_EXCL; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - mode, &holder, NULL); + mode, &swsusp_holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); From 29499ab060fec044161be73fb0e448eab97b4813 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:40 +0200 Subject: [PATCH 151/266] bcache: don't pass a stack address to blkdev_get_by_path sb is just an on-stack pointer that can easily be reused by other calls. Switch to use the bcache-wide bcache_kobj instead as there is no need to claim per-bcache device anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20230608110258.189493-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 94b91c45c9e2..4a2aed047aec 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2560,7 +2560,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, err = "failed to open device"; bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, - sb, NULL); + bcache_kobj, NULL); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { dev_t dev; From 5ee607675debef509946f8a251d4c30a21493ec2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:41 +0200 Subject: [PATCH 152/266] rnbd-srv: don't pass a holder for non-exclusive blkdev_get_by_path Passing a holder to blkdev_get_by_path when FMODE_EXCL isn't set doesn't make sense, so pass NULL instead. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: Jack Wang Link: https://lore.kernel.org/r/20230608110258.189493-14-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 0352c2585fb0..a92a4289d0ec 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -715,7 +715,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, goto reject; } - bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE, NULL); + bdev = blkdev_get_by_path(full_path, open_flags, NULL, NULL); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n", From 2ef789288afd365f4245ba97e56189062de5148e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:42 +0200 Subject: [PATCH 153/266] btrfs: don't pass a holder for non-exclusive blkdev_get_by_path Passing a holder to blkdev_get_by_path when FMODE_EXCL isn't set doesn't make sense, so pass NULL instead and remove the holder argument from the call chains the only end up in non-FMODE_EXCL blkdev_get_by_path calls. Exclusive mode for device scanning is not used since commit 50d281fc434c ("btrfs: scan device in non-exclusive mode")". Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: David Sterba Link: https://lore.kernel.org/r/20230608110258.189493-15-hch@lst.de Signed-off-by: Jens Axboe --- fs/btrfs/super.c | 16 ++++++---------- fs/btrfs/volumes.c | 17 ++++++++--------- fs/btrfs/volumes.h | 3 +-- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ec18e2210602..1a2ee9407f54 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -849,8 +849,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, * All other options will be parsed on much later in the mount process and * only when we need to allocate a new super block. */ -static int btrfs_parse_device_options(const char *options, fmode_t flags, - void *holder) +static int btrfs_parse_device_options(const char *options, fmode_t flags) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; @@ -884,8 +883,7 @@ static int btrfs_parse_device_options(const char *options, fmode_t flags, error = -ENOMEM; goto out; } - device = btrfs_scan_one_device(device_name, flags, - holder); + device = btrfs_scan_one_device(device_name, flags); kfree(device_name); if (IS_ERR(device)) { error = PTR_ERR(device); @@ -1477,13 +1475,13 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, } mutex_lock(&uuid_mutex); - error = btrfs_parse_device_options(data, mode, fs_type); + error = btrfs_parse_device_options(data, mode); if (error) { mutex_unlock(&uuid_mutex); goto error_fs_info; } - device = btrfs_scan_one_device(device_name, mode, fs_type); + device = btrfs_scan_one_device(device_name, mode); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); error = PTR_ERR(device); @@ -2190,8 +2188,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_root_fs_type); + device = btrfs_scan_one_device(vol->name, FMODE_READ); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2205,8 +2202,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, FMODE_READ, - &btrfs_root_fs_type); + device = btrfs_scan_one_device(vol->name, FMODE_READ); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR(device); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 784ccc8f6c69..035868cee3dd 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1348,8 +1348,7 @@ int btrfs_forget_devices(dev_t devt) * and we are not allowed to call set_blocksize during the scan. The superblock * is read via pagecache */ -struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, - void *holder) +struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags) { struct btrfs_super_block *disk_super; bool new_device_added = false; @@ -1368,16 +1367,16 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, */ /* - * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may - * initiate the device scan which may race with the user's mount - * or mkfs command, resulting in failure. - * Since the device scan is solely for reading purposes, there is - * no need for FMODE_EXCL. Additionally, the devices are read again + * Avoid an exclusive open here, as the systemd-udev may initiate the + * device scan which may race with the user's mount or mkfs command, + * resulting in failure. + * Since the device scan is solely for reading purposes, there is no + * need for an exclusive open. Additionally, the devices are read again * during the mount process. It is ok to get some inconsistent * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev = blkdev_get_by_path(path, flags, holder, NULL); + bdev = blkdev_get_by_path(path, flags, NULL, NULL); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -2381,7 +2380,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, return -ENOMEM; } - ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, + ret = btrfs_get_bdev_and_sb(path, FMODE_READ, NULL, 0, &bdev, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bf47a1a70813..eb97a397b3c3 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -600,8 +600,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_tree_free(struct extent_map_tree *tree); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, - fmode_t flags, void *holder); +struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); From 2736e8eeb0ccdc71d1f4256c9c9a28f58cc43307 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:43 +0200 Subject: [PATCH 154/266] block: use the holder as indication for exclusive opens The current interface for exclusive opens is rather confusing as it requires both the FMODE_EXCL flag and a holder. Remove the need to pass FMODE_EXCL and just key off the exclusive open off a non-NULL holder. For blkdev_put this requires adding the holder argument, which provides better debug checking that only the holder actually releases the hold, but at the same time allows removing the now superfluous mode argument. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: David Sterba [btrfs] Acked-by: Jack Wang [rnbd] Link: https://lore.kernel.org/r/20230608110258.189493-16-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 37 ++++++++++++++++------------ block/fops.c | 6 +++-- block/genhd.c | 5 ++-- block/ioctl.c | 5 ++-- drivers/block/drbd/drbd_nl.c | 23 ++++++++++------- drivers/block/pktcdvd.c | 13 +++++----- drivers/block/rnbd/rnbd-srv.c | 4 +-- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/zram/zram_drv.c | 8 +++--- drivers/md/bcache/super.c | 15 ++++++------ drivers/md/dm.c | 6 ++--- drivers/md/md.c | 38 +++++++++++++++-------------- drivers/mtd/devices/block2mtd.c | 4 +-- drivers/nvme/target/io-cmd-bdev.c | 2 +- drivers/s390/block/dasd_genhd.c | 2 +- drivers/target/target_core_iblock.c | 6 ++--- drivers/target/target_core_pscsi.c | 8 +++--- fs/btrfs/dev-replace.c | 6 ++--- fs/btrfs/ioctl.c | 12 ++++----- fs/btrfs/volumes.c | 28 ++++++++++----------- fs/btrfs/volumes.h | 6 ++--- fs/erofs/super.c | 7 +++--- fs/ext4/super.c | 11 +++------ fs/f2fs/super.c | 2 +- fs/jfs/jfs_logmgr.c | 6 ++--- fs/nfs/blocklayout/dev.c | 4 +-- fs/nilfs2/super.c | 6 ++--- fs/ocfs2/cluster/heartbeat.c | 4 +-- fs/reiserfs/journal.c | 19 +++++++-------- fs/reiserfs/reiserfs.h | 1 - fs/super.c | 20 +++++++-------- fs/xfs/xfs_super.c | 15 ++++++------ include/linux/blkdev.h | 2 +- kernel/power/hibernate.c | 12 +++------ kernel/power/power.h | 2 +- kernel/power/swap.c | 21 +++++++--------- mm/swapfile.c | 7 +++--- 37 files changed, 183 insertions(+), 192 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 2c6888ceb378..db63e5bcc46f 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -604,7 +604,7 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); -static void bd_end_claim(struct block_device *bdev) +static void bd_end_claim(struct block_device *bdev, void *holder) { struct block_device *whole = bdev_whole(bdev); bool unblock = false; @@ -614,6 +614,7 @@ static void bd_end_claim(struct block_device *bdev) * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. */ mutex_lock(&bdev_lock); + WARN_ON_ONCE(bdev->bd_holder != holder); WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--whole->bd_holders < 0); if (!bdev->bd_holders) { @@ -750,10 +751,9 @@ void blkdev_put_no_open(struct block_device *bdev) * @holder: exclusive holder identifier * @hops: holder operations * - * Open the block device described by device number @dev. If @mode includes - * %FMODE_EXCL, the block device is opened with exclusive access. Specifying - * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for - * the same @holder. + * Open the block device described by device number @dev. If @holder is not + * %NULL, the block device is opened with exclusive access. Exclusive opens may + * nest for the same @holder. * * Use this interface ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a device @@ -785,10 +785,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, return ERR_PTR(-ENXIO); disk = bdev->bd_disk; - if (mode & FMODE_EXCL) { + if (holder) { + mode |= FMODE_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) goto put_blkdev; + } else { + if (WARN_ON_ONCE(mode & FMODE_EXCL)) { + ret = -EIO; + goto put_blkdev; + } } disk_block_events(disk); @@ -805,7 +811,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; - if (mode & FMODE_EXCL) { + if (holder) { bd_finish_claiming(bdev, holder, hops); /* @@ -829,7 +835,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, put_module: module_put(disk->fops->owner); abort_claiming: - if (mode & FMODE_EXCL) + if (holder) bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); @@ -845,10 +851,9 @@ EXPORT_SYMBOL(blkdev_get_by_dev); * @mode: FMODE_* mask * @holder: exclusive holder identifier * - * Open the block device described by the device file at @path. If @mode - * includes %FMODE_EXCL, the block device is opened with exclusive access. - * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may - * nest for the same @holder. + * Open the block device described by the device file at @path. If @holder is + * not %NULL, the block device is opened with exclusive access. Exclusive opens + * may nest for the same @holder. * * CONTEXT: * Might sleep. @@ -869,7 +874,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, bdev = blkdev_get_by_dev(dev, mode, holder, hops); if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, mode); + blkdev_put(bdev, holder); return ERR_PTR(-EACCES); } @@ -877,7 +882,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); -void blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, void *holder) { struct gendisk *disk = bdev->bd_disk; @@ -892,8 +897,8 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - if (mode & FMODE_EXCL) - bd_end_claim(bdev); + if (holder) + bd_end_claim(bdev, holder); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE diff --git a/block/fops.c b/block/fops.c index 26af2b39c758..9f26e25bafa1 100644 --- a/block/fops.c +++ b/block/fops.c @@ -490,7 +490,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if ((filp->f_flags & O_ACCMODE) == 3) filp->f_mode |= FMODE_WRITE_IOCTL; - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp, NULL); + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, + (filp->f_mode & FMODE_EXCL) ? filp : NULL, + NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -504,7 +506,7 @@ static int blkdev_release(struct inode *inode, struct file *filp) { struct block_device *bdev = filp->private_data; - blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, (filp->f_mode & FMODE_EXCL) ? filp : NULL); return 0; } diff --git a/block/genhd.c b/block/genhd.c index 4e5fd6aaa883..b56f8b5c88b3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -365,12 +365,11 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) } set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXCL, NULL, - NULL); + bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL, NULL); if (IS_ERR(bdev)) ret = PTR_ERR(bdev); else - blkdev_put(bdev, mode & ~FMODE_EXCL); + blkdev_put(bdev, NULL); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, diff --git a/block/ioctl.c b/block/ioctl.c index c7d7d4345edb..b39bd5b41ee4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -454,11 +454,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, if (mode & FMODE_EXCL) return set_blocksize(bdev, n); - if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev, - NULL))) + if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL))) return -EBUSY; ret = set_blocksize(bdev, n); - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, &bdev); return ret; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index cab59dab3410..10b1e5171332 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1640,8 +1640,7 @@ static struct block_device *open_backing_dev(struct drbd_device *device, struct block_device *bdev; int err = 0; - bdev = blkdev_get_by_path(bdev_path, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, + bdev = blkdev_get_by_path(bdev_path, FMODE_READ | FMODE_WRITE, claim_ptr, NULL); if (IS_ERR(bdev)) { drbd_err(device, "open(\"%s\") failed with %ld\n", @@ -1654,7 +1653,7 @@ static struct block_device *open_backing_dev(struct drbd_device *device, err = bd_link_disk_holder(bdev, device->vdisk); if (err) { - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, claim_ptr); drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n", bdev_path, err); bdev = ERR_PTR(err); @@ -1696,13 +1695,13 @@ static int open_backing_devices(struct drbd_device *device, } static void close_backing_dev(struct drbd_device *device, struct block_device *bdev, - bool do_bd_unlink) + void *claim_ptr, bool do_bd_unlink) { if (!bdev) return; if (do_bd_unlink) bd_unlink_disk_holder(bdev, device->vdisk); - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, claim_ptr); } void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev) @@ -1710,8 +1709,11 @@ void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev * if (ldev == NULL) return; - close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev); - close_backing_dev(device, ldev->backing_bdev, true); + close_backing_dev(device, ldev->md_bdev, + ldev->md.meta_dev_idx < 0 ? + (void *)device : (void *)drbd_m_holder, + ldev->md_bdev != ldev->backing_bdev); + close_backing_dev(device, ldev->backing_bdev, device, true); kfree(ldev->disk_conf); kfree(ldev); @@ -2127,8 +2129,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) fail: conn_reconfig_done(connection); if (nbc) { - close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev); - close_backing_dev(device, nbc->backing_bdev, true); + close_backing_dev(device, nbc->md_bdev, + nbc->disk_conf->meta_dev_idx < 0 ? + (void *)device : (void *)drbd_m_holder, + nbc->md_bdev != nbc->backing_bdev); + close_backing_dev(device, nbc->backing_bdev, device, true); kfree(nbc); } kfree(new_disk_conf); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 7bfc058cb665..c3299e49edd5 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2167,8 +2167,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) * to read/write from/to it. It is already opened in O_NONBLOCK mode * so open should not fail. */ - bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd, - NULL); + bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ, pd, NULL); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto out; @@ -2215,7 +2214,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) return 0; out_putdev: - blkdev_put(bdev, FMODE_READ | FMODE_EXCL); + blkdev_put(bdev, pd); out: return ret; } @@ -2234,7 +2233,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush) pkt_lock_door(pd, 0); pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); + blkdev_put(pd->bdev, pd); pkt_shrink_pktlist(pd); } @@ -2520,7 +2519,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) return PTR_ERR(bdev); sdev = scsi_device_from_queue(bdev->bd_disk->queue); if (!sdev) { - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); + blkdev_put(bdev, NULL); return -EINVAL; } put_device(&sdev->sdev_gendev); @@ -2545,7 +2544,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) return 0; out_mem: - blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); + blkdev_put(bdev, NULL); /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return -ENOMEM; @@ -2751,7 +2750,7 @@ static int pkt_remove_dev(dev_t pkt_dev) pkt_debugfs_dev_remove(pd); pkt_sysfs_dev_remove(pd); - blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); + blkdev_put(pd->bdev, NULL); remove_proc_entry(pd->disk->disk_name, pkt_proc); dev_notice(ddev, "writer unmapped\n"); diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index a92a4289d0ec..a909f8763ce7 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -219,7 +219,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) rnbd_put_sess_dev(sess_dev); wait_for_completion(&dc); /* wait for inflights to drop to zero */ - blkdev_put(sess_dev->bdev, sess_dev->open_flags); + blkdev_put(sess_dev->bdev, NULL); mutex_lock(&sess_dev->dev->lock); list_del(&sess_dev->dev_list); if (sess_dev->open_flags & FMODE_WRITE) @@ -791,7 +791,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, } rnbd_put_srv_dev(srv_dev); blkdev_put: - blkdev_put(bdev, open_flags); + blkdev_put(bdev, NULL); free_path: kfree(full_path); reject: diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 43b36da9b354..141b60aad570 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -473,7 +473,7 @@ static void xenvbd_sysfs_delif(struct xenbus_device *dev) static void xen_vbd_free(struct xen_vbd *vbd) { if (vbd->bdev) - blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE); + blkdev_put(vbd->bdev, NULL); vbd->bdev = NULL; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f5644c606040..21615d67a9bd 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -420,7 +420,7 @@ static void reset_bdev(struct zram *zram) return; bdev = zram->bdev; - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, zram); /* hope filp_close flush all of IO */ filp_close(zram->backing_dev, NULL); zram->backing_dev = NULL; @@ -507,8 +507,8 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - bdev = blkdev_get_by_dev(inode->i_rdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram, NULL); + bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE, zram, + NULL); if (IS_ERR(bdev)) { err = PTR_ERR(bdev); bdev = NULL; @@ -539,7 +539,7 @@ static ssize_t backing_dev_store(struct device *dev, kvfree(bitmap); if (bdev) - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, zram); if (backing_dev) filp_close(backing_dev, NULL); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 4a2aed047aec..7022fea396f2 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1369,7 +1369,7 @@ static void cached_dev_free(struct closure *cl) put_page(virt_to_page(dc->sb_disk)); if (!IS_ERR_OR_NULL(dc->bdev)) - blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(dc->bdev, bcache_kobj); wake_up(&unregister_wait); @@ -2218,7 +2218,7 @@ void bch_cache_release(struct kobject *kobj) put_page(virt_to_page(ca->sb_disk)); if (!IS_ERR_OR_NULL(ca->bdev)) - blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(ca->bdev, bcache_kobj); kfree(ca); module_put(THIS_MODULE); @@ -2359,7 +2359,7 @@ static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk, * call blkdev_put() to bdev in bch_cache_release(). So we * explicitly call blkdev_put() here. */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, bcache_kobj); if (ret == -ENOMEM) err = "cache_alloc(): -ENOMEM"; else if (ret == -EPERM) @@ -2461,7 +2461,7 @@ static void register_bdev_worker(struct work_struct *work) if (!dc) { fail = true; put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(args->bdev, bcache_kobj); goto out; } @@ -2491,7 +2491,7 @@ static void register_cache_worker(struct work_struct *work) if (!ca) { fail = true; put_page(virt_to_page(args->sb_disk)); - blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(args->bdev, bcache_kobj); goto out; } @@ -2558,8 +2558,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = -EINVAL; err = "failed to open device"; - bdev = blkdev_get_by_path(strim(path), - FMODE_READ|FMODE_WRITE|FMODE_EXCL, + bdev = blkdev_get_by_path(strim(path), FMODE_READ | FMODE_WRITE, bcache_kobj, NULL); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { @@ -2648,7 +2647,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, out_put_sb_page: put_page(virt_to_page(sb_disk)); out_blkdev_put: - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, register_bcache); out_free_sb: kfree(sb); out_free_path: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 246b8f028a98..b16e37362c5a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -746,7 +746,7 @@ static struct table_device *open_table_device(struct mapped_device *md, return ERR_PTR(-ENOMEM); refcount_set(&td->count, 1); - bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr, NULL); + bdev = blkdev_get_by_dev(dev, mode, _dm_claim_ptr, NULL); if (IS_ERR(bdev)) { r = PTR_ERR(bdev); goto out_free_td; @@ -771,7 +771,7 @@ static struct table_device *open_table_device(struct mapped_device *md, return td; out_blkdev_put: - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, _dm_claim_ptr); out_free_td: kfree(td); return ERR_PTR(r); @@ -784,7 +784,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md { if (md->disk->slave_dir) bd_unlink_disk_holder(td->dm_dev.bdev, md->disk); - blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + blkdev_put(td->dm_dev.bdev, _dm_claim_ptr); put_dax(td->dm_dev.dax_dev); list_del(&td->list); kfree(td); diff --git a/drivers/md/md.c b/drivers/md/md.c index 159197dd7b6d..dad4a5539f9f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2449,7 +2449,10 @@ static void rdev_delayed_delete(struct work_struct *ws) void md_autodetect_dev(dev_t dev); -static void export_rdev(struct md_rdev *rdev) +/* just for claiming the bdev */ +static struct md_rdev claim_rdev; + +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); @@ -2457,7 +2460,7 @@ static void export_rdev(struct md_rdev *rdev) if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); #endif - blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev); rdev->bdev = NULL; kobject_put(&rdev->kobj); } @@ -2485,7 +2488,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) INIT_WORK(&rdev->del_work, rdev_delayed_delete); kobject_get(&rdev->kobj); queue_work(md_rdev_misc_wq, &rdev->del_work); - export_rdev(rdev); + export_rdev(rdev, rdev->mddev); } static void export_array(struct mddev *mddev) @@ -3612,6 +3615,7 @@ int md_rdev_init(struct md_rdev *rdev) return badblocks_init(&rdev->badblocks, 0); } EXPORT_SYMBOL_GPL(md_rdev_init); + /* * Import a device. If 'super_format' >= 0, then sanity check the superblock * @@ -3624,7 +3628,6 @@ EXPORT_SYMBOL_GPL(md_rdev_init); */ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) { - static struct md_rdev claim_rdev; /* just for claiming the bdev */ struct md_rdev *rdev; sector_t size; int err; @@ -3640,8 +3643,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe if (err) goto out_clear_rdev; - rdev->bdev = blkdev_get_by_dev(newdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, + rdev->bdev = blkdev_get_by_dev(newdev, FMODE_READ | FMODE_WRITE, super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", @@ -3679,7 +3681,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe return rdev; out_blkdev_put: - blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(rdev->bdev, super_format == -2 ? &claim_rdev : rdev); out_clear_rdev: md_rdev_clear(rdev); out_free_rdev: @@ -4560,7 +4562,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) err = bind_rdev_to_array(rdev, mddev); out: if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); mddev_unlock(mddev); if (!err) md_new_event(); @@ -6498,7 +6500,7 @@ static void autorun_devices(int part) rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) - export_rdev(rdev); + export_rdev(rdev, mddev); } autorun_array(mddev); mddev_unlock(mddev); @@ -6508,7 +6510,7 @@ static void autorun_devices(int part) */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); - export_rdev(rdev); + export_rdev(rdev, mddev); } mddev_put(mddev); } @@ -6696,13 +6698,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, rdev0->bdev); - export_rdev(rdev); + export_rdev(rdev, mddev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } @@ -6746,7 +6748,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* This was a hot-add request, but events doesn't * match, so reject it. */ - export_rdev(rdev); + export_rdev(rdev, mddev); return -EINVAL; } @@ -6772,7 +6774,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) } } if (has_journal || mddev->bitmap) { - export_rdev(rdev); + export_rdev(rdev, mddev); return -EBUSY; } set_bit(Journal, &rdev->flags); @@ -6787,7 +6789,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* --add initiated by this node */ err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } } @@ -6797,7 +6799,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev); + export_rdev(rdev, mddev); if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { @@ -6860,7 +6862,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) { - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } } @@ -6985,7 +6987,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) return 0; abort_export: - export_rdev(rdev); + export_rdev(rdev, mddev); return err; } diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 218eb2af564a..44fc23af4c3f 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -209,7 +209,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) if (dev->blkdev) { invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping, 0, -1); - blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(dev->blkdev, NULL); } kfree(dev); @@ -261,7 +261,7 @@ static struct block_device __ref *mdtblock_early_get_bdev(const char *devname, static struct block2mtd_dev *add_device(char *devname, int erase_size, char *label, int timeout) { - const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; + const fmode_t mode = FMODE_READ | FMODE_WRITE; struct block_device *bdev; struct block2mtd_dev *dev; char *name; diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 9b6d6d85c725..65ed2d478fac 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -51,7 +51,7 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) void nvmet_bdev_ns_disable(struct nvmet_ns *ns) { if (ns->bdev) { - blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ); + blkdev_put(ns->bdev, NULL); ns->bdev = NULL; } } diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index f21198bc483e..d2b27b84f854 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -179,7 +179,7 @@ void dasd_destroy_partitions(struct dasd_block *block) mutex_unlock(&bdev->bd_disk->open_mutex); /* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */ - blkdev_put(bdev, FMODE_READ); + blkdev_put(bdev, NULL); } int dasd_gendisk_init(void) diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index a5cbbefa78ee..c62f961f46e3 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -108,7 +108,7 @@ static int iblock_configure_device(struct se_device *dev) pr_debug( "IBLOCK: Claiming struct block_device: %s\n", ib_dev->ibd_udev_path); - mode = FMODE_READ|FMODE_EXCL; + mode = FMODE_READ; if (!ib_dev->ibd_readonly) mode |= FMODE_WRITE; else @@ -175,7 +175,7 @@ static int iblock_configure_device(struct se_device *dev) return 0; out_blkdev_put: - blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL); + blkdev_put(ib_dev->ibd_bd, ib_dev); out_free_bioset: bioset_exit(&ib_dev->ibd_bio_set); out: @@ -201,7 +201,7 @@ static void iblock_destroy_device(struct se_device *dev) struct iblock_dev *ib_dev = IBLOCK_DEV(dev); if (ib_dev->ibd_bd != NULL) - blkdev_put(ib_dev->ibd_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL); + blkdev_put(ib_dev->ibd_bd, ib_dev); bioset_exit(&ib_dev->ibd_bio_set); } diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index e3494e036c6c..da3b5512d7ae 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -366,8 +366,7 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd) * Claim exclusive struct block_device access to struct scsi_device * for TYPE_DISK and TYPE_ZBC using supplied udev_path */ - bd = blkdev_get_by_path(dev->udev_path, - FMODE_WRITE|FMODE_READ|FMODE_EXCL, pdv, + bd = blkdev_get_by_path(dev->udev_path, FMODE_WRITE | FMODE_READ, pdv, NULL); if (IS_ERR(bd)) { pr_err("pSCSI: blkdev_get_by_path() failed\n"); @@ -378,7 +377,7 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd) ret = pscsi_add_device_to_list(dev, sd); if (ret) { - blkdev_put(pdv->pdv_bd, FMODE_WRITE|FMODE_READ|FMODE_EXCL); + blkdev_put(pdv->pdv_bd, pdv); scsi_device_put(sd); return ret; } @@ -566,8 +565,7 @@ static void pscsi_destroy_device(struct se_device *dev) */ if ((sd->type == TYPE_DISK || sd->type == TYPE_ZBC) && pdv->pdv_bd) { - blkdev_put(pdv->pdv_bd, - FMODE_WRITE|FMODE_READ|FMODE_EXCL); + blkdev_put(pdv->pdv_bd, pdv); pdv->pdv_bd = NULL; } /* diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4de4984fa99b..677e9d9e1527 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -257,7 +257,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE, fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); @@ -315,7 +315,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->bdev = bdev; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->mode = FMODE_EXCL; + device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; @@ -334,7 +334,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - blkdev_put(bdev, FMODE_EXCL); + blkdev_put(bdev, fs_info->bdev_holder); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2fa36f694daa..d99376a79ef4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2672,7 +2672,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; struct block_device *bdev = NULL; - fmode_t mode; + void *holder; int ret; bool cancel = false; @@ -2709,7 +2709,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); + ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); btrfs_exclop_finish(fs_info); @@ -2724,7 +2724,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) err_drop: mnt_drop_write_file(file); if (bdev) - blkdev_put(bdev, mode); + blkdev_put(bdev, holder); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2738,7 +2738,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; struct block_device *bdev = NULL; - fmode_t mode; + void *holder; int ret; bool cancel = false; @@ -2765,7 +2765,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); + ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); @@ -2773,7 +2773,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mnt_drop_write_file(file); if (bdev) - blkdev_put(bdev, mode); + blkdev_put(bdev, holder); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 035868cee3dd..7b12e05cdbf0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -507,14 +507,14 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, sync_blockdev(*bdev); ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - blkdev_put(*bdev, flags); + blkdev_put(*bdev, holder); goto error; } invalidate_bdev(*bdev); *disk_super = btrfs_read_dev_super(*bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - blkdev_put(*bdev, flags); + blkdev_put(*bdev, holder); goto error; } @@ -642,7 +642,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - device->mode = flags; + device->holder = holder; fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && @@ -656,7 +656,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - blkdev_put(bdev, flags); + blkdev_put(bdev, holder); return -EINVAL; } @@ -1057,7 +1057,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, continue; if (device->bdev) { - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->holder); device->bdev = NULL; fs_devices->open_devices--; } @@ -1103,7 +1103,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - blkdev_put(device->bdev, device->mode); + blkdev_put(device->bdev, device->holder); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1213,8 +1213,6 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *latest_dev = NULL; struct btrfs_device *tmp_device; - flags |= FMODE_EXCL; - list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, dev_list) { int ret; @@ -1400,7 +1398,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags) btrfs_release_disk_super(disk_super); error_bdev_put: - blkdev_put(bdev, flags); + blkdev_put(bdev, NULL); return device; } @@ -2087,7 +2085,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, fmode_t *mode) + struct block_device **bdev, void **holder) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2226,7 +2224,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } *bdev = device->bdev; - *mode = device->mode; + *holder = device->holder; synchronize_rcu(); btrfs_free_device(device); @@ -2394,7 +2392,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - blkdev_put(bdev, FMODE_READ); + blkdev_put(bdev, NULL); return 0; } @@ -2627,7 +2625,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE, fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -2690,7 +2688,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->mode = FMODE_EXCL; + device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2848,7 +2846,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path error_free_device: btrfs_free_device(device); error: - blkdev_put(bdev, FMODE_EXCL); + blkdev_put(bdev, fs_info->bdev_holder); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index eb97a397b3c3..840a8df39907 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -94,8 +94,8 @@ struct btrfs_device { struct btrfs_zoned_device_info *zone_info; - /* the mode sent to blkdev_get */ - fmode_t mode; + /* block device holder for blkdev_get/put */ + void *holder; /* * Device's major-minor number. Must be set even if the device is not @@ -619,7 +619,7 @@ void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, fmode_t *mode); + struct block_device **bdev, void **holder); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 6c263e9cd38b..54dba967a2d4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -19,6 +19,7 @@ #include static struct kmem_cache *erofs_inode_cachep __read_mostly; +struct file_system_type erofs_fs_type; void _erofs_err(struct super_block *sb, const char *function, const char *fmt, ...) @@ -253,8 +254,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL, - sb->s_type, NULL); + bdev = blkdev_get_by_path(dif->path, FMODE_READ, sb->s_type, + NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; @@ -877,7 +878,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) - blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + blkdev_put(dif->bdev, &erofs_fs_type); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9070ea9154d7..92dd699139a3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1112,7 +1112,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb, + bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, sb, &ext4_holder_ops); if (IS_ERR(bdev)) goto fail; @@ -1128,17 +1128,12 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) /* * Release the journal device */ -static void ext4_blkdev_put(struct block_device *bdev) -{ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - static void ext4_blkdev_remove(struct ext4_sb_info *sbi) { struct block_device *bdev; bdev = sbi->s_journal_bdev; if (bdev) { - ext4_blkdev_put(bdev); + blkdev_put(bdev, sbi->s_es); sbi->s_journal_bdev = NULL; } } @@ -5915,7 +5910,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, out_journal: jbd2_journal_destroy(journal); out_bdev: - ext4_blkdev_put(bdev); + blkdev_put(bdev, sb); return NULL; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7c34ab082f13..a5adb1d316e3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1538,7 +1538,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) int i; for (i = 0; i < sbi->s_ndevs; i++) { - blkdev_put(FDEV(i).bdev, FMODE_EXCL); + blkdev_put(FDEV(i).bdev, sbi->sb->s_type); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 46d393c8088a..82f70d46f4e5 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1100,7 +1100,7 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ | FMODE_WRITE, log, NULL); if (IS_ERR(bdev)) { rc = PTR_ERR(bdev); @@ -1141,7 +1141,7 @@ int lmLogOpen(struct super_block *sb) lbmLogShutdown(log); close: /* close external log device */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, log); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1485,7 +1485,7 @@ int lmLogClose(struct super_block *sb) bdev = log->bdev; rc = lmLogShutdown(log); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, log); kfree(log); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 38b066ca699e..9be7f958f60e 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -35,7 +35,7 @@ bl_free_device(struct pnfs_block_dev *dev) } if (dev->bdev) - blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE); + blkdev_put(dev->bdev, NULL); } } @@ -374,7 +374,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE); + blkdev_put(d->bdev, NULL); return error; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 91bfbd973d1d..61d5e79a5e81 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1278,7 +1278,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; + fmode_t mode = FMODE_READ; struct dentry *root_dentry; int err, s_new = false; @@ -1357,7 +1357,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s_new) - blkdev_put(sd.bdev, mode); + blkdev_put(sd.bdev, fs_type); return root_dentry; @@ -1366,7 +1366,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, failed: if (!s_new) - blkdev_put(sd.bdev, mode); + blkdev_put(sd.bdev, fs_type); return ERR_PTR(err); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6b13b8c3f2b8..c6ae9aee01ed 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1503,7 +1503,7 @@ static void o2hb_region_release(struct config_item *item) } if (reg->hr_bdev) - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(reg->hr_bdev, NULL); kfree(reg->hr_slots); @@ -1893,7 +1893,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, out3: if (ret < 0) { - blkdev_put(reg->hr_bdev, FMODE_READ | FMODE_WRITE); + blkdev_put(reg->hr_bdev, NULL); reg->hr_bdev = NULL; } out2: diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 5e4db9a0c8e5..905297ea5545 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2589,7 +2589,7 @@ static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal) { if (journal->j_dev_bd != NULL) { - blkdev_put(journal->j_dev_bd, journal->j_dev_mode); + blkdev_put(journal->j_dev_bd, journal); journal->j_dev_bd = NULL; } } @@ -2598,9 +2598,10 @@ static int journal_init_dev(struct super_block *super, struct reiserfs_journal *journal, const char *jdev_name) { + fmode_t blkdev_mode = FMODE_READ; + void *holder = journal; int result; dev_t jdev; - fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; result = 0; @@ -2608,16 +2609,15 @@ static int journal_init_dev(struct super_block *super, jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; - if (bdev_read_only(super->s_bdev)) - blkdev_mode = FMODE_READ; + if (!bdev_read_only(super->s_bdev)) + blkdev_mode |= FMODE_WRITE; /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) - blkdev_mode &= ~FMODE_EXCL; - journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, - journal, NULL); - journal->j_dev_mode = blkdev_mode; + holder = NULL; + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder, + NULL); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; @@ -2631,8 +2631,7 @@ static int journal_init_dev(struct super_block *super, return 0; } - journal->j_dev_mode = blkdev_mode; - journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal, + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder, NULL); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 1bccf6a2e908..55e85256aae8 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -300,7 +300,6 @@ struct reiserfs_journal { struct reiserfs_journal_cnode *j_first; struct block_device *j_dev_bd; - fmode_t j_dev_mode; /* first block on s_dev of reserved area journal */ int j_1st_reserved_block; diff --git a/fs/super.c b/fs/super.c index f127589700ab..8563794a8bc4 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1255,7 +1255,7 @@ int get_tree_bdev(struct fs_context *fc, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; + fmode_t mode = FMODE_READ; int error = 0; if (!(fc->sb_flags & SB_RDONLY)) @@ -1279,7 +1279,7 @@ int get_tree_bdev(struct fs_context *fc, if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); return -EBUSY; } @@ -1288,7 +1288,7 @@ int get_tree_bdev(struct fs_context *fc, s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) { - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); return PTR_ERR(s); } @@ -1297,7 +1297,7 @@ int get_tree_bdev(struct fs_context *fc, if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", bdev); deactivate_locked_super(s); - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); return -EBUSY; } @@ -1309,7 +1309,7 @@ int get_tree_bdev(struct fs_context *fc, * holding an active reference. */ up_write(&s->s_umount); - blkdev_put(bdev, mode); + blkdev_put(bdev, fc->fs_type); down_write(&s->s_umount); } else { s->s_mode = mode; @@ -1344,7 +1344,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; + fmode_t mode = FMODE_READ; int error = 0; if (!(flags & SB_RDONLY)) @@ -1386,7 +1386,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, * holding an active reference. */ up_write(&s->s_umount); - blkdev_put(bdev, mode); + blkdev_put(bdev, fs_type); down_write(&s->s_umount); } else { s->s_mode = mode; @@ -1409,7 +1409,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - blkdev_put(bdev, mode); + blkdev_put(bdev, fs_type); error: return ERR_PTR(error); } @@ -1418,13 +1418,11 @@ EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; - fmode_t mode = sb->s_mode; bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); - WARN_ON_ONCE(!(mode & FMODE_EXCL)); - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, sb->s_type); } EXPORT_SYMBOL(kill_block_super); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1b4bd5c88f4a..3b7cf8268057 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -396,8 +396,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - mp, &xfs_holder_ops); + *bdevp = blkdev_get_by_path(name, FMODE_READ | FMODE_WRITE, mp, + &xfs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); @@ -408,10 +408,11 @@ xfs_blkdev_get( STATIC void xfs_blkdev_put( + struct xfs_mount *mp, struct block_device *bdev) { if (bdev) - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + blkdev_put(bdev, mp); } STATIC void @@ -422,13 +423,13 @@ xfs_close_devices( struct block_device *logdev = mp->m_logdev_targp->bt_bdev; xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(logdev); + xfs_blkdev_put(mp, logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(rtdev); + xfs_blkdev_put(mp, rtdev); } xfs_free_buftarg(mp->m_ddev_targp); } @@ -503,10 +504,10 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(rtdev); + xfs_blkdev_put(mp, rtdev); out_close_logdev: if (logdev && logdev != ddev) - xfs_blkdev_put(logdev); + xfs_blkdev_put(mp, logdev); return error; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 25bdd0cc74dc..d5b99796f12c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1480,7 +1480,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); -void blkdev_put(struct block_device *bdev, fmode_t mode); +void blkdev_put(struct block_device *bdev, void *holder); /* just for blk-cgroup, don't use elsewhere */ struct block_device *blkdev_get_no_open(dev_t dev); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 7ae95ec72f99..f62e89d0d906 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -688,22 +688,18 @@ static int load_image_and_restore(bool snapshot_test) { int error; unsigned int flags; - fmode_t mode = FMODE_READ; - - if (snapshot_test) - mode |= FMODE_EXCL; pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { - swsusp_close(mode); + swsusp_close(snapshot_test); goto Unlock; } error = swsusp_read(&flags); - swsusp_close(mode); + swsusp_close(snapshot_test); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -956,7 +952,7 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(false); goto Unlock; } @@ -991,7 +987,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(false); goto Finish; } diff --git a/kernel/power/power.h b/kernel/power/power.h index 978189fcafd1..a8e0c44b804e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -177,7 +177,7 @@ int swsusp_check(bool snapshot_test); extern void swsusp_free(void); extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); -extern void swsusp_close(fmode_t); +void swsusp_close(bool snapshot_test); #ifdef CONFIG_SUSPEND extern int swsusp_unmark(void); #endif diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b03ff1a33c7f..cc9259307c94 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -363,7 +363,7 @@ static int swsusp_swap_check(void) res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(hib_resume_bdev, FMODE_WRITE); + blkdev_put(hib_resume_bdev, NULL); return res; } @@ -443,7 +443,7 @@ static int get_swap_writer(struct swap_map_handle *handle) err_rel: release_swap_writer(handle); err_close: - swsusp_close(FMODE_WRITE); + swsusp_close(false); return ret; } @@ -508,7 +508,7 @@ static int swap_writer_finish(struct swap_map_handle *handle, if (error) free_all_swap_pages(root_swap); release_swap_writer(handle); - swsusp_close(FMODE_WRITE); + swsusp_close(false); return error; } @@ -1518,14 +1518,11 @@ static void *swsusp_holder; int swsusp_check(bool snapshot_test) { + void *holder = snapshot_test ? &swsusp_holder : NULL; int error; - fmode_t mode = FMODE_READ; - if (snapshot_test) - mode |= FMODE_EXCL; - - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - mode, &swsusp_holder, NULL); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_READ, + holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1552,7 +1549,7 @@ int swsusp_check(bool snapshot_test) put: if (error) - blkdev_put(hib_resume_bdev, mode); + blkdev_put(hib_resume_bdev, holder); else pr_debug("Image signature found, resuming\n"); } else { @@ -1569,14 +1566,14 @@ int swsusp_check(bool snapshot_test) * swsusp_close - close swap device. */ -void swsusp_close(fmode_t mode) +void swsusp_close(bool snapshot_test) { if (IS_ERR(hib_resume_bdev)) { pr_debug("Image device not initialised\n"); return; } - blkdev_put(hib_resume_bdev, mode); + blkdev_put(hib_resume_bdev, snapshot_test ? &swsusp_holder : NULL); } /** diff --git a/mm/swapfile.c b/mm/swapfile.c index cfbcf7d5705f..16554256be65 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2539,7 +2539,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, old_block_size); - blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(bdev, p); } inode_lock(inode); @@ -2770,8 +2770,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = blkdev_get_by_dev(inode->i_rdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, p, - NULL); + FMODE_READ | FMODE_WRITE, p, NULL); if (IS_ERR(p->bdev)) { error = PTR_ERR(p->bdev); p->bdev = NULL; @@ -3222,7 +3221,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->cluster_next_cpu = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); - blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(p->bdev, p); } inode = NULL; destroy_swap_extents(p); From 3f0b3e785e8b54a40c530fa77b7ab37bec925c57 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:44 +0200 Subject: [PATCH 155/266] block: add a sb_open_mode helper Add a helper to return the open flags for blkdev_get_by* for passed in super block flags instead of open coding the logic in many places. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-17-hch@lst.de Signed-off-by: Jens Axboe --- fs/btrfs/super.c | 5 +---- fs/nilfs2/super.c | 7 ++----- fs/super.c | 15 ++++----------- include/linux/blkdev.h | 7 +++++++ 4 files changed, 14 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1a2ee9407f54..fd02b92e3910 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1440,12 +1440,9 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; void *new_sec_opts = NULL; - fmode_t mode = FMODE_READ; + fmode_t mode = sb_open_mode(flags); int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - if (data) { error = security_sb_eat_lsm_opts(data, &new_sec_opts); if (error) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 61d5e79a5e81..a41fd84d4e28 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1278,14 +1278,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; - fmode_t mode = FMODE_READ; struct dentry *root_dentry; int err, s_new = false; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - - sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type, NULL); + sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, + NULL); if (IS_ERR(sd.bdev)) return ERR_CAST(sd.bdev); diff --git a/fs/super.c b/fs/super.c index 8563794a8bc4..dc7f32839833 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1255,17 +1255,13 @@ int get_tree_bdev(struct fs_context *fc, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; int error = 0; - if (!(fc->sb_flags & SB_RDONLY)) - mode |= FMODE_WRITE; - if (!fc->source) return invalf(fc, "No source specified"); - bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type, - &fs_holder_ops); + bdev = blkdev_get_by_path(fc->source, sb_open_mode(fc->sb_flags), + fc->fs_type, &fs_holder_ops); if (IS_ERR(bdev)) { errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev); @@ -1344,13 +1340,10 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; int error = 0; - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; - - bdev = blkdev_get_by_path(dev_name, mode, fs_type, &fs_holder_ops); + bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, + &fs_holder_ops); if (IS_ERR(bdev)) return ERR_CAST(bdev); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d5b99796f12c..978036039020 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1473,6 +1473,13 @@ struct blk_holder_ops { void (*mark_dead)(struct block_device *bdev); }; +/* + * Return the correct open flags for blkdev_get_by_* for super block flags + * as stored in sb->s_flags. + */ +#define sb_open_mode(flags) \ + (FMODE_READ | (((flags) & SB_RDONLY) ? 0 : FMODE_WRITE)) + struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, const struct blk_holder_ops *hops); struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, From 81b1fb7d17c0110df839e13468ada9e99bb6e5f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:45 +0200 Subject: [PATCH 156/266] fs: remove sb->s_mode There is no real need to store the open mode in the super_block now. It is only used by f2fs, which can easily recalculate it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-18-hch@lst.de Signed-off-by: Jens Axboe --- fs/f2fs/super.c | 10 ++++++---- fs/nilfs2/super.c | 1 - fs/super.c | 2 -- include/linux/fs.h | 1 - 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a5adb1d316e3..5a764fecd1c7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3993,6 +3993,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; unsigned int logical_blksize; + fmode_t mode = sb_open_mode(sbi->sb->s_flags); int i; /* Initialize single device information */ @@ -4024,8 +4025,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (max_devices == 1) { /* Single zoned block device mount */ FDEV(0).bdev = - blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, - sbi->sb->s_mode, sbi->sb->s_type, NULL); + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode, + sbi->sb->s_type, NULL); } else { /* Multi-device mount */ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); @@ -4043,8 +4044,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; } - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, - sbi->sb->s_mode, sbi->sb->s_type, NULL); + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode, + sbi->sb->s_type, + NULL); } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index a41fd84d4e28..15a5a1099427 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1316,7 +1316,6 @@ nilfs_mount(struct file_system_type *fs_type, int flags, s_new = true; /* New superblock instance created */ - s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); sb_set_blocksize(s, block_size(sd.bdev)); diff --git a/fs/super.c b/fs/super.c index dc7f32839833..86f40f898198 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1308,7 +1308,6 @@ int get_tree_bdev(struct fs_context *fc, blkdev_put(bdev, fc->fs_type); down_write(&s->s_umount); } else { - s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fc->fs_type->name, s->s_id); @@ -1382,7 +1381,6 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, blkdev_put(bdev, fs_type); down_write(&s->s_umount); } else { - s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7b2053649820..ad1d2c9afb3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1215,7 +1215,6 @@ struct super_block { uuid_t s_uuid; /* UUID */ unsigned int s_max_links; - fmode_t s_mode; /* * The next field is for VFS *only*. No filesystems have any business From 5f4eb9d5413fdfc779c099fdaf0ff417eb163145 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:46 +0200 Subject: [PATCH 157/266] scsi: replace the fmode_t argument to scsi_cmd_allowed with a simple bool Instead of passing a fmode_t and only checking it for FMODE_WRITE, pass a bool open_for_write to prepare for callers that won't have the fmode_t. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-19-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/scsi_bsg.c | 2 +- drivers/scsi/scsi_ioctl.c | 8 ++++---- drivers/scsi/sg.c | 2 +- include/scsi/scsi_ioctl.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index 96ee35256a16..12431f35f861 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -42,7 +42,7 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, if (copy_from_user(scmd->cmnd, uptr64(hdr->request), scmd->cmd_len)) goto out_put_request; ret = -EPERM; - if (!scsi_cmd_allowed(scmd->cmnd, mode)) + if (!scsi_cmd_allowed(scmd->cmnd, mode & FMODE_WRITE)) goto out_put_request; ret = 0; diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index e3b31d32b6a9..dda5468ca97f 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -248,7 +248,7 @@ static int scsi_send_start_stop(struct scsi_device *sdev, int data) * Only a subset of commands are allowed for unprivileged users. Commands used * to format the media, update the firmware, etc. are not permitted. */ -bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode) +bool scsi_cmd_allowed(unsigned char *cmd, bool open_for_write) { /* root can do any command. */ if (capable(CAP_SYS_RAWIO)) @@ -338,7 +338,7 @@ bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode) case GPCMD_SET_READ_AHEAD: /* ZBC */ case ZBC_OUT: - return (mode & FMODE_WRITE); + return open_for_write; default: return false; } @@ -354,7 +354,7 @@ static int scsi_fill_sghdr_rq(struct scsi_device *sdev, struct request *rq, return -EMSGSIZE; if (copy_from_user(scmd->cmnd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (!scsi_cmd_allowed(scmd->cmnd, mode)) + if (!scsi_cmd_allowed(scmd->cmnd, mode & FMODE_WRITE)) return -EPERM; scmd->cmd_len = hdr->cmd_len; @@ -554,7 +554,7 @@ static int sg_scsi_ioctl(struct request_queue *q, fmode_t mode, goto error; err = -EPERM; - if (!scsi_cmd_allowed(scmd->cmnd, mode)) + if (!scsi_cmd_allowed(scmd->cmnd, mode & FMODE_WRITE)) goto error; /* default. possible overridden later */ diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 037f8c98a6d3..e49ea693d0b6 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -237,7 +237,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd) if (sfp->parentdp->device->type == TYPE_SCANNER) return 0; - if (!scsi_cmd_allowed(cmd, filp->f_mode)) + if (!scsi_cmd_allowed(cmd, filp->f_mode & FMODE_WRITE)) return -EPERM; return 0; } diff --git a/include/scsi/scsi_ioctl.h b/include/scsi/scsi_ioctl.h index beac64e38b87..914201a8cb94 100644 --- a/include/scsi/scsi_ioctl.h +++ b/include/scsi/scsi_ioctl.h @@ -49,7 +49,7 @@ int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, void __user *arg); int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp); int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp); -bool scsi_cmd_allowed(unsigned char *cmd, fmode_t mode); +bool scsi_cmd_allowed(unsigned char *cmd, bool open_for_write); #endif /* __KERNEL__ */ #endif /* _SCSI_IOCTL_H */ From 2e80089c18241699c41d0af0669cb93844ff0dc1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:47 +0200 Subject: [PATCH 158/266] scsi: replace the fmode_t argument to scsi_ioctl with a simple bool Instead of passing a fmode_t and only checking it for FMODE_WRITE, pass a bool open_for_write to prepare for callers that won't have the fmode_t. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-20-hch@lst.de Signed-off-by: Jens Axboe --- drivers/scsi/ch.c | 3 ++- drivers/scsi/scsi_ioctl.c | 34 +++++++++++++++++----------------- drivers/scsi/sd.c | 2 +- drivers/scsi/sg.c | 5 +++-- drivers/scsi/sr.c | 2 +- drivers/scsi/st.c | 2 +- include/scsi/scsi_ioctl.h | 2 +- 7 files changed, 26 insertions(+), 24 deletions(-) diff --git a/drivers/scsi/ch.c b/drivers/scsi/ch.c index ac648bb8f7e7..cb0a399be1cc 100644 --- a/drivers/scsi/ch.c +++ b/drivers/scsi/ch.c @@ -877,7 +877,8 @@ static long ch_ioctl(struct file *file, } default: - return scsi_ioctl(ch->device, file->f_mode, cmd, argp); + return scsi_ioctl(ch->device, file->f_mode & FMODE_WRITE, cmd, + argp); } } diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c index dda5468ca97f..6f6c5973c3ea 100644 --- a/drivers/scsi/scsi_ioctl.c +++ b/drivers/scsi/scsi_ioctl.c @@ -346,7 +346,7 @@ bool scsi_cmd_allowed(unsigned char *cmd, bool open_for_write) EXPORT_SYMBOL(scsi_cmd_allowed); static int scsi_fill_sghdr_rq(struct scsi_device *sdev, struct request *rq, - struct sg_io_hdr *hdr, fmode_t mode) + struct sg_io_hdr *hdr, bool open_for_write) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); @@ -354,7 +354,7 @@ static int scsi_fill_sghdr_rq(struct scsi_device *sdev, struct request *rq, return -EMSGSIZE; if (copy_from_user(scmd->cmnd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (!scsi_cmd_allowed(scmd->cmnd, mode & FMODE_WRITE)) + if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) return -EPERM; scmd->cmd_len = hdr->cmd_len; @@ -407,7 +407,8 @@ static int scsi_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, return ret; } -static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) +static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, + bool open_for_write) { unsigned long start_time; ssize_t ret = 0; @@ -448,7 +449,7 @@ static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) goto out_put_request; } - ret = scsi_fill_sghdr_rq(sdev, rq, hdr, mode); + ret = scsi_fill_sghdr_rq(sdev, rq, hdr, open_for_write); if (ret < 0) goto out_put_request; @@ -477,8 +478,7 @@ static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) /** * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl * @q: request queue to send scsi commands down - * @mode: mode used to open the file through which the ioctl has been - * submitted + * @open_for_write: is the file / block device opened for writing? * @sic: userspace structure describing the command to perform * * Send down the scsi command described by @sic to the device below @@ -501,7 +501,7 @@ static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode) * Positive numbers returned are the compacted SCSI error codes (4 * bytes in one int) where the lowest byte is the SCSI status. */ -static int sg_scsi_ioctl(struct request_queue *q, fmode_t mode, +static int sg_scsi_ioctl(struct request_queue *q, bool open_for_write, struct scsi_ioctl_command __user *sic) { struct request *rq; @@ -554,7 +554,7 @@ static int sg_scsi_ioctl(struct request_queue *q, fmode_t mode, goto error; err = -EPERM; - if (!scsi_cmd_allowed(scmd->cmnd, mode & FMODE_WRITE)) + if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) goto error; /* default. possible overridden later */ @@ -776,7 +776,7 @@ static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, return 0; } -static int scsi_cdrom_send_packet(struct scsi_device *sdev, fmode_t mode, +static int scsi_cdrom_send_packet(struct scsi_device *sdev, bool open_for_write, void __user *arg) { struct cdrom_generic_command cgc; @@ -817,7 +817,7 @@ static int scsi_cdrom_send_packet(struct scsi_device *sdev, fmode_t mode, hdr.cmdp = ((struct cdrom_generic_command __user *) arg)->cmd; hdr.cmd_len = sizeof(cgc.cmd); - err = sg_io(sdev, &hdr, mode); + err = sg_io(sdev, &hdr, open_for_write); if (err == -EFAULT) return -EFAULT; @@ -832,7 +832,7 @@ static int scsi_cdrom_send_packet(struct scsi_device *sdev, fmode_t mode, return err; } -static int scsi_ioctl_sg_io(struct scsi_device *sdev, fmode_t mode, +static int scsi_ioctl_sg_io(struct scsi_device *sdev, bool open_for_write, void __user *argp) { struct sg_io_hdr hdr; @@ -841,7 +841,7 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, fmode_t mode, error = get_sg_io_hdr(&hdr, argp); if (error) return error; - error = sg_io(sdev, &hdr, mode); + error = sg_io(sdev, &hdr, open_for_write); if (error == -EFAULT) return error; if (put_sg_io_hdr(&hdr, argp)) @@ -852,7 +852,7 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, fmode_t mode, /** * scsi_ioctl - Dispatch ioctl to scsi device * @sdev: scsi device receiving ioctl - * @mode: mode the block/char device is opened with + * @open_for_write: is the file / block device opened for writing? * @cmd: which ioctl is it * @arg: data associated with ioctl * @@ -860,7 +860,7 @@ static int scsi_ioctl_sg_io(struct scsi_device *sdev, fmode_t mode, * does not take a major/minor number as the dev field. Rather, it takes * a pointer to a &struct scsi_device. */ -int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, +int scsi_ioctl(struct scsi_device *sdev, bool open_for_write, int cmd, void __user *arg) { struct request_queue *q = sdev->request_queue; @@ -896,11 +896,11 @@ int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, case SG_EMULATED_HOST: return sg_emulated_host(q, arg); case SG_IO: - return scsi_ioctl_sg_io(sdev, mode, arg); + return scsi_ioctl_sg_io(sdev, open_for_write, arg); case SCSI_IOCTL_SEND_COMMAND: - return sg_scsi_ioctl(q, mode, arg); + return sg_scsi_ioctl(q, open_for_write, arg); case CDROM_SEND_PACKET: - return scsi_cdrom_send_packet(sdev, mode, arg); + return scsi_cdrom_send_packet(sdev, open_for_write, arg); case CDROMCLOSETRAY: return scsi_send_start_stop(sdev, 3); case CDROMEJECT: diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index c67c84f6ba61..02b6704ec2b4 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1463,7 +1463,7 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode, if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); - return scsi_ioctl(sdp, mode, cmd, p); + return scsi_ioctl(sdp, mode & FMODE_WRITE, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index e49ea693d0b6..138e28bb76b7 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1103,7 +1103,8 @@ sg_ioctl_common(struct file *filp, Sg_device *sdp, Sg_fd *sfp, case SCSI_IOCTL_SEND_COMMAND: if (atomic_read(&sdp->detaching)) return -ENODEV; - return scsi_ioctl(sdp->device, filp->f_mode, cmd_in, p); + return scsi_ioctl(sdp->device, filp->f_mode & FMODE_WRITE, + cmd_in, p); case SG_SET_DEBUG: result = get_user(val, ip); if (result) @@ -1159,7 +1160,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) ret = sg_ioctl_common(filp, sdp, sfp, cmd_in, p); if (ret != -ENOIOCTLCMD) return ret; - return scsi_ioctl(sdp->device, filp->f_mode, cmd_in, p); + return scsi_ioctl(sdp->device, filp->f_mode & FMODE_WRITE, cmd_in, p); } static __poll_t diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 55082acb59bc..00aaafc8dd78 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -543,7 +543,7 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (ret != -ENOSYS) goto put; } - ret = scsi_ioctl(sdev, mode, cmd, argp); + ret = scsi_ioctl(sdev, mode & FMODE_WRITE, cmd, argp); put: scsi_autopm_put_device(sdev); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index b90a440e135d..14d7981ddcdd 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3832,7 +3832,7 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) break; } - retval = scsi_ioctl(STp->device, file->f_mode, cmd_in, p); + retval = scsi_ioctl(STp->device, file->f_mode & FMODE_WRITE, cmd_in, p); if (!retval && cmd_in == SCSI_IOCTL_STOP_UNIT) { /* unload */ STp->rew_at_close = 0; diff --git a/include/scsi/scsi_ioctl.h b/include/scsi/scsi_ioctl.h index 914201a8cb94..a207c07da9d2 100644 --- a/include/scsi/scsi_ioctl.h +++ b/include/scsi/scsi_ioctl.h @@ -45,7 +45,7 @@ typedef struct scsi_fctargaddress { int scsi_ioctl_block_when_processing_errors(struct scsi_device *sdev, int cmd, bool ndelay); -int scsi_ioctl(struct scsi_device *sdev, fmode_t mode, int cmd, +int scsi_ioctl(struct scsi_device *sdev, bool open_for_write, int cmd, void __user *arg); int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp); int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp); From 1991299e49fa58c3ba7e91599932f84bf537d592 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:48 +0200 Subject: [PATCH 159/266] scsi: replace the fmode_t argument to ->sg_io_fn with a simple bool Instead of passing a fmode_t and only checking it for FMODE_WRITE, pass a bool open_for_write to prepare for callers that won't have the fmode_t. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-21-hch@lst.de Signed-off-by: Jens Axboe --- block/bsg-lib.c | 2 +- block/bsg.c | 8 +++++--- drivers/scsi/scsi_bsg.c | 4 ++-- include/linux/bsg.h | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 435c32373cd6..b3acdbdb6e7e 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -26,7 +26,7 @@ struct bsg_set { }; static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, - fmode_t mode, unsigned int timeout) + bool open_for_write, unsigned int timeout) { struct bsg_job *job; struct request *rq; diff --git a/block/bsg.c b/block/bsg.c index 7eca43f33d7f..bec4027842b3 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -54,7 +54,8 @@ static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr) return max_t(unsigned int, timeout, BLK_MIN_SG_TIMEOUT); } -static int bsg_sg_io(struct bsg_device *bd, fmode_t mode, void __user *uarg) +static int bsg_sg_io(struct bsg_device *bd, bool open_for_write, + void __user *uarg) { struct sg_io_v4 hdr; int ret; @@ -63,7 +64,8 @@ static int bsg_sg_io(struct bsg_device *bd, fmode_t mode, void __user *uarg) return -EFAULT; if (hdr.guard != 'Q') return -EINVAL; - ret = bd->sg_io_fn(bd->queue, &hdr, mode, bsg_timeout(bd, &hdr)); + ret = bd->sg_io_fn(bd->queue, &hdr, open_for_write, + bsg_timeout(bd, &hdr)); if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr))) return -EFAULT; return ret; @@ -146,7 +148,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case SG_EMULATED_HOST: return put_user(1, intp); case SG_IO: - return bsg_sg_io(bd, file->f_mode, uarg); + return bsg_sg_io(bd, file->f_mode & FMODE_WRITE, uarg); case SCSI_IOCTL_SEND_COMMAND: pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n", current->comm); diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index 12431f35f861..a9a9ec086a7e 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -10,7 +10,7 @@ #define uptr64(val) ((void __user *)(uintptr_t)(val)) static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, - fmode_t mode, unsigned int timeout) + bool open_for_write, unsigned int timeout) { struct scsi_cmnd *scmd; struct request *rq; @@ -42,7 +42,7 @@ static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, if (copy_from_user(scmd->cmnd, uptr64(hdr->request), scmd->cmd_len)) goto out_put_request; ret = -EPERM; - if (!scsi_cmd_allowed(scmd->cmnd, mode & FMODE_WRITE)) + if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) goto out_put_request; ret = 0; diff --git a/include/linux/bsg.h b/include/linux/bsg.h index 1ac81c809da9..ee2df73edf83 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -9,7 +9,7 @@ struct device; struct request_queue; typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr, - fmode_t mode, unsigned int timeout); + bool open_for_write, unsigned int timeout); struct bsg_device *bsg_register_queue(struct request_queue *q, struct device *parent, const char *name, From 7d9d7d59d44b7e9236d168472aa222b6543fae25 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:49 +0200 Subject: [PATCH 160/266] nvme: replace the fmode_t argument to the nvme ioctl handlers with a simple bool Instead of passing a fmode_t and only checking it fo0r FMODE_WRITE, pass a bool open_for_write to prepare for callers that won't have the fmode_t. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20230608110258.189493-22-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 62 +++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 81c5c9e38477..8bf09047348e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -14,7 +14,7 @@ enum { }; static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, - unsigned int flags, fmode_t mode) + unsigned int flags, bool open_for_write) { u32 effects; @@ -80,7 +80,7 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, * writing. */ if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) - return mode & FMODE_WRITE; + return open_for_write; return true; } @@ -337,7 +337,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd, unsigned int flags, - fmode_t mode) + bool open_for_write) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -365,7 +365,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); - if (!nvme_cmd_allowed(ns, &c, 0, mode)) + if (!nvme_cmd_allowed(ns, &c, 0, open_for_write)) return -EACCES; if (cmd.timeout_ms) @@ -385,7 +385,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd64 __user *ucmd, unsigned int flags, - fmode_t mode) + bool open_for_write) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; @@ -412,7 +412,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(cmd.cdw14); c.common.cdw15 = cpu_to_le32(cmd.cdw15); - if (!nvme_cmd_allowed(ns, &c, flags, mode)) + if (!nvme_cmd_allowed(ns, &c, flags, open_for_write)) return -EACCES; if (cmd.timeout_ms) @@ -583,7 +583,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14)); c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15)); - if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode)) + if (!nvme_cmd_allowed(ns, &c, 0, ioucmd->file->f_mode & FMODE_WRITE)) return -EACCES; d.metadata = READ_ONCE(cmd->metadata); @@ -649,13 +649,13 @@ static bool is_ctrl_ioctl(unsigned int cmd) } static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, - void __user *argp, fmode_t mode) + void __user *argp, bool open_for_write) { switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp, 0, mode); + return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, 0, mode); + return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -680,14 +680,14 @@ struct nvme_user_io32 { #endif /* COMPAT_FOR_U64_ALIGNMENT */ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, - void __user *argp, unsigned int flags, fmode_t mode) + void __user *argp, unsigned int flags, bool open_for_write) { switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); return ns->head->ns_id; case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->ctrl, ns, argp, flags, mode); + return nvme_user_cmd(ns->ctrl, ns, argp, flags, open_for_write); /* * struct nvme_user_io can have different padding on some 32-bit ABIs. * Just accept the compat version as all fields that are used are the @@ -702,7 +702,8 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, flags |= NVME_IOCTL_VEC; fallthrough; case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp, flags, mode); + return nvme_user_cmd64(ns->ctrl, ns, argp, flags, + open_for_write); default: return -ENOTTY; } @@ -712,6 +713,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns *ns = bdev->bd_disk->private_data; + bool open_for_write = mode & FMODE_WRITE; void __user *argp = (void __user *)arg; unsigned int flags = 0; @@ -719,19 +721,20 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode, flags |= NVME_IOCTL_PARTITION; if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode); - return nvme_ns_ioctl(ns, cmd, argp, flags, mode); + return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); + return nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); } long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct nvme_ns *ns = container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev); + bool open_for_write = file->f_mode & FMODE_WRITE; void __user *argp = (void __user *)arg; if (is_ctrl_ioctl(cmd)) - return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, file->f_mode); - return nvme_ns_ioctl(ns, cmd, argp, 0, file->f_mode); + return nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); + return nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); } static int nvme_uring_cmd_checks(unsigned int issue_flags) @@ -800,7 +803,7 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd, #ifdef CONFIG_NVME_MULTIPATH static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *argp, struct nvme_ns_head *head, int srcu_idx, - fmode_t mode) + bool open_for_write) __releases(&head->srcu) { struct nvme_ctrl *ctrl = ns->ctrl; @@ -808,7 +811,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, nvme_get_ctrl(ns->ctrl); srcu_read_unlock(&head->srcu, srcu_idx); - ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode); + ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, open_for_write); nvme_put_ctrl(ctrl); return ret; @@ -818,6 +821,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns_head *head = bdev->bd_disk->private_data; + bool open_for_write = mode & FMODE_WRITE; void __user *argp = (void __user *)arg; struct nvme_ns *ns; int srcu_idx, ret = -EWOULDBLOCK; @@ -838,9 +842,9 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, */ if (is_ctrl_ioctl(cmd)) return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, - mode); + open_for_write); - ret = nvme_ns_ioctl(ns, cmd, argp, flags, mode); + ret = nvme_ns_ioctl(ns, cmd, argp, flags, open_for_write); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -849,6 +853,7 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + bool open_for_write = file->f_mode & FMODE_WRITE; struct cdev *cdev = file_inode(file)->i_cdev; struct nvme_ns_head *head = container_of(cdev, struct nvme_ns_head, cdev); @@ -863,9 +868,9 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, if (is_ctrl_ioctl(cmd)) return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, - file->f_mode); + open_for_write); - ret = nvme_ns_ioctl(ns, cmd, argp, 0, file->f_mode); + ret = nvme_ns_ioctl(ns, cmd, argp, 0, open_for_write); out_unlock: srcu_read_unlock(&head->srcu, srcu_idx); return ret; @@ -940,7 +945,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) } static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, - fmode_t mode) + bool open_for_write) { struct nvme_ns *ns; int ret; @@ -964,7 +969,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, kref_get(&ns->kref); up_read(&ctrl->namespaces_rwsem); - ret = nvme_user_cmd(ctrl, ns, argp, 0, mode); + ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write); nvme_put_ns(ns); return ret; @@ -976,16 +981,17 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp, long nvme_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + bool open_for_write = file->f_mode & FMODE_WRITE; struct nvme_ctrl *ctrl = file->private_data; void __user *argp = (void __user *)arg; switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ctrl, NULL, argp, 0, file->f_mode); + return nvme_user_cmd(ctrl, NULL, argp, 0, open_for_write); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp, 0, file->f_mode); + return nvme_user_cmd64(ctrl, NULL, argp, 0, open_for_write); case NVME_IOCTL_IO_CMD: - return nvme_dev_user_cmd(ctrl, argp, file->f_mode); + return nvme_dev_user_cmd(ctrl, argp, open_for_write); case NVME_IOCTL_RESET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; From 658afed19ceed54a52b9e9e69c0791c8868ff55d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:50 +0200 Subject: [PATCH 161/266] mtd: block: use a simple bool to track open for write Instead of propagating the fmode_t, just use a bool to track if a mtd block device was opened for writing. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: Richard Weinberger Link: https://lore.kernel.org/r/20230608110258.189493-23-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/mtd/mtdblock.c | 2 +- include/linux/mtd/blktrans.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index f0bb09fde95e..bd0b75453643 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -208,7 +208,7 @@ static int blktrans_open(struct gendisk *disk, fmode_t mode) ret = __get_mtd_device(dev->mtd); if (ret) goto error_release; - dev->file_mode = mode; + dev->writable = mode & FMODE_WRITE; unlock: dev->open++; diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c index a0a1194dc1d9..fa476fb4dffb 100644 --- a/drivers/mtd/mtdblock.c +++ b/drivers/mtd/mtdblock.c @@ -294,7 +294,7 @@ static void mtdblock_release(struct mtd_blktrans_dev *mbd) * It was the last usage. Free the cache, but only sync if * opened for writing. */ - if (mbd->file_mode & FMODE_WRITE) + if (mbd->writable) mtd_sync(mbd->mtd); vfree(mtdblk->cache_data); } diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h index 15cc9b95e32b..6e471436bba5 100644 --- a/include/linux/mtd/blktrans.h +++ b/include/linux/mtd/blktrans.h @@ -34,7 +34,7 @@ struct mtd_blktrans_dev { struct blk_mq_tag_set *tag_set; spinlock_t queue_lock; void *priv; - fmode_t file_mode; + bool writable; }; struct mtd_blktrans_ops { From 99b07780814e89f16bec2773c237eb25121f8502 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:51 +0200 Subject: [PATCH 162/266] rnbd-srv: replace sess->open_flags with a "bool readonly" Stop passing the fmode_t around and just use a simple bool to track if an export is read-only. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: Jack Wang Link: https://lore.kernel.org/r/20230608110258.189493-24-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv-sysfs.c | 3 +-- drivers/block/rnbd/rnbd-srv.c | 15 +++++++-------- drivers/block/rnbd/rnbd-srv.h | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 4962826e9639..39b89f9b6bd9 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -87,8 +87,7 @@ static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr, sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); - return sysfs_emit(page, "%d\n", - !(sess_dev->open_flags & FMODE_WRITE)); + return sysfs_emit(page, "%d\n", sess_dev->readonly); } static struct kobj_attribute rnbd_srv_dev_session_ro_attr = diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index a909f8763ce7..591a1be370c4 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -222,7 +222,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) blkdev_put(sess_dev->bdev, NULL); mutex_lock(&sess_dev->dev->lock); list_del(&sess_dev->dev_list); - if (sess_dev->open_flags & FMODE_WRITE) + if (!sess_dev->readonly) sess_dev->dev->open_write_cnt--; mutex_unlock(&sess_dev->dev->lock); @@ -559,7 +559,7 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, static struct rnbd_srv_sess_dev * rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, const struct rnbd_msg_open *open_msg, - struct block_device *bdev, fmode_t open_flags, + struct block_device *bdev, bool readonly, struct rnbd_srv_dev *srv_dev) { struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); @@ -574,7 +574,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, sdev->bdev = bdev; sdev->sess = srv_sess; sdev->dev = srv_dev; - sdev->open_flags = open_flags; + sdev->readonly = readonly; sdev->access_mode = open_msg->access_mode; return sdev; @@ -677,13 +677,12 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, struct rnbd_srv_sess_dev *srv_sess_dev; const struct rnbd_msg_open *open_msg = msg; struct block_device *bdev; - fmode_t open_flags; + fmode_t open_flags = FMODE_READ; char *full_path; struct rnbd_msg_open_rsp *rsp = data; trace_process_msg_open(srv_sess, open_msg); - open_flags = FMODE_READ; if (open_msg->access_mode != RNBD_ACCESS_RO) open_flags |= FMODE_WRITE; @@ -732,9 +731,9 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, goto blkdev_put; } - srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, - bdev, open_flags, - srv_dev); + srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, bdev, + open_msg->access_mode == RNBD_ACCESS_RO, + srv_dev); if (IS_ERR(srv_sess_dev)) { pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", full_path, srv_sess->sessname, PTR_ERR(srv_sess_dev)); diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index 6b5e5ade18ae..1027656dedb0 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -52,7 +52,7 @@ struct rnbd_srv_sess_dev { struct kobject kobj; u32 device_id; bool keep_id; - fmode_t open_flags; + bool readonly; struct kref kref; struct completion *destroy_comp; char pathname[NAME_MAX]; From bd6abfc8e7898ce2163a1ffdbb9ec71a0a081267 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:52 +0200 Subject: [PATCH 163/266] ubd: remove commented out code in ubd_open This code has been dead forever, make sure it doesn't show up in code searches. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Acked-by: Richard Weinberger Link: https://lore.kernel.org/r/20230608110258.189493-25-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 8b79554968ad..20c1a16199c5 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1170,13 +1170,6 @@ static int ubd_open(struct gendisk *disk, fmode_t mode) } ubd_dev->count++; set_disk_ro(disk, !ubd_dev->openflags.w); - - /* This should no more be needed. And it didn't work anyway to exclude - * read-write remounting of filesystems.*/ - /*if((mode & FMODE_WRITE) && !ubd_dev->openflags.w){ - if(--ubd_dev->count == 0) ubd_close_dev(ubd_dev); - err = -EROFS; - }*/ out: mutex_unlock(&ubd_mutex); return err; From cfb425761c79b6056ae5bb73f8d400f03b513959 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:53 +0200 Subject: [PATCH 164/266] block: move a few internal definitions out of blkdev.h All these helpers are only used in core block code, so move them out of the public header. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-26-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 23 +++++++++++++++++++++-- include/linux/blkdev.h | 27 --------------------------- 2 files changed, 21 insertions(+), 29 deletions(-) diff --git a/block/blk.h b/block/blk.h index 9582fcd0df41..6910220aa030 100644 --- a/block/blk.h +++ b/block/blk.h @@ -394,10 +394,27 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); -#else +int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +#else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} static inline void disk_clear_zone_settings(struct gendisk *disk) {} -#endif +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} +static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); +void bdev_add(struct block_device *bdev, dev_t dev); int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); @@ -449,6 +466,8 @@ extern struct device_attribute dev_attr_events_poll_msecs; extern struct attribute_group blk_trace_attr_group; +int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, + loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 978036039020..6b65623e447c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -318,7 +318,6 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model); #ifdef CONFIG_BLK_DEV_ZONED - #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -328,33 +327,11 @@ extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, gfp_t gfp_mask); int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)); - -extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); -extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); - #else /* CONFIG_BLK_DEV_ZONED */ - static inline unsigned int bdev_nr_zones(struct block_device *bdev) { return 0; } - -static inline int blkdev_report_zones_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - return -ENOTTY; -} - #endif /* CONFIG_BLK_DEV_ZONED */ /* @@ -1493,11 +1470,7 @@ void blkdev_put(struct block_device *bdev, void *holder); struct block_device *blkdev_get_no_open(dev_t dev); void blkdev_put_no_open(struct block_device *bdev); -struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); -void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, - loff_t lend); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); From 5e4ea834676e3b8965344ca61d36e1ae236249eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:54 +0200 Subject: [PATCH 165/266] block: remove unused fmode_t arguments from ioctl handlers A few ioctl handlers have fmode_t arguments that are entirely unused, remove them. Signed-off-by: Christoph Hellwig Acked-by: Christian Brauner Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20230608110258.189493-27-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ++-- block/blk.h | 6 +++--- block/ioctl.c | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 096b6b47561f..02cc2c629ac9 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -323,8 +323,8 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, * BLKREPORTZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, + unsigned long arg) { void __user *argp = (void __user *)arg; struct zone_report_args args; diff --git a/block/blk.h b/block/blk.h index 6910220aa030..e28d5d67d31a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -394,15 +394,15 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); -int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); +int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, + unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} static inline void disk_clear_zone_settings(struct gendisk *disk) {} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, unsigned long arg) + unsigned int cmd, unsigned long arg) { return -ENOTTY; } diff --git a/block/ioctl.c b/block/ioctl.c index b39bd5b41ee4..3a10c34b8ef6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -344,8 +344,8 @@ static int blkdev_pr_clear(struct block_device *bdev, return ops->pr_clear(bdev, c.key); } -static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) +static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, + unsigned long arg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -354,8 +354,8 @@ static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, return 0; } -static int blkdev_roset(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) +static int blkdev_roset(struct block_device *bdev, unsigned cmd, + unsigned long arg) { int ret, n; @@ -475,9 +475,9 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKFLSBUF: - return blkdev_flushbuf(bdev, mode, cmd, arg); + return blkdev_flushbuf(bdev, cmd, arg); case BLKROSET: - return blkdev_roset(bdev, mode, cmd, arg); + return blkdev_roset(bdev, cmd, arg); case BLKDISCARD: return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: @@ -487,7 +487,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKGETDISKSEQ: return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: - return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: case BLKCLOSEZONE: From 05bdb9965305bbfdae79b31d22df03d1e2cfcb22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:55 +0200 Subject: [PATCH 166/266] block: replace fmode_t with a block-specific type for block open flags The only overlap between the block open flags mapped into the fmode_t and other uses of fmode_t are FMODE_READ and FMODE_WRITE. Define a new blk_mode_t instead for use in blkdev_get_by_{dev,path}, ->open and ->ioctl and stop abusing fmode_t. Signed-off-by: Christoph Hellwig Acked-by: Jack Wang [rnbd] Reviewed-by: Hannes Reinecke Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-28-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 8 +++--- arch/xtensa/platforms/iss/simdisk.c | 2 +- block/bdev.c | 32 +++++++++++----------- block/blk-zoned.c | 8 +++--- block/blk.h | 11 ++++---- block/fops.c | 32 +++++++++++++++++----- block/genhd.c | 8 +++--- block/ioctl.c | 42 +++++++++-------------------- drivers/block/amiflop.c | 12 ++++----- drivers/block/aoe/aoeblk.c | 4 +-- drivers/block/ataflop.c | 25 +++++++++-------- drivers/block/drbd/drbd_main.c | 7 ++--- drivers/block/drbd/drbd_nl.c | 2 +- drivers/block/floppy.c | 28 +++++++++---------- drivers/block/loop.c | 22 +++++++-------- drivers/block/mtip32xx/mtip32xx.c | 4 +-- drivers/block/nbd.c | 4 +-- drivers/block/pktcdvd.c | 17 ++++++------ drivers/block/rbd.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 4 +-- drivers/block/rnbd/rnbd-srv.c | 4 +-- drivers/block/sunvdc.c | 2 +- drivers/block/swim.c | 16 +++++------ drivers/block/swim3.c | 24 ++++++++--------- drivers/block/ublk_drv.c | 2 +- drivers/block/xen-blkback/xenbus.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/block/z2ram.c | 2 +- drivers/block/zram/zram_drv.c | 6 ++--- drivers/cdrom/cdrom.c | 6 ++--- drivers/cdrom/gdrom.c | 4 +-- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/request.c | 4 +-- drivers/md/bcache/super.c | 6 ++--- drivers/md/dm-cache-target.c | 12 ++++----- drivers/md/dm-clone-target.c | 10 +++---- drivers/md/dm-core.h | 7 +++-- drivers/md/dm-era-target.c | 6 +++-- drivers/md/dm-ioctl.c | 10 +++---- drivers/md/dm-snap.c | 4 +-- drivers/md/dm-table.c | 11 ++++---- drivers/md/dm-thin.c | 9 ++++--- drivers/md/dm-verity-fec.c | 2 +- drivers/md/dm-verity-target.c | 6 ++--- drivers/md/dm.c | 10 +++---- drivers/md/dm.h | 2 +- drivers/md/md.c | 8 +++--- drivers/mmc/core/block.c | 8 +++--- drivers/mtd/devices/block2mtd.c | 4 +-- drivers/mtd/mtd_blkdevs.c | 4 +-- drivers/mtd/ubi/block.c | 5 ++-- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/ioctl.c | 8 +++--- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/nvme.h | 4 +-- drivers/nvme/target/io-cmd-bdev.c | 2 +- drivers/s390/block/dasd.c | 6 ++--- drivers/s390/block/dasd_genhd.c | 3 ++- drivers/s390/block/dasd_int.h | 3 ++- drivers/s390/block/dasd_ioctl.c | 2 +- drivers/s390/block/dcssblk.c | 4 +-- drivers/scsi/sd.c | 19 ++++++------- drivers/scsi/sr.c | 10 +++---- drivers/target/target_core_iblock.c | 5 ++-- drivers/target/target_core_pscsi.c | 4 +-- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/super.c | 8 +++--- fs/btrfs/volumes.c | 16 +++++------ fs/btrfs/volumes.h | 4 +-- fs/erofs/super.c | 2 +- fs/ext4/super.c | 2 +- fs/f2fs/super.c | 2 +- fs/jfs/jfs_logmgr.c | 2 +- fs/nfs/blocklayout/dev.c | 5 ++-- fs/ocfs2/cluster/heartbeat.c | 3 ++- fs/reiserfs/journal.c | 4 +-- fs/xfs/xfs_super.c | 2 +- include/linux/blkdev.h | 30 ++++++++++++++++----- include/linux/cdrom.h | 3 ++- include/linux/device-mapper.h | 8 +++--- kernel/power/swap.c | 6 ++--- mm/swapfile.c | 2 +- 82 files changed, 334 insertions(+), 315 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 20c1a16199c5..50206feac577 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,9 +108,9 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) static DEFINE_MUTEX(ubd_lock); static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ -static int ubd_open(struct gendisk *disk, fmode_t mode); +static int ubd_open(struct gendisk *disk, blk_mode_t mode); static void ubd_release(struct gendisk *disk); -static int ubd_ioctl(struct block_device *bdev, fmode_t mode, +static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -1154,7 +1154,7 @@ static int __init ubd_driver_init(void){ device_initcall(ubd_driver_init); -static int ubd_open(struct gendisk *disk, fmode_t mode) +static int ubd_open(struct gendisk *disk, blk_mode_t mode) { struct ubd *ubd_dev = disk->private_data; int err = 0; @@ -1389,7 +1389,7 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static int ubd_ioctl(struct block_device *bdev, fmode_t mode, +static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct ubd *ubd_dev = bdev->bd_disk->private_data; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 2ad9da3de0d9..178cf96ca10a 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -120,7 +120,7 @@ static void simdisk_submit_bio(struct bio *bio) bio_endio(bio); } -static int simdisk_open(struct gendisk *disk, fmode_t mode) +static int simdisk_open(struct gendisk *disk, blk_mode_t mode) { struct simdisk *dev = disk->private_data; diff --git a/block/bdev.c b/block/bdev.c index db63e5bcc46f..bd558a9ba3cd 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -93,7 +93,7 @@ EXPORT_SYMBOL(invalidate_bdev); * Drop all buffers & page cache for given bdev range. This function bails * with error if bdev has other exclusive owner (such as filesystem). */ -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, +int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend) { /* @@ -101,14 +101,14 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ - if (!(mode & FMODE_EXCL)) { + if (!(mode & BLK_OPEN_EXCL)) { int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); if (err) goto invalidate; } truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; @@ -647,7 +647,7 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } -static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) +static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret; @@ -679,7 +679,7 @@ static void blkdev_put_whole(struct block_device *bdev) bdev->bd_disk->fops->release(bdev->bd_disk); } -static int blkdev_get_part(struct block_device *part, fmode_t mode) +static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; int ret; @@ -743,11 +743,11 @@ void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } - + /** * blkdev_get_by_dev - open a block device by device number * @dev: device number of block device to open - * @mode: FMODE_* mask + * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * @hops: holder operations * @@ -765,7 +765,7 @@ void blkdev_put_no_open(struct block_device *bdev) * RETURNS: * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, +struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { bool unblock_events = true; @@ -775,8 +775,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), - ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | - ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); + ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | + ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) return ERR_PTR(ret); @@ -786,12 +786,12 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, disk = bdev->bd_disk; if (holder) { - mode |= FMODE_EXCL; + mode |= BLK_OPEN_EXCL; ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) goto put_blkdev; } else { - if (WARN_ON_ONCE(mode & FMODE_EXCL)) { + if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { ret = -EIO; goto put_blkdev; } @@ -821,7 +821,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, * writeable reference is too fragile given the way @mode is * used in blkdev_get/put(). */ - if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && + if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; @@ -848,7 +848,7 @@ EXPORT_SYMBOL(blkdev_get_by_dev); /** * blkdev_get_by_path - open a block device by name * @path: path to the block device to open - * @mode: FMODE_* mask + * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier * * Open the block device described by the device file at @path. If @holder is @@ -861,7 +861,7 @@ EXPORT_SYMBOL(blkdev_get_by_dev); * RETURNS: * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { struct block_device *bdev; @@ -873,7 +873,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, return ERR_PTR(error); bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { + if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, holder); return ERR_PTR(-EACCES); } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 02cc2c629ac9..0f9f97cdddd9 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -356,8 +356,8 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, return 0; } -static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, - const struct blk_zone_range *zrange) +static int blkdev_truncate_zone_range(struct block_device *bdev, + blk_mode_t mode, const struct blk_zone_range *zrange) { loff_t start, end; @@ -376,7 +376,7 @@ static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -390,7 +390,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!bdev_is_zoned(bdev)) return -ENOTTY; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) diff --git a/block/blk.h b/block/blk.h index e28d5d67d31a..768852a84fef 100644 --- a/block/blk.h +++ b/block/blk.h @@ -396,7 +396,7 @@ void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); -int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} @@ -407,7 +407,7 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev, return -ENOTTY; } static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, unsigned long arg) + blk_mode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; } @@ -451,7 +451,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page) struct request_queue *blk_alloc_queue(int node_id); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode); +int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); @@ -466,8 +466,9 @@ extern struct device_attribute dev_attr_events_poll_msecs; extern struct attribute_group blk_trace_attr_group; -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, - loff_t lend); +blk_mode_t file_to_blk_mode(struct file *file); +int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, + loff_t lstart, loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/block/fops.c b/block/fops.c index 9f26e25bafa1..086612103b9d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -470,6 +470,30 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, return error; } +blk_mode_t file_to_blk_mode(struct file *file) +{ + blk_mode_t mode = 0; + + if (file->f_mode & FMODE_READ) + mode |= BLK_OPEN_READ; + if (file->f_mode & FMODE_WRITE) + mode |= BLK_OPEN_WRITE; + if (file->f_mode & FMODE_EXCL) + mode |= BLK_OPEN_EXCL; + if (file->f_flags & O_NDELAY) + mode |= BLK_OPEN_NDELAY; + + /* + * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy + * driver has historically allowed ioctls as if the file was opened for + * writing, but does not allow and actual reads or writes. + */ + if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY)) + mode |= BLK_OPEN_WRITE_IOCTL; + + return mode; +} + static int blkdev_open(struct inode *inode, struct file *filp) { struct block_device *bdev; @@ -483,14 +507,10 @@ static int blkdev_open(struct inode *inode, struct file *filp) filp->f_flags |= O_LARGEFILE; filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - if (filp->f_flags & O_NDELAY) - filp->f_mode |= FMODE_NDELAY; if (filp->f_flags & O_EXCL) filp->f_mode |= FMODE_EXCL; - if ((filp->f_flags & O_ACCMODE) == 3) - filp->f_mode |= FMODE_WRITE_IOCTL; - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, + bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp), (filp->f_mode & FMODE_EXCL) ? filp : NULL, NULL); if (IS_ERR(bdev)) @@ -648,7 +668,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, filemap_invalidate_lock(inode->i_mapping); /* Invalidate the page cache, including dirty pages. */ - error = truncate_bdev_range(bdev, file->f_mode, start, end); + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; diff --git a/block/genhd.c b/block/genhd.c index b56f8b5c88b3..2c2f9a716822 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -339,7 +339,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode) +int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { struct block_device *bdev; int ret = 0; @@ -357,7 +357,7 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) * synchronize with other exclusive openers and other partition * scanners. */ - if (!(mode & FMODE_EXCL)) { + if (!(mode & BLK_OPEN_EXCL)) { ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, NULL); if (ret) @@ -377,7 +377,7 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) * creat partition for underlying disk. */ clear_bit(GD_NEED_PART_SCAN, &disk->state); - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(disk->part0, disk_scan_partitions); return ret; } @@ -505,7 +505,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) - disk_scan_partitions(disk, FMODE_READ); + disk_scan_partitions(disk, BLK_OPEN_READ); /* * Announce the disk and partitions after all partitions are diff --git a/block/ioctl.c b/block/ioctl.c index 3a10c34b8ef6..61bb94fd4281 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -82,7 +82,7 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif -static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, +static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; @@ -90,7 +90,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, struct inode *inode = bdev->bd_inode; int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_discard_sectors(bdev)) @@ -120,14 +120,14 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, return err; } -static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, +static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { uint64_t start, len; uint64_t range[2]; int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_secure_erase_sectors(bdev)) return -EOPNOTSUPP; @@ -151,7 +151,7 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, } -static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, +static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; @@ -159,7 +159,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, struct inode *inode = bdev->bd_inode; int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(range, (void __user *)arg, sizeof(range))) @@ -240,7 +240,7 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) * drivers that implement only commands that are completely compatible * between 32-bit and 64-bit user space */ -int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode, +int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; @@ -439,7 +439,7 @@ static int compat_hdio_getgeo(struct block_device *bdev, #endif /* set the logical block size */ -static int blkdev_bszset(struct block_device *bdev, fmode_t mode, +static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, int __user *argp) { int ret, n; @@ -451,7 +451,7 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, if (get_user(n, argp)) return -EFAULT; - if (mode & FMODE_EXCL) + if (mode & BLK_OPEN_EXCL) return set_blocksize(bdev, n); if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL))) @@ -467,7 +467,7 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ -static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, +static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg, void __user *argp) { @@ -560,18 +560,9 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; - fmode_t mode = file->f_mode; + blk_mode_t mode = file_to_blk_mode(file); int ret; - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: @@ -630,16 +621,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) void __user *argp = compat_ptr(arg); struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; + blk_mode_t mode = file_to_blk_mode(file); switch (cmd) { /* These need separate implementations for the data structure */ diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 9a0e9dc74a8c..e460c9799d9f 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1532,7 +1532,7 @@ static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, +static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param) { struct amiga_floppy_struct *p = bdev->bd_disk->private_data; @@ -1607,7 +1607,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -static int fd_ioctl(struct block_device *bdev, fmode_t mode, +static int fd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param) { int ret; @@ -1654,7 +1654,7 @@ static void fd_probe(int dev) * /dev/PS0 etc), and disallows simultaneous access to the same * drive with different device numbers. */ -static int floppy_open(struct gendisk *disk, fmode_t mode) +static int floppy_open(struct gendisk *disk, blk_mode_t mode) { int drive = disk->first_minor & 3; int system = (disk->first_minor & 4) >> 2; @@ -1673,10 +1673,9 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) mutex_unlock(&amiflop_mutex); return -ENXIO; } - - if (mode & (FMODE_READ|FMODE_WRITE)) { + if (mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) { disk_check_media_change(disk); - if (mode & FMODE_WRITE) { + if (mode & BLK_OPEN_WRITE) { int wrprot; get_fdc(drive); @@ -1691,7 +1690,6 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) } } } - local_irq_save(flags); fd_ref[drive]++; fd_device[drive] = system; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index c3a39e02ab95..cf6883756155 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -204,7 +204,7 @@ aoedisk_rm_debugfs(struct aoedev *d) } static int -aoeblk_open(struct gendisk *disk, fmode_t mode) +aoeblk_open(struct gendisk *disk, blk_mode_t mode) { struct aoedev *d = disk->private_data; ulong flags; @@ -285,7 +285,7 @@ aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static int -aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg) +aoeblk_ioctl(struct block_device *bdev, blk_mode_t mode, uint cmd, ulong arg) { struct aoedev *d; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 66a3242bb062..cd738cab725f 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -442,12 +442,12 @@ static void fd_times_out(struct timer_list *unused); static void finish_fdc( void ); static void finish_fdc_done( int dummy ); static void setup_req_params( int drive ); -static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int - cmd, unsigned long param); +static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long param); static void fd_probe( int drive ); static int fd_test_drive_present( int drive ); static void config_types( void ); -static int floppy_open(struct gendisk *disk, fmode_t mode); +static int floppy_open(struct gendisk *disk, blk_mode_t mode); static void floppy_release(struct gendisk *disk); /************************* End of Prototypes **************************/ @@ -1581,7 +1581,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, +static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param) { struct gendisk *disk = bdev->bd_disk; @@ -1768,7 +1768,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, } } -static int fd_ioctl(struct block_device *bdev, fmode_t mode, +static int fd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { int ret; @@ -1915,7 +1915,7 @@ static void __init config_types( void ) * drive with different device numbers. */ -static int floppy_open(struct gendisk *disk, fmode_t mode) +static int floppy_open(struct gendisk *disk, blk_mode_t mode) { struct atari_floppy_struct *p = disk->private_data; int type = disk->first_minor >> 2; @@ -1924,23 +1924,22 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) if (p->ref && p->type != type) return -EBUSY; - if (p->ref == -1 || (p->ref && mode & FMODE_EXCL)) + if (p->ref == -1 || (p->ref && mode & BLK_OPEN_EXCL)) return -EBUSY; - - if (mode & FMODE_EXCL) + if (mode & BLK_OPEN_EXCL) p->ref = -1; else p->ref++; p->type = type; - if (mode & FMODE_NDELAY) + if (mode & BLK_OPEN_NDELAY) return 0; - if (mode & (FMODE_READ|FMODE_WRITE)) { + if (mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) { if (disk_check_media_change(disk)) floppy_revalidate(disk); - if (mode & FMODE_WRITE) { + if (mode & BLK_OPEN_WRITE) { if (p->wpstat) { if (p->ref < 0) p->ref = 0; @@ -1953,7 +1952,7 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) return 0; } -static int floppy_unlocked_open(struct gendisk *disk, fmode_t mode) +static int floppy_unlocked_open(struct gendisk *disk, blk_mode_t mode) { int ret; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 7f3d7ca6ce6b..965f672557f2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -49,7 +49,7 @@ #include "drbd_debugfs.h" static DEFINE_MUTEX(drbd_main_mutex); -static int drbd_open(struct gendisk *disk, fmode_t mode); +static int drbd_open(struct gendisk *disk, blk_mode_t mode); static void drbd_release(struct gendisk *gd); static void md_sync_timer_fn(struct timer_list *t); static int w_bitmap_io(struct drbd_work *w, int unused); @@ -1882,7 +1882,7 @@ int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void return 0; } -static int drbd_open(struct gendisk *disk, fmode_t mode) +static int drbd_open(struct gendisk *disk, blk_mode_t mode) { struct drbd_device *device = disk->private_data; unsigned long flags; @@ -1894,7 +1894,7 @@ static int drbd_open(struct gendisk *disk, fmode_t mode) * and no race with updating open_cnt */ if (device->state.role != R_PRIMARY) { - if (mode & FMODE_WRITE) + if (mode & BLK_OPEN_WRITE) rv = -EROFS; else if (!drbd_allow_oos) rv = -EMEDIUMTYPE; @@ -1911,6 +1911,7 @@ static int drbd_open(struct gendisk *disk, fmode_t mode) static void drbd_release(struct gendisk *gd) { struct drbd_device *device = gd->private_data; + mutex_lock(&drbd_main_mutex); device->open_cnt--; mutex_unlock(&drbd_main_mutex); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 10b1e5171332..cddae6f4b00f 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1640,7 +1640,7 @@ static struct block_device *open_backing_dev(struct drbd_device *device, struct block_device *bdev; int err = 0; - bdev = blkdev_get_by_path(bdev_path, FMODE_READ | FMODE_WRITE, + bdev = blkdev_get_by_path(bdev_path, BLK_OPEN_READ | BLK_OPEN_WRITE, claim_ptr, NULL); if (IS_ERR(bdev)) { drbd_err(device, "open(\"%s\") failed with %ld\n", diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index d79fac288a73..2db9b186b977 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3394,8 +3394,8 @@ static bool valid_floppy_drive_params(const short autodetect[FD_AUTODETECT_SIZE] return true; } -static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long param) +static int fd_locked_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long param) { int drive = (long)bdev->bd_disk->private_data; int type = ITYPE(drive_state[drive].fd_device); @@ -3428,7 +3428,8 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int return ret; /* permission checks */ - if (((cmd & 0x40) && !(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) || + if (((cmd & 0x40) && + !(mode & (BLK_OPEN_WRITE | BLK_OPEN_WRITE_IOCTL))) || ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))) return -EPERM; @@ -3566,7 +3567,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int return 0; } -static int fd_ioctl(struct block_device *bdev, fmode_t mode, +static int fd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param) { int ret; @@ -3654,8 +3655,8 @@ struct compat_floppy_write_errors { #define FDGETFDCSTAT32 _IOR(2, 0x15, struct compat_floppy_fdc_state) #define FDWERRORGET32 _IOR(2, 0x17, struct compat_floppy_write_errors) -static int compat_set_geometry(struct block_device *bdev, fmode_t mode, unsigned int cmd, - struct compat_floppy_struct __user *arg) +static int compat_set_geometry(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, struct compat_floppy_struct __user *arg) { struct floppy_struct v; int drive, type; @@ -3664,7 +3665,7 @@ static int compat_set_geometry(struct block_device *bdev, fmode_t mode, unsigned BUILD_BUG_ON(offsetof(struct floppy_struct, name) != offsetof(struct compat_floppy_struct, name)); - if (!(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) + if (!(mode & (BLK_OPEN_WRITE | BLK_OPEN_WRITE_IOCTL))) return -EPERM; memset(&v, 0, sizeof(struct floppy_struct)); @@ -3861,8 +3862,8 @@ static int compat_werrorget(int drive, return 0; } -static int fd_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long param) +static int fd_compat_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long param) { int drive = (long)bdev->bd_disk->private_data; switch (cmd) { @@ -3984,7 +3985,7 @@ static void floppy_release(struct gendisk *disk) * /dev/PS0 etc), and disallows simultaneous access to the same * drive with different device numbers. */ -static int floppy_open(struct gendisk *disk, fmode_t mode) +static int floppy_open(struct gendisk *disk, blk_mode_t mode) { int drive = (long)disk->private_data; int old_dev, new_dev; @@ -4049,9 +4050,8 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) if (fdc_state[FDC(drive)].rawcmd == 1) fdc_state[FDC(drive)].rawcmd = 2; - - if (!(mode & FMODE_NDELAY)) { - if (mode & (FMODE_READ|FMODE_WRITE)) { + if (!(mode & BLK_OPEN_NDELAY)) { + if (mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) { drive_state[drive].last_checked = 0; clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags); @@ -4063,7 +4063,7 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) goto out; } res = -EROFS; - if ((mode & FMODE_WRITE) && + if ((mode & BLK_OPEN_WRITE) && !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags)) goto out; } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ca40d24572ae..37511d2b2caf 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -990,7 +990,7 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } -static int loop_configure(struct loop_device *lo, fmode_t mode, +static int loop_configure(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, const struct loop_config *config) { @@ -1014,7 +1014,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, * If we don't hold exclusive handle for the device, upgrade to it * here to avoid changing device under exclusive owner. */ - if (!(mode & FMODE_EXCL)) { + if (!(mode & BLK_OPEN_EXCL)) { error = bd_prepare_to_claim(bdev, loop_configure, NULL); if (error) goto out_putf; @@ -1050,7 +1050,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (error) goto out_unlock; - if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || + if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || !file->f_op->write_iter) lo->lo_flags |= LO_FLAGS_READ_ONLY; @@ -1116,7 +1116,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, if (partscan) loop_reread_partitions(lo); - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); return 0; @@ -1124,7 +1124,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, out_unlock: loop_global_unlock(lo, is_loop); out_bdev: - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, loop_configure); out_putf: fput(file); @@ -1528,7 +1528,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd, return err; } -static int lo_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; @@ -1563,24 +1563,22 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, return loop_clr_fd(lo); case LOOP_SET_STATUS: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status_old(lo, argp); - } break; case LOOP_GET_STATUS: return loop_get_status_old(lo, argp); case LOOP_SET_STATUS64: err = -EPERM; - if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) { + if ((mode & BLK_OPEN_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_status64(lo, argp); - } break; case LOOP_GET_STATUS64: return loop_get_status64(lo, argp); case LOOP_SET_CAPACITY: case LOOP_SET_DIRECT_IO: case LOOP_SET_BLOCK_SIZE: - if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN)) + if (!(mode & BLK_OPEN_WRITE) && !capable(CAP_SYS_ADMIN)) return -EPERM; fallthrough; default: @@ -1691,7 +1689,7 @@ loop_get_status_compat(struct loop_device *lo, return err; } -static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int lo_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct loop_device *lo = bdev->bd_disk->private_data; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 815d77ba6381..b200950e8fb5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3041,7 +3041,7 @@ static int rssd_disk_name_format(char *prefix, * structure pointer. */ static int mtip_block_ioctl(struct block_device *dev, - fmode_t mode, + blk_mode_t mode, unsigned cmd, unsigned long arg) { @@ -3079,7 +3079,7 @@ static int mtip_block_ioctl(struct block_device *dev, * structure pointer. */ static int mtip_block_compat_ioctl(struct block_device *dev, - fmode_t mode, + blk_mode_t mode, unsigned cmd, unsigned long arg) { diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index cfb835238684..8576d696c7a2 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1502,7 +1502,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return -ENOTTY; } -static int nbd_ioctl(struct block_device *bdev, fmode_t mode, +static int nbd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct nbd_device *nbd = bdev->bd_disk->private_data; @@ -1553,7 +1553,7 @@ static struct nbd_config *nbd_alloc_config(void) return config; } -static int nbd_open(struct gendisk *disk, fmode_t mode) +static int nbd_open(struct gendisk *disk, blk_mode_t mode) { struct nbd_device *nbd; int ret = 0; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index c3299e49edd5..a1428538bda5 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2154,7 +2154,7 @@ static int pkt_open_write(struct pktcdvd_device *pd) /* * called at open time. */ -static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) +static int pkt_open_dev(struct pktcdvd_device *pd, bool write) { struct device *ddev = disk_to_dev(pd->disk); int ret; @@ -2167,7 +2167,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) * to read/write from/to it. It is already opened in O_NONBLOCK mode * so open should not fail. */ - bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ, pd, NULL); + bdev = blkdev_get_by_dev(pd->bdev->bd_dev, BLK_OPEN_READ, pd, NULL); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto out; @@ -2247,7 +2247,7 @@ static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) return pkt_devs[dev_minor]; } -static int pkt_open(struct gendisk *disk, fmode_t mode) +static int pkt_open(struct gendisk *disk, blk_mode_t mode) { struct pktcdvd_device *pd = NULL; int ret; @@ -2263,13 +2263,13 @@ static int pkt_open(struct gendisk *disk, fmode_t mode) pd->refcnt++; if (pd->refcnt > 1) { - if ((mode & FMODE_WRITE) && + if ((mode & BLK_OPEN_WRITE) && !test_bit(PACKET_WRITABLE, &pd->flags)) { ret = -EBUSY; goto out_dec; } } else { - ret = pkt_open_dev(pd, mode & FMODE_WRITE); + ret = pkt_open_dev(pd, mode & BLK_OPEN_WRITE); if (ret) goto out_dec; /* @@ -2278,7 +2278,6 @@ static int pkt_open(struct gendisk *disk, fmode_t mode) */ set_blocksize(disk->part0, CD_FRAMESIZE); } - mutex_unlock(&ctl_mutex); mutex_unlock(&pktcdvd_mutex); return 0; @@ -2514,7 +2513,8 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } } - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL, NULL); + bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_NDELAY, NULL, + NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); sdev = scsi_device_from_queue(bdev->bd_disk->queue); @@ -2550,7 +2550,8 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) return -ENOMEM; } -static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +static int pkt_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long arg) { struct pktcdvd_device *pd = bdev->bd_disk->private_data; struct device *ddev = disk_to_dev(pd->disk); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5215eff94fe9..39f2903fe25f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -660,7 +660,7 @@ static bool pending_result_dec(struct pending_result *pending, int *result) return true; } -static int rbd_open(struct gendisk *disk, fmode_t mode) +static int rbd_open(struct gendisk *disk, blk_mode_t mode) { struct rbd_device *rbd_dev = disk->private_data; bool removing = false; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index d5261d36d786..b0550b68645d 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -921,11 +921,11 @@ rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) return sess; } -static int rnbd_client_open(struct gendisk *disk, fmode_t mode) +static int rnbd_client_open(struct gendisk *disk, blk_mode_t mode) { struct rnbd_clt_dev *dev = disk->private_data; - if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE)) + if (get_disk_ro(dev->gd) && (mode & BLK_OPEN_WRITE)) return -EPERM; if (dev->dev_state == DEV_STATE_UNMAPPED || diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 591a1be370c4..c186df0ec641 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -677,14 +677,14 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, struct rnbd_srv_sess_dev *srv_sess_dev; const struct rnbd_msg_open *open_msg = msg; struct block_device *bdev; - fmode_t open_flags = FMODE_READ; + blk_mode_t open_flags = BLK_OPEN_READ; char *full_path; struct rnbd_msg_open_rsp *rsp = data; trace_process_msg_open(srv_sess, open_msg); if (open_msg->access_mode != RNBD_ACCESS_RO) - open_flags |= FMODE_WRITE; + open_flags |= BLK_OPEN_WRITE; mutex_lock(&srv_sess->lock); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 9fa821fa76b0..7bf4b48e2282 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -139,7 +139,7 @@ static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) * when vdisk_mtype is VD_MEDIA_TYPE_CD or VD_MEDIA_TYPE_DVD. * Needed to be able to install inside an ldom from an iso image. */ -static int vdc_ioctl(struct block_device *bdev, fmode_t mode, +static int vdc_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned command, unsigned long argument) { struct vdc_port *port = bdev->bd_disk->private_data; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index a629b38dec66..651009b3a601 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -608,20 +608,18 @@ static void setup_medium(struct floppy_state *fs) } } -static int floppy_open(struct gendisk *disk, fmode_t mode) +static int floppy_open(struct gendisk *disk, blk_mode_t mode) { struct floppy_state *fs = disk->private_data; struct swim __iomem *base = fs->swd->base; int err; - if (fs->ref_count == -1 || (fs->ref_count && mode & FMODE_EXCL)) + if (fs->ref_count == -1 || (fs->ref_count && mode & BLK_OPEN_EXCL)) return -EBUSY; - - if (mode & FMODE_EXCL) + if (mode & BLK_OPEN_EXCL) fs->ref_count = -1; else fs->ref_count++; - swim_write(base, setup, S_IBM_DRIVE | S_FCLK_DIV2); udelay(10); swim_drive(base, fs->location); @@ -636,10 +634,10 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) set_capacity(fs->disk, fs->total_secs); - if (mode & FMODE_NDELAY) + if (mode & BLK_OPEN_NDELAY) return 0; - if (mode & (FMODE_READ|FMODE_WRITE)) { + if (mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) { if (disk_check_media_change(disk) && fs->disk_in) fs->ejected = 0; if ((mode & FMODE_WRITE) && fs->write_protected) { @@ -659,7 +657,7 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) return err; } -static int floppy_unlocked_open(struct gendisk *disk, fmode_t mode) +static int floppy_unlocked_open(struct gendisk *disk, blk_mode_t mode) { int ret; @@ -686,7 +684,7 @@ static void floppy_release(struct gendisk *disk) mutex_unlock(&swim_mutex); } -static int floppy_ioctl(struct block_device *bdev, fmode_t mode, +static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param) { struct floppy_state *fs = bdev->bd_disk->private_data; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index b696deff3d8b..945a03154250 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -246,9 +246,9 @@ static int grab_drive(struct floppy_state *fs, enum swim_state state, int interruptible); static void release_drive(struct floppy_state *fs); static int fd_eject(struct floppy_state *fs); -static int floppy_ioctl(struct block_device *bdev, fmode_t mode, +static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param); -static int floppy_open(struct gendisk *disk, fmode_t mode); +static int floppy_open(struct gendisk *disk, blk_mode_t mode); static unsigned int floppy_check_events(struct gendisk *disk, unsigned int clearing); static int floppy_revalidate(struct gendisk *disk); @@ -882,7 +882,7 @@ static int fd_eject(struct floppy_state *fs) static struct floppy_struct floppy_type = { 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,NULL }; /* 7 1.44MB 3.5" */ -static int floppy_locked_ioctl(struct block_device *bdev, fmode_t mode, +static int floppy_locked_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long param) { struct floppy_state *fs = bdev->bd_disk->private_data; @@ -910,7 +910,7 @@ static int floppy_locked_ioctl(struct block_device *bdev, fmode_t mode, return -ENOTTY; } -static int floppy_ioctl(struct block_device *bdev, fmode_t mode, +static int floppy_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param) { int ret; @@ -922,7 +922,7 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode, return ret; } -static int floppy_open(struct gendisk *disk, fmode_t mode) +static int floppy_open(struct gendisk *disk, blk_mode_t mode) { struct floppy_state *fs = disk->private_data; struct swim3 __iomem *sw = fs->swim3; @@ -957,18 +957,18 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) swim3_action(fs, SETMFM); swim3_select(fs, RELAX); - } else if (fs->ref_count == -1 || mode & FMODE_EXCL) + } else if (fs->ref_count == -1 || mode & BLK_OPEN_EXCL) return -EBUSY; - if (err == 0 && (mode & FMODE_NDELAY) == 0 - && (mode & (FMODE_READ|FMODE_WRITE))) { + if (err == 0 && !(mode & BLK_OPEN_NDELAY) && + (mode & (BLK_OPEN_READ | BLK_OPEN_WRITE))) { if (disk_check_media_change(disk)) floppy_revalidate(disk); if (fs->ejected) err = -ENXIO; } - if (err == 0 && (mode & FMODE_WRITE)) { + if (err == 0 && (mode & BLK_OPEN_WRITE)) { if (fs->write_prot < 0) fs->write_prot = swim3_readbit(fs, WRITE_PROT); if (fs->write_prot) @@ -984,7 +984,7 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) return err; } - if (mode & FMODE_EXCL) + if (mode & BLK_OPEN_EXCL) fs->ref_count = -1; else ++fs->ref_count; @@ -992,12 +992,12 @@ static int floppy_open(struct gendisk *disk, fmode_t mode) return 0; } -static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +static int floppy_unlocked_open(struct gendisk *disk, blk_mode_t mode) { int ret; mutex_lock(&swim3_mutex); - ret = floppy_open(bdev, mode); + ret = floppy_open(disk, mode); mutex_unlock(&swim3_mutex); return ret; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 92c900ac2ebc..9fdc4c7f908d 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -447,7 +447,7 @@ static void ublk_store_owner_uid_gid(unsigned int *owner_uid, *owner_gid = from_kgid(&init_user_ns, gid); } -static int ublk_open(struct gendisk *disk, fmode_t mode) +static int ublk_open(struct gendisk *disk, blk_mode_t mode) { struct ublk_device *ub = disk->private_data; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 141b60aad570..bb66178c432b 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -492,7 +492,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, vbd->pdevice = MKDEV(major, minor); bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ? - FMODE_READ : FMODE_WRITE, NULL, NULL); + BLK_OPEN_READ : BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(bdev)) { pr_warn("xen_vbd_create: device %08x could not be opened\n", diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 23ed258b57f0..52e74adbaad6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -509,7 +509,7 @@ static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) return 0; } -static int blkif_ioctl(struct block_device *bdev, fmode_t mode, +static int blkif_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned command, unsigned long argument) { struct blkfront_info *info = bdev->bd_disk->private_data; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index a2e41cc084ca..11493167b0a8 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -140,7 +140,7 @@ static void get_chipram(void) return; } -static int z2_open(struct gendisk *disk, fmode_t mode) +static int z2_open(struct gendisk *disk, blk_mode_t mode) { int device = disk->first_minor; int max_z2_map = (Z2RAM_SIZE / Z2RAM_CHUNKSIZE) * sizeof(z2ram_map[0]); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 21615d67a9bd..1867f378b319 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -507,8 +507,8 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - bdev = blkdev_get_by_dev(inode->i_rdev, FMODE_READ | FMODE_WRITE, zram, - NULL); + bdev = blkdev_get_by_dev(inode->i_rdev, BLK_OPEN_READ | BLK_OPEN_WRITE, + zram, NULL); if (IS_ERR(bdev)) { err = PTR_ERR(bdev); bdev = NULL; @@ -2097,7 +2097,7 @@ static ssize_t reset_store(struct device *dev, return len; } -static int zram_open(struct gendisk *disk, fmode_t mode) +static int zram_open(struct gendisk *disk, blk_mode_t mode) { struct zram *zram = disk->private_data; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 998b03fe976e..bd8cd59c758a 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -1146,7 +1146,7 @@ int open_for_data(struct cdrom_device_info *cdi) * is in their own interest: device control becomes a lot easier * this way. */ -int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode) +int cdrom_open(struct cdrom_device_info *cdi, blk_mode_t mode) { int ret; @@ -1155,7 +1155,7 @@ int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode) /* if this was a O_NONBLOCK open and we should honor the flags, * do a quick open without drive/disc integrity checks. */ cdi->use_count++; - if ((mode & FMODE_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) { + if ((mode & BLK_OPEN_NDELAY) && (cdi->options & CDO_USE_FFLAGS)) { ret = cdi->ops->open(cdi, 1); } else { ret = open_for_data(cdi); @@ -1163,7 +1163,7 @@ int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode) goto err; if (CDROM_CAN(CDC_GENERIC_PACKET)) cdrom_mmc3_profile(cdi); - if (mode & FMODE_WRITE) { + if (mode & BLK_OPEN_WRITE) { ret = -EROFS; if (cdrom_open_write(cdi)) goto err_release; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index dac148d4d1fe..3a46e27479ff 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -474,7 +474,7 @@ static const struct cdrom_device_ops gdrom_ops = { CDC_RESET | CDC_DRIVE_STATUS | CDC_CD_R, }; -static int gdrom_bdops_open(struct gendisk *disk, fmode_t mode) +static int gdrom_bdops_open(struct gendisk *disk, blk_mode_t mode) { int ret; @@ -499,7 +499,7 @@ static unsigned int gdrom_bdops_check_events(struct gendisk *disk, return cdrom_check_events(gd.cd_info, clearing); } -static int gdrom_bdops_ioctl(struct block_device *bdev, fmode_t mode, +static int gdrom_bdops_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg) { int ret; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index aebb7ef10e63..700dc5588d5f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -275,7 +275,7 @@ struct bcache_device { int (*cache_miss)(struct btree *b, struct search *s, struct bio *bio, unsigned int sectors); - int (*ioctl)(struct bcache_device *d, fmode_t mode, + int (*ioctl)(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg); }; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 67a2e29e0b40..a9b1f3896249 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1228,7 +1228,7 @@ void cached_dev_submit_bio(struct bio *bio) detached_dev_do_request(d, bio, orig_bdev, start_time); } -static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, +static int cached_dev_ioctl(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); @@ -1318,7 +1318,7 @@ void flash_dev_submit_bio(struct bio *bio) continue_at(cl, search_free, NULL); } -static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, +static int flash_dev_ioctl(struct bcache_device *d, blk_mode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7022fea396f2..1f829e74db0a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -732,7 +732,7 @@ static int prio_read(struct cache *ca, uint64_t bucket) /* Bcache device */ -static int open_dev(struct gendisk *disk, fmode_t mode) +static int open_dev(struct gendisk *disk, blk_mode_t mode) { struct bcache_device *d = disk->private_data; @@ -750,7 +750,7 @@ static void release_dev(struct gendisk *b) closure_put(&d->cl); } -static int ioctl_dev(struct block_device *b, fmode_t mode, +static int ioctl_dev(struct block_device *b, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct bcache_device *d = b->bd_disk->private_data; @@ -2558,7 +2558,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, ret = -EINVAL; err = "failed to open device"; - bdev = blkdev_get_by_path(strim(path), FMODE_READ | FMODE_WRITE, + bdev = blkdev_get_by_path(strim(path), BLK_OPEN_READ | BLK_OPEN_WRITE, bcache_kobj, NULL); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 872896218550..911f73f7ebba 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2051,8 +2051,8 @@ static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as, if (!at_least_one_arg(as, error)) return -EINVAL; - r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &ca->metadata_dev); + r = dm_get_device(ca->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->metadata_dev); if (r) { *error = "Error opening metadata device"; return r; @@ -2074,8 +2074,8 @@ static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as, if (!at_least_one_arg(as, error)) return -EINVAL; - r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &ca->cache_dev); + r = dm_get_device(ca->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->cache_dev); if (r) { *error = "Error opening cache device"; return r; @@ -2093,8 +2093,8 @@ static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as, if (!at_least_one_arg(as, error)) return -EINVAL; - r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &ca->origin_dev); + r = dm_get_device(ca->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &ca->origin_dev); if (r) { *error = "Error opening origin device"; return r; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index f467cdb5a022..94b2fc33f64b 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1683,8 +1683,8 @@ static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char * int r; sector_t metadata_dev_size; - r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &clone->metadata_dev); + r = dm_get_device(clone->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->metadata_dev); if (r) { *error = "Error opening metadata device"; return r; @@ -1703,8 +1703,8 @@ static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **err int r; sector_t dest_dev_size; - r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE, - &clone->dest_dev); + r = dm_get_device(clone->ti, dm_shift_arg(as), + BLK_OPEN_READ | BLK_OPEN_WRITE, &clone->dest_dev); if (r) { *error = "Error opening destination device"; return r; @@ -1725,7 +1725,7 @@ static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **e int r; sector_t source_dev_size; - r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ, + r = dm_get_device(clone->ti, dm_shift_arg(as), BLK_OPEN_READ, &clone->source_dev); if (r) { *error = "Error opening source device"; diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index aecab0c0720f..ce913ad91a52 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -207,11 +207,10 @@ struct dm_table { unsigned integrity_added:1; /* - * Indicates the rw permissions for the new logical - * device. This should be a combination of FMODE_READ - * and FMODE_WRITE. + * Indicates the rw permissions for the new logical device. This + * should be a combination of BLK_OPEN_READ and BLK_OPEN_WRITE. */ - fmode_t mode; + blk_mode_t mode; /* a list of devices used by this table */ struct list_head devices; diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 0d70914217ee..6acfa5bf97a4 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1482,14 +1482,16 @@ static int era_ctr(struct dm_target *ti, unsigned int argc, char **argv) era->ti = ti; - r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &era->metadata_dev); + r = dm_get_device(ti, argv[0], BLK_OPEN_READ | BLK_OPEN_WRITE, + &era->metadata_dev); if (r) { ti->error = "Error opening metadata device"; era_destroy(era); return -EINVAL; } - r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &era->origin_dev); + r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE, + &era->origin_dev); if (r) { ti->error = "Error opening data device"; era_destroy(era); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index cc77cf3d4109..8ba4cbb92351 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -861,7 +861,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param) table = dm_get_inactive_table(md, &srcu_idx); if (table) { - if (!(dm_table_get_mode(table) & FMODE_WRITE)) + if (!(dm_table_get_mode(table) & BLK_OPEN_WRITE)) param->flags |= DM_READONLY_FLAG; param->target_count = table->num_targets; } @@ -1192,7 +1192,7 @@ static int do_resume(struct dm_ioctl *param) if (old_size && new_size && old_size != new_size) need_resize_uevent = true; - if (dm_table_get_mode(new_map) & FMODE_WRITE) + if (dm_table_get_mode(new_map) & BLK_OPEN_WRITE) set_disk_ro(dm_disk(md), 0); else set_disk_ro(dm_disk(md), 1); @@ -1381,12 +1381,12 @@ static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_ return 0; } -static inline fmode_t get_mode(struct dm_ioctl *param) +static inline blk_mode_t get_mode(struct dm_ioctl *param) { - fmode_t mode = FMODE_READ | FMODE_WRITE; + blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_WRITE; if (param->flags & DM_READONLY_FLAG) - mode = FMODE_READ; + mode = BLK_OPEN_READ; return mode; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 7832974b73eb..bf7a574499a3 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1242,7 +1242,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int r = -EINVAL; char *origin_path, *cow_path; unsigned int args_used, num_flush_bios = 1; - fmode_t origin_mode = FMODE_READ; + blk_mode_t origin_mode = BLK_OPEN_READ; if (argc < 4) { ti->error = "requires 4 or more arguments"; @@ -1252,7 +1252,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (dm_target_is_snapshot_merge(ti)) { num_flush_bios = 2; - origin_mode = FMODE_WRITE; + origin_mode = BLK_OPEN_WRITE; } s = kzalloc(sizeof(*s), GFP_KERNEL); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2fd5826bfce1..7d208b2b1a19 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -126,7 +126,7 @@ static int alloc_targets(struct dm_table *t, unsigned int num) return 0; } -int dm_table_create(struct dm_table **result, fmode_t mode, +int dm_table_create(struct dm_table **result, blk_mode_t mode, unsigned int num_targets, struct mapped_device *md) { struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); @@ -304,7 +304,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, * device and not to touch the existing bdev field in case * it is accessed concurrently. */ -static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, +static int upgrade_mode(struct dm_dev_internal *dd, blk_mode_t new_mode, struct mapped_device *md) { int r; @@ -330,7 +330,7 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, * Note: the __ref annotation is because this function can call the __init * marked early_lookup_bdev when called during early boot code from dm-init.c. */ -int __ref dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, +int __ref dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, struct dm_dev **result) { int r; @@ -662,7 +662,8 @@ int dm_table_add_target(struct dm_table *t, const char *type, t->singleton = true; } - if (dm_target_always_writeable(ti->type) && !(t->mode & FMODE_WRITE)) { + if (dm_target_always_writeable(ti->type) && + !(t->mode & BLK_OPEN_WRITE)) { ti->error = "target type may not be included in a read-only table"; goto bad; } @@ -2033,7 +2034,7 @@ struct list_head *dm_table_get_devices(struct dm_table *t) return &t->devices; } -fmode_t dm_table_get_mode(struct dm_table *t) +blk_mode_t dm_table_get_mode(struct dm_table *t) { return t->mode; } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 2b13c949bd72..464c6b678417 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3301,7 +3301,7 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv) unsigned long block_size; dm_block_t low_water_blocks; struct dm_dev *metadata_dev; - fmode_t metadata_mode; + blk_mode_t metadata_mode; /* * FIXME Remove validation from scope of lock. @@ -3334,7 +3334,8 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto out_unlock; - metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE); + metadata_mode = BLK_OPEN_READ | + ((pf.mode == PM_READ_ONLY) ? 0 : BLK_OPEN_WRITE); r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev); if (r) { ti->error = "Error opening metadata block device"; @@ -3342,7 +3343,7 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv) } warn_if_metadata_device_too_big(metadata_dev->bdev); - r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); + r = dm_get_device(ti, argv[1], BLK_OPEN_READ | BLK_OPEN_WRITE, &data_dev); if (r) { ti->error = "Error getting data device"; goto out_metadata; @@ -4223,7 +4224,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad_origin_dev; } - r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); + r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &origin_dev); if (r) { ti->error = "Error opening origin device"; goto bad_origin_dev; diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index a9ee2faa75a2..3ef9f018da60 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -607,7 +607,7 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v, (*argc)--; if (!strcasecmp(arg_name, DM_VERITY_OPT_FEC_DEV)) { - r = dm_get_device(ti, arg_value, FMODE_READ, &v->fec->dev); + r = dm_get_device(ti, arg_value, BLK_OPEN_READ, &v->fec->dev); if (r) { ti->error = "FEC device lookup failed"; return r; diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e35c16e06d06..26adcfea0302 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -1196,7 +1196,7 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { + if ((dm_table_get_mode(ti->table) & ~BLK_OPEN_READ)) { ti->error = "Device must be readonly"; r = -EINVAL; goto bad; @@ -1225,13 +1225,13 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) } v->version = num; - r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); + r = dm_get_device(ti, argv[1], BLK_OPEN_READ, &v->data_dev); if (r) { ti->error = "Data device lookup failed"; goto bad; } - r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); + r = dm_get_device(ti, argv[2], BLK_OPEN_READ, &v->hash_dev); if (r) { ti->error = "Hash device lookup failed"; goto bad; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b16e37362c5a..ca2dc079c3f4 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -310,7 +310,7 @@ int dm_deleting_md(struct mapped_device *md) return test_bit(DMF_DELETING, &md->flags); } -static int dm_blk_open(struct gendisk *disk, fmode_t mode) +static int dm_blk_open(struct gendisk *disk, blk_mode_t mode) { struct mapped_device *md; @@ -448,7 +448,7 @@ static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) dm_put_live_table(md, srcu_idx); } -static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, +static int dm_blk_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; @@ -734,7 +734,7 @@ static char *_dm_claim_ptr = "I belong to device-mapper"; * Open a table device so we can use it as a map destination. */ static struct table_device *open_table_device(struct mapped_device *md, - dev_t dev, fmode_t mode) + dev_t dev, blk_mode_t mode) { struct table_device *td; struct block_device *bdev; @@ -791,7 +791,7 @@ static void close_table_device(struct table_device *td, struct mapped_device *md } static struct table_device *find_table_device(struct list_head *l, dev_t dev, - fmode_t mode) + blk_mode_t mode) { struct table_device *td; @@ -802,7 +802,7 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev, return NULL; } -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, +int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode, struct dm_dev **result) { struct table_device *td; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index a856e0aee73b..63d9010d8e61 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -203,7 +203,7 @@ int dm_open_count(struct mapped_device *md); int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred); int dm_cancel_deferred_remove(struct mapped_device *md); int dm_request_based(struct mapped_device *md); -int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, +int dm_get_table_device(struct mapped_device *md, dev_t dev, blk_mode_t mode, struct dm_dev **result); void dm_put_table_device(struct mapped_device *md, struct dm_dev *d); diff --git a/drivers/md/md.c b/drivers/md/md.c index dad4a5539f9f..ca0de7ddd943 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3643,7 +3643,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe if (err) goto out_clear_rdev; - rdev->bdev = blkdev_get_by_dev(newdev, FMODE_READ | FMODE_WRITE, + rdev->bdev = blkdev_get_by_dev(newdev, BLK_OPEN_READ | BLK_OPEN_WRITE, super_format == -2 ? &claim_rdev : rdev, NULL); if (IS_ERR(rdev->bdev)) { pr_warn("md: could not open device unknown-block(%u,%u).\n", @@ -7488,7 +7488,7 @@ static int __md_set_array_info(struct mddev *mddev, void __user *argp) return err; } -static int md_ioctl(struct block_device *bdev, fmode_t mode, +static int md_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { int err = 0; @@ -7720,7 +7720,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, return err; } #ifdef CONFIG_COMPAT -static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int md_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -7769,7 +7769,7 @@ static int md_set_read_only(struct block_device *bdev, bool ro) return err; } -static int md_open(struct gendisk *disk, fmode_t mode) +static int md_open(struct gendisk *disk, blk_mode_t mode) { struct mddev *mddev; int err; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index b16eedf22d4e..2a33b5073cc4 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -357,7 +357,7 @@ static const struct attribute_group *mmc_disk_attr_groups[] = { NULL, }; -static int mmc_blk_open(struct gendisk *disk, fmode_t mode) +static int mmc_blk_open(struct gendisk *disk, blk_mode_t mode) { struct mmc_blk_data *md = mmc_blk_get(disk); int ret = -ENXIO; @@ -365,7 +365,7 @@ static int mmc_blk_open(struct gendisk *disk, fmode_t mode) mutex_lock(&block_mutex); if (md) { ret = 0; - if ((mode & FMODE_WRITE) && md->read_only) { + if ((mode & BLK_OPEN_WRITE) && md->read_only) { mmc_blk_put(md); ret = -EROFS; } @@ -754,7 +754,7 @@ static int mmc_blk_check_blkdev(struct block_device *bdev) return 0; } -static int mmc_blk_ioctl(struct block_device *bdev, fmode_t mode, +static int mmc_blk_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct mmc_blk_data *md; @@ -791,7 +791,7 @@ static int mmc_blk_ioctl(struct block_device *bdev, fmode_t mode, } #ifdef CONFIG_COMPAT -static int mmc_blk_compat_ioctl(struct block_device *bdev, fmode_t mode, +static int mmc_blk_compat_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { return mmc_blk_ioctl(bdev, mode, cmd, (unsigned long) compat_ptr(arg)); diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index 44fc23af4c3f..be106dc20ff3 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -220,7 +220,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) * early_lookup_bdev when called from the early boot code. */ static struct block_device __ref *mdtblock_early_get_bdev(const char *devname, - fmode_t mode, int timeout, struct block2mtd_dev *dev) + blk_mode_t mode, int timeout, struct block2mtd_dev *dev) { struct block_device *bdev = ERR_PTR(-ENODEV); #ifndef MODULE @@ -261,7 +261,7 @@ static struct block_device __ref *mdtblock_early_get_bdev(const char *devname, static struct block2mtd_dev *add_device(char *devname, int erase_size, char *label, int timeout) { - const fmode_t mode = FMODE_READ | FMODE_WRITE; + const blk_mode_t mode = BLK_OPEN_READ | BLK_OPEN_WRITE; struct block_device *bdev; struct block2mtd_dev *dev; char *name; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index bd0b75453643..ff18636e0889 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -182,7 +182,7 @@ static blk_status_t mtd_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static int blktrans_open(struct gendisk *disk, fmode_t mode) +static int blktrans_open(struct gendisk *disk, blk_mode_t mode) { struct mtd_blktrans_dev *dev = disk->private_data; int ret = 0; @@ -208,7 +208,7 @@ static int blktrans_open(struct gendisk *disk, fmode_t mode) ret = __get_mtd_device(dev->mtd); if (ret) goto error_release; - dev->writable = mode & FMODE_WRITE; + dev->writable = mode & BLK_OPEN_WRITE; unlock: dev->open++; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index e85fb9de0b70..437c5b83ffe5 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -227,7 +227,7 @@ static blk_status_t ubiblock_read(struct request *req) return BLK_STS_OK; } -static int ubiblock_open(struct gendisk *disk, fmode_t mode) +static int ubiblock_open(struct gendisk *disk, blk_mode_t mode) { struct ubiblock *dev = disk->private_data; int ret; @@ -246,11 +246,10 @@ static int ubiblock_open(struct gendisk *disk, fmode_t mode) * It's just a paranoid check, as write requests will get rejected * in any case. */ - if (mode & FMODE_WRITE) { + if (mode & BLK_OPEN_WRITE) { ret = -EROFS; goto out_unlock; } - dev->desc = ubi_open_volume(dev->ubi_num, dev->vol_id, UBI_READONLY); if (IS_ERR(dev->desc)) { dev_err(disk_to_dev(dev->gd), "failed to open ubi volume %d_%d", diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fd7f8e6d66fd..c3d72fc677f7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1591,7 +1591,7 @@ static void nvme_ns_release(struct nvme_ns *ns) nvme_put_ns(ns); } -static int nvme_open(struct gendisk *disk, fmode_t mode) +static int nvme_open(struct gendisk *disk, blk_mode_t mode) { return nvme_ns_open(disk->private_data); } diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 8bf09047348e..0fd0aa571cc9 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -709,11 +709,11 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, } } -int nvme_ioctl(struct block_device *bdev, fmode_t mode, +int nvme_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns *ns = bdev->bd_disk->private_data; - bool open_for_write = mode & FMODE_WRITE; + bool open_for_write = mode & BLK_OPEN_WRITE; void __user *argp = (void __user *)arg; unsigned int flags = 0; @@ -817,11 +817,11 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd, return ret; } -int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, +int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct nvme_ns_head *head = bdev->bd_disk->private_data; - bool open_for_write = mode & FMODE_WRITE; + bool open_for_write = mode & BLK_OPEN_WRITE; void __user *argp = (void __user *)arg; struct nvme_ns *ns; int srcu_idx, ret = -EWOULDBLOCK; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 698c0e70bcfa..91a9a55227fa 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -402,7 +402,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio) srcu_read_unlock(&head->srcu, srcu_idx); } -static int nvme_ns_head_open(struct gendisk *disk, fmode_t mode) +static int nvme_ns_head_open(struct gendisk *disk, blk_mode_t mode) { if (!nvme_tryget_ns_head(disk->private_data)) return -ENXIO; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index bf46f122e9e1..953e59f56139 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -836,10 +836,10 @@ void nvme_put_ns_head(struct nvme_ns_head *head); int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, const struct file_operations *fops, struct module *owner); void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device); -int nvme_ioctl(struct block_device *bdev, fmode_t mode, +int nvme_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode, +int nvme_ns_head_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 65ed2d478fac..2733e0158585 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -85,7 +85,7 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) return -ENOTBLK; ns->bdev = blkdev_get_by_path(ns->device_path, - FMODE_READ | FMODE_WRITE, NULL, NULL); + BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(ns->bdev)) { ret = PTR_ERR(ns->bdev); if (ret != -ENOTBLK) { diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 19295b2df470..45788955c4e6 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -3234,7 +3234,7 @@ struct blk_mq_ops dasd_mq_ops = { .exit_hctx = dasd_exit_hctx, }; -static int dasd_open(struct gendisk *disk, fmode_t mode) +static int dasd_open(struct gendisk *disk, blk_mode_t mode) { struct dasd_device *base; int rc; @@ -3268,14 +3268,12 @@ static int dasd_open(struct gendisk *disk, fmode_t mode) rc = -ENODEV; goto out; } - - if ((mode & FMODE_WRITE) && + if ((mode & BLK_OPEN_WRITE) && (test_bit(DASD_FLAG_DEVICE_RO, &base->flags) || (base->features & DASD_FEATURE_READONLY))) { rc = -EROFS; goto out; } - dasd_put_device(base); return 0; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index d2b27b84f854..fe5108a1b332 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -130,7 +130,8 @@ int dasd_scan_partitions(struct dasd_block *block) struct block_device *bdev; int rc; - bdev = blkdev_get_by_dev(disk_devt(block->gdp), FMODE_READ, NULL, NULL); + bdev = blkdev_get_by_dev(disk_devt(block->gdp), BLK_OPEN_READ, NULL, + NULL); if (IS_ERR(bdev)) { DBF_DEV_EVENT(DBF_ERR, block->base, "scan partitions error, blkdev_get returned %ld", diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index 33f812f0e515..0aa56351da72 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -965,7 +965,8 @@ int dasd_scan_partitions(struct dasd_block *); void dasd_destroy_partitions(struct dasd_block *); /* externals in dasd_ioctl.c */ -int dasd_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long); +int dasd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, + unsigned long arg); int dasd_set_read_only(struct block_device *bdev, bool ro); /* externals in dasd_proc.c */ diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 9327dcdd6e5e..838c9f5313e6 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -612,7 +612,7 @@ static int dasd_ioctl_readall_cmb(struct dasd_block *block, unsigned int cmd, return ret; } -int dasd_ioctl(struct block_device *bdev, fmode_t mode, +int dasd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct dasd_block *block; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 5aee3106bfda..200f88f0e451 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -28,7 +28,7 @@ #define DCSSBLK_PARM_LEN 400 #define DCSS_BUS_ID_SIZE 20 -static int dcssblk_open(struct gendisk *disk, fmode_t mode); +static int dcssblk_open(struct gendisk *disk, blk_mode_t mode); static void dcssblk_release(struct gendisk *disk); static void dcssblk_submit_bio(struct bio *bio); static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, @@ -809,7 +809,7 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch } static int -dcssblk_open(struct gendisk *disk, fmode_t mode) +dcssblk_open(struct gendisk *disk, blk_mode_t mode) { struct dcssblk_dev_info *dev_info = disk->private_data; int rc; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 02b6704ec2b4..ab216976dbdc 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1298,7 +1298,7 @@ static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) /** * sd_open - open a scsi disk device * @disk: disk to open - * @mode: FMODE_* mask + * @mode: open mode * * Returns 0 if successful. Returns a negated errno value in case * of error. @@ -1310,7 +1310,7 @@ static bool sd_need_revalidate(struct gendisk *disk, struct scsi_disk *sdkp) * * Locking: called with disk->open_mutex held. **/ -static int sd_open(struct gendisk *disk, fmode_t mode) +static int sd_open(struct gendisk *disk, blk_mode_t mode) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; @@ -1336,7 +1336,8 @@ static int sd_open(struct gendisk *disk, fmode_t mode) * If the drive is empty, just let the open fail. */ retval = -ENOMEDIUM; - if (sdev->removable && !sdkp->media_present && !(mode & FMODE_NDELAY)) + if (sdev->removable && !sdkp->media_present && + !(mode & BLK_OPEN_NDELAY)) goto error_out; /* @@ -1344,7 +1345,7 @@ static int sd_open(struct gendisk *disk, fmode_t mode) * if the user expects to be able to write to the thing. */ retval = -EROFS; - if (sdkp->write_prot && (mode & FMODE_WRITE)) + if (sdkp->write_prot && (mode & BLK_OPEN_WRITE)) goto error_out; /* @@ -1379,7 +1380,7 @@ static int sd_open(struct gendisk *disk, fmode_t mode) * Note: may block (uninterruptible) if error recovery is underway * on this disk. * - * Locking: called with bdev->bd_disk->open_mutex held. + * Locking: called with disk->open_mutex held. **/ static void sd_release(struct gendisk *disk) { @@ -1424,7 +1425,7 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) /** * sd_ioctl - process an ioctl * @bdev: target block device - * @mode: FMODE_* mask + * @mode: open mode * @cmd: ioctl command number * @arg: this is third argument given to ioctl(2) system call. * Often contains a pointer. @@ -1435,7 +1436,7 @@ static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) * Note: most ioctls are forward onto the block subsystem or further * down in the scsi subsystem. **/ -static int sd_ioctl(struct block_device *bdev, fmode_t mode, +static int sd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; @@ -1457,13 +1458,13 @@ static int sd_ioctl(struct block_device *bdev, fmode_t mode, * access to the device is prohibited. */ error = scsi_ioctl_block_when_processing_errors(sdp, cmd, - (mode & FMODE_NDELAY) != 0); + (mode & BLK_OPEN_NDELAY)); if (error) return error; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); - return scsi_ioctl(sdp, mode & FMODE_WRITE, cmd, p); + return scsi_ioctl(sdp, mode & BLK_OPEN_WRITE, cmd, p); } static void set_media_not_present(struct scsi_disk *sdkp) diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 00aaafc8dd78..ce886c8c9dbe 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -484,7 +484,7 @@ static void sr_revalidate_disk(struct scsi_cd *cd) get_sectorsize(cd); } -static int sr_block_open(struct gendisk *disk, fmode_t mode) +static int sr_block_open(struct gendisk *disk, blk_mode_t mode) { struct scsi_cd *cd = scsi_cd(disk); struct scsi_device *sdev = cd->device; @@ -518,8 +518,8 @@ static void sr_block_release(struct gendisk *disk) scsi_device_put(cd->device); } -static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +static int sr_block_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned cmd, unsigned long arg) { struct scsi_cd *cd = scsi_cd(bdev->bd_disk); struct scsi_device *sdev = cd->device; @@ -532,7 +532,7 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, mutex_lock(&cd->lock); ret = scsi_ioctl_block_when_processing_errors(sdev, cmd, - (mode & FMODE_NDELAY) != 0); + (mode & BLK_OPEN_NDELAY)); if (ret) goto out; @@ -543,7 +543,7 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, if (ret != -ENOSYS) goto put; } - ret = scsi_ioctl(sdev, mode & FMODE_WRITE, cmd, argp); + ret = scsi_ioctl(sdev, mode & BLK_OPEN_WRITE, cmd, argp); put: scsi_autopm_put_device(sdev); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index c62f961f46e3..3c462d69daca 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -90,7 +90,7 @@ static int iblock_configure_device(struct se_device *dev) struct request_queue *q; struct block_device *bd = NULL; struct blk_integrity *bi; - fmode_t mode; + blk_mode_t mode = BLK_OPEN_READ; unsigned int max_write_zeroes_sectors; int ret; @@ -108,9 +108,8 @@ static int iblock_configure_device(struct se_device *dev) pr_debug( "IBLOCK: Claiming struct block_device: %s\n", ib_dev->ibd_udev_path); - mode = FMODE_READ; if (!ib_dev->ibd_readonly) - mode |= FMODE_WRITE; + mode |= BLK_OPEN_WRITE; else dev->dev_flags |= DF_READ_ONLY; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index da3b5512d7ae..0d4f09693ef4 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -366,8 +366,8 @@ static int pscsi_create_type_disk(struct se_device *dev, struct scsi_device *sd) * Claim exclusive struct block_device access to struct scsi_device * for TYPE_DISK and TYPE_ZBC using supplied udev_path */ - bd = blkdev_get_by_path(dev->udev_path, FMODE_WRITE | FMODE_READ, pdv, - NULL); + bd = blkdev_get_by_path(dev->udev_path, BLK_OPEN_WRITE | BLK_OPEN_READ, + pdv, NULL); if (IS_ERR(bd)) { pr_err("pSCSI: blkdev_get_by_path() failed\n"); scsi_device_put(sd); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 677e9d9e1527..2d00600ff413 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -257,7 +257,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev = blkdev_get_by_path(device_path, FMODE_WRITE, + bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fd02b92e3910..1c3c1d7ad68c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -849,7 +849,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, * All other options will be parsed on much later in the mount process and * only when we need to allocate a new super block. */ -static int btrfs_parse_device_options(const char *options, fmode_t flags) +static int btrfs_parse_device_options(const char *options, blk_mode_t flags) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; @@ -1440,7 +1440,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; void *new_sec_opts = NULL; - fmode_t mode = sb_open_mode(flags); + blk_mode_t mode = sb_open_mode(flags); int error = 0; if (data) { @@ -2185,7 +2185,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, FMODE_READ); + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2199,7 +2199,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, FMODE_READ); + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR(device); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7b12e05cdbf0..c85e54f86035 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -490,7 +490,7 @@ static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( static int -btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, +btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, int flush, struct block_device **bdev, struct btrfs_super_block **disk_super) { @@ -590,7 +590,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device * fs_devices->device_list_mutex here. */ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device, fmode_t flags, + struct btrfs_device *device, blk_mode_t flags, void *holder) { struct block_device *bdev; @@ -1207,7 +1207,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder) + blk_mode_t flags, void *holder) { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; @@ -1255,7 +1255,7 @@ static int devid_cmp(void *priv, const struct list_head *a, } int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder) + blk_mode_t flags, void *holder) { int ret; @@ -1346,7 +1346,7 @@ int btrfs_forget_devices(dev_t devt) * and we are not allowed to call set_blocksize during the scan. The superblock * is read via pagecache */ -struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags) +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) { struct btrfs_super_block *disk_super; bool new_device_added = false; @@ -2378,7 +2378,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, return -ENOMEM; } - ret = btrfs_get_bdev_and_sb(path, FMODE_READ, NULL, 0, + ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, &bdev, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); @@ -2625,7 +2625,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev = blkdev_get_by_path(device_path, FMODE_WRITE, + bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, fs_info->bdev_holder, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -6907,7 +6907,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (IS_ERR(fs_devices)) return fs_devices; - ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); + ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); return ERR_PTR(ret); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 840a8df39907..8227ba4d64b8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -599,8 +599,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, - fmode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags); + blk_mode_t flags, void *holder); +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 54dba967a2d4..3f080f0afc02 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -254,7 +254,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev = blkdev_get_by_path(dif->path, FMODE_READ, sb->s_type, + bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 92dd699139a3..94a7b56ed876 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1112,7 +1112,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, sb, + bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, &ext4_holder_ops); if (IS_ERR(bdev)) goto fail; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 5a764fecd1c7..e34197a70dc1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3993,7 +3993,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; unsigned int logical_blksize; - fmode_t mode = sb_open_mode(sbi->sb->s_flags); + blk_mode_t mode = sb_open_mode(sbi->sb->s_flags); int i; /* Initialize single device information */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 82f70d46f4e5..e855b8fde76c 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1100,7 +1100,7 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ | FMODE_WRITE, + bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); if (IS_ERR(bdev)) { rc = PTR_ERR(bdev); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 9be7f958f60e..70f5563a8e81 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -243,7 +243,8 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, if (!dev) return -EIO; - bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL, NULL); + bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, + NULL); if (IS_ERR(bdev)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); @@ -312,7 +313,7 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL, + bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(bdev)) { pr_warn("pNFS: failed to open device %s (%ld)\n", diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index c6ae9aee01ed..21472e3ed182 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1786,7 +1786,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, goto out2; reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, - FMODE_WRITE | FMODE_READ, NULL, NULL); + BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, + NULL); if (IS_ERR(reg->hr_bdev)) { ret = PTR_ERR(reg->hr_bdev); reg->hr_bdev = NULL; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 905297ea5545..62beee3c62b6 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2598,7 +2598,7 @@ static int journal_init_dev(struct super_block *super, struct reiserfs_journal *journal, const char *jdev_name) { - fmode_t blkdev_mode = FMODE_READ; + blk_mode_t blkdev_mode = BLK_OPEN_READ; void *holder = journal; int result; dev_t jdev; @@ -2610,7 +2610,7 @@ static int journal_init_dev(struct super_block *super, new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; if (!bdev_read_only(super->s_bdev)) - blkdev_mode |= FMODE_WRITE; + blkdev_mode |= BLK_OPEN_WRITE; /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3b7cf8268057..67ad1c937637 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -396,7 +396,7 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, FMODE_READ | FMODE_WRITE, mp, + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, &xfs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6b65623e447c..824e31dd752a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -112,6 +112,19 @@ struct blk_integrity { unsigned char tag_size; }; +typedef unsigned int __bitwise blk_mode_t; + +/* open for reading */ +#define BLK_OPEN_READ ((__force blk_mode_t)(1 << 0)) +/* open for writing */ +#define BLK_OPEN_WRITE ((__force blk_mode_t)(1 << 1)) +/* open exclusively (vs other exclusive openers */ +#define BLK_OPEN_EXCL ((__force blk_mode_t)(1 << 2)) +/* opened with O_NDELAY */ +#define BLK_OPEN_NDELAY ((__force blk_mode_t)(1 << 3)) +/* open for "writes" only for ioctls (specialy hack for floppy.c) */ +#define BLK_OPEN_WRITE_IOCTL ((__force blk_mode_t)(1 << 4)) + struct gendisk { /* * major/first_minor/minors should not be set by any new driver, the @@ -187,6 +200,7 @@ struct gendisk { struct badblocks *bb; struct lockdep_map lockdep_map; u64 diskseq; + blk_mode_t open_mode; /* * Independent sector access ranges. This is always NULL for @@ -1363,10 +1377,12 @@ struct block_device_operations { void (*submit_bio)(struct bio *bio); int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); - int (*open)(struct gendisk *disk, fmode_t mode); + int (*open)(struct gendisk *disk, blk_mode_t mode); void (*release)(struct gendisk *disk); - int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); + int (*ioctl)(struct block_device *bdev, blk_mode_t mode, + unsigned cmd, unsigned long arg); + int (*compat_ioctl)(struct block_device *bdev, blk_mode_t mode, + unsigned cmd, unsigned long arg); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); @@ -1393,7 +1409,7 @@ struct block_device_operations { }; #ifdef CONFIG_COMPAT -extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, +extern int blkdev_compat_ptr_ioctl(struct block_device *, blk_mode_t, unsigned int, unsigned long); #else #define blkdev_compat_ptr_ioctl NULL @@ -1455,11 +1471,11 @@ struct blk_holder_ops { * as stored in sb->s_flags. */ #define sb_open_mode(flags) \ - (FMODE_READ | (((flags) & SB_RDONLY) ? 0 : FMODE_WRITE)) + (BLK_OPEN_READ | (((flags) & SB_RDONLY) ? 0 : BLK_OPEN_WRITE)) -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder, +struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops); int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); diff --git a/include/linux/cdrom.h b/include/linux/cdrom.h index 3c253b29f4aa..98c6fd0b39b6 100644 --- a/include/linux/cdrom.h +++ b/include/linux/cdrom.h @@ -13,6 +13,7 @@ #include /* not really needed, later.. */ #include +#include #include #include @@ -101,7 +102,7 @@ int cdrom_read_tocentry(struct cdrom_device_info *cdi, struct cdrom_tocentry *entry); /* the general block_device operations structure: */ -int cdrom_open(struct cdrom_device_info *cdi, fmode_t mode); +int cdrom_open(struct cdrom_device_info *cdi, blk_mode_t mode); void cdrom_release(struct cdrom_device_info *cdi); int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev, unsigned int cmd, unsigned long arg); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index c27b84002d83..69d0435c7ebb 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -166,7 +166,7 @@ void dm_error(const char *message); struct dm_dev { struct block_device *bdev; struct dax_device *dax_dev; - fmode_t mode; + blk_mode_t mode; char name[16]; }; @@ -174,7 +174,7 @@ struct dm_dev { * Constructors should call these functions to ensure destination devices * are opened/closed correctly. */ -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, +int dm_get_device(struct dm_target *ti, const char *path, blk_mode_t mode, struct dm_dev **result); void dm_put_device(struct dm_target *ti, struct dm_dev *d); @@ -543,7 +543,7 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo); /* * First create an empty table. */ -int dm_table_create(struct dm_table **result, fmode_t mode, +int dm_table_create(struct dm_table **result, blk_mode_t mode, unsigned int num_targets, struct mapped_device *md); /* @@ -586,7 +586,7 @@ void dm_sync_table(struct mapped_device *md); * Queries */ sector_t dm_table_get_size(struct dm_table *t); -fmode_t dm_table_get_mode(struct dm_table *t); +blk_mode_t dm_table_get_mode(struct dm_table *t); struct mapped_device *dm_table_get_md(struct dm_table *t); const char *dm_table_device_name(struct dm_table *t); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index cc9259307c94..f6ebcd00c410 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -356,8 +356,8 @@ static int swsusp_swap_check(void) return res; root_swap = res; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_WRITE, - NULL, NULL); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + BLK_OPEN_WRITE, NULL, NULL); if (IS_ERR(hib_resume_bdev)) return PTR_ERR(hib_resume_bdev); @@ -1521,7 +1521,7 @@ int swsusp_check(bool snapshot_test) void *holder = snapshot_test ? &swsusp_holder : NULL; int error; - hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, FMODE_READ, + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, BLK_OPEN_READ, holder, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); diff --git a/mm/swapfile.c b/mm/swapfile.c index 16554256be65..6bc83060df9a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2770,7 +2770,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = blkdev_get_by_dev(inode->i_rdev, - FMODE_READ | FMODE_WRITE, p, NULL); + BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL); if (IS_ERR(p->bdev)) { error = PTR_ERR(p->bdev); p->bdev = NULL; From 4e762d8623448bb9d32711832ce977a65ff7636a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:56 +0200 Subject: [PATCH 167/266] block: always use I_BDEV on file->f_mapping->host to find the bdev Always use I_BDEV(file->f_mapping->host) to find the bdev for a file to free up file->private_data for other uses. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-29-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/block/fops.c b/block/fops.c index 086612103b9d..0d714d050a46 100644 --- a/block/fops.c +++ b/block/fops.c @@ -54,7 +54,7 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; @@ -170,7 +170,7 @@ static void blkdev_bio_end_io(struct bio *bio) static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; @@ -310,7 +310,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; @@ -451,7 +451,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct block_device *bdev = filp->private_data; + struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; error = file_write_and_wait_range(filp, start, end); @@ -516,7 +516,6 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (IS_ERR(bdev)) return PTR_ERR(bdev); - filp->private_data = bdev; filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return 0; @@ -524,9 +523,8 @@ static int blkdev_open(struct inode *inode, struct file *filp) static int blkdev_release(struct inode *inode, struct file *filp) { - struct block_device *bdev = filp->private_data; - - blkdev_put(bdev, (filp->f_mode & FMODE_EXCL) ? filp : NULL); + blkdev_put(I_BDEV(filp->f_mapping->host), + (filp->f_mode & FMODE_EXCL) ? filp : NULL); return 0; } @@ -539,7 +537,7 @@ static int blkdev_release(struct inode *inode, struct file *filp) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; @@ -575,7 +573,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; From ee3249a8ce78ef014a71b05157a43fba8dc764e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:57 +0200 Subject: [PATCH 168/266] block: store the holder in file->private_data Store the file struct used as the holder in file->private_data as an indicator that this file descriptor was opened exclusively to remove the last use of FMODE_EXCL. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20230608110258.189493-30-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/block/fops.c b/block/fops.c index 0d714d050a46..9871bd6052b4 100644 --- a/block/fops.c +++ b/block/fops.c @@ -478,7 +478,7 @@ blk_mode_t file_to_blk_mode(struct file *file) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; - if (file->f_mode & FMODE_EXCL) + if (file->private_data) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) mode |= BLK_OPEN_NDELAY; @@ -507,12 +507,15 @@ static int blkdev_open(struct inode *inode, struct file *filp) filp->f_flags |= O_LARGEFILE; filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + /* + * Use the file private data to store the holder for exclusive openes. + * file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL. + */ if (filp->f_flags & O_EXCL) - filp->f_mode |= FMODE_EXCL; + filp->private_data = filp; bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp), - (filp->f_mode & FMODE_EXCL) ? filp : NULL, - NULL); + filp->private_data, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); @@ -523,8 +526,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) static int blkdev_release(struct inode *inode, struct file *filp) { - blkdev_put(I_BDEV(filp->f_mapping->host), - (filp->f_mode & FMODE_EXCL) ? filp : NULL); + blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data); return 0; } From 0733ad8002916b9dbbbcfe6e92ad44d2657de1c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 8 Jun 2023 13:02:58 +0200 Subject: [PATCH 169/266] fs: remove the now unused FMODE_* flags FMODE_NDELAY, FMODE_EXCL and FMODE_WRITE_IOCTL were only used for block internal purposed and are now entirely unused, so remove them. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20230608110258.189493-31-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/fs.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ad1d2c9afb3f..8045c7ef4000 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -119,13 +119,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) -/* File is opened with O_NDELAY (only set for block devices) */ -#define FMODE_NDELAY ((__force fmode_t)0x40) -/* File is opened with O_EXCL (only set for block devices) */ -#define FMODE_EXCL ((__force fmode_t)0x80) -/* File is opened using open(.., 3, ..) and is writeable only for ioctls - (specialy hack for floppy.c) */ -#define FMODE_WRITE_IOCTL ((__force fmode_t)0x100) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ From 4f1731df60f9033669f024d06ae26a6301260b55 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 10 Jun 2023 10:30:43 +0800 Subject: [PATCH 170/266] blk-mq: fix potential io hang by wrong 'wake_batch' In __blk_mq_tag_busy/idle(), updating 'active_queues' and calculating 'wake_batch' is not atomic: t1: t2: _blk_mq_tag_busy blk_mq_tag_busy inc active_queues // assume 1->2 inc active_queues // 2 -> 3 blk_mq_update_wake_batch // calculate based on 3 blk_mq_update_wake_batch /* calculate based on 2, while active_queues is actually 3. */ Fix this problem by protecting them wih 'tags->lock', this is not a hot path, so performance should not be concerned. And now that all writers are inside the lock, switch 'actives_queues' from atomic to unsigned int. Fixes: 180dccb0dba4 ("blk-mq: fix tag_get wait task can't be awakened") Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230610023043.2559121-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 2 +- block/blk-mq-tag.c | 15 ++++++++++----- block/blk-mq.h | 3 +-- include/linux/blk-mq.h | 3 +-- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 68165a50951b..c3b5930106b2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -401,7 +401,7 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", - atomic_read(&tags->active_queues)); + READ_ONCE(tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index d6af9d431dc6..426197312069 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -38,6 +38,7 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; + struct blk_mq_tags *tags = hctx->tags; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; @@ -51,9 +52,11 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state); } - users = atomic_inc_return(&hctx->tags->active_queues); - - blk_mq_update_wake_batch(hctx->tags, users); + spin_lock_irq(&tags->lock); + users = tags->active_queues + 1; + WRITE_ONCE(tags->active_queues, users); + blk_mq_update_wake_batch(tags, users); + spin_unlock_irq(&tags->lock); } /* @@ -86,9 +89,11 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) return; } - users = atomic_dec_return(&tags->active_queues); - + spin_lock_irq(&tags->lock); + users = tags->active_queues - 1; + WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); + spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); } diff --git a/block/blk-mq.h b/block/blk-mq.h index 8c642e9f32f1..1743857e0b01 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -412,8 +412,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return true; } - users = atomic_read(&hctx->tags->active_queues); - + users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 59b52ec155b1..f401067ac03a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -739,8 +739,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; - - atomic_t active_queues; + unsigned int active_queues; struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; From a836ca33c5b07d34dd5347af9f64d25651d12674 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 28 Apr 2023 00:31:12 -0700 Subject: [PATCH 171/266] nvme-core: fix memory leak in dhchap_secret_store Free dhchap_secret in nvme_ctrl_dhchap_secret_store() before we return fix following kmemleack:- unreferenced object 0xffff8886376ea800 (size 64): comm "check", pid 22048, jiffies 4344316705 (age 92.199s) hex dump (first 32 bytes): 44 48 48 43 2d 31 3a 30 30 3a 6e 78 72 35 4b 67 DHHC-1:00:nxr5Kg 75 58 34 75 6f 41 78 73 4a 61 34 63 2f 68 75 4c uX4uoAxsJa4c/huL backtrace: [<0000000030ce5d4b>] __kmalloc+0x4b/0x130 [<000000009be1cdc1>] nvme_ctrl_dhchap_secret_store+0x8f/0x160 [nvme_core] [<00000000ac06c96a>] kernfs_fop_write_iter+0x12b/0x1c0 [<00000000437e7ced>] vfs_write+0x2ba/0x3c0 [<00000000f9491baf>] ksys_write+0x5f/0xe0 [<000000001c46513d>] do_syscall_64+0x3b/0x90 [<00000000ecf348fe>] entry_SYSCALL_64_after_hwframe+0x72/0xdc unreferenced object 0xffff8886376eaf00 (size 64): comm "check", pid 22048, jiffies 4344316736 (age 92.168s) hex dump (first 32 bytes): 44 48 48 43 2d 31 3a 30 30 3a 6e 78 72 35 4b 67 DHHC-1:00:nxr5Kg 75 58 34 75 6f 41 78 73 4a 61 34 63 2f 68 75 4c uX4uoAxsJa4c/huL backtrace: [<0000000030ce5d4b>] __kmalloc+0x4b/0x130 [<000000009be1cdc1>] nvme_ctrl_dhchap_secret_store+0x8f/0x160 [nvme_core] [<00000000ac06c96a>] kernfs_fop_write_iter+0x12b/0x1c0 [<00000000437e7ced>] vfs_write+0x2ba/0x3c0 [<00000000f9491baf>] ksys_write+0x5f/0xe0 [<000000001c46513d>] do_syscall_64+0x3b/0x90 [<00000000ecf348fe>] entry_SYSCALL_64_after_hwframe+0x72/0xdc Fixes: f50fff73d620 ("nvme: implement In-Band authentication") Signed-off-by: Chaitanya Kulkarni Tested-by: Yi Zhang Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c3d72fc677f7..a529fddbf972 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3825,8 +3825,10 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, int ret; ret = nvme_auth_generate_key(dhchap_secret, &key); - if (ret) + if (ret) { + kfree(dhchap_secret); return ret; + } kfree(opts->dhchap_secret); opts->dhchap_secret = dhchap_secret; host_key = ctrl->host_key; @@ -3834,7 +3836,8 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, ctrl->host_key = key; mutex_unlock(&ctrl->dhchap_auth_mutex); nvme_auth_free_key(host_key); - } + } else + kfree(dhchap_secret); /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); queue_work(nvme_wq, &ctrl->dhchap_auth_work); From 99c2dcc8ffc24e210a3aa05c204d92f3ef460b05 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 28 Apr 2023 00:31:13 -0700 Subject: [PATCH 172/266] nvme-core: fix memory leak in dhchap_ctrl_secret Free dhchap_secret in nvme_ctrl_dhchap_ctrl_secret_store() before we return when nvme_auth_generate_key() returns error. Fixes: f50fff73d620 ("nvme: implement In-Band authentication") Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a529fddbf972..49800d6ffd81 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3882,8 +3882,10 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, int ret; ret = nvme_auth_generate_key(dhchap_secret, &key); - if (ret) + if (ret) { + kfree(dhchap_secret); return ret; + } kfree(opts->dhchap_ctrl_secret); opts->dhchap_ctrl_secret = dhchap_secret; ctrl_key = ctrl->ctrl_key; @@ -3891,7 +3893,8 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, ctrl->ctrl_key = key; mutex_unlock(&ctrl->dhchap_auth_mutex); nvme_auth_free_key(ctrl_key); - } + } else + kfree(dhchap_secret); /* Start re-authentication */ dev_info(ctrl->device, "re-authenticating controller\n"); queue_work(nvme_wq, &ctrl->dhchap_auth_work); From 3a12a0b868a512fcada564699d00f5e652c0998c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 28 Apr 2023 00:31:14 -0700 Subject: [PATCH 173/266] nvme-core: add missing fault-injection cleanup Add missing fault-injection cleanup in nvme_init_ctrl() in the error unwind path that also fixes following message for blktests:- linux-block (for-next) # grep debugfs debugfs-err.log [ 147.853464] debugfs: Directory 'nvme1' with parent '/' already present! [ 147.853973] nvme1: failed to create debugfs attr [ 148.802490] debugfs: Directory 'nvme1' with parent '/' already present! [ 148.803244] nvme1: failed to create debugfs attr [ 148.877304] debugfs: Directory 'nvme1' with parent '/' already present! [ 148.877775] nvme1: failed to create debugfs attr [ 149.816652] debugfs: Directory 'nvme1' with parent '/' already present! [ 149.818011] nvme1: failed to create debugfs attr Signed-off-by: Chaitanya Kulkarni Tested-by: Yi Zhang Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 49800d6ffd81..2013e756a43a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5201,6 +5201,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_cdev: + nvme_fault_inject_fini(&ctrl->fault_inject); cdev_device_del(&ctrl->cdev, ctrl->device); out_free_name: nvme_put_ctrl(ctrl); From 7ed5cf8e6d9bfb6a78d0471317edff14f0f2b4dd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 28 Apr 2023 00:31:15 -0700 Subject: [PATCH 174/266] nvme-core: fix dev_pm_qos memleak Call dev_pm_qos_hide_latency_tolerance() in the error unwind patch to avoid following kmemleak:- blktests (master) # kmemleak-clear; ./check nvme/044; blktests (master) # kmemleak-scan ; kmemleak-show nvme/044 (Test bi-directional authentication) [passed] runtime 2.111s ... 2.124s unreferenced object 0xffff888110c46240 (size 96): comm "nvme", pid 33461, jiffies 4345365353 (age 75.586s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000069ac2cec>] kmalloc_trace+0x25/0x90 [<000000006acc66d5>] dev_pm_qos_update_user_latency_tolerance+0x6f/0x100 [<00000000cc376ea7>] nvme_init_ctrl+0x38e/0x410 [nvme_core] [<000000007df61b4b>] 0xffffffffc05e88b3 [<00000000d152b985>] 0xffffffffc05744cb [<00000000f04a4041>] vfs_write+0xc5/0x3c0 [<00000000f9491baf>] ksys_write+0x5f/0xe0 [<000000001c46513d>] do_syscall_64+0x3b/0x90 [<00000000ecf348fe>] entry_SYSCALL_64_after_hwframe+0x72/0xdc Link: https://lore.kernel.org/linux-nvme/CAHj4cs-nDaKzMx2txO4dbE+Mz9ePwLtU0e3egz+StmzOUgWUrA@mail.gmail.com/ Fixes: f50fff73d620 ("nvme: implement In-Band authentication") Signed-off-by: Chaitanya Kulkarni Tested-by: Yi Zhang Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2013e756a43a..cfb98e6b94a7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5202,6 +5202,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_cdev: nvme_fault_inject_fini(&ctrl->fault_inject); + dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); out_free_name: nvme_put_ctrl(ctrl); From f3f28373152da143da9d163b2529669508e52e8e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 20 Apr 2023 02:17:50 +0300 Subject: [PATCH 175/266] nvme-rdma: fix typo in comment There is no ib_stop_cq API and the need for the +1 is for ib_drain_qp. Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0eb79696fb73..5e636a2d7ce1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -501,7 +501,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) } ibdev = queue->device->dev; - /* +1 for ib_stop_cq */ + /* +1 for ib_drain_qp */ queue->cq_size = cq_factor * queue->queue_size + 1; ret = nvme_rdma_create_cq(ibdev, queue); From 4a4d9bc0c86dcd7b6f9b5471962839e8ce7682e4 Mon Sep 17 00:00:00 2001 From: Irvin Cote Date: Fri, 12 May 2023 03:05:37 -0300 Subject: [PATCH 176/266] nvme-pci: cleaning up nvme_pci_init_request Erase the superfluous line that retrieves the nvme_dev. Signed-off-by: Irvin Cote Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7f25c0fe3a0b..b027e5e3f4ac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -420,10 +420,9 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_dev *dev = to_nvme_dev(set->driver_data); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - nvme_req(req)->ctrl = &dev->ctrl; + nvme_req(req)->ctrl = set->driver_data; nvme_req(req)->cmd = &iod->cmd; return 0; } From a249d3066de62ce2ed68fdf6445556658ecba222 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 26 Apr 2023 08:04:41 -0700 Subject: [PATCH 177/266] nvme-fabrics: add queue setup helpers tcp and rdma transports have lots of duplicate code setting up the different queue mappings. Add common helpers. Cc: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 76 +++++++++++++++++++++++++++++++ drivers/nvme/host/fabrics.h | 11 +++++ drivers/nvme/host/rdma.c | 79 ++------------------------------ drivers/nvme/host/tcp.c | 90 ++----------------------------------- 4 files changed, 95 insertions(+), 161 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 0069ebff85df..eebe0faceb44 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -957,6 +957,82 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, return ret; } +void nvmf_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues, + u32 io_queues[HCTX_MAX_TYPES]) +{ + if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; + nr_io_queues -= io_queues[HCTX_TYPE_READ]; + io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_write_queues, nr_io_queues); + nr_io_queues -= io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_io_queues, nr_io_queues); + nr_io_queues -= io_queues[HCTX_TYPE_DEFAULT]; + } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } +} +EXPORT_SYMBOL_GPL(nvmf_set_io_queues); + +void nvmf_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl, + u32 io_queues[HCTX_MAX_TYPES]) +{ + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (opts->nr_write_queues && io_queues[HCTX_TYPE_READ]) { + /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + io_queues[HCTX_TYPE_READ]; + set->map[HCTX_TYPE_READ].queue_offset = + io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* shared read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_READ].queue_offset = 0; + } + + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + if (opts->nr_poll_queues && io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + io_queues[HCTX_TYPE_DEFAULT] + + io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + + dev_info(ctrl->device, + "mapped %d/%d/%d default/read/poll queues.\n", + io_queues[HCTX_TYPE_DEFAULT], + io_queues[HCTX_TYPE_READ], + io_queues[HCTX_TYPE_POLL]); +} +EXPORT_SYMBOL_GPL(nvmf_map_queues); + static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, unsigned int required_opts) { diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index dcac3df8a5f7..e438d67a319b 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -203,6 +203,13 @@ static inline void nvmf_complete_timed_out_request(struct request *rq) } } +static inline unsigned int nvmf_nr_io_queues(struct nvmf_ctrl_options *opts) +{ + return min(opts->nr_io_queues, num_online_cpus()) + + min(opts->nr_write_queues, num_online_cpus()) + + min(opts->nr_poll_queues, num_online_cpus()); +} + int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); @@ -215,5 +222,9 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, struct nvmf_ctrl_options *opts); +void nvmf_set_io_queues(struct nvmf_ctrl_options *opts, u32 nr_io_queues, + u32 io_queues[HCTX_MAX_TYPES]); +void nvmf_map_queues(struct blk_mq_tag_set *set, struct nvme_ctrl *ctrl, + u32 io_queues[HCTX_MAX_TYPES]); #endif /* _NVME_FABRICS_H */ diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5e636a2d7ce1..d433b2ec07a6 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -713,18 +713,10 @@ static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl, static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - struct ib_device *ibdev = ctrl->device->dev; - unsigned int nr_io_queues, nr_default_queues; - unsigned int nr_read_queues, nr_poll_queues; + unsigned int nr_io_queues; int i, ret; - nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors, - min(opts->nr_io_queues, num_online_cpus())); - nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors, - min(opts->nr_write_queues, num_online_cpus())); - nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus()); - nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues; - + nr_io_queues = nvmf_nr_io_queues(opts); ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) return ret; @@ -739,34 +731,7 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); - if (opts->nr_write_queues && nr_read_queues < nr_io_queues) { - /* - * separate read/write queues - * hand out dedicated default queues only after we have - * sufficient read queues. - */ - ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues; - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(nr_default_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* - * shared read/write queues - * either no write queues were requested, or we don't have - * sufficient queue count to have dedicated default queues. - */ - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(nr_read_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } - - if (opts->nr_poll_queues && nr_io_queues) { - /* map dedicated poll queues only if we have queues left */ - ctrl->io_queues[HCTX_TYPE_POLL] = - min(nr_poll_queues, nr_io_queues); - } - + nvmf_set_io_queues(opts, nr_io_queues, ctrl->io_queues); for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_rdma_alloc_queue(ctrl, i, ctrl->ctrl.sqsize + 1); @@ -2138,44 +2103,8 @@ static void nvme_rdma_complete_rq(struct request *rq) static void nvme_rdma_map_queues(struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data); - struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { - /* separate read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_READ]; - set->map[HCTX_TYPE_READ].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* shared read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_READ].queue_offset = 0; - } - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); - blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); - - if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { - /* map dedicated poll queues only if we have queues left */ - set->map[HCTX_TYPE_POLL].nr_queues = - ctrl->io_queues[HCTX_TYPE_POLL]; - set->map[HCTX_TYPE_POLL].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT] + - ctrl->io_queues[HCTX_TYPE_READ]; - blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); - } - - dev_info(ctrl->ctrl.device, - "mapped %d/%d/%d default/read/poll queues.\n", - ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ], - ctrl->io_queues[HCTX_TYPE_POLL]); + nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues); } static const struct blk_mq_ops nvme_rdma_mq_ops = { diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index bf0230442d57..260b3554d821 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1802,58 +1802,12 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) return ret; } -static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) -{ - unsigned int nr_io_queues; - - nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); - nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); - nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); - - return nr_io_queues; -} - -static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, - unsigned int nr_io_queues) -{ - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); - struct nvmf_ctrl_options *opts = nctrl->opts; - - if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { - /* - * separate read/write queues - * hand out dedicated default queues only after we have - * sufficient read queues. - */ - ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_write_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* - * shared read/write queues - * either no write queues were requested, or we don't have - * sufficient queue count to have dedicated default queues. - */ - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_io_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } - - if (opts->nr_poll_queues && nr_io_queues) { - /* map dedicated poll queues only if we have queues left */ - ctrl->io_queues[HCTX_TYPE_POLL] = - min(opts->nr_poll_queues, nr_io_queues); - } -} - static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { unsigned int nr_io_queues; int ret; - nr_io_queues = nvme_tcp_nr_io_queues(ctrl); + nr_io_queues = nvmf_nr_io_queues(ctrl->opts); ret = nvme_set_queue_count(ctrl, &nr_io_queues); if (ret) return ret; @@ -1868,8 +1822,8 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); - nvme_tcp_set_io_queues(ctrl, nr_io_queues); - + nvmf_set_io_queues(ctrl->opts, nr_io_queues, + to_tcp_ctrl(ctrl)->io_queues); return __nvme_tcp_alloc_io_queues(ctrl); } @@ -2449,44 +2403,8 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, static void nvme_tcp_map_queues(struct blk_mq_tag_set *set) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); - struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { - /* separate read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_READ]; - set->map[HCTX_TYPE_READ].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* shared read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_READ].queue_offset = 0; - } - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); - blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); - - if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { - /* map dedicated poll queues only if we have queues left */ - set->map[HCTX_TYPE_POLL].nr_queues = - ctrl->io_queues[HCTX_TYPE_POLL]; - set->map[HCTX_TYPE_POLL].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT] + - ctrl->io_queues[HCTX_TYPE_READ]; - blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); - } - - dev_info(ctrl->ctrl.device, - "mapped %d/%d/%d default/read/poll queues.\n", - ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ], - ctrl->io_queues[HCTX_TYPE_POLL]); + nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues); } static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) From c60651e32f1e886cd85fd9f591ad2d8706173605 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 1 May 2023 14:40:25 +0200 Subject: [PATCH 178/266] nvmet: reorder fields in 'struct nvmet_sq' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct nvmet_sq' from 472 to 464 bytes when CONFIG_NVME_TARGET_AUTH is defined. This structure is embedded into some other structures, so it helps reducing their sizes as well. Signed-off-by: Christophe JAILLET Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index dc60a22646f7..6cf723bc664e 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -109,8 +109,8 @@ struct nvmet_sq { u32 sqhd; bool sqhd_disabled; #ifdef CONFIG_NVME_TARGET_AUTH - struct delayed_work auth_expired_work; bool authenticated; + struct delayed_work auth_expired_work; u16 dhchap_tid; u16 dhchap_status; int dhchap_step; From 9d217fb0e778d69b2e3988efbc441976c0fb29b5 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 1 May 2023 14:40:26 +0200 Subject: [PATCH 179/266] nvme: reorder fields in 'struct nvme_ctrl' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct nvme_ctrl' from 5368 to 5344 bytes when all CONFIG_* are defined. This structure is embedded into some other structures, so it helps reducing their size as well. Signed-off-by: Christophe JAILLET Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 953e59f56139..9a585e60e1f2 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -246,8 +246,8 @@ enum nvme_ctrl_flags { struct nvme_ctrl { bool comp_seen; - enum nvme_ctrl_state state; bool identified; + enum nvme_ctrl_state state; spinlock_t lock; struct mutex scan_lock; const struct nvme_ctrl_ops *ops; @@ -279,8 +279,8 @@ struct nvme_ctrl { char name[12]; u16 cntlid; - u32 ctrl_config; u16 mtfa; + u32 ctrl_config; u32 queue_count; u64 cap; @@ -353,10 +353,10 @@ struct nvme_ctrl { bool apst_enabled; /* PCIe only: */ + u16 hmmaxd; u32 hmpre; u32 hmmin; u32 hmminds; - u16 hmmaxd; /* Fabrics only */ u32 ioccsz; From e64b0c807cdb7e1c8c6cd1a1b4e72027d1034d91 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 1 May 2023 14:40:27 +0200 Subject: [PATCH 180/266] nvmet: reorder fields in 'struct nvmf_ctrl_options' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct nvmf_ctrl_options' from 136 to 128 bytes. When such a structure is allocated in nvmf_create_ctrl(), because of the way memory allocation works, when 136 bytes were requested, 192 bytes were allocated. So this saves 64 bytes per allocation, 1 cache line to hold the whole structure and a few cycles when zeroing the memory in nvmf_create_ctrl(). Signed-off-by: Christophe JAILLET Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index e438d67a319b..b78dc3b98508 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -77,6 +77,9 @@ enum { * with the parsing opts enum. * @mask: Used by the fabrics library to parse through sysfs options * on adding a NVMe controller. + * @max_reconnects: maximum number of allowed reconnect attempts before removing + * the controller, (-1) means reconnect forever, zero means remove + * immediately; * @transport: Holds the fabric transport "technology name" (for a lack of * better description) that will be used by an NVMe controller * being added. @@ -96,9 +99,6 @@ enum { * @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN. * @kato: Keep-alive timeout. * @host: Virtual NVMe host, contains the NQN and Host ID. - * @max_reconnects: maximum number of allowed reconnect attempts before removing - * the controller, (-1) means reconnect forever, zero means remove - * immediately; * @dhchap_secret: DH-HMAC-CHAP secret * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional * authentication @@ -112,6 +112,7 @@ enum { */ struct nvmf_ctrl_options { unsigned mask; + int max_reconnects; char *transport; char *subsysnqn; char *traddr; @@ -125,7 +126,6 @@ struct nvmf_ctrl_options { bool duplicate_connect; unsigned int kato; struct nvmf_host *host; - int max_reconnects; char *dhchap_secret; char *dhchap_ctrl_secret; bool disable_sqflow; From 0f5335e15897fa989ffc301e4657ed5aa7b62f9f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 1 May 2023 14:40:28 +0200 Subject: [PATCH 181/266] nvmet: reorder fields in 'struct nvme_dhchap_queue_context' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct nvme_dhchap_queue_context' from 416 to 400 bytes. This structure is kvcalloc()'ed in nvme_auth_init_ctrl(), so it is likely that the allocation can be relatively big. Saving 16 bytes per structure may might a slight difference. Signed-off-by: Christophe JAILLET Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index ea16a0aba679..daf5d144a8ea 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -30,18 +30,18 @@ struct nvme_dhchap_queue_context { u32 s2; u16 transaction; u8 status; + u8 dhgroup_id; u8 hash_id; size_t hash_len; - u8 dhgroup_id; u8 c1[64]; u8 c2[64]; u8 response[64]; u8 *host_response; u8 *ctrl_key; - int ctrl_key_len; u8 *host_key; - int host_key_len; u8 *sess_key; + int ctrl_key_len; + int host_key_len; int sess_key_len; }; From 92bbe55182affa9f3b00a266d5f41fbc8a2114d6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 1 May 2023 14:40:29 +0200 Subject: [PATCH 182/266] nvmet: reorder fields in 'struct nvmefc_fcp_req' Group some variables based on their sizes to reduce holes. On x86_64, this shrinks the size of 'struct nvmefc_fcp_req' from 112 to 104 bytes. This structure is embedded in some other structures (nvme_fc_fcp_op which itself is embedded in nvme_fcp_op_w_sgl), so it helps reducing the size of these structures too. Signed-off-by: Christophe JAILLET Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme-fc-driver.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index fa092b9be2fd..4109f1bd6128 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -185,7 +185,6 @@ enum nvmefc_fcp_datadir { * @first_sgl: memory for 1st scatter/gather list segment for payload data * @sg_cnt: number of elements in the scatter/gather list * @io_dir: direction of the FCP request (see NVMEFC_FCP_xxx) - * @sqid: The nvme SQID the command is being issued on * @done: The callback routine the LLDD is to invoke upon completion of * the FCP operation. req argument is the pointer to the original * FCP IO operation. @@ -194,12 +193,13 @@ enum nvmefc_fcp_datadir { * while processing the operation. The length of the buffer * corresponds to the fcprqst_priv_sz value specified in the * nvme_fc_port_template supplied by the LLDD. + * @sqid: The nvme SQID the command is being issued on * * Values set by the LLDD indicating completion status of the FCP operation. * Must be set prior to calling the done() callback. + * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. * @transferred_length: amount of payload data, in bytes, that were * transferred. Should equal payload_length on success. - * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. * @status: Completion status of the FCP operation. must be 0 upon success, * negative errno value upon failure (ex: -EIO). Note: this is * NOT a reflection of the NVME CQE completion status. Only the @@ -219,14 +219,14 @@ struct nvmefc_fcp_req { int sg_cnt; enum nvmefc_fcp_datadir io_dir; - __le16 sqid; - void (*done)(struct nvmefc_fcp_req *req); void *private; - u32 transferred_length; + __le16 sqid; + u16 rcv_rsplen; + u32 transferred_length; u32 status; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ From b86d6595f73462c4086cadba63aacff4155e74ed Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Fri, 12 May 2023 18:41:53 +0300 Subject: [PATCH 183/266] nvme-fabrics: unify common code in admin and io queue connect To simplify code maintenance, it is recommended to avoid duplicating code. Tested-by: Noam Gottlieb Reviewed-by: Israel Rukshin Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 74 +++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index eebe0faceb44..3c7cfab1b2fb 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -349,6 +349,45 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, } } +static struct nvmf_connect_data *nvmf_connect_data_prep(struct nvme_ctrl *ctrl, + u16 cntlid) +{ + struct nvmf_connect_data *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + + uuid_copy(&data->hostid, &ctrl->opts->host->id); + data->cntlid = cpu_to_le16(cntlid); + strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); + strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); + + return data; +} + +static void nvmf_connect_cmd_prep(struct nvme_ctrl *ctrl, u16 qid, + struct nvme_command *cmd) +{ + cmd->connect.opcode = nvme_fabrics_command; + cmd->connect.fctype = nvme_fabrics_type_connect; + cmd->connect.qid = cpu_to_le16(qid); + + if (qid) { + cmd->connect.sqsize = cpu_to_le16(ctrl->sqsize); + } else { + cmd->connect.sqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); + + /* + * set keep-alive timeout in seconds granularity (ms * 1000) + */ + cmd->connect.kato = cpu_to_le32(ctrl->kato * 1000); + } + + if (ctrl->opts->disable_sqflow) + cmd->connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; +} + /** * nvmf_connect_admin_queue() - NVMe Fabrics Admin Queue "Connect" * API function. @@ -377,28 +416,12 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) int ret; u32 result; - cmd.connect.opcode = nvme_fabrics_command; - cmd.connect.fctype = nvme_fabrics_type_connect; - cmd.connect.qid = 0; - cmd.connect.sqsize = cpu_to_le16(NVME_AQ_DEPTH - 1); + nvmf_connect_cmd_prep(ctrl, 0, &cmd); - /* - * Set keep-alive timeout in seconds granularity (ms * 1000) - */ - cmd.connect.kato = cpu_to_le32(ctrl->kato * 1000); - - if (ctrl->opts->disable_sqflow) - cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; - - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = nvmf_connect_data_prep(ctrl, 0xffff); if (!data) return -ENOMEM; - uuid_copy(&data->hostid, &ctrl->opts->host->id); - data->cntlid = cpu_to_le16(0xffff); - strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); - strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); - ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, data, sizeof(*data), NVME_QID_ANY, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); @@ -468,23 +491,12 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) int ret; u32 result; - cmd.connect.opcode = nvme_fabrics_command; - cmd.connect.fctype = nvme_fabrics_type_connect; - cmd.connect.qid = cpu_to_le16(qid); - cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize); + nvmf_connect_cmd_prep(ctrl, qid, &cmd); - if (ctrl->opts->disable_sqflow) - cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; - - data = kzalloc(sizeof(*data), GFP_KERNEL); + data = nvmf_connect_data_prep(ctrl, ctrl->cntlid); if (!data) return -ENOMEM; - uuid_copy(&data->hostid, &ctrl->opts->host->id); - data->cntlid = cpu_to_le16(ctrl->cntlid); - strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); - strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); - ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, data, sizeof(*data), qid, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); From 5e4b55fa522ec0839f33a73fe5facf609ee66b58 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Fri, 12 May 2023 18:41:54 +0300 Subject: [PATCH 184/266] nvme-fabrics: check hostid using uuid_equal Use a dedicated function to match uuids instead of duplicating it. Tested-by: Noam Gottlieb Reviewed-by: Israel Rukshin Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index b78dc3b98508..82e7a27ffbde 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -181,7 +181,7 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, ctrl->state == NVME_CTRL_DEAD || strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || strcmp(opts->host->nqn, ctrl->opts->host->nqn) || - memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t))) + !uuid_equal(&opts->host->id, &ctrl->opts->host->id)) return false; return true; From ae8bd606e09bbdb2607c8249872cc2aeaf2fcc72 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Fri, 12 May 2023 18:41:55 +0300 Subject: [PATCH 185/266] nvme-fabrics: prevent overriding of existing host When first connecting a target using the "default" host parameters, setting the hostid from the command line during a subsequent connection establishment would override the "default" hostid parameter. This would cause an existing connection that is already using the host definitions to lose its hostid. To address this issue, the code has been modified to allow only 1:1 mapping between hostnqn and hostid. This will maintain unambiguous host identification. Any non 1:1 mapping will be rejected during connection establishment. Tested-by: Noam Gottlieb Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 98 +++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 27 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 3c7cfab1b2fb..b1fa27b60917 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -21,35 +21,79 @@ static DEFINE_MUTEX(nvmf_hosts_mutex); static struct nvmf_host *nvmf_default_host; -static struct nvmf_host *__nvmf_host_find(const char *hostnqn) +/** + * __nvmf_host_find() - Find a matching to a previously created host + * @hostnqn: Host NQN to match + * @id: Host ID to match + * + * We have defined a host as how it is perceived by the target. + * Therefore, we don't allow different Host NQNs with the same Host ID. + * Similarly, we do not allow the usage of the same Host NQN with different + * Host IDs. This will maintain unambiguous host identification. + * + * Return: Returns host pointer on success, NULL in case of no match or + * ERR_PTR(-EINVAL) in case of error match. + */ +static struct nvmf_host *__nvmf_host_find(const char *hostnqn, uuid_t *id) { struct nvmf_host *host; + lockdep_assert_held(&nvmf_hosts_mutex); + list_for_each_entry(host, &nvmf_hosts, list) { - if (!strcmp(host->nqn, hostnqn)) + bool same_hostnqn = !strcmp(host->nqn, hostnqn); + bool same_hostid = uuid_equal(&host->id, id); + + if (same_hostnqn && same_hostid) return host; + + if (same_hostnqn) { + pr_err("found same hostnqn %s but different hostid %pUb\n", + hostnqn, id); + return ERR_PTR(-EINVAL); + } + if (same_hostid) { + pr_err("found same hostid %pUb but different hostnqn %s\n", + id, hostnqn); + return ERR_PTR(-EINVAL); + + } } return NULL; } -static struct nvmf_host *nvmf_host_add(const char *hostnqn) +static struct nvmf_host *nvmf_host_alloc(const char *hostnqn, uuid_t *id) +{ + struct nvmf_host *host; + + host = kmalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return NULL; + + kref_init(&host->ref); + uuid_copy(&host->id, id); + strscpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + + return host; +} + +static struct nvmf_host *nvmf_host_add(const char *hostnqn, uuid_t *id) { struct nvmf_host *host; mutex_lock(&nvmf_hosts_mutex); - host = __nvmf_host_find(hostnqn); - if (host) { + host = __nvmf_host_find(hostnqn, id); + if (IS_ERR(host)) { + goto out_unlock; + } else if (host) { kref_get(&host->ref); goto out_unlock; } - host = kmalloc(sizeof(*host), GFP_KERNEL); + host = nvmf_host_alloc(hostnqn, id); if (!host) - goto out_unlock; - - kref_init(&host->ref); - strscpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + return ERR_PTR(-ENOMEM); list_add_tail(&host->list, &nvmf_hosts); out_unlock: @@ -60,16 +104,17 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) static struct nvmf_host *nvmf_host_default(void) { struct nvmf_host *host; + char nqn[NVMF_NQN_SIZE]; + uuid_t id; - host = kmalloc(sizeof(*host), GFP_KERNEL); + uuid_gen(&id); + snprintf(nqn, NVMF_NQN_SIZE, + "nqn.2014-08.org.nvmexpress:uuid:%pUb", &id); + + host = nvmf_host_alloc(nqn, &id); if (!host) return NULL; - kref_init(&host->ref); - uuid_gen(&host->id); - snprintf(host->nqn, NVMF_NQN_SIZE, - "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id); - mutex_lock(&nvmf_hosts_mutex); list_add_tail(&host->list, &nvmf_hosts); mutex_unlock(&nvmf_hosts_mutex); @@ -633,6 +678,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, size_t nqnlen = 0; int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO; uuid_t hostid; + char hostnqn[NVMF_NQN_SIZE]; /* Set defaults */ opts->queue_size = NVMF_DEF_QUEUE_SIZE; @@ -649,7 +695,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, if (!options) return -ENOMEM; - uuid_gen(&hostid); + /* use default host if not given by user space */ + uuid_copy(&hostid, &nvmf_default_host->id); + strscpy(hostnqn, nvmf_default_host->nqn, NVMF_NQN_SIZE); while ((p = strsep(&o, ",\n")) != NULL) { if (!*p) @@ -795,12 +843,8 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -EINVAL; goto out; } - opts->host = nvmf_host_add(p); + strscpy(hostnqn, p, NVMF_NQN_SIZE); kfree(p); - if (!opts->host) { - ret = -ENOMEM; - goto out; - } break; case NVMF_OPT_RECONNECT_DELAY: if (match_int(args, &token)) { @@ -957,13 +1001,13 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->fast_io_fail_tmo, ctrl_loss_tmo); } - if (!opts->host) { - kref_get(&nvmf_default_host->ref); - opts->host = nvmf_default_host; + opts->host = nvmf_host_add(hostnqn, &hostid); + if (IS_ERR(opts->host)) { + ret = PTR_ERR(opts->host); + opts->host = NULL; + goto out; } - uuid_copy(&opts->host->id, &hostid); - out: kfree(options); return ret; From 942e21c042e6f735e13364e955cbd55a8930294b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 25 Apr 2023 00:12:42 +0300 Subject: [PATCH 186/266] nvme: move sysfs code to a dedicated sysfs.c file The core.c file became long and hard to maintain. Create a dedicated file to centralize the sysfs functionality. This is a common practice to separate sysfs/configfs related logic from the main driver logic .c file. For example, in the nvmet module the configfs interface has its own dedicated file. This patch does not include any functional changes. Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe Reviewed-by: Hannes Reinecke Signed-off-by: Max Gurtovoy [merged dhchap memleak fixes, include nvme-auth.h] Signed-off-by: Keith Busch --- drivers/nvme/host/Makefile | 2 +- drivers/nvme/host/core.c | 656 +----------------------------------- drivers/nvme/host/nvme.h | 4 + drivers/nvme/host/sysfs.c | 665 +++++++++++++++++++++++++++++++++++++ 4 files changed, 672 insertions(+), 655 deletions(-) create mode 100644 drivers/nvme/host/sysfs.c diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index e27202d22c7d..d3fc5063e4be 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -10,7 +10,7 @@ obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o obj-$(CONFIG_NVME_APPLE) += nvme-apple.o -nvme-core-y += core.o ioctl.o +nvme-core-y += core.o ioctl.o sysfs.o nvme-core-$(CONFIG_NVME_VERBOSE_ERRORS) += constants.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cfb98e6b94a7..43b906a59c8c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -237,7 +237,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl); -static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) { /* * Keep a reference until nvme_do_delete_ctrl() complete, @@ -2256,7 +2256,7 @@ static int nvme_report_zones(struct gendisk *disk, sector_t sector, #define nvme_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ -static const struct block_device_operations nvme_bdev_ops = { +const struct block_device_operations nvme_bdev_ops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, @@ -2791,75 +2791,6 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) return NULL; } -#define SUBSYS_ATTR_RO(_name, _mode, _show) \ - struct device_attribute subsys_attr_##_name = \ - __ATTR(_name, _mode, _show, NULL) - -static ssize_t nvme_subsys_show_nqn(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_subsystem *subsys = - container_of(dev, struct nvme_subsystem, dev); - - return sysfs_emit(buf, "%s\n", subsys->subnqn); -} -static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); - -static ssize_t nvme_subsys_show_type(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_subsystem *subsys = - container_of(dev, struct nvme_subsystem, dev); - - switch (subsys->subtype) { - case NVME_NQN_DISC: - return sysfs_emit(buf, "discovery\n"); - case NVME_NQN_NVME: - return sysfs_emit(buf, "nvm\n"); - default: - return sysfs_emit(buf, "reserved\n"); - } -} -static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type); - -#define nvme_subsys_show_str_function(field) \ -static ssize_t subsys_##field##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct nvme_subsystem *subsys = \ - container_of(dev, struct nvme_subsystem, dev); \ - return sysfs_emit(buf, "%.*s\n", \ - (int)sizeof(subsys->field), subsys->field); \ -} \ -static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); - -nvme_subsys_show_str_function(model); -nvme_subsys_show_str_function(serial); -nvme_subsys_show_str_function(firmware_rev); - -static struct attribute *nvme_subsys_attrs[] = { - &subsys_attr_model.attr, - &subsys_attr_serial.attr, - &subsys_attr_firmware_rev.attr, - &subsys_attr_subsysnqn.attr, - &subsys_attr_subsystype.attr, -#ifdef CONFIG_NVME_MULTIPATH - &subsys_attr_iopolicy.attr, -#endif - NULL, -}; - -static const struct attribute_group nvme_subsys_attrs_group = { - .attrs = nvme_subsys_attrs, -}; - -static const struct attribute_group *nvme_subsys_attrs_groups[] = { - &nvme_subsys_attrs_group, - NULL, -}; - static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl) { return ctrl->opts && ctrl->opts->discovery_nqn; @@ -3393,589 +3324,6 @@ static const struct file_operations nvme_dev_fops = { .uring_cmd = nvme_dev_uring_cmd, }; -static ssize_t nvme_sysfs_reset(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - int ret; - - ret = nvme_reset_ctrl_sync(ctrl); - if (ret < 0) - return ret; - return count; -} -static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); - -static ssize_t nvme_sysfs_rescan(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - nvme_queue_scan(ctrl); - return count; -} -static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); - -static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (disk->fops == &nvme_bdev_ops) - return nvme_get_ns_from_dev(dev)->head; - else - return disk->private_data; -} - -static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct nvme_ns_head *head = dev_to_ns_head(dev); - struct nvme_ns_ids *ids = &head->ids; - struct nvme_subsystem *subsys = head->subsys; - int serial_len = sizeof(subsys->serial); - int model_len = sizeof(subsys->model); - - if (!uuid_is_null(&ids->uuid)) - return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); - - if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); - - if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - return sysfs_emit(buf, "eui.%8phN\n", ids->eui64); - - while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || - subsys->serial[serial_len - 1] == '\0')) - serial_len--; - while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || - subsys->model[model_len - 1] == '\0')) - model_len--; - - return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, - serial_len, subsys->serial, model_len, subsys->model, - head->ns_id); -} -static DEVICE_ATTR_RO(wwid); - -static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); -} -static DEVICE_ATTR_RO(nguid); - -static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; - - /* For backward compatibility expose the NGUID to userspace if - * we have no UUID set - */ - if (uuid_is_null(&ids->uuid)) { - dev_warn_ratelimited(dev, - "No UUID available providing old NGUID\n"); - return sysfs_emit(buf, "%pU\n", ids->nguid); - } - return sysfs_emit(buf, "%pU\n", &ids->uuid); -} -static DEVICE_ATTR_RO(uuid); - -static ssize_t eui_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); -} -static DEVICE_ATTR_RO(eui); - -static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id); -} -static DEVICE_ATTR_RO(nsid); - -static struct attribute *nvme_ns_id_attrs[] = { - &dev_attr_wwid.attr, - &dev_attr_uuid.attr, - &dev_attr_nguid.attr, - &dev_attr_eui.attr, - &dev_attr_nsid.attr, -#ifdef CONFIG_NVME_MULTIPATH - &dev_attr_ana_grpid.attr, - &dev_attr_ana_state.attr, -#endif - NULL, -}; - -static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) -{ - struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; - - if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ids->uuid) && - !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return 0; - } - if (a == &dev_attr_nguid.attr) { - if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) - return 0; - } - if (a == &dev_attr_eui.attr) { - if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) - return 0; - } -#ifdef CONFIG_NVME_MULTIPATH - if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { - if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ - return 0; - if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) - return 0; - } -#endif - return a->mode; -} - -static const struct attribute_group nvme_ns_id_attr_group = { - .attrs = nvme_ns_id_attrs, - .is_visible = nvme_ns_id_attrs_are_visible, -}; - -const struct attribute_group *nvme_ns_id_attr_groups[] = { - &nvme_ns_id_attr_group, - NULL, -}; - -#define nvme_show_str_function(field) \ -static ssize_t field##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sysfs_emit(buf, "%.*s\n", \ - (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ -} \ -static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); - -nvme_show_str_function(model); -nvme_show_str_function(serial); -nvme_show_str_function(firmware_rev); - -#define nvme_show_int_function(field) \ -static ssize_t field##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sysfs_emit(buf, "%d\n", ctrl->field); \ -} \ -static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); - -nvme_show_int_function(cntlid); -nvme_show_int_function(numa_node); -nvme_show_int_function(queue_count); -nvme_show_int_function(sqsize); -nvme_show_int_function(kato); - -static ssize_t nvme_sysfs_delete(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (device_remove_file_self(dev, attr)) - nvme_delete_ctrl_sync(ctrl); - return count; -} -static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); - -static ssize_t nvme_sysfs_show_transport(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%s\n", ctrl->ops->name); -} -static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); - -static ssize_t nvme_sysfs_show_state(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - static const char *const state_name[] = { - [NVME_CTRL_NEW] = "new", - [NVME_CTRL_LIVE] = "live", - [NVME_CTRL_RESETTING] = "resetting", - [NVME_CTRL_CONNECTING] = "connecting", - [NVME_CTRL_DELETING] = "deleting", - [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", - [NVME_CTRL_DEAD] = "dead", - }; - - if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && - state_name[ctrl->state]) - return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); - - return sysfs_emit(buf, "unknown state\n"); -} - -static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); - -static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn); -} -static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); - -static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn); -} -static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); - -static ssize_t nvme_sysfs_show_hostid(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id); -} -static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); - -static ssize_t nvme_sysfs_show_address(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); -} -static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); - -static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - - if (ctrl->opts->max_reconnects == -1) - return sysfs_emit(buf, "off\n"); - return sysfs_emit(buf, "%d\n", - opts->max_reconnects * opts->reconnect_delay); -} - -static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - int ctrl_loss_tmo, err; - - err = kstrtoint(buf, 10, &ctrl_loss_tmo); - if (err) - return -EINVAL; - - if (ctrl_loss_tmo < 0) - opts->max_reconnects = -1; - else - opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, - opts->reconnect_delay); - return count; -} -static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, - nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); - -static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (ctrl->opts->reconnect_delay == -1) - return sysfs_emit(buf, "off\n"); - return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay); -} - -static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - unsigned int v; - int err; - - err = kstrtou32(buf, 10, &v); - if (err) - return err; - - ctrl->opts->reconnect_delay = v; - return count; -} -static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, - nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); - -static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (ctrl->opts->fast_io_fail_tmo == -1) - return sysfs_emit(buf, "off\n"); - return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo); -} - -static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - int fast_io_fail_tmo, err; - - err = kstrtoint(buf, 10, &fast_io_fail_tmo); - if (err) - return -EINVAL; - - if (fast_io_fail_tmo < 0) - opts->fast_io_fail_tmo = -1; - else - opts->fast_io_fail_tmo = fast_io_fail_tmo; - return count; -} -static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, - nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); - -static ssize_t cntrltype_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - static const char * const type[] = { - [NVME_CTRL_IO] = "io\n", - [NVME_CTRL_DISC] = "discovery\n", - [NVME_CTRL_ADMIN] = "admin\n", - }; - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype]) - return sysfs_emit(buf, "reserved\n"); - - return sysfs_emit(buf, type[ctrl->cntrltype]); -} -static DEVICE_ATTR_RO(cntrltype); - -static ssize_t dctype_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - static const char * const type[] = { - [NVME_DCTYPE_NOT_REPORTED] = "none\n", - [NVME_DCTYPE_DDC] = "ddc\n", - [NVME_DCTYPE_CDC] = "cdc\n", - }; - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype]) - return sysfs_emit(buf, "reserved\n"); - - return sysfs_emit(buf, type[ctrl->dctype]); -} -static DEVICE_ATTR_RO(dctype); - -#ifdef CONFIG_NVME_AUTH -static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - - if (!opts->dhchap_secret) - return sysfs_emit(buf, "none\n"); - return sysfs_emit(buf, "%s\n", opts->dhchap_secret); -} - -static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - char *dhchap_secret; - - if (!ctrl->opts->dhchap_secret) - return -EINVAL; - if (count < 7) - return -EINVAL; - if (memcmp(buf, "DHHC-1:", 7)) - return -EINVAL; - - dhchap_secret = kzalloc(count + 1, GFP_KERNEL); - if (!dhchap_secret) - return -ENOMEM; - memcpy(dhchap_secret, buf, count); - nvme_auth_stop(ctrl); - if (strcmp(dhchap_secret, opts->dhchap_secret)) { - struct nvme_dhchap_key *key, *host_key; - int ret; - - ret = nvme_auth_generate_key(dhchap_secret, &key); - if (ret) { - kfree(dhchap_secret); - return ret; - } - kfree(opts->dhchap_secret); - opts->dhchap_secret = dhchap_secret; - host_key = ctrl->host_key; - mutex_lock(&ctrl->dhchap_auth_mutex); - ctrl->host_key = key; - mutex_unlock(&ctrl->dhchap_auth_mutex); - nvme_auth_free_key(host_key); - } else - kfree(dhchap_secret); - /* Start re-authentication */ - dev_info(ctrl->device, "re-authenticating controller\n"); - queue_work(nvme_wq, &ctrl->dhchap_auth_work); - - return count; -} -static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR, - nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store); - -static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - - if (!opts->dhchap_ctrl_secret) - return sysfs_emit(buf, "none\n"); - return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret); -} - -static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) -{ - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - struct nvmf_ctrl_options *opts = ctrl->opts; - char *dhchap_secret; - - if (!ctrl->opts->dhchap_ctrl_secret) - return -EINVAL; - if (count < 7) - return -EINVAL; - if (memcmp(buf, "DHHC-1:", 7)) - return -EINVAL; - - dhchap_secret = kzalloc(count + 1, GFP_KERNEL); - if (!dhchap_secret) - return -ENOMEM; - memcpy(dhchap_secret, buf, count); - nvme_auth_stop(ctrl); - if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { - struct nvme_dhchap_key *key, *ctrl_key; - int ret; - - ret = nvme_auth_generate_key(dhchap_secret, &key); - if (ret) { - kfree(dhchap_secret); - return ret; - } - kfree(opts->dhchap_ctrl_secret); - opts->dhchap_ctrl_secret = dhchap_secret; - ctrl_key = ctrl->ctrl_key; - mutex_lock(&ctrl->dhchap_auth_mutex); - ctrl->ctrl_key = key; - mutex_unlock(&ctrl->dhchap_auth_mutex); - nvme_auth_free_key(ctrl_key); - } else - kfree(dhchap_secret); - /* Start re-authentication */ - dev_info(ctrl->device, "re-authenticating controller\n"); - queue_work(nvme_wq, &ctrl->dhchap_auth_work); - - return count; -} -static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, - nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); -#endif - -static struct attribute *nvme_dev_attrs[] = { - &dev_attr_reset_controller.attr, - &dev_attr_rescan_controller.attr, - &dev_attr_model.attr, - &dev_attr_serial.attr, - &dev_attr_firmware_rev.attr, - &dev_attr_cntlid.attr, - &dev_attr_delete_controller.attr, - &dev_attr_transport.attr, - &dev_attr_subsysnqn.attr, - &dev_attr_address.attr, - &dev_attr_state.attr, - &dev_attr_numa_node.attr, - &dev_attr_queue_count.attr, - &dev_attr_sqsize.attr, - &dev_attr_hostnqn.attr, - &dev_attr_hostid.attr, - &dev_attr_ctrl_loss_tmo.attr, - &dev_attr_reconnect_delay.attr, - &dev_attr_fast_io_fail_tmo.attr, - &dev_attr_kato.attr, - &dev_attr_cntrltype.attr, - &dev_attr_dctype.attr, -#ifdef CONFIG_NVME_AUTH - &dev_attr_dhchap_secret.attr, - &dev_attr_dhchap_ctrl_secret.attr, -#endif - NULL -}; - -static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, - struct attribute *a, int n) -{ - struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - - if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) - return 0; - if (a == &dev_attr_address.attr && !ctrl->ops->get_address) - return 0; - if (a == &dev_attr_hostnqn.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_hostid.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) - return 0; -#ifdef CONFIG_NVME_AUTH - if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts) - return 0; - if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) - return 0; -#endif - - return a->mode; -} - -const struct attribute_group nvme_dev_attrs_group = { - .attrs = nvme_dev_attrs, - .is_visible = nvme_dev_attrs_are_visible, -}; -EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); - -static const struct attribute_group *nvme_dev_attr_groups[] = { - &nvme_dev_attrs_group, - NULL, -}; - static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl, unsigned nsid) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9a585e60e1f2..03cc7529d854 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -860,7 +860,11 @@ extern const struct attribute_group *nvme_ns_id_attr_groups[]; extern const struct pr_ops nvme_pr_ops; extern const struct block_device_operations nvme_ns_head_ops; extern const struct attribute_group nvme_dev_attrs_group; +extern const struct attribute_group *nvme_subsys_attrs_groups[]; +extern const struct attribute_group *nvme_dev_attr_groups[]; +extern const struct block_device_operations nvme_bdev_ops; +void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); #ifdef CONFIG_NVME_MULTIPATH static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c new file mode 100644 index 000000000000..796e1d373b7c --- /dev/null +++ b/drivers/nvme/host/sysfs.c @@ -0,0 +1,665 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Sysfs interface for the NVMe core driver. + * + * Copyright (c) 2011-2014, Intel Corporation. + */ + +#include + +#include "nvme.h" +#include "fabrics.h" + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int ret; + + ret = nvme_reset_ctrl_sync(ctrl); + if (ret < 0) + return ret; + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + +static ssize_t nvme_sysfs_rescan(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + nvme_queue_scan(ctrl); + return count; +} +static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); + +static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (disk->fops == &nvme_bdev_ops) + return nvme_get_ns_from_dev(dev)->head; + else + return disk->private_data; +} + +static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns_head *head = dev_to_ns_head(dev); + struct nvme_ns_ids *ids = &head->ids; + struct nvme_subsystem *subsys = head->subsys; + int serial_len = sizeof(subsys->serial); + int model_len = sizeof(subsys->model); + + if (!uuid_is_null(&ids->uuid)) + return sysfs_emit(buf, "uuid.%pU\n", &ids->uuid); + + if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return sysfs_emit(buf, "eui.%16phN\n", ids->nguid); + + if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + return sysfs_emit(buf, "eui.%8phN\n", ids->eui64); + + while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || + subsys->serial[serial_len - 1] == '\0')) + serial_len--; + while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || + subsys->model[model_len - 1] == '\0')) + model_len--; + + return sysfs_emit(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, + serial_len, subsys->serial, model_len, subsys->model, + head->ns_id); +} +static DEVICE_ATTR_RO(wwid); + +static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); +} +static DEVICE_ATTR_RO(nguid); + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; + + /* For backward compatibility expose the NGUID to userspace if + * we have no UUID set + */ + if (uuid_is_null(&ids->uuid)) { + dev_warn_ratelimited(dev, + "No UUID available providing old NGUID\n"); + return sysfs_emit(buf, "%pU\n", ids->nguid); + } + return sysfs_emit(buf, "%pU\n", &ids->uuid); +} +static DEVICE_ATTR_RO(uuid); + +static ssize_t eui_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); +} +static DEVICE_ATTR_RO(eui); + +static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%d\n", dev_to_ns_head(dev)->ns_id); +} +static DEVICE_ATTR_RO(nsid); + +static struct attribute *nvme_ns_id_attrs[] = { + &dev_attr_wwid.attr, + &dev_attr_uuid.attr, + &dev_attr_nguid.attr, + &dev_attr_eui.attr, + &dev_attr_nsid.attr, +#ifdef CONFIG_NVME_MULTIPATH + &dev_attr_ana_grpid.attr, + &dev_attr_ana_state.attr, +#endif + NULL, +}; + +static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; + + if (a == &dev_attr_uuid.attr) { + if (uuid_is_null(&ids->uuid) && + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return 0; + } + if (a == &dev_attr_nguid.attr) { + if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return 0; + } + if (a == &dev_attr_eui.attr) { + if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + return 0; + } +#ifdef CONFIG_NVME_MULTIPATH + if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { + if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ + return 0; + if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) + return 0; + } +#endif + return a->mode; +} + +static const struct attribute_group nvme_ns_id_attr_group = { + .attrs = nvme_ns_id_attrs, + .is_visible = nvme_ns_id_attrs_are_visible, +}; + +const struct attribute_group *nvme_ns_id_attr_groups[] = { + &nvme_ns_id_attr_group, + NULL, +}; + +#define nvme_show_str_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sysfs_emit(buf, "%.*s\n", \ + (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_str_function(model); +nvme_show_str_function(serial); +nvme_show_str_function(firmware_rev); + +#define nvme_show_int_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sysfs_emit(buf, "%d\n", ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_int_function(cntlid); +nvme_show_int_function(numa_node); +nvme_show_int_function(queue_count); +nvme_show_int_function(sqsize); +nvme_show_int_function(kato); + +static ssize_t nvme_sysfs_delete(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (device_remove_file_self(dev, attr)) + nvme_delete_ctrl_sync(ctrl); + return count; +} +static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); + +static ssize_t nvme_sysfs_show_transport(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%s\n", ctrl->ops->name); +} +static DEVICE_ATTR(transport, S_IRUGO, nvme_sysfs_show_transport, NULL); + +static ssize_t nvme_sysfs_show_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + static const char *const state_name[] = { + [NVME_CTRL_NEW] = "new", + [NVME_CTRL_LIVE] = "live", + [NVME_CTRL_RESETTING] = "resetting", + [NVME_CTRL_CONNECTING] = "connecting", + [NVME_CTRL_DELETING] = "deleting", + [NVME_CTRL_DELETING_NOIO]= "deleting (no IO)", + [NVME_CTRL_DEAD] = "dead", + }; + + if ((unsigned)ctrl->state < ARRAY_SIZE(state_name) && + state_name[ctrl->state]) + return sysfs_emit(buf, "%s\n", state_name[ctrl->state]); + + return sysfs_emit(buf, "unknown state\n"); +} + +static DEVICE_ATTR(state, S_IRUGO, nvme_sysfs_show_state, NULL); + +static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%s\n", ctrl->subsys->subnqn); +} +static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); + +static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%s\n", ctrl->opts->host->nqn); +} +static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); + +static ssize_t nvme_sysfs_show_hostid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%pU\n", &ctrl->opts->host->id); +} +static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); + +static ssize_t nvme_sysfs_show_address(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return ctrl->ops->get_address(ctrl, buf, PAGE_SIZE); +} +static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL); + +static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (ctrl->opts->max_reconnects == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", + opts->max_reconnects * opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ctrl_loss_tmo, err; + + err = kstrtoint(buf, 10, &ctrl_loss_tmo); + if (err) + return -EINVAL; + + if (ctrl_loss_tmo < 0) + opts->max_reconnects = -1; + else + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + return count; +} +static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store); + +static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->reconnect_delay == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->reconnect_delay); +} + +static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + unsigned int v; + int err; + + err = kstrtou32(buf, 10, &v); + if (err) + return err; + + ctrl->opts->reconnect_delay = v; + return count; +} +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, + nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store); + +static ssize_t nvme_ctrl_fast_io_fail_tmo_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->opts->fast_io_fail_tmo == -1) + return sysfs_emit(buf, "off\n"); + return sysfs_emit(buf, "%d\n", ctrl->opts->fast_io_fail_tmo); +} + +static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + int fast_io_fail_tmo, err; + + err = kstrtoint(buf, 10, &fast_io_fail_tmo); + if (err) + return -EINVAL; + + if (fast_io_fail_tmo < 0) + opts->fast_io_fail_tmo = -1; + else + opts->fast_io_fail_tmo = fast_io_fail_tmo; + return count; +} +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); + +static ssize_t cntrltype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + static const char * const type[] = { + [NVME_CTRL_IO] = "io\n", + [NVME_CTRL_DISC] = "discovery\n", + [NVME_CTRL_ADMIN] = "admin\n", + }; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype]) + return sysfs_emit(buf, "reserved\n"); + + return sysfs_emit(buf, type[ctrl->cntrltype]); +} +static DEVICE_ATTR_RO(cntrltype); + +static ssize_t dctype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + static const char * const type[] = { + [NVME_DCTYPE_NOT_REPORTED] = "none\n", + [NVME_DCTYPE_DDC] = "ddc\n", + [NVME_DCTYPE_CDC] = "cdc\n", + }; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype]) + return sysfs_emit(buf, "reserved\n"); + + return sysfs_emit(buf, type[ctrl->dctype]); +} +static DEVICE_ATTR_RO(dctype); + +#ifdef CONFIG_NVME_AUTH +static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (!opts->dhchap_secret) + return sysfs_emit(buf, "none\n"); + return sysfs_emit(buf, "%s\n", opts->dhchap_secret); +} + +static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + char *dhchap_secret; + + if (!ctrl->opts->dhchap_secret) + return -EINVAL; + if (count < 7) + return -EINVAL; + if (memcmp(buf, "DHHC-1:", 7)) + return -EINVAL; + + dhchap_secret = kzalloc(count + 1, GFP_KERNEL); + if (!dhchap_secret) + return -ENOMEM; + memcpy(dhchap_secret, buf, count); + nvme_auth_stop(ctrl); + if (strcmp(dhchap_secret, opts->dhchap_secret)) { + struct nvme_dhchap_key *key, *host_key; + int ret; + + ret = nvme_auth_generate_key(dhchap_secret, &key); + if (ret) { + kfree(dhchap_secret); + return ret; + } + kfree(opts->dhchap_secret); + opts->dhchap_secret = dhchap_secret; + host_key = ctrl->host_key; + mutex_lock(&ctrl->dhchap_auth_mutex); + ctrl->host_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); + nvme_auth_free_key(host_key); + } else + kfree(dhchap_secret); + /* Start re-authentication */ + dev_info(ctrl->device, "re-authenticating controller\n"); + queue_work(nvme_wq, &ctrl->dhchap_auth_work); + + return count; +} + +static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR, + nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store); + +static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (!opts->dhchap_ctrl_secret) + return sysfs_emit(buf, "none\n"); + return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret); +} + +static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + char *dhchap_secret; + + if (!ctrl->opts->dhchap_ctrl_secret) + return -EINVAL; + if (count < 7) + return -EINVAL; + if (memcmp(buf, "DHHC-1:", 7)) + return -EINVAL; + + dhchap_secret = kzalloc(count + 1, GFP_KERNEL); + if (!dhchap_secret) + return -ENOMEM; + memcpy(dhchap_secret, buf, count); + nvme_auth_stop(ctrl); + if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { + struct nvme_dhchap_key *key, *ctrl_key; + int ret; + + ret = nvme_auth_generate_key(dhchap_secret, &key); + if (ret) { + kfree(dhchap_secret); + return ret; + } + kfree(opts->dhchap_ctrl_secret); + opts->dhchap_ctrl_secret = dhchap_secret; + ctrl_key = ctrl->ctrl_key; + mutex_lock(&ctrl->dhchap_auth_mutex); + ctrl->ctrl_key = key; + mutex_unlock(&ctrl->dhchap_auth_mutex); + nvme_auth_free_key(ctrl_key); + } else + kfree(dhchap_secret); + /* Start re-authentication */ + dev_info(ctrl->device, "re-authenticating controller\n"); + queue_work(nvme_wq, &ctrl->dhchap_auth_work); + + return count; +} + +static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, + nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); +#endif + +static struct attribute *nvme_dev_attrs[] = { + &dev_attr_reset_controller.attr, + &dev_attr_rescan_controller.attr, + &dev_attr_model.attr, + &dev_attr_serial.attr, + &dev_attr_firmware_rev.attr, + &dev_attr_cntlid.attr, + &dev_attr_delete_controller.attr, + &dev_attr_transport.attr, + &dev_attr_subsysnqn.attr, + &dev_attr_address.attr, + &dev_attr_state.attr, + &dev_attr_numa_node.attr, + &dev_attr_queue_count.attr, + &dev_attr_sqsize.attr, + &dev_attr_hostnqn.attr, + &dev_attr_hostid.attr, + &dev_attr_ctrl_loss_tmo.attr, + &dev_attr_reconnect_delay.attr, + &dev_attr_fast_io_fail_tmo.attr, + &dev_attr_kato.attr, + &dev_attr_cntrltype.attr, + &dev_attr_dctype.attr, +#ifdef CONFIG_NVME_AUTH + &dev_attr_dhchap_secret.attr, + &dev_attr_dhchap_ctrl_secret.attr, +#endif + NULL +}; + +static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (a == &dev_attr_delete_controller.attr && !ctrl->ops->delete_ctrl) + return 0; + if (a == &dev_attr_address.attr && !ctrl->ops->get_address) + return 0; + if (a == &dev_attr_hostnqn.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_hostid.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) + return 0; +#ifdef CONFIG_NVME_AUTH + if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) + return 0; +#endif + + return a->mode; +} + +const struct attribute_group nvme_dev_attrs_group = { + .attrs = nvme_dev_attrs, + .is_visible = nvme_dev_attrs_are_visible, +}; +EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); + +const struct attribute_group *nvme_dev_attr_groups[] = { + &nvme_dev_attrs_group, + NULL, +}; + +#define SUBSYS_ATTR_RO(_name, _mode, _show) \ + struct device_attribute subsys_attr_##_name = \ + __ATTR(_name, _mode, _show, NULL) + +static ssize_t nvme_subsys_show_nqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + return sysfs_emit(buf, "%s\n", subsys->subnqn); +} +static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); + +static ssize_t nvme_subsys_show_type(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + switch (subsys->subtype) { + case NVME_NQN_DISC: + return sysfs_emit(buf, "discovery\n"); + case NVME_NQN_NVME: + return sysfs_emit(buf, "nvm\n"); + default: + return sysfs_emit(buf, "reserved\n"); + } +} +static SUBSYS_ATTR_RO(subsystype, S_IRUGO, nvme_subsys_show_type); + +#define nvme_subsys_show_str_function(field) \ +static ssize_t subsys_##field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_subsystem *subsys = \ + container_of(dev, struct nvme_subsystem, dev); \ + return sysfs_emit(buf, "%.*s\n", \ + (int)sizeof(subsys->field), subsys->field); \ +} \ +static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); + +nvme_subsys_show_str_function(model); +nvme_subsys_show_str_function(serial); +nvme_subsys_show_str_function(firmware_rev); + +static struct attribute *nvme_subsys_attrs[] = { + &subsys_attr_model.attr, + &subsys_attr_serial.attr, + &subsys_attr_firmware_rev.attr, + &subsys_attr_subsysnqn.attr, + &subsys_attr_subsystype.attr, +#ifdef CONFIG_NVME_MULTIPATH + &subsys_attr_iopolicy.attr, +#endif + NULL, +}; + +static const struct attribute_group nvme_subsys_attrs_group = { + .attrs = nvme_subsys_attrs, +}; + +const struct attribute_group *nvme_subsys_attrs_groups[] = { + &nvme_subsys_attrs_group, + NULL, +}; From 2110a6bcd7afd313ae10e9808878cd7c419c6b65 Mon Sep 17 00:00:00 2001 From: Irvin Cote Date: Wed, 17 May 2023 19:09:15 -0300 Subject: [PATCH 187/266] nvme-core: remove redundant check from nvme_init_ns_head nvme_find_ns_head already checks that the list of namescpaces in an already existing namespace head is not empty Signed-off-by: Irvin Cote Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 43b906a59c8c..3263af1c2c15 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3563,7 +3563,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) goto out_put_ns_head; } - if (!multipath && !list_empty(&head->list)) { + if (!multipath) { dev_warn(ctrl->device, "Found shared namespace %d, but multipathing not supported.\n", info->nsid); From 2ad0713c73ffea42a7a69ac2fa8996454e9f7816 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 17 Apr 2023 22:41:13 +0200 Subject: [PATCH 188/266] nvmet-auth: remove some dead code 'status' is known to be 0 at the point. And nvmet_auth_challenge() return a -E or 0. So these lines of code should just be removed. Signed-off-by: Christophe JAILLET Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd-auth.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 7970a7640e58..038032e46145 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -483,15 +483,6 @@ void nvmet_execute_auth_receive(struct nvmet_req *req) status = NVME_SC_INTERNAL; break; } - if (status) { - req->sq->dhchap_status = status; - nvmet_auth_failure1(req, d, al); - pr_warn("ctrl %d qid %d: challenge status (%x)\n", - ctrl->cntlid, req->sq->qid, - req->sq->dhchap_status); - status = 0; - break; - } req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_REPLY; break; case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1: From 94c78ea124001f5b04630947197fda4b9a9e95c6 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 19 May 2023 02:40:51 -0700 Subject: [PATCH 189/266] nvmet-auth: remove unnecessary break after goto Remove dead break after goto. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd-auth.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 038032e46145..586458f765f1 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -295,13 +295,11 @@ void nvmet_execute_auth_send(struct nvmet_req *req) status = 0; } goto done_kfree; - break; case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2: req->sq->authenticated = true; pr_debug("%s: ctrl %d qid %d ctrl authenticated\n", __func__, ctrl->cntlid, req->sq->qid); goto done_kfree; - break; case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2: status = nvmet_auth_failure2(d); if (status) { @@ -312,7 +310,6 @@ void nvmet_execute_auth_send(struct nvmet_req *req) status = 0; } goto done_kfree; - break; default: req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; @@ -320,7 +317,6 @@ void nvmet_execute_auth_send(struct nvmet_req *req) NVME_AUTH_DHCHAP_MESSAGE_FAILURE2; req->sq->authenticated = false; goto done_kfree; - break; } done_failure1: req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; From e9227d486ede0428a00f011236753d83a390d2e1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 19 May 2023 02:40:52 -0700 Subject: [PATCH 190/266] nvme-fcloop: no need to return from void function Remove return at the end of void function. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/fcloop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index e940a7d37a9d..1ab3d900f2bf 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -645,8 +645,6 @@ fcloop_fcp_recv_work(struct work_struct *work) } if (ret) fcloop_call_host_done(fcpreq, tfcp_req, ret); - - return; } static void From bdbfcd5f6caa46e1ddbfd60cbf694d192b37805a Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Tue, 30 May 2023 15:42:53 +0000 Subject: [PATCH 191/266] nvme: Increase block size variable size to 32-bit Increase block size variable size to 32-bit unsigned to be able to support block devices larger than 32k (starting from 64 KiB). Physical and logical block size already support unsigned 32-bit. Signed-off-by: Daniel Gomez Reviewed-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3263af1c2c15..64c484c14d6f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1835,7 +1835,7 @@ static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze)); - unsigned short bs = 1 << ns->lba_shift; + u32 bs = 1U << ns->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; /* From 900095bfbbf6623fbfa9e5ceb3982f293b6f3275 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 1 Jun 2023 22:37:13 -0700 Subject: [PATCH 192/266] nvme-fabrics: error out to unlock the mutex Currently, in the nvmf_host_add() function, if the nvmf_host_alloc() call failed to allocate memory for the host, the code would directly return -ENOMEM without unlocking the nvmf_hosts_mutex. This could lead to potential issues with mutex synchronization. Fix that error handling mechanism by jumping to the out_unlock label when nvmf_host_alloc() fails. This ensures that the mutex is unlocked before returning the error code. The updated code enhances avoids possible deadlocks. Fixes: f0cebf82004d ("nvme-fabrics: prevent overriding of existing host") Reported-by: kernel test robot Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202306020909.MTUEBeIa-lkp@intel.com/ Signed-off-by: Chaitanya Kulkarni Reviewed-by: Julia Lawall Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index b1fa27b60917..c4345d1d98aa 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -92,8 +92,10 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn, uuid_t *id) } host = nvmf_host_alloc(hostnqn, id); - if (!host) - return ERR_PTR(-ENOMEM); + if (!host) { + host = ERR_PTR(-ENOMEM); + goto out_unlock; + } list_add_tail(&host->list, &nvmf_hosts); out_unlock: From 959ffef13bac792e4e2e3321d6e2bd2b00c0f5f9 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 1 Jun 2023 23:47:42 -0700 Subject: [PATCH 193/266] nvme-fabrics: open code __nvmf_host_find() There is no point in maintaining a separate funciton __nvmf_host_find() that has only one caller nvmf_host_add() especially when caller and callee both are small enough to merge. Due to this we are actually repeating the error handling code in both callee and caller for no reason that can be avoided, but instead we have to read both function to establish the correctness along with additional lockdep warning check due to involved locking. Just open code __nvmf_host_find() in nvme_host_alloc() with appropriate comment that removes repeated error checks in the callee/caller and lockdep check that is needed for the nvmf_hosts_mutex involvement, diffstats :- drivers/nvme/host/fabrics.c | 75 +++++++++++++------------------------ 1 file changed, 27 insertions(+), 48 deletions(-) Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 75 +++++++++++++------------------------ 1 file changed, 27 insertions(+), 48 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index c4345d1d98aa..8175d49f2909 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -21,48 +21,6 @@ static DEFINE_MUTEX(nvmf_hosts_mutex); static struct nvmf_host *nvmf_default_host; -/** - * __nvmf_host_find() - Find a matching to a previously created host - * @hostnqn: Host NQN to match - * @id: Host ID to match - * - * We have defined a host as how it is perceived by the target. - * Therefore, we don't allow different Host NQNs with the same Host ID. - * Similarly, we do not allow the usage of the same Host NQN with different - * Host IDs. This will maintain unambiguous host identification. - * - * Return: Returns host pointer on success, NULL in case of no match or - * ERR_PTR(-EINVAL) in case of error match. - */ -static struct nvmf_host *__nvmf_host_find(const char *hostnqn, uuid_t *id) -{ - struct nvmf_host *host; - - lockdep_assert_held(&nvmf_hosts_mutex); - - list_for_each_entry(host, &nvmf_hosts, list) { - bool same_hostnqn = !strcmp(host->nqn, hostnqn); - bool same_hostid = uuid_equal(&host->id, id); - - if (same_hostnqn && same_hostid) - return host; - - if (same_hostnqn) { - pr_err("found same hostnqn %s but different hostid %pUb\n", - hostnqn, id); - return ERR_PTR(-EINVAL); - } - if (same_hostid) { - pr_err("found same hostid %pUb but different hostnqn %s\n", - id, hostnqn); - return ERR_PTR(-EINVAL); - - } - } - - return NULL; -} - static struct nvmf_host *nvmf_host_alloc(const char *hostnqn, uuid_t *id) { struct nvmf_host *host; @@ -83,12 +41,33 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn, uuid_t *id) struct nvmf_host *host; mutex_lock(&nvmf_hosts_mutex); - host = __nvmf_host_find(hostnqn, id); - if (IS_ERR(host)) { - goto out_unlock; - } else if (host) { - kref_get(&host->ref); - goto out_unlock; + + /* + * We have defined a host as how it is perceived by the target. + * Therefore, we don't allow different Host NQNs with the same Host ID. + * Similarly, we do not allow the usage of the same Host NQN with + * different Host IDs. This'll maintain unambiguous host identification. + */ + list_for_each_entry(host, &nvmf_hosts, list) { + bool same_hostnqn = !strcmp(host->nqn, hostnqn); + bool same_hostid = uuid_equal(&host->id, id); + + if (same_hostnqn && same_hostid) { + kref_get(&host->ref); + goto out_unlock; + } + if (same_hostnqn) { + pr_err("found same hostnqn %s but different hostid %pUb\n", + hostnqn, id); + host = ERR_PTR(-EINVAL); + goto out_unlock; + } + if (same_hostid) { + pr_err("found same hostid %pUb but different hostnqn %s\n", + id, hostnqn); + host = ERR_PTR(-EINVAL); + goto out_unlock; + } } host = nvmf_host_alloc(hostnqn, id); From d97b4111b3228122284df0263200f02fd44751fd Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 6 Jun 2023 14:24:11 +0200 Subject: [PATCH 194/266] nvmet-fcloop: Do not wait on completion when unregister fails The nvme_fc_unregister_localport() returns an error code in case that the locaport pointer is NULL or has already been unegisterd. localport is is either in the ONLINE state (all resources allocated) or has already been put into DELETED state. In this case we will never receive an wakeup call and thus any caller will hang, e.g. module unload. Signed-off-by: Daniel Wagner Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/fcloop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1ab3d900f2bf..c65a73433c05 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1166,7 +1166,8 @@ __wait_localport_unreg(struct fcloop_lport *lport) ret = nvme_fc_unregister_localport(lport->localport); - wait_for_completion(&lport->unreg_done); + if (!ret) + wait_for_completion(&lport->unreg_done); kfree(lport); From 35e797b0246b49d116326ff23c2dbfd6507168a5 Mon Sep 17 00:00:00 2001 From: Irvin Cote Date: Thu, 18 May 2023 19:10:54 -0300 Subject: [PATCH 195/266] nvme-core: use nvme_ns_head_multipath instead of ns->head->disk Change the way we check for a multipath nshead so as to consistently use the same check to assert the same condition. Signed-off-by: Irvin Cote Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 64c484c14d6f..afd2e4a41aab 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3664,7 +3664,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) * instance as shared namespaces will show up as multiple block * devices. */ - if (ns->head->disk) { + if (nvme_ns_head_multipath(ns->head)) { sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, ctrl->instance, ns->head->instance); disk->flags |= GENHD_FL_HIDDEN; From c917dd96fe41c2fb2a4b606372bf64ec5661f509 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 30 May 2023 08:18:20 -0700 Subject: [PATCH 196/266] nvme: skip optional id ctrl csi if it failed A frequently recieved report is the driver requests the optional Command Set Specific Identify Controller structure. Some controllers report this in their error log, which tiggers other warnings to user space monitoring the devices. These error entries are harmless and of questionable value to save in the log, but let's reduce their occurance by not resending the command if it previously failed. This will not prevent the errors on the initial module load, but will greatly reduce their occurance on any rescans and resumes from suspend. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217445 Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 ++++- drivers/nvme/host/nvme.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index afd2e4a41aab..76e8f8b4098e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2995,7 +2995,8 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) ctrl->max_zeroes_sectors = 0; if (ctrl->subsys->subtype != NVME_NQN_NVME || - nvme_ctrl_limited_cns(ctrl)) + nvme_ctrl_limited_cns(ctrl) || + test_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags)) return 0; id = kzalloc(sizeof(*id), GFP_KERNEL); @@ -3017,6 +3018,8 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); free_data: + if (ret > 0) + set_bit(NVME_CTRL_SKIP_ID_CNS_CS, &ctrl->flags); kfree(id); return ret; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 03cc7529d854..78308f15e090 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -242,6 +242,7 @@ enum nvme_ctrl_flags { NVME_CTRL_ADMIN_Q_STOPPED = 1, NVME_CTRL_STARTED_ONCE = 2, NVME_CTRL_STOPPED = 3, + NVME_CTRL_SKIP_ID_CNS_CS = 4, }; struct nvme_ctrl { From 3dbd53c7be1c3dd04875a0672262b56417039869 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Jun 2023 17:43:09 +0200 Subject: [PATCH 197/266] swim3: fix the floppy_locked_ioctl prototype Add back the accidentally dropped mode parameter. Fixes: b60f7635788a ("swim3: fix the floppy_locked_ioctl prototype") Reported-by: Randy Dunlap Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230613154309.327557-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/swim3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 945a03154250..dc43a63b3469 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -882,7 +882,7 @@ static int fd_eject(struct floppy_state *fs) static struct floppy_struct floppy_type = { 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,NULL }; /* 7 1.44MB 3.5" */ -static int floppy_locked_ioctl(struct block_device *bdev, +static int floppy_locked_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long param) { struct floppy_state *fs = bdev->bd_disk->private_data; From 3de13550a20fd570441c0f854abe58c6cc46c0bc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 15:22:12 +0200 Subject: [PATCH 198/266] raid6: neon: add missing prototypes The raid6 syndrome functions are generated for different sizes and have no generic prototype, while in the inner functions have a prototype in a header that cannot be included from the correct file. In both cases, the compiler warns about missing prototypes: lib/raid6/recov_neon_inner.c:27:6: warning: no previous prototype for '__raid6_2data_recov_neon' [-Wmissing-prototypes] lib/raid6/recov_neon_inner.c:77:6: warning: no previous prototype for '__raid6_datap_recov_neon' [-Wmissing-prototypes] lib/raid6/neon1.c:56:6: warning: no previous prototype for 'raid6_neon1_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon1.c:86:6: warning: no previous prototype for 'raid6_neon1_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon2.c:56:6: warning: no previous prototype for 'raid6_neon2_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon2.c:97:6: warning: no previous prototype for 'raid6_neon2_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon4.c:56:6: warning: no previous prototype for 'raid6_neon4_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon4.c:119:6: warning: no previous prototype for 'raid6_neon4_xor_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon8.c:56:6: warning: no previous prototype for 'raid6_neon8_gen_syndrome_real' [-Wmissing-prototypes] lib/raid6/neon8.c:163:6: warning: no previous prototype for 'raid6_neon8_xor_syndrome_real' [-Wmissing-prototypes] Add a new header file that contains the prototypes for both to avoid the warnings. Signed-off-by: Arnd Bergmann Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230517132220.937200-1-arnd@kernel.org --- lib/raid6/neon.h | 22 ++++++++++++++++++++++ lib/raid6/neon.uc | 1 + lib/raid6/recov_neon.c | 8 +------- lib/raid6/recov_neon_inner.c | 1 + 4 files changed, 25 insertions(+), 7 deletions(-) create mode 100644 lib/raid6/neon.h diff --git a/lib/raid6/neon.h b/lib/raid6/neon.h new file mode 100644 index 000000000000..2ca41ee9b499 --- /dev/null +++ b/lib/raid6/neon.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only + +void raid6_neon1_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon1_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon2_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon2_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon4_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon4_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void raid6_neon8_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs); +void raid6_neon8_xor_syndrome_real(int disks, int start, int stop, + unsigned long bytes, void **ptrs); +void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, + uint8_t *dq, const uint8_t *pbmul, + const uint8_t *qmul); + +void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, + const uint8_t *qmul); + + diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc index b7c68030da4f..355270af0cd6 100644 --- a/lib/raid6/neon.uc +++ b/lib/raid6/neon.uc @@ -25,6 +25,7 @@ */ #include +#include "neon.h" typedef uint8x16_t unative_t; diff --git a/lib/raid6/recov_neon.c b/lib/raid6/recov_neon.c index d6fba8bf8c0a..1bfc14174d4d 100644 --- a/lib/raid6/recov_neon.c +++ b/lib/raid6/recov_neon.c @@ -8,6 +8,7 @@ #ifdef __KERNEL__ #include +#include "neon.h" #else #define kernel_neon_begin() #define kernel_neon_end() @@ -19,13 +20,6 @@ static int raid6_has_neon(void) return cpu_has_neon(); } -void __raid6_2data_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dp, - uint8_t *dq, const uint8_t *pbmul, - const uint8_t *qmul); - -void __raid6_datap_recov_neon(int bytes, uint8_t *p, uint8_t *q, uint8_t *dq, - const uint8_t *qmul); - static void raid6_2data_recov_neon(int disks, size_t bytes, int faila, int failb, void **ptrs) { diff --git a/lib/raid6/recov_neon_inner.c b/lib/raid6/recov_neon_inner.c index 90eb80d43790..f9e7e8f5a151 100644 --- a/lib/raid6/recov_neon_inner.c +++ b/lib/raid6/recov_neon_inner.c @@ -5,6 +5,7 @@ */ #include +#include "neon.h" #ifdef CONFIG_ARM /* From 301867b1c16805aebbc306aafa6ecdc68b73c7e5 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 15 May 2023 21:48:05 +0800 Subject: [PATCH 199/266] md/raid10: check slab-out-of-bounds in md_bitmap_get_counter If we write a large number to md/bitmap_set_bits, md_bitmap_checkpage() will return -EINVAL because 'page >= bitmap->pages', but the return value was not checked immediately in md_bitmap_get_counter() in order to set *blocks value and slab-out-of-bounds occurs. Move check of 'page >= bitmap->pages' to md_bitmap_get_counter() and return directly if true. Fixes: ef4256733506 ("md/bitmap: optimise scanning of empty bitmaps.") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230515134808.3936750-2-linan666@huaweicloud.com --- drivers/md/md-bitmap.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bc8d7565171d..358a06495902 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -54,14 +54,7 @@ __acquires(bitmap->lock) { unsigned char *mappage; - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - + WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -1387,6 +1380,14 @@ __acquires(bitmap->lock) sector_t csize; int err; + if (page >= bitmap->pages) { + /* + * This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page or + * user set a huge number to sysfs bitmap_set_bits. + */ + return NULL; + } err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || From 46038b30b308c3ebf49e79548f109d00a8d74b31 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:06 +0800 Subject: [PATCH 200/266] md/raid5: don't allow replacement while reshape is in progress If reshape is interrupted(for example, echo frozen to sync_action), then rdev replacement can be set. It's safe because reshape is always prior to resync in md_check_recovery(). However, if system reboots, then kernel will complain cannot handle concurrent replacement and reshape and this array is not able to assemble anymore. Fix this problem by don't allow replacement until reshape is done. Reported-by: Peter Neuwirth Link: https://lore.kernel.org/linux-raid/e2f96772-bfbc-f43b-6da1-f520e5164536@online.de/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-2-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4739ed891e75..5950932323fc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8377,6 +8377,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) p = conf->disks + disk; tmp = rdev_mdlock_deref(mddev, p->rdev); if (test_bit(WantReplacement, &tmp->flags) && + mddev->reshape_position == MaxSector && p->replacement == NULL) { clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); From 873f50ece41aad5c4f788a340960c53774b5526e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:07 +0800 Subject: [PATCH 201/266] md: fix data corruption for raid456 when reshape restart while grow up Currently, if reshape is interrupted, echo "reshape" to sync_action will restart reshape from scratch, for example: echo frozen > sync_action echo reshape > sync_action This will corrupt data before reshape_position if the array is growing, fix the problem by continue reshape from reshape_position. Reported-by: Peter Neuwirth Link: https://lore.kernel.org/linux-raid/e2f96772-bfbc-f43b-6da1-f520e5164536@online.de/ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-3-yukuai1@huaweicloud.com --- drivers/md/md.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ca0de7ddd943..b7f83784710b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4806,11 +4806,21 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; err = mddev_lock(mddev); if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; - else { + } else if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); + } else { + /* + * If reshape is still in progress, and + * md_check_recovery() can continue to reshape, + * don't restart reshape because data can be + * corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } From 431e61257d631157e1d374f1368febf37aa59f7c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:08 +0800 Subject: [PATCH 202/266] md: export md_is_rdwr() and is_md_suspended() The two apis will be used later to fix a deadlock in raid456, there are no functional changes. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-4-yukuai1@huaweicloud.com --- drivers/md/md.c | 16 ---------------- drivers/md/md.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index b7f83784710b..fb060e381ae7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -93,18 +93,6 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -360,10 +348,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -static bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. diff --git a/drivers/md/md.h b/drivers/md/md.h index fd8f260ed5f8..da173d6bf726 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -555,6 +555,23 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); From 3e00777d51572bdd75cef29c9c31106b52d7cc8f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:09 +0800 Subject: [PATCH 203/266] md: add a new api prepare_suspend() in md_personality There are no functional changes, the new api will be used later to do special handling for raid456 in md_suspend(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-5-yukuai1@huaweicloud.com --- drivers/md/md.c | 4 ++++ drivers/md/md.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index fb060e381ae7..2f29d4e365c5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -448,6 +448,10 @@ void mddev_suspend(struct mddev *mddev) wake_up(&mddev->sb_wait); set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); percpu_ref_kill(&mddev->active_io); + + if (mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); + wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); mddev->pers->quiesce(mddev, 1); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); diff --git a/drivers/md/md.h b/drivers/md/md.h index da173d6bf726..1eec65cf783c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -631,6 +631,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour From 868bba54a3bcbfc34314e963d5a7e66717f39d4e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 12 May 2023 09:56:10 +0800 Subject: [PATCH 204/266] md/raid5: fix a deadlock in the case that reshape is interrupted If reshape is in progress and io across reshape_position is issued, such io will wait for reshape to make progress(see details in the case that make_stripe_request() return STRIPE_SCHEDULE_AND_RETRY). It has been reported several times that if system reboot while growing raid5 to raid6, array assemble will hang infinitely([1, 2]). This is because following deadlock is triggered: 1) a normal io is waiting for reshape to progress, this io can be from system-udevd or mdadm. 2) while assemble, mdadm tries to suspend the array, hence 'reconfig_mutex' is held and mddev_suspend() must wait for normal io to be done. 3) daemon thread can't start reshape because 'reconfig_mutex' can't be held. 1) and 3) is unbreakable because they're foundation design. In order to break 2), following is possible solutions that I can think of: a) Let mddev_suspend() fail is not a good option, because this will break many scenarios since mddev_suspend() doesn't fail before. b) Fail the io that is waiting for reshape to make progress from mddev_suspend(). c) Return false for the io that is waiting for reshape to make progress from raid5_make_request(), and these io will wait for suspend to be done in md_handle_request(), where 'active_io' is not grabbed. c) sounds better than b), however, b) is used because it's easy and straightforward, and it's verified that mdadm can assemble in this case. On the other hand, c) breaks the logic that mddev_suspend() will wait for submitted io to be completely handled. Fix the problem by checking reshape in mddev_suspend(), if reshape can't make progress and there are still some io waiting for reshape, fail those io. [1] https://lore.kernel.org/all/CAFig2csUV2QiomUhj_t3dPOgV300dbQ6XtM9ygKPdXJFSH__Nw@mail.gmail.com/ [2] https://lore.kernel.org/all/CAO2ABipzbw6QL5eNa44CQHjiVa-LTvS696Mh9QaTw+qsUKFUCw@mail.gmail.com/ Reported-by: Jove Reported-by: David Gilmour Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230512015610.821290-6-yukuai1@huaweicloud.com --- drivers/md/md.c | 1 + drivers/md/raid5.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2f29d4e365c5..36af585b0e96 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9100,6 +9100,7 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); + wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5950932323fc..01c55f24ab09 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5966,6 +5966,19 @@ static int add_all_stripe_bios(struct r5conf *conf, return ret; } +static bool reshape_inprogress(struct mddev *mddev) +{ + return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + !test_bit(MD_RECOVERY_DONE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery); +} + +static bool reshape_disabled(struct mddev *mddev) +{ + return is_md_suspended(mddev) || !md_is_rdwr(mddev); +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5997,7 +6010,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6076,6 +6090,15 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) && + reshape_disabled(mddev)) { + bi->bi_status = BLK_STS_IOERR; + ret = STRIPE_FAIL; + pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n", + mdname(mddev)); + } + return ret; } @@ -9044,6 +9067,22 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wait_event(mddev->sb_wait, !reshape_inprogress(mddev) || + percpu_ref_is_zero(&mddev->active_io)); + if (percpu_ref_is_zero(&mddev->active_io)) + return; + + /* + * Reshape is not in progress, and array is suspended, io that is + * waiting for reshpape can never be done. + */ + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -9064,6 +9103,7 @@ static struct md_personality raid6_personality = .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9088,6 +9128,7 @@ static struct md_personality raid5_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9113,6 +9154,7 @@ static struct md_personality raid4_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, + .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, From 6beb489b2eed25978523f379a605073f99240c50 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 22 May 2023 15:25:33 +0800 Subject: [PATCH 205/266] md/raid10: fix overflow of md/safe_mode_delay There is no input check when echo md/safe_mode_delay in safe_delay_store(). And msec might also overflow when HZ < 1000 in safe_delay_show(), Fix it by checking overflow in safe_delay_store() and use unsigned long conversion in safe_delay_show(). Fixes: 72e02075a33f ("md: factor out parsing of fixed-point numbers") Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230522072535.1523740-2-linan666@huaweicloud.com --- drivers/md/md.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 36af585b0e96..2fc8d25f6e80 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3784,8 +3784,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) static ssize_t safe_delay_show(struct mddev *mddev, char *page) { - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) @@ -3797,7 +3798,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) return -EINVAL; } - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; From f8b20a405428803bd9881881d8242c9d72c6b2b2 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 22 May 2023 15:25:34 +0800 Subject: [PATCH 206/266] md/raid10: fix wrong setting of max_corr_read_errors There is no input check when echo md/max_read_errors and overflow might occur. Add check of input number. Fixes: 1e50915fe0bb ("raid: improve MD/raid10 handling of correctable read errors.") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230522072535.1523740-3-linan666@huaweicloud.com --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2fc8d25f6e80..49c83ed08937 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4468,6 +4468,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; + if (n > INT_MAX) + return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } From 3ce94ce5d05ae89190a23f6187f64d8f4b2d3782 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 09:27:27 +0800 Subject: [PATCH 207/266] md: fix duplicate filename for rdev Commit 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs") delays the deletion of rdev, however, this introduces a window that rdev can be added again while the deletion is not done yet, and sysfs will complain about duplicate filename. Follow up patches try to fix this problem by flushing workqueue, however, flush_rdev_wq() is just dead code, the progress in md_kick_rdev_from_array(): 1) list_del_rcu(&rdev->same_set); 2) synchronize_rcu(); 3) queue_work(md_rdev_misc_wq, &rdev->del_work); So in flush_rdev_wq(), if rdev is found in the list, work_pending() can never pass, in the meantime, if work is queued, then rdev can never be found in the list. flush_rdev_wq() can be replaced by flush_workqueue() directly, however, this approach is not good: - the workqueue is global, this synchronization for all raid disks is not necessary. - flush_workqueue can't be called under 'reconfig_mutex', there is still a small window between flush_workqueue() and mddev_lock() that other contexts can queue new work, hence the problem is not solved completely. sysfs already has apis to support delete itself through writer, and these apis, specifically sysfs_break/unbreak_active_protection(), is used to support deleting rdev synchronously. Therefore, the above commit can be reverted, and sysfs duplicate filename can be avoided. A new mdadm regression test is proposed as well([1]). [1] https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/ Fixes: 5792a2856a63 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523012727.3042247-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 86 +++++++++++++++++++++++++------------------------ drivers/md/md.h | 10 ++++-- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 49c83ed08937..724c7414b241 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -87,11 +87,11 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; -static struct workqueue_struct *md_rdev_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); +static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); /* * Default number of read corrections we'll attempt on an rdev @@ -643,9 +643,11 @@ void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); + mutex_init(&mddev->delete_mutex); mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->deleting); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); @@ -747,6 +749,24 @@ static void mddev_free(struct mddev *mddev) static const struct attribute_group md_redundancy_group; +static void md_free_rdev(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct md_rdev *tmp; + + mutex_lock(&mddev->delete_mutex); + if (list_empty(&mddev->deleting)) + goto out; + + list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) { + list_del_init(&rdev->same_set); + kobject_del(&rdev->kobj); + export_rdev(rdev, mddev); + } +out: + mutex_unlock(&mddev->delete_mutex); +} + void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev) } else mutex_unlock(&mddev->reconfig_mutex); + md_free_rdev(mddev); + /* As we've dropped the mutex we need a spinlock to * make sure the thread doesn't disappear */ @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void rdev_delayed_delete(struct work_struct *ws) -{ - struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); - kobject_del(&rdev->kobj); - kobject_put(&rdev->kobj); -} - void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ @@ -2455,6 +2470,8 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) static void md_kick_rdev_from_array(struct md_rdev *rdev) { + struct mddev *mddev = rdev->mddev; + bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); @@ -2468,15 +2485,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) rdev->sysfs_unack_badblocks = NULL; rdev->sysfs_badblocks = NULL; rdev->badblocks.count = 0; - /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state". We also need - * to delay it due to rcu usage. - */ + synchronize_rcu(); - INIT_WORK(&rdev->del_work, rdev_delayed_delete); - kobject_get(&rdev->kobj); - queue_work(md_rdev_misc_wq, &rdev->del_work); - export_rdev(rdev, rdev->mddev); + + /* + * kobject_del() will wait for all in progress writers to be done, where + * reconfig_mutex is held, hence it can't be called under + * reconfig_mutex and it's delayed to mddev_unlock(). + */ + mutex_lock(&mddev->delete_mutex); + list_add(&rdev->same_set, &mddev->deleting); + mutex_unlock(&mddev->delete_mutex); } static void export_array(struct mddev *mddev) @@ -3544,6 +3563,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); + struct kernfs_node *kn = NULL; ssize_t rv; struct mddev *mddev = rdev->mddev; @@ -3551,6 +3571,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + if (entry->store == state_store && cmd_match(page, "remove")) + kn = sysfs_break_active_protection(kobj, attr); + rv = mddev ? mddev_lock(mddev) : -ENODEV; if (!rv) { if (rdev->mddev == NULL) @@ -3559,6 +3583,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, rv = entry->store(rdev, page, length); mddev_unlock(mddev); } + + if (kn) + sysfs_unbreak_active_protection(kn); + return rv; } @@ -4484,20 +4512,6 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } -/* need to ensure rdev_delayed_delete() has completed */ -static void flush_rdev_wq(struct mddev *mddev) -{ - struct md_rdev *rdev; - - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (work_pending(&rdev->del_work)) { - flush_workqueue(md_rdev_misc_wq); - break; - } - rcu_read_unlock(); -} - static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4525,7 +4539,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -5595,7 +5608,6 @@ struct mddev *md_alloc(dev_t dev, char *name) * removed (mddev_delayed_delete). */ flush_workqueue(md_misc_wq); - flush_workqueue(md_rdev_misc_wq); mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); @@ -7558,9 +7570,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, } - if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) - flush_rdev_wq(mddev); - if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ wait_event_interruptible_timeout(mddev->sb_wait, @@ -9623,10 +9632,6 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; - md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); - if (!md_rdev_misc_wq) - goto err_rdev_misc_wq; - ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -9645,8 +9650,6 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: - destroy_workqueue(md_rdev_misc_wq); -err_rdev_misc_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9942,7 +9945,6 @@ static __exit void md_exit(void) } spin_unlock(&all_mddevs_lock); - destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index 1eec65cf783c..7156fc05f834 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -122,8 +122,6 @@ struct md_rdev { struct serial_in_rdev *serial; /* used for raid1 io serialization */ - struct work_struct del_work; /* used for delayed sysfs removal */ - struct kernfs_node *sysfs_state; /* handle for 'state' * sysfs entry */ /* handle for 'unacknowledged_bad_blocks' sysfs dentry */ @@ -531,6 +529,14 @@ struct mddev { unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int noio_flag; /* for memalloc scope API */ + /* + * Temporarily store rdev that will be finally removed when + * reconfig_mutex is unlocked. + */ + struct list_head deleting; + /* Protect the deleting list */ + struct mutex delete_mutex; + bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; From e5e9b9cb71a09d86d5e8d147e6a6457e1f8887b5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:13 +0800 Subject: [PATCH 208/266] md: factor out a helper to wake up md_thread directly md_wakeup_thread() can't wakeup md_thread->tsk if md_thread->run is still in progress, and in some cases md_thread->tsk need to be woke up directly, like md_set_readonly() and do_md_stop(). Commit 9dfbdafda3b3 ("md: unlock mddev before reap sync_thread in action_store") introduce a new scenario where unregister sync_thread is not protected by 'reconfig_mutex', this can cause null-ptr-deference in theroy: t1: md_set_readonly t2: action_store md_unregister_thread // 'reconfig_mutex' is not held // 'reconfig_mutex' is held by caller if (mddev->sync_thread) thread = *threadp *threadp = NULL wake_up_process(mddev->sync_thread->tsk) // null-ptr-deference Fix this problem by factoring out a helper to wake up md_thread directly, so that 'sync_thread' won't be accessed multiple times from the reader side. This helper also prepare to protect md_thread with rcu. Noted that later patches is going to fix that unregister sync_thread is not protected by 'reconfig_mutex' from action_store(). Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-2-yukuai1@huaweicloud.com --- drivers/md/md.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 724c7414b241..9d54de3441ef 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -92,6 +92,7 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void md_wakeup_thread_directly(struct md_thread *thread); /* * Default number of read corrections we'll attempt on an rdev @@ -6284,10 +6285,12 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) return -EBUSY; @@ -6348,10 +6351,12 @@ static int do_md_stop(struct mddev *mddev, int mode, } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) set_bit(MD_RECOVERY_INTR, &mddev->recovery); - if (mddev->sync_thread) - /* Thread might be blocked waiting for metadata update - * which will now never happen */ - wake_up_process(mddev->sync_thread->tsk); + + /* + * Thread might be blocked waiting for metadata update which will now + * never happen + */ + md_wakeup_thread_directly(mddev->sync_thread); mddev_unlock(mddev); wait_event(resync_wait, (mddev->sync_thread == NULL && @@ -7898,6 +7903,12 @@ static int md_thread(void *arg) return 0; } +static void md_wakeup_thread_directly(struct md_thread *thread) +{ + if (thread) + wake_up_process(thread->tsk); +} + void md_wakeup_thread(struct md_thread *thread) { if (thread) { From 955a257d69e44cea09b0375b8f2f3d4d9fcf7b4e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:14 +0800 Subject: [PATCH 209/266] dm-raid: remove useless checking in raid_message() md_wakeup_thread() handle the case that pass in md_thread is NULL, there is no need to check this. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-3-yukuai1@huaweicloud.com --- drivers/md/dm-raid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c8821fcb8299..8846bf510a35 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3750,11 +3750,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, * canceling read-auto mode */ mddev->ro = 0; - if (!mddev->suspended && mddev->sync_thread) + if (!mddev->suspended) md_wakeup_thread(mddev->sync_thread); } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - if (!mddev->suspended && mddev->thread) + if (!mddev->suspended) md_wakeup_thread(mddev->thread); return 0; From c333673a78307abe6b1f6998809288fcd86740ed Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:15 +0800 Subject: [PATCH 210/266] md/bitmap: always wake up md_thread in timeout_store md_wakeup_thread() can handle the case that pass in md_thread is NULL, the only difference is that md_wakeup_thread() will be called when current timeout is 'MAX_SCHEDULE_TIMEOUT', this should not matter because timeout_store() is not hot path, and the daemon process is woke up more than demand from other context already. Prepare to factor out a helper to set timeout. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-4-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 358a06495902..4b5ba81c53be 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2476,11 +2476,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len) * the bitmap is all clean and we don't need to * adjust the timeout right now */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { + if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) mddev->thread->timeout = timeout; - md_wakeup_thread(mddev->thread); - } } + + md_wakeup_thread(mddev->thread); return len; } From 4eeb6535cd51100460ec8873bb68addef17b3e81 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:16 +0800 Subject: [PATCH 211/266] md/bitmap: factor out a helper to set timeout Register/unregister 'mddev->thread' are both under 'reconfig_mutex', however, some context didn't hold the mutex to access mddev->thread, which can cause null-ptr-deference: 1) md_bitmap_daemon_work() can be called from md_check_recovery() where 'reconfig_mutex' is not held, deference 'mddev->thread' might cause null-ptr-deference, because md_unregister_thread() reset the pointer before stopping the thread. 2) timeout_store() access 'mddev->thread' multiple times, null-ptr-deference can be triggered if 'mddev->thread' is reset in the middle. This patch factor out a helper to set timeout, the new helper always check if 'mddev->thread' is null first, so that problem 1 can be fixed. Now that this helper only access 'mddev->thread' once, but it's possible that 'mddev->thread' can be freed while this helper is still in progress, hence the problem is not fixed yet. Follow up patches will fix this by protecting md_thread with rcu. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-5-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 4b5ba81c53be..23522df41ca5 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1234,11 +1234,22 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, sector_t offset, sector_t *blocks, int create); +static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, + bool force) +{ + struct md_thread *thread = mddev->thread; + + if (!thread) + return; + + if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) + thread->timeout = timeout; +} + /* * bitmap daemon -- periodically wakes up to clean bits and flush pages * out to disk */ - void md_bitmap_daemon_work(struct mddev *mddev) { struct bitmap *bitmap; @@ -1262,7 +1273,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); goto done; } bitmap->allclean = 1; @@ -1359,8 +1370,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) done: if (bitmap->allclean == 0) - mddev->thread->timeout = - mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); mutex_unlock(&mddev->bitmap_info.mutex); } @@ -1821,8 +1831,7 @@ void md_bitmap_destroy(struct mddev *mddev) mddev->bitmap = NULL; /* disconnect from the md device */ spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); - if (mddev->thread) - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + mddev_set_timeout(mddev, MAX_SCHEDULE_TIMEOUT, true); md_bitmap_free(bitmap); } @@ -1965,7 +1974,7 @@ int md_bitmap_load(struct mddev *mddev) /* Kick recovery in case any bits were set */ set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); md_wakeup_thread(mddev->thread); md_bitmap_update_sb(bitmap); @@ -2470,17 +2479,11 @@ timeout_store(struct mddev *mddev, const char *buf, size_t len) timeout = MAX_SCHEDULE_TIMEOUT-1; if (timeout < 1) timeout = 1; - mddev->bitmap_info.daemon_sleep = timeout; - if (mddev->thread) { - /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then - * the bitmap is all clean and we don't need to - * adjust the timeout right now - */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) - mddev->thread->timeout = timeout; - } + mddev->bitmap_info.daemon_sleep = timeout; + mddev_set_timeout(mddev, timeout, false); md_wakeup_thread(mddev->thread); + return len; } From 4469315439827290923fce4f3f672599cabeb366 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 23 May 2023 10:10:17 +0800 Subject: [PATCH 212/266] md: protect md_thread with rcu Currently, there are many places that md_thread can be accessed without protection, following are known scenarios that can cause null-ptr-dereference or uaf: 1) sync_thread that is allocated and started from md_start_sync() 2) mddev->thread can be accessed directly from timeout_store() and md_bitmap_daemon_work() 3) md_unregister_thread() from action_store(). Currently, a global spinlock 'pers_lock' is borrowed to protect 'mddev->thread' in some places, this problem can be fixed likewise, however, use a global lock for all the cases is not good. Fix this problem by protecting all md_thread with rcu. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230523021017.3048783-6-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 10 ++++-- drivers/md/md-cluster.c | 17 ++++++---- drivers/md/md-multipath.c | 4 +-- drivers/md/md.c | 69 ++++++++++++++++++--------------------- drivers/md/md.h | 8 ++--- drivers/md/raid1.c | 7 ++-- drivers/md/raid1.h | 2 +- drivers/md/raid10.c | 20 +++++++----- drivers/md/raid10.h | 2 +- drivers/md/raid5-cache.c | 22 ++++++++----- drivers/md/raid5.c | 15 +++++---- drivers/md/raid5.h | 2 +- 12 files changed, 97 insertions(+), 81 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 23522df41ca5..ad5a3456cd8a 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1237,13 +1237,19 @@ static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap, static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout, bool force) { - struct md_thread *thread = mddev->thread; + struct md_thread *thread; + + rcu_read_lock(); + thread = rcu_dereference(mddev->thread); if (!thread) - return; + goto out; if (force || thread->timeout < MAX_SCHEDULE_TIMEOUT) thread->timeout = timeout; + +out: + rcu_read_unlock(); } /* diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 10e0c5381d01..3d9fd74233df 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -75,14 +75,14 @@ struct md_cluster_info { sector_t suspend_hi; int suspend_from; /* the slot which broadcast suspend_lo/hi */ - struct md_thread *recovery_thread; + struct md_thread __rcu *recovery_thread; unsigned long recovery_map; /* communication loc resources */ struct dlm_lock_resource *ack_lockres; struct dlm_lock_resource *message_lockres; struct dlm_lock_resource *token_lockres; struct dlm_lock_resource *no_new_dev_lockres; - struct md_thread *recv_thread; + struct md_thread __rcu *recv_thread; struct completion newdisk_completion; wait_queue_head_t wait; unsigned long state; @@ -362,8 +362,8 @@ static void __recover_slot(struct mddev *mddev, int slot) set_bit(slot, &cinfo->recovery_map); if (!cinfo->recovery_thread) { - cinfo->recovery_thread = md_register_thread(recover_bitmaps, - mddev, "recover"); + rcu_assign_pointer(cinfo->recovery_thread, + md_register_thread(recover_bitmaps, mddev, "recover")); if (!cinfo->recovery_thread) { pr_warn("md-cluster: Could not create recovery thread\n"); return; @@ -526,11 +526,15 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { int got_lock = 0; + struct md_thread *thread; struct md_cluster_info *cinfo = mddev->cluster_info; mddev->good_device_nr = le32_to_cpu(msg->raid_slot); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); - wait_event(mddev->thread->wqueue, + + /* daemaon thread must exist */ + thread = rcu_dereference_protected(mddev->thread, true); + wait_event(thread->wqueue, (got_lock = mddev_trylock(mddev)) || test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)); md_reload_sb(mddev, mddev->good_device_nr); @@ -889,7 +893,8 @@ static int join(struct mddev *mddev, int nodes) } /* Initiate the communication resources */ ret = -ENOMEM; - cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); + rcu_assign_pointer(cinfo->recv_thread, + md_register_thread(recv_daemon, mddev, "cluster_recv")); if (!cinfo->recv_thread) { pr_err("md-cluster: cannot allocate memory for recv_thread!\n"); goto err; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 66edf5e72bd6..92c45be203d7 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -400,8 +400,8 @@ static int multipath_run (struct mddev *mddev) if (ret) goto out_free_conf; - mddev->thread = md_register_thread(multipathd, mddev, - "multipath"); + rcu_assign_pointer(mddev->thread, + md_register_thread(multipathd, mddev, "multipath")); if (!mddev->thread) goto out_free_conf; diff --git a/drivers/md/md.c b/drivers/md/md.c index 9d54de3441ef..a6ede3d1c99e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -70,11 +70,7 @@ #include "md-bitmap.h" #include "md-cluster.h" -/* pers_list is a list of registered personalities protected - * by pers_lock. - * pers_lock does extra service to protect accesses to - * mddev->thread when the mutex cannot be held. - */ +/* pers_list is a list of registered personalities protected by pers_lock. */ static LIST_HEAD(pers_list); static DEFINE_SPINLOCK(pers_lock); @@ -92,7 +88,7 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); -static void md_wakeup_thread_directly(struct md_thread *thread); +static void md_wakeup_thread_directly(struct md_thread __rcu *thread); /* * Default number of read corrections we'll attempt on an rdev @@ -442,8 +438,10 @@ static void md_submit_bio(struct bio *bio) */ void mddev_suspend(struct mddev *mddev) { - WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); - lockdep_assert_held(&mddev->reconfig_mutex); + struct md_thread *thread = rcu_dereference_protected(mddev->thread, + lockdep_is_held(&mddev->reconfig_mutex)); + + WARN_ON_ONCE(thread && current == thread->tsk); if (mddev->suspended++) return; wake_up(&mddev->sb_wait); @@ -811,13 +809,8 @@ void mddev_unlock(struct mddev *mddev) md_free_rdev(mddev); - /* As we've dropped the mutex we need a spinlock to - * make sure the thread doesn't disappear - */ - spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); wake_up(&mddev->sb_wait); - spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); @@ -7903,19 +7896,29 @@ static int md_thread(void *arg) return 0; } -static void md_wakeup_thread_directly(struct md_thread *thread) +static void md_wakeup_thread_directly(struct md_thread __rcu *thread) { - if (thread) - wake_up_process(thread->tsk); + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) + wake_up_process(t->tsk); + rcu_read_unlock(); } -void md_wakeup_thread(struct md_thread *thread) +void md_wakeup_thread(struct md_thread __rcu *thread) { - if (thread) { - pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - set_bit(THREAD_WAKEUP, &thread->flags); - wake_up(&thread->wqueue); + struct md_thread *t; + + rcu_read_lock(); + t = rcu_dereference(thread); + if (t) { + pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); + set_bit(THREAD_WAKEUP, &t->flags); + wake_up(&t->wqueue); } + rcu_read_unlock(); } EXPORT_SYMBOL(md_wakeup_thread); @@ -7945,22 +7948,15 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *), } EXPORT_SYMBOL(md_register_thread); -void md_unregister_thread(struct md_thread **threadp) +void md_unregister_thread(struct md_thread __rcu **threadp) { - struct md_thread *thread; + struct md_thread *thread = rcu_dereference_protected(*threadp, true); - /* - * Locking ensures that mddev_unlock does not wake_up a - * non-existent thread - */ - spin_lock(&pers_lock); - thread = *threadp; - if (!thread) { - spin_unlock(&pers_lock); + if (!thread) return; - } - *threadp = NULL; - spin_unlock(&pers_lock); + + rcu_assign_pointer(*threadp, NULL); + synchronize_rcu(); pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); @@ -9226,9 +9222,8 @@ static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - mddev->sync_thread = md_register_thread(md_do_sync, - mddev, - "resync"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "resync")); if (!mddev->sync_thread) { pr_warn("%s: could not start resync thread...\n", mdname(mddev)); diff --git a/drivers/md/md.h b/drivers/md/md.h index 7156fc05f834..a50122165fa1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -365,8 +365,8 @@ struct mddev { int new_chunk_sectors; int reshape_backwards; - struct md_thread *thread; /* management thread */ - struct md_thread *sync_thread; /* doing resync or reconstruct */ + struct md_thread __rcu *thread; /* management thread */ + struct md_thread __rcu *sync_thread; /* doing resync or reconstruct */ /* 'last_sync_action' is initialized to "none". It is set when a * sync operation (i.e "data-check", "requested-resync", "resync", @@ -758,8 +758,8 @@ extern struct md_thread *md_register_thread( void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); -extern void md_unregister_thread(struct md_thread **threadp); -extern void md_wakeup_thread(struct md_thread *thread); +extern void md_unregister_thread(struct md_thread __rcu **threadp); +extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern int mddev_init_writes_pending(struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 3570da63969b..220f6ce761a5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3087,7 +3087,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, "raid1"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid1d, mddev, "raid1")); if (!conf->thread) goto abort; @@ -3180,8 +3181,8 @@ static int raid1_run(struct mddev *mddev) /* * Ok, everything is just fine now */ - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index ebb6788820e7..468f189da7a0 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -130,7 +130,7 @@ struct r1conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* Keep track of cluster resync window to send to other * nodes. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 381c21f7fb06..0ae7e52983fa 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -982,6 +982,7 @@ static void lower_barrier(struct r10conf *conf) static bool stop_waiting_barrier(struct r10conf *conf) { struct bio_list *bio_list = current->bio_list; + struct md_thread *thread; /* barrier is dropped */ if (!conf->barrier) @@ -997,12 +998,14 @@ static bool stop_waiting_barrier(struct r10conf *conf) (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1]))) return true; + /* daemon thread must exist while handling io */ + thread = rcu_dereference_protected(conf->mddev->thread, true); /* * move on if io is issued from raid10d(), nr_pending is not released * from original io(see handle_read_error()). All raise barrier is * blocked until this io is done. */ - if (conf->mddev->thread->tsk == current) { + if (thread->tsk == current) { WARN_ON_ONCE(atomic_read(&conf->nr_pending) == 0); return true; } @@ -4107,7 +4110,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) atomic_set(&conf->nr_pending, 0); err = -ENOMEM; - conf->thread = md_register_thread(raid10d, mddev, "raid10"); + rcu_assign_pointer(conf->thread, + md_register_thread(raid10d, mddev, "raid10")); if (!conf->thread) goto out; @@ -4152,8 +4156,8 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); if (mddev_is_clustered(conf->mddev)) { int fc, fo; @@ -4296,8 +4300,8 @@ static int raid10_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto out_free_conf; } @@ -4698,8 +4702,8 @@ static int raid10_start_reshape(struct mddev *mddev) set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { ret = -EAGAIN; goto abort; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 8c072ce0bc54..63e48b11b552 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -100,7 +100,7 @@ struct r10conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; /* * Keep track of cluster resync window to send to other nodes. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 852b265c5db4..47ba7d9e81e1 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -120,7 +120,7 @@ struct r5l_log { struct bio_set bs; mempool_t meta_pool; - struct md_thread *reclaim_thread; + struct md_thread __rcu *reclaim_thread; unsigned long reclaim_target; /* number of space that need to be * reclaimed. if it's 0, reclaim spaces * used by io_units which are in @@ -1576,17 +1576,18 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) void r5l_quiesce(struct r5l_log *log, int quiesce) { - struct mddev *mddev; + struct mddev *mddev = log->rdev->mddev; + struct md_thread *thread = rcu_dereference_protected( + log->reclaim_thread, lockdep_is_held(&mddev->reconfig_mutex)); if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ - mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); - kthread_park(log->reclaim_thread->tsk); + kthread_park(thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); } else - kthread_unpark(log->reclaim_thread->tsk); + kthread_unpark(thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) @@ -3063,6 +3064,7 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) { struct r5l_log *log; + struct md_thread *thread; int ret; pr_debug("md/raid:%s: using device %pg as journal\n", @@ -3121,11 +3123,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->tree_lock); INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); - log->reclaim_thread = md_register_thread(r5l_reclaim_thread, - log->rdev->mddev, "reclaim"); - if (!log->reclaim_thread) + thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, + "reclaim"); + if (!thread) goto reclaim_thread; - log->reclaim_thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + + thread->timeout = R5C_RECLAIM_WAKEUP_INTERVAL; + rcu_assign_pointer(log->reclaim_thread, thread); init_waitqueue_head(&log->iounit_wait); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 01c55f24ab09..7e2bbcfef325 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7731,7 +7731,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) } sprintf(pers_name, "raid%d", mddev->new_level); - conf->thread = md_register_thread(raid5d, mddev, pers_name); + rcu_assign_pointer(conf->thread, + md_register_thread(raid5d, mddev, pers_name)); if (!conf->thread) { pr_warn("md/raid:%s: couldn't allocate thread.\n", mdname(mddev)); @@ -7954,8 +7955,8 @@ static int raid5_run(struct mddev *mddev) } conf->min_offset_diff = min_offset_diff; - mddev->thread = conf->thread; - conf->thread = NULL; + rcu_assign_pointer(mddev->thread, conf->thread); + rcu_assign_pointer(conf->thread, NULL); mddev->private = conf; for (i = 0; i < conf->raid_disks && conf->previous_raid_disks; @@ -8052,8 +8053,8 @@ static int raid5_run(struct mddev *mddev) clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) goto abort; } @@ -8631,8 +8632,8 @@ static int raid5_start_reshape(struct mddev *mddev) clear_bit(MD_RECOVERY_DONE, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); - mddev->sync_thread = md_register_thread(md_do_sync, mddev, - "reshape"); + rcu_assign_pointer(mddev->sync_thread, + md_register_thread(md_do_sync, mddev, "reshape")); if (!mddev->sync_thread) { mddev->recovery = 0; spin_lock_irq(&conf->device_lock); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index e873938a6125..f19707189a7b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -679,7 +679,7 @@ struct r5conf { /* When taking over an array from a different personality, we store * the new thread here until we fully activate the array. */ - struct md_thread *thread; + struct md_thread __rcu *thread; struct list_head temp_inactive_list[NR_STRIPE_HASH_LOCKS]; struct r5worker_group *worker_groups; int group_cnt; From 75aa7a1b8f85b03971df1d0f5b1a3a9edf020dff Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:34:10 +0800 Subject: [PATCH 213/266] md/raid5: don't start reshape when recovery or replace is in progress When recovery is interrupted (reboot, etc.) check for MD_RECOVERY_RUNNING is not enough to tell recovery is in progress. Also check recovery_cp before starting reshape. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529133410.2125914-1-yukuai1@huaweicloud.com --- drivers/md/raid5.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7e2bbcfef325..f8bc74e16811 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8525,6 +8525,7 @@ static int raid5_start_reshape(struct mddev *mddev) struct r5conf *conf = mddev->private; struct md_rdev *rdev; int spares = 0; + int i; unsigned long flags; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) @@ -8536,6 +8537,13 @@ static int raid5_start_reshape(struct mddev *mddev) if (has_failed(conf)) return -EINVAL; + /* raid5 can't handle concurrent reshape and recovery */ + if (mddev->recovery_cp < MaxSector) + return -EBUSY; + for (i = 0; i < conf->raid_disks; i++) + if (rdev_mdlock_deref(mddev, conf->disks[i].replacement)) + return -EBUSY; + rdev_for_each(rdev, mddev) { if (!test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) From 34817a2441747b48e444cb0e05d84e14bc9443da Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 27 May 2023 15:22:15 +0800 Subject: [PATCH 214/266] md/raid10: fix null-ptr-deref of mreplace in raid10_sync_request There are two check of 'mreplace' in raid10_sync_request(). In the first check, 'need_replace' will be set and 'mreplace' will be used later if no-Faulty 'mreplace' exists, In the second check, 'mreplace' will be set to NULL if it is Faulty, but 'need_replace' will not be changed accordingly. null-ptr-deref occurs if Faulty is set between two check. Fix it by merging two checks into one. And replace 'need_replace' with 'mreplace' because their values are always the same. Fixes: ee37d7314a32 ("md/raid10: Fix raid10 replace hang when new added disk faulty") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230527072218.2365857-2-linan666@huaweicloud.com --- drivers/md/raid10.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0ae7e52983fa..51552280b325 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3441,7 +3441,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int must_sync; int any_working; int need_recover = 0; - int need_replace = 0; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3453,11 +3452,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, !test_bit(Faulty, &mrdev->flags) && !test_bit(In_sync, &mrdev->flags)) need_recover = 1; - if (mreplace != NULL && - !test_bit(Faulty, &mreplace->flags)) - need_replace = 1; + if (mreplace && test_bit(Faulty, &mreplace->flags)) + mreplace = NULL; - if (!need_recover && !need_replace) { + if (!need_recover && !mreplace) { rcu_read_unlock(); continue; } @@ -3473,8 +3471,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - if (mreplace && test_bit(Faulty, &mreplace->flags)) - mreplace = NULL; /* Unless we are doing a full sync, or a replacement * we only need to recover the block if it is set in * the bitmap @@ -3597,11 +3593,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, bio = r10_bio->devs[1].repl_bio; if (bio) bio->bi_end_io = NULL; - /* Note: if need_replace, then bio + /* Note: if replace is not NULL, then bio * cannot be NULL as r10buf_pool_alloc will * have allocated it. */ - if (!need_replace) + if (!mreplace) break; bio->bi_next = biolist; biolist = bio; From 59f8f0b54c8ffb4521f6bbd1cb6f4dfa5022e75e Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 27 May 2023 15:22:16 +0800 Subject: [PATCH 215/266] md/raid10: improve code of mrdev in raid10_sync_request 'need_recover' and 'mrdev' are equivalent in raid10_sync_request(), and inc mrdev->nr_pending is unreasonable if don't need recovery. Replace 'need_recover' with 'mrdev', and only inc nr_pending when needed. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230527072218.2365857-3-linan666@huaweicloud.com --- drivers/md/raid10.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 51552280b325..8eda79932754 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3440,7 +3440,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t sect; int must_sync; int any_working; - int need_recover = 0; struct raid10_info *mirror = &conf->mirrors[i]; struct md_rdev *mrdev, *mreplace; @@ -3448,14 +3447,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mrdev = rcu_dereference(mirror->rdev); mreplace = rcu_dereference(mirror->replacement); - if (mrdev != NULL && - !test_bit(Faulty, &mrdev->flags) && - !test_bit(In_sync, &mrdev->flags)) - need_recover = 1; + if (mrdev && (test_bit(Faulty, &mrdev->flags) || + test_bit(In_sync, &mrdev->flags))) + mrdev = NULL; if (mreplace && test_bit(Faulty, &mreplace->flags)) mreplace = NULL; - if (!need_recover && !mreplace) { + if (!mrdev && !mreplace) { rcu_read_unlock(); continue; } @@ -3489,7 +3487,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, rcu_read_unlock(); continue; } - atomic_inc(&mrdev->nr_pending); + if (mrdev) + atomic_inc(&mrdev->nr_pending); if (mreplace) atomic_inc(&mreplace->nr_pending); rcu_read_unlock(); @@ -3576,7 +3575,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, r10_bio->devs[1].devnum = i; r10_bio->devs[1].addr = to_addr; - if (need_recover) { + if (mrdev) { bio = r10_bio->devs[1].bio; bio->bi_next = biolist; biolist = bio; @@ -3621,7 +3620,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, for (k = 0; k < conf->copies; k++) if (r10_bio->devs[k].devnum == i) break; - if (!test_bit(In_sync, + if (mrdev && !test_bit(In_sync, &mrdev->flags) && !rdev_set_badblocks( mrdev, @@ -3647,12 +3646,14 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (rb2) atomic_dec(&rb2->remaining); r10_bio = rb2; - rdev_dec_pending(mrdev, mddev); + if (mrdev) + rdev_dec_pending(mrdev, mddev); if (mreplace) rdev_dec_pending(mreplace, mddev); break; } - rdev_dec_pending(mrdev, mddev); + if (mrdev) + rdev_dec_pending(mrdev, mddev); if (mreplace) rdev_dec_pending(mreplace, mddev); if (r10_bio->devs[0].bio->bi_opf & MD_FAILFAST) { From 6090368abcb40554c660c8c3140ce19ff0f8f66f Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 27 May 2023 17:20:07 +0800 Subject: [PATCH 216/266] md/raid10: prioritize adding disk to 'removed' mirror When add a new disk to raid10, it will traverse conf->mirror from start and find one of the following mirror to add: 1. mirror->rdev is set to WantReplacement and it have no replacement, set new disk to mirror->replacement. 2. no mirror->rdev, set new disk to mirror->rdev. There is a array as below (sda is set to WantReplacement): Number Major Minor RaidDevice State 0 8 0 0 active sync set-A /dev/sda - 0 0 1 removed 2 8 32 2 active sync set-A /dev/sdc 3 8 48 3 active sync set-B /dev/sdd Use 'mdadm --add' to add a new disk to this array, the new disk will become sda's replacement instead of add to removed position, which is confusing for users. Meanwhile, after new disk recovery success, sda will be set to Faulty. Prioritize adding disk to 'removed' mirror is a better choice. In the above scenario, the behavior is the same as before, except sda will not be deleted. Before other disks are added, continued use sda is more reliable. Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230527092007.3008856-1-linan666@huaweicloud.com --- drivers/md/raid10.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8eda79932754..ab0e2485b2b7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2151,9 +2151,10 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r10conf *conf = mddev->private; int err = -EEXIST; - int mirror; + int mirror, repl_slot = -1; int first = 0; int last = conf->geo.raid_disks - 1; + struct raid10_info *p; if (mddev->recovery_cp < MaxSector) /* only hot-add to in-sync arrays, as recovery is @@ -2176,23 +2177,14 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) else mirror = first; for ( ; mirror <= last ; mirror++) { - struct raid10_info *p = &conf->mirrors[mirror]; + p = &conf->mirrors[mirror]; if (p->recovery_disabled == mddev->recovery_disabled) continue; if (p->rdev) { - if (!test_bit(WantReplacement, &p->rdev->flags) || - p->replacement != NULL) - continue; - clear_bit(In_sync, &rdev->flags); - set_bit(Replacement, &rdev->flags); - rdev->raid_disk = mirror; - err = 0; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - conf->fullsync = 1; - rcu_assign_pointer(p->replacement, rdev); - break; + if (test_bit(WantReplacement, &p->rdev->flags) && + p->replacement == NULL && repl_slot < 0) + repl_slot = mirror; + continue; } if (mddev->gendisk) @@ -2209,6 +2201,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) break; } + if (err && repl_slot >= 0) { + p = &conf->mirrors[repl_slot]; + clear_bit(In_sync, &rdev->flags); + set_bit(Replacement, &rdev->flags); + rdev->raid_disk = repl_slot; + err = 0; + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + conf->fullsync = 1; + rcu_assign_pointer(p->replacement, rdev); + } + print_conf(conf); return err; } From 4d8a5754a694062f349b8bf66856561e3840c7e5 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Sat, 27 May 2023 18:18:51 +0800 Subject: [PATCH 217/266] md/raid10: clean up md_add_new_disk() Commit 1a855a060665 ("md: fix bug with re-adding of partially recovered device.") only add device which is set to In_sync. But it let devices without metadata cannot be added when they should be. Commit bf572541ab44 ("md: fix regression with re-adding devices to arrays with no metadata") fix the above issue, it set device without metadata to In_sync when add new disk. However, after commit f466722ca614 ("md: Change handling of save_raid_disk and metadata update during recovery.") deletes changes of the first patch, setting In_sync for devcie without metadata is meanless because the flag will be cleared soon and will not be used during this period. Clean it up. Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230527101851.3266500-2-linan666@huaweicloud.com --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a6ede3d1c99e..0aec2954e161 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6746,7 +6746,6 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) if (info->state & (1<raid_disk < mddev->raid_disks) { rdev->raid_disk = info->raid_disk; - set_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); } else rdev->raid_disk = -1; From 8d355a46c1e0cea59be3ea8395409a5e6eeed946 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 2 Jun 2023 17:18:38 +0800 Subject: [PATCH 218/266] md/raid10: Do not add spare disk when recovery fails In raid10_sync_request(), if data cannot be read from any disk for recovery, it will go to 'giveup' and let 'chunks_skipped' + 1. After multiple 'giveup', when 'chunks_skipped >= geo.raid_disks', it will return 'max_sector', indicating that the recovery has been completed. However, the recovery is just aborted and the data remains inconsistent. Fix it by setting mirror->recovery_disabled, which will prevent the spare disk from being added to this mirror. The same issue also exists during resync, it will be fixed afterwards. Signed-off-by: Li Nan Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230602091839.743798-2-linan666@huaweicloud.com --- drivers/md/raid10.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ab0e2485b2b7..1b953e788ce1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3311,6 +3311,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, int chunks_skipped = 0; sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; + int error_disk = -1; /* * Allow skipping a full rebuild for incremental assembly @@ -3394,8 +3395,21 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return reshape_request(mddev, sector_nr, skipped); if (chunks_skipped >= conf->geo.raid_disks) { - /* if there has been nothing to do on any drive, - * then there is nothing to do at all.. + pr_err("md/raid10:%s: %s fails\n", mdname(mddev), + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? "resync" : "recovery"); + if (error_disk >= 0 && + !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* + * recovery fails, set mirrors.recovery_disabled, + * device shouldn't be added to there. + */ + conf->mirrors[error_disk].recovery_disabled = + mddev->recovery_disabled; + return 0; + } + /* + * if there has been nothing to do on any drive, + * then there is nothing to do at all. */ *skipped = 1; return (max_sector - sector_nr) + sectors_skipped; @@ -3646,6 +3660,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, mdname(mddev)); mirror->recovery_disabled = mddev->recovery_disabled; + } else { + error_disk = i; } put_buf(r10_bio); if (rb2) From 2ae6aaf76912bae53c74b191569d2ab484f24bf3 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Fri, 2 Jun 2023 17:18:39 +0800 Subject: [PATCH 219/266] md/raid10: fix io loss while replacement replace rdev When removing a disk with replacement, the replacement will be used to replace rdev. During this process, there is a brief window in which both rdev and replacement are read as NULL in raid10_write_request(). This will result in io not being submitted but it should be. //remove //write raid10_remove_disk raid10_write_request mirror->rdev = NULL read rdev -> NULL mirror->rdev = mirror->replacement mirror->replacement = NULL read replacement -> NULL Fix it by reading replacement first and rdev later, meanwhile, use smp_mb() to prevent memory reordering. Fixes: 475b0321a4df ("md/raid10: writes should get directed to replacement as well as original.") Signed-off-by: Li Nan Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230602091839.743798-3-linan666@huaweicloud.com --- drivers/md/raid10.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1b953e788ce1..d738f89800c8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -779,8 +779,16 @@ static struct md_rdev *read_balance(struct r10conf *conf, disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + r10_bio->devs[slot].addr + sectors > + rdev->recovery_offset) { + /* + * Read replacement first to prevent reading both rdev + * and replacement as NULL during replacement replace + * rdev. + */ + smp_mb(); rdev = rcu_dereference(conf->mirrors[disk].rdev); + } if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; @@ -1482,9 +1490,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[d].replacement); + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(conf->mirrors[d].replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == rrdev) rrdev = NULL; if (rdev && (test_bit(Faulty, &rdev->flags))) From 010444623e7f4da6b4a4dd603a7da7469981e293 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:11:00 +0800 Subject: [PATCH 220/266] md/raid10: prevent soft lockup while flush writes Currently, there is no limit for raid1/raid10 plugged bio. While flushing writes, raid1 has cond_resched() while raid10 doesn't, and too many writes can cause soft lockup. Follow up soft lockup can be triggered easily with writeback test for raid10 with ramdisks: watchdog: BUG: soft lockup - CPU#10 stuck for 27s! [md0_raid10:1293] Call Trace: call_rcu+0x16/0x20 put_object+0x41/0x80 __delete_object+0x50/0x90 delete_object_full+0x2b/0x40 kmemleak_free+0x46/0xa0 slab_free_freelist_hook.constprop.0+0xed/0x1a0 kmem_cache_free+0xfd/0x300 mempool_free_slab+0x1f/0x30 mempool_free+0x3a/0x100 bio_free+0x59/0x80 bio_put+0xcf/0x2c0 free_r10bio+0xbf/0xf0 raid_end_bio_io+0x78/0xb0 one_write_done+0x8a/0xa0 raid10_end_write_request+0x1b4/0x430 bio_endio+0x175/0x320 brd_submit_bio+0x3b9/0x9b7 [brd] __submit_bio+0x69/0xe0 submit_bio_noacct_nocheck+0x1e6/0x5a0 submit_bio_noacct+0x38c/0x7e0 flush_pending_writes+0xf0/0x240 raid10d+0xac/0x1ed0 Fix the problem by adding cond_resched() to raid10 like what raid1 did. Note that unlimited plugged bio still need to be optimized, for example, in the case of lots of dirty pages writeback, this will take lots of memory and io will spend a long time in plug, hence io latency is bad. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529131106.2123367-2-yukuai1@huaweicloud.com --- drivers/md/raid10.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d738f89800c8..09017a43cea5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -929,6 +929,7 @@ static void flush_pending_writes(struct r10conf *conf) else submit_bio_noacct(bio); bio = next; + cond_resched(); } blk_finish_plug(&plug); } else @@ -1153,6 +1154,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) else submit_bio_noacct(bio); bio = next; + cond_resched(); } kfree(plug); } From 5ec6ca140a034682e421e2e808ef5ddfdfd65242 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:11:01 +0800 Subject: [PATCH 221/266] md/raid1-10: factor out a helper to add bio to plug The code in raid1 and raid10 is identical, prepare to limit the number of plugged bios. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529131106.2123367-3-yukuai1@huaweicloud.com --- drivers/md/raid1-10.c | 16 ++++++++++++++++ drivers/md/raid1.c | 12 +----------- drivers/md/raid10.c | 11 +---------- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index cd349e69ed77..76cff924b70b 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -110,3 +110,19 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, size -= len; } while (idx++ < RESYNC_PAGES && size > 0); } + +static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, + blk_plug_cb_fn unplug) +{ + struct raid1_plug_cb *plug = NULL; + struct blk_plug_cb *cb = blk_check_plugged(unplug, mddev, + sizeof(*plug)); + + if (!cb) + return false; + + plug = container_of(cb, struct raid1_plug_cb, cb); + bio_list_add(&plug->pending, bio); + + return true; +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 220f6ce761a5..92180f3a9f28 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1346,8 +1346,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct bitmap *bitmap = mddev->bitmap; unsigned long flags; struct md_rdev *blocked_rdev; - struct blk_plug_cb *cb; - struct raid1_plug_cb *plug = NULL; int first_clone; int max_sectors; bool write_behind = false; @@ -1576,15 +1574,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; - - cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid1_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 09017a43cea5..b8223da15ca9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1295,8 +1295,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; unsigned long flags; - struct blk_plug_cb *cb; - struct raid1_plug_cb *plug = NULL; struct r10conf *conf = mddev->private; struct md_rdev *rdev; int devnum = r10_bio->devs[n_copy].devnum; @@ -1336,14 +1334,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, atomic_inc(&r10_bio->remaining); - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid1_plug_cb, cb); - else - plug = NULL; - if (plug) { - bio_list_add(&plug->pending, mbio); - } else { + if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); From 8295efbe68c080047e98d9c0eb5cb933b238a8cb Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:11:02 +0800 Subject: [PATCH 222/266] md/raid1-10: factor out a helper to submit normal write There are multiple places to do the same thing, factor out a helper to prevent redundant code, and the helper will be used in following patch as well. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529131106.2123367-4-yukuai1@huaweicloud.com --- drivers/md/raid1-10.c | 17 +++++++++++++++++ drivers/md/raid1.c | 13 ++----------- drivers/md/raid10.c | 26 ++++---------------------- 3 files changed, 23 insertions(+), 33 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 76cff924b70b..21b0ff3ca4f0 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -111,6 +111,23 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp, } while (idx++ < RESYNC_PAGES && size > 0); } + +static inline void raid1_submit_write(struct bio *bio) +{ + struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev; + + bio->bi_next = NULL; + bio_set_dev(bio, rdev->bdev); + if (test_bit(Faulty, &rdev->flags)) + bio_io_error(bio); + else if (unlikely(bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev))) + /* Just ignore it */ + bio_endio(bio); + else + submit_bio_noacct(bio); +} + static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, blk_plug_cb_fn unplug) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 92180f3a9f28..c6b2abb7d185 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -799,17 +799,8 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void *)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b8223da15ca9..21cdb85baba4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -917,17 +917,8 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } @@ -1142,17 +1133,8 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_bdev; - bio->bi_next = NULL; - bio_set_dev(bio, rdev->bdev); - if (test_bit(Faulty, &rdev->flags)) { - bio_io_error(bio); - } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !bdev_max_discard_sectors(bio->bi_bdev))) - /* Just ignore it */ - bio_endio(bio); - else - submit_bio_noacct(bio); + + raid1_submit_write(bio); bio = next; cond_resched(); } From 7db922bae3abdf0a1db81ef7228cc0b996a0c1e3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:11:03 +0800 Subject: [PATCH 223/266] md/raid1-10: submit write io directly if bitmap is not enabled Commit 6cce3b23f6f8 ("[PATCH] md: write intent bitmap support for raid10") add bitmap support, and it changed that write io is submitted through daemon thread because bitmap need to be updated before write io. And later, plug is used to fix performance regression because all the write io will go to demon thread, which means io can't be issued concurrently. However, if bitmap is not enabled, the write io should not go to daemon thread in the first place, and plug is not needed as well. Fixes: 6cce3b23f6f8 ("[PATCH] md: write intent bitmap support for raid10") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529131106.2123367-5-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 4 +--- drivers/md/md-bitmap.h | 7 +++++++ drivers/md/raid1-10.c | 13 +++++++++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index ad5a3456cd8a..3ee590cf12a7 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1016,7 +1016,6 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) return set; } - /* this gets called when the md device is ready to unplug its underlying * (slave) device queues -- before we let any writes go down, we need to * sync the dirty pages of the bitmap file to disk */ @@ -1026,8 +1025,7 @@ void md_bitmap_unplug(struct bitmap *bitmap) int dirty, need_write; int writing = 0; - if (!bitmap || !bitmap->storage.filemap || - test_bit(BITMAP_STALE, &bitmap->flags)) + if (!md_bitmap_enabled(bitmap)) return; /* look at each page to see if there are any set bits that need to be diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index cfd7395de8fd..3a4750952b3a 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -273,6 +273,13 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); + +static inline bool md_bitmap_enabled(struct bitmap *bitmap) +{ + return bitmap && bitmap->storage.filemap && + !test_bit(BITMAP_STALE, &bitmap->flags); +} + #endif #endif diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 21b0ff3ca4f0..52469e460dab 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -132,9 +132,18 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, blk_plug_cb_fn unplug) { struct raid1_plug_cb *plug = NULL; - struct blk_plug_cb *cb = blk_check_plugged(unplug, mddev, - sizeof(*plug)); + struct blk_plug_cb *cb; + /* + * If bitmap is not enabled, it's safe to submit the io directly, and + * this can get optimal performance. + */ + if (!md_bitmap_enabled(mddev->bitmap)) { + raid1_submit_write(bio); + return true; + } + + cb = blk_check_plugged(unplug, mddev, sizeof(*plug)); if (!cb) return false; From a022325ab970cf04b66ca128a87345714aa44b99 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:11:04 +0800 Subject: [PATCH 224/266] md/md-bitmap: add a new helper to unplug bitmap asynchrously If bitmap is enabled, bitmap must update before submitting write io, this is why unplug callback must move these io to 'conf->pending_io_list' if 'current->bio_list' is not empty, which will suffer performance degradation. A new helper md_bitmap_unplug_async() is introduced to submit bitmap io in a kworker, so that submit bitmap io in raid10_unplug() doesn't require that 'current->bio_list' is empty. This patch prepare to limit the number of plugged bio. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529131106.2123367-6-yukuai1@huaweicloud.com --- drivers/md/md-bitmap.c | 29 +++++++++++++++++++++++++++++ drivers/md/md-bitmap.h | 1 + drivers/md/md.c | 9 +++++++++ drivers/md/md.h | 1 + 4 files changed, 40 insertions(+) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 3ee590cf12a7..1ff712889a3b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1054,6 +1054,35 @@ void md_bitmap_unplug(struct bitmap *bitmap) } EXPORT_SYMBOL(md_bitmap_unplug); +struct bitmap_unplug_work { + struct work_struct work; + struct bitmap *bitmap; + struct completion *done; +}; + +static void md_bitmap_unplug_fn(struct work_struct *work) +{ + struct bitmap_unplug_work *unplug_work = + container_of(work, struct bitmap_unplug_work, work); + + md_bitmap_unplug(unplug_work->bitmap); + complete(unplug_work->done); +} + +void md_bitmap_unplug_async(struct bitmap *bitmap) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct bitmap_unplug_work unplug_work; + + INIT_WORK_ONSTACK(&unplug_work.work, md_bitmap_unplug_fn); + unplug_work.bitmap = bitmap; + unplug_work.done = &done; + + queue_work(md_bitmap_wq, &unplug_work.work); + wait_for_completion(&done); +} +EXPORT_SYMBOL(md_bitmap_unplug_async); + static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); /* * bitmap_init_from_disk -- called at bitmap_create time to initialize * the in-memory bitmap from the on-disk bitmap -- also, sets up the diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 3a4750952b3a..8a3788c9bfef 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -264,6 +264,7 @@ void md_bitmap_sync_with_cluster(struct mddev *mddev, sector_t new_lo, sector_t new_hi); void md_bitmap_unplug(struct bitmap *bitmap); +void md_bitmap_unplug_async(struct bitmap *bitmap); void md_bitmap_daemon_work(struct mddev *mddev); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, diff --git a/drivers/md/md.c b/drivers/md/md.c index 0aec2954e161..cf3733c90c47 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -83,6 +83,7 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; +struct workqueue_struct *md_bitmap_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -9637,6 +9638,11 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; + md_bitmap_wq = alloc_workqueue("md_bitmap", WQ_MEM_RECLAIM | WQ_UNBOUND, + 0); + if (!md_bitmap_wq) + goto err_bitmap_wq; + ret = __register_blkdev(MD_MAJOR, "md", md_probe); if (ret < 0) goto err_md; @@ -9655,6 +9661,8 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: + destroy_workqueue(md_bitmap_wq); +err_bitmap_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9951,6 +9959,7 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); + destroy_workqueue(md_bitmap_wq); destroy_workqueue(md_wq); } diff --git a/drivers/md/md.h b/drivers/md/md.h index a50122165fa1..bfd2306bc750 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -852,6 +852,7 @@ struct mdu_array_info_s; struct mdu_disk_info_s; extern int mdp_major; +extern struct workqueue_struct *md_bitmap_wq; void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); From 9efcc2c3df7612eea02daa159ae7c6ac44420513 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:11:05 +0800 Subject: [PATCH 225/266] md/raid1-10: don't handle pluged bio by daemon thread current->bio_list will be set under submit_bio() context, in this case bitmap io will be added to the list and wait for current io submission to finish, while current io submission must wait for bitmap io to be done. commit 874807a83139 ("md/raid1{,0}: fix deadlock in bitmap_unplug.") fix the deadlock by handling plugged bio by daemon thread. On the one hand, the deadlock won't exist after commit a214b949d8e3 ("blk-mq: only flush requests from the plug in blk_mq_submit_bio"). On the other hand, current solution makes it impossible to flush plugged bio in raid1/10_make_request(), because this will cause that all the writes will goto daemon thread. In order to limit the number of plugged bio, commit 874807a83139 ("md/raid1{,0}: fix deadlock in bitmap_unplug.") is reverted, and the deadlock is fixed by handling bitmap io asynchronously. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529131106.2123367-7-yukuai1@huaweicloud.com --- drivers/md/raid1-10.c | 14 ++++++++++++++ drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 8 +++----- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 52469e460dab..c62bcb17c67b 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -152,3 +152,17 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, return true; } + +/* + * current->bio_list will be set under submit_bio() context, in this case bitmap + * io will be added to the list and wait for current io submission to finish, + * while current io submission must wait for bitmap io to be done. In order to + * avoid such deadlock, submit bitmap io asynchronously. + */ +static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) +{ + if (current->bio_list) + md_bitmap_unplug_async(bitmap); + else + md_bitmap_unplug(bitmap); +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c6b2abb7d185..334ed76d3c5a 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -794,7 +794,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect static void flush_bio_list(struct r1conf *conf, struct bio *bio) { /* flush any pending bitmap writes to disk before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1169,7 +1169,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r1conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 21cdb85baba4..f890f549086b 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -910,9 +910,7 @@ static void flush_pending_writes(struct r10conf *conf) __set_current_state(TASK_RUNNING); blk_start_plug(&plug); - /* flush any pending bitmap writes to disk - * before proceeding w/ I/O */ - md_bitmap_unplug(conf->mddev->bitmap); + raid1_prepare_flush_writes(conf->mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ @@ -1116,7 +1114,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) struct r10conf *conf = mddev->private; struct bio *bio; - if (from_schedule || current->bio_list) { + if (from_schedule) { spin_lock_irq(&conf->device_lock); bio_list_merge(&conf->pending_bio_list, &plug->pending); spin_unlock_irq(&conf->device_lock); @@ -1128,7 +1126,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* we aren't scheduling, so we can do the write-out directly. */ bio = bio_list_get(&plug->pending); - md_bitmap_unplug(mddev->bitmap); + raid1_prepare_flush_writes(mddev->bitmap); wake_up(&conf->wait_barrier); while (bio) { /* submit pending writes */ From 460af1f9d9e62acce4a21f9bd00b5bcd5963bcd4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 29 May 2023 21:11:06 +0800 Subject: [PATCH 226/266] md/raid1-10: limit the number of plugged bio bio can be added to plug infinitely, and following writeback test can trigger huge amount of plugged bio: Test script: modprobe brd rd_nr=4 rd_size=10485760 mdadm -CR /dev/md0 -l10 -n4 /dev/ram[0123] --assume-clean --bitmap=internal echo 0 > /proc/sys/vm/dirty_background_ratio fio -filename=/dev/md0 -ioengine=libaio -rw=write -bs=4k -numjobs=1 -iodepth=128 -name=test Test result: Monitor /sys/block/md0/inflight will found that inflight keep increasing until fio finish writing, after running for about 2 minutes: [root@fedora ~]# cat /sys/block/md0/inflight 0 4474191 Fix the problem by limiting the number of plugged bio based on the number of copies for original bio. Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230529131106.2123367-8-yukuai1@huaweicloud.com --- drivers/md/raid1-10.c | 9 ++++++++- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index c62bcb17c67b..169ebe296f2d 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -21,6 +21,7 @@ #define IO_MADE_GOOD ((struct bio *)2) #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) +#define MAX_PLUG_BIO 32 /* for managing resync I/O pages */ struct resync_pages { @@ -31,6 +32,7 @@ struct resync_pages { struct raid1_plug_cb { struct blk_plug_cb cb; struct bio_list pending; + unsigned int count; }; static void rbio_pool_free(void *rbio, void *data) @@ -129,7 +131,7 @@ static inline void raid1_submit_write(struct bio *bio) } static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, - blk_plug_cb_fn unplug) + blk_plug_cb_fn unplug, int copies) { struct raid1_plug_cb *plug = NULL; struct blk_plug_cb *cb; @@ -149,6 +151,11 @@ static inline bool raid1_add_bio_to_plug(struct mddev *mddev, struct bio *bio, plug = container_of(cb, struct raid1_plug_cb, cb); bio_list_add(&plug->pending, bio); + if (++plug->count / MAX_PLUG_BIO >= copies) { + list_del(&cb->list); + cb->callback(cb, false); + } + return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 334ed76d3c5a..dd25832eb045 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1565,7 +1565,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; - if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug)) { + if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f890f549086b..d0de8c9fb3cf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1314,7 +1314,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, atomic_inc(&r10_bio->remaining); - if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug)) { + if (!raid1_add_bio_to_plug(mddev, mbio, raid10_unplug, conf->copies)) { spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); spin_unlock_irqrestore(&conf->device_lock, flags); From d44c404207831dfe3b301ff479e964b77914488b Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 13 Jun 2023 22:54:39 +0100 Subject: [PATCH 227/266] block: Fix dio_cleanup() to advance the head index Fix dio_bio_cleanup() to advance the head index into the list of pages past the pages it has released, as __blockdev_direct_IO() will call it twice if do_direct_IO() fails. The issue was causing: WARNING: CPU: 6 PID: 2220 at mm/gup.c:76 try_get_folio This can be triggered by setting up a clean pair of UDF filesystems on loopback devices and running the generic/451 xfstest with them as the scratch and test partitions. Something like the following: fallocate /mnt2/udf_scratch -l 1G fallocate /mnt2/udf_test -l 1G mknod /dev/lo0 b 7 0 mknod /dev/lo1 b 7 1 losetup lo0 /mnt2/udf_scratch losetup lo1 /mnt2/udf_test mkfs -t udf /dev/lo0 mkfs -t udf /dev/lo1 cd xfstests ./check generic/451 with xfstests configured by putting the following into local.config: export FSTYP=udf export DISABLE_UDF_TEST=1 export TEST_DEV=/dev/lo1 export TEST_DIR=/xfstest.test export SCRATCH_DEV=/dev/lo0 export SCRATCH_MNT=/xfstest.scratch Fixes: 1ccf164ec866 ("block: Use iov_iter_extract_pages() and page pinning in direct-io.c") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202306120931.a9606b88-oliver.sang@intel.com Signed-off-by: David Howells cc: Christoph Hellwig cc: David Hildenbrand cc: Andrew Morton cc: Jens Axboe cc: Al Viro cc: Matthew Wilcox cc: Jan Kara cc: Jeff Layton cc: Jason Gunthorpe cc: Logan Gunthorpe cc: Hillf Danton cc: Christian Brauner cc: Linus Torvalds cc: linux-fsdevel@vger.kernel.org cc: linux-block@vger.kernel.org cc: linux-kernel@vger.kernel.org cc: linux-mm@kvack.org Reviewed-by: David Hildenbrand Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/1193485.1686693279@warthog.procyon.org.uk Signed-off-by: Jens Axboe --- fs/direct-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/direct-io.c b/fs/direct-io.c index 0643f1bb4b59..2ceb378b93c0 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -459,6 +459,7 @@ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) if (dio->is_pinned) unpin_user_pages(dio->pages + sdio->head, sdio->tail - sdio->head); + sdio->head = sdio->tail; } /* From 30654614f3d27230200b1650f6025a2ce67900b4 Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Wed, 14 Jun 2023 08:25:29 +0800 Subject: [PATCH 228/266] blk-mq: check on cpu id when there is only one ctx mapping commit f168420c62e7 ("blk-mq: don't redirect completion for hctx withs only one ctx mapping") When nvme applies a 1:1 mapping of hctx and ctx, there will be no remote request. But for ufs, the submission and completion queues could be asymmetric. (e.g. Multiple SQs share one CQ) Therefore, 1:1 mapping of hctx and ctx won't complete request on the submission cpu. In this situation, this nr_ctx check could violate the QUEUE_FLAG_SAME_FORCE, as a result, check on cpu id when there is only one ctx mapping. Signed-off-by: Ed Tsai Signed-off-by: Po-Wen Kao Suggested-by: Keith Busch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230614002529.6636-1-ed.tsai@mediatek.com [axboe: fixed up indentation] Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1749f5890606..24dc8fe0a9d2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1185,8 +1185,9 @@ bool blk_mq_complete_request_remote(struct request *rq) * or a polled request, always complete locally, * it's pointless to redirect the completion. */ - if (rq->mq_hctx->nr_ctx == 1 || - rq->cmd_flags & REQ_POLLED) + if ((rq->mq_hctx->nr_ctx == 1 && + rq->mq_ctx->cpu == raw_smp_processor_id()) || + rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { From 6dd4423f3f247b6f0ecb828cf62ea2bc4604f0b5 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Wed, 14 Jun 2023 15:35:38 +0200 Subject: [PATCH 229/266] brd: use cond_resched instead of cond_resched_rcu The body of the loop is run without RCU lock held. Use the regular cond_resched() instead of cond_resched_rcu(). Fixes: 786bb0245881 ("brd: use XArray instead of radix-tree to index backing pages") Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Pankaj Raghav Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20230614133538.1279369-1-p.raghav@samsung.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2f71376afc71..970bd6ff38c4 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -111,7 +111,7 @@ static void brd_free_pages(struct brd_device *brd) xa_for_each(&brd->brd_pages, idx, page) { __free_page(page); - cond_resched_rcu(); + cond_resched(); } xa_destroy(&brd->brd_pages); From cbe7cff4a76bc749dd70264ca5cf924e2adf9296 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 10 Jun 2023 10:20:01 +0800 Subject: [PATCH 230/266] blktrace: use inline function for blk_trace_remove() while blktrace is disabled If config is disabled, call blk_trace_remove() directly will trigger build warning, hence use inline function instead, prepare to fix blktrace debugfs entries leakage. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230610022003.2557284-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index cfbda114348c..122c62e561fc 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -85,10 +85,14 @@ extern int blk_trace_remove(struct request_queue *q); # define blk_add_driver_data(rq, data, len) do {} while (0) # define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) # define blk_trace_startstop(q, start) (-ENOTTY) -# define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) # define blk_add_cgroup_trace_msg(q, cg, fmt, ...) do { } while (0) # define blk_trace_note_message_enabled(q) (false) + +static inline int blk_trace_remove(struct request_queue *q) +{ + return -ENOTTY; +} #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_COMPAT From db59133e927916d8a25ee1fd8264f2808040909d Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 10 Jun 2023 10:20:02 +0800 Subject: [PATCH 231/266] scsi: sg: fix blktrace debugfs entries leakage sg_ioctl() support to enable blktrace, which will create debugfs entries "/sys/kernel/debug/block/sgx/", however, there is no guarantee that user will remove these entries through ioctl, and deleting sg device doesn't cleanup these blktrace entries. This problem can be fixed by cleanup blktrace while releasing request_queue, however, it's not a good idea to do this special handling in common layer just for sg device. Fix this problem by shutdown bltkrace in sg_device_destroy(), where the device is deleted and all the users close the device, also grab a scsi_device reference from sg_add_device() to prevent scsi_device to be freed before sg_device_destroy(); Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20230610022003.2557284-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 138e28bb76b7..2433eeef042a 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1497,6 +1497,10 @@ sg_add_device(struct device *cl_dev) int error; unsigned long iflags; + error = scsi_device_get(scsidp); + if (error) + return error; + error = -ENOMEM; cdev = cdev_alloc(); if (!cdev) { @@ -1554,6 +1558,7 @@ sg_add_device(struct device *cl_dev) out: if (cdev) cdev_del(cdev); + scsi_device_put(scsidp); return error; } @@ -1561,6 +1566,7 @@ static void sg_device_destroy(struct kref *kref) { struct sg_device *sdp = container_of(kref, struct sg_device, d_ref); + struct request_queue *q = sdp->device->request_queue; unsigned long flags; /* CAUTION! Note that the device can still be found via idr_find() @@ -1568,6 +1574,9 @@ sg_device_destroy(struct kref *kref) * any other cleanup. */ + blk_trace_remove(q); + scsi_device_put(sdp->device); + write_lock_irqsave(&sg_index_lock, flags); idr_remove(&sg_index_idr, sdp->index); write_unlock_irqrestore(&sg_index_lock, flags); From dd7de3704af9989b780693d51eaea49a665bd9c2 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sat, 10 Jun 2023 10:20:03 +0800 Subject: [PATCH 232/266] block: fix blktrace debugfs entries leakage Commit 99d055b4fd4b ("block: remove per-disk debugfs files in blk_unregister_queue") moves blk_trace_shutdown() from blk_release_queue() to blk_unregister_queue(), this is safe if blktrace is created through sysfs, however, there is a regression in corner case. blktrace can still be enabled after del_gendisk() through ioctl if the disk is opened before del_gendisk(), and if blktrace is not shutdown through ioctl before closing the disk, debugfs entries will be leaked. Fix this problem by shutdown blktrace in disk_release(), this is safe because blk_trace_remove() is reentrant. Fixes: 99d055b4fd4b ("block: remove per-disk debugfs files in blk_unregister_queue") Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230610022003.2557284-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 2c2f9a716822..f71f82991434 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,8 +25,9 @@ #include #include #include -#include "blk-throttle.h" +#include +#include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" @@ -1147,6 +1148,8 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); + blk_trace_remove(disk->queue); + /* * To undo the all initialization from blk_mq_init_allocated_queue in * case of a probe failure where add_disk is never called we have to From a301b2deb66cd93bae0f676702356273ebf8abb6 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Thu, 15 Jun 2023 20:12:18 +0800 Subject: [PATCH 233/266] bcache: Convert to use sysfs_emit()/sysfs_emit_at() APIs Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20230615121223.22502-2-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index c6f677059214..0e2c1880f60b 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -1111,26 +1111,25 @@ SHOW(__bch_cache) vfree(p); - ret = scnprintf(buf, PAGE_SIZE, - "Unused: %zu%%\n" - "Clean: %zu%%\n" - "Dirty: %zu%%\n" - "Metadata: %zu%%\n" - "Average: %llu\n" - "Sectors per Q: %zu\n" - "Quantiles: [", - unused * 100 / (size_t) ca->sb.nbuckets, - available * 100 / (size_t) ca->sb.nbuckets, - dirty * 100 / (size_t) ca->sb.nbuckets, - meta * 100 / (size_t) ca->sb.nbuckets, sum, - n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + ret = sysfs_emit(buf, + "Unused: %zu%%\n" + "Clean: %zu%%\n" + "Dirty: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + available * 100 / (size_t) ca->sb.nbuckets, + dirty * 100 / (size_t) ca->sb.nbuckets, + meta * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); for (i = 0; i < ARRAY_SIZE(q); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%u ", q[i]); + ret += sysfs_emit_at(buf, ret, "%u ", q[i]); ret--; - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); + ret += sysfs_emit_at(buf, ret, "]\n"); return ret; } From b98dd0b0a596fdeaca68396ce8f782883ed253a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 15 Jun 2023 20:12:19 +0800 Subject: [PATCH 234/266] bcache: make kobj_type structures constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit ee6d3dd4ed48 ("driver core: make kobj_type constant.") the driver core allows the usage of const struct kobj_type. Take advantage of this to constify the structure definitions to prevent modification at runtime. Signed-off-by: Thomas Weißschuh Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20230615121223.22502-3-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 10 +++++----- drivers/md/bcache/sysfs.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 700dc5588d5f..5a79bb3c272f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -1004,11 +1004,11 @@ extern struct workqueue_struct *bch_flush_wq; extern struct mutex bch_register_lock; extern struct list_head bch_cache_sets; -extern struct kobj_type bch_cached_dev_ktype; -extern struct kobj_type bch_flash_dev_ktype; -extern struct kobj_type bch_cache_set_ktype; -extern struct kobj_type bch_cache_set_internal_ktype; -extern struct kobj_type bch_cache_ktype; +extern const struct kobj_type bch_cached_dev_ktype; +extern const struct kobj_type bch_flash_dev_ktype; +extern const struct kobj_type bch_cache_set_ktype; +extern const struct kobj_type bch_cache_set_internal_ktype; +extern const struct kobj_type bch_cache_ktype; void bch_cached_dev_release(struct kobject *kobj); void bch_flash_dev_release(struct kobject *kobj); diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index a2ff6447b699..65b8bd975ab1 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -3,7 +3,7 @@ #define _BCACHE_SYSFS_H_ #define KTYPE(type) \ -struct kobj_type type ## _ktype = { \ +const struct kobj_type type ## _ktype = { \ .release = type ## _release, \ .sysfs_ops = &((const struct sysfs_ops) { \ .show = type ## _show, \ From ccb8c3bd6d93e7986b702d1f66d5d56d08abc59f Mon Sep 17 00:00:00 2001 From: Andrea Tomassetti Date: Thu, 15 Jun 2023 20:12:20 +0800 Subject: [PATCH 235/266] bcache: Remove dead references to cache_readaheads The cache_readaheads stat counter is not used anymore and should be removed. Signed-off-by: Andrea Tomassetti Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20230615121223.22502-4-colyli@suse.de Signed-off-by: Jens Axboe --- Documentation/admin-guide/bcache.rst | 3 --- drivers/md/bcache/stats.h | 1 - 2 files changed, 4 deletions(-) diff --git a/Documentation/admin-guide/bcache.rst b/Documentation/admin-guide/bcache.rst index bb5032a99234..6fdb495ac466 100644 --- a/Documentation/admin-guide/bcache.rst +++ b/Documentation/admin-guide/bcache.rst @@ -508,9 +508,6 @@ cache_miss_collisions cache miss, but raced with a write and data was already present (usually 0 since the synchronization for cache misses was rewritten) -cache_readaheads - Count of times readahead occurred. - Sysfs - cache set ~~~~~~~~~~~~~~~~~ diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index bd3afc856d53..21b445f8af15 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -18,7 +18,6 @@ struct cache_stats { unsigned long cache_misses; unsigned long cache_bypass_hits; unsigned long cache_bypass_misses; - unsigned long cache_readaheads; unsigned long cache_miss_collisions; unsigned long sectors_bypassed; From 028ddcac477b691dd9205c92f991cc15259d033e Mon Sep 17 00:00:00 2001 From: Zheng Wang Date: Thu, 15 Jun 2023 20:12:21 +0800 Subject: [PATCH 236/266] bcache: Remove unnecessary NULL point check in node allocations Due to the previous fix of __bch_btree_node_alloc, the return value will never be a NULL pointer. So IS_ERR is enough to handle the failure situation. Fix it by replacing IS_ERR_OR_NULL check by an IS_ERR check. Fixes: cafe56359144 ("bcache: A block layer cache") Cc: stable@vger.kernel.org Signed-off-by: Zheng Wang Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20230615121223.22502-5-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 10 +++++----- drivers/md/bcache/super.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 147c493a989a..7c21e54468bf 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1138,7 +1138,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); @@ -1340,7 +1340,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); - while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) + while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; @@ -1352,7 +1352,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, for (i = 0; i < nodes; i++) { new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); - if (IS_ERR_OR_NULL(new_nodes[i])) + if (IS_ERR(new_nodes[i])) goto out_nocoalesce; } @@ -1487,7 +1487,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, bch_keylist_free(&keylist); for (i = 0; i < nodes; i++) - if (!IS_ERR_OR_NULL(new_nodes[i])) { + if (!IS_ERR(new_nodes[i])) { btree_node_free(new_nodes[i]); rw_unlock(true, new_nodes[i]); } @@ -1669,7 +1669,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (should_rewrite) { n = btree_node_alloc_replacement(b, NULL); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { bch_btree_node_write_sync(n); bch_btree_set_root(n); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1f829e74db0a..e2a803683105 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1723,7 +1723,7 @@ static void cache_set_flush(struct closure *cl) if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); - if (!IS_ERR_OR_NULL(c->root)) + if (!IS_ERR(c->root)) list_add(&c->root->list, &c->btree_cache); /* @@ -2087,7 +2087,7 @@ static int run_cache_set(struct cache_set *c) err = "cannot allocate new btree root"; c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; mutex_lock(&c->root->write_lock); From 80fca8a10b604afad6c14213fdfd816c4eda3ee4 Mon Sep 17 00:00:00 2001 From: Zheng Wang Date: Thu, 15 Jun 2023 20:12:22 +0800 Subject: [PATCH 237/266] bcache: Fix __bch_btree_node_alloc to make the failure behavior consistent In some specific situations, the return value of __bch_btree_node_alloc may be NULL. This may lead to a potential NULL pointer dereference in caller function like a calling chain : btree_split->bch_btree_node_alloc->__bch_btree_node_alloc. Fix it by initializing the return value in __bch_btree_node_alloc. Fixes: cafe56359144 ("bcache: A block layer cache") Cc: stable@vger.kernel.org Signed-off-by: Zheng Wang Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20230615121223.22502-6-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7c21e54468bf..0ddf91204782 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1090,10 +1090,12 @@ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, struct btree *parent) { BKEY_PADDED(key) k; - struct btree *b = ERR_PTR(-EAGAIN); + struct btree *b; mutex_lock(&c->bucket_lock); retry: + /* return ERR_PTR(-EAGAIN) when it fails */ + b = ERR_PTR(-EAGAIN); if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; From f0854489fc07d2456f7cc71a63f4faf9c716ffbe Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Thu, 15 Jun 2023 20:12:23 +0800 Subject: [PATCH 238/266] bcache: fixup btree_cache_wait list damage We get a kernel crash about "list_add corruption. next->prev should be prev (ffff9c801bc01210), but was ffff9c77b688237c. (next=ffffae586d8afe68)." crash> struct list_head 0xffff9c801bc01210 struct list_head { next = 0xffffae586d8afe68, prev = 0xffffae586d8afe68 } crash> struct list_head 0xffff9c77b688237c struct list_head { next = 0x0, prev = 0x0 } crash> struct list_head 0xffffae586d8afe68 struct list_head struct: invalid kernel virtual address: ffffae586d8afe68 type: "gdb_readmem_callback" Cannot access memory at address 0xffffae586d8afe68 [230469.019492] Call Trace: [230469.032041] prepare_to_wait+0x8a/0xb0 [230469.044363] ? bch_btree_keys_free+0x6c/0xc0 [escache] [230469.056533] mca_cannibalize_lock+0x72/0x90 [escache] [230469.068788] mca_alloc+0x2ae/0x450 [escache] [230469.080790] bch_btree_node_get+0x136/0x2d0 [escache] [230469.092681] bch_btree_check_thread+0x1e1/0x260 [escache] [230469.104382] ? finish_wait+0x80/0x80 [230469.115884] ? bch_btree_check_recurse+0x1a0/0x1a0 [escache] [230469.127259] kthread+0x112/0x130 [230469.138448] ? kthread_flush_work_fn+0x10/0x10 [230469.149477] ret_from_fork+0x35/0x40 bch_btree_check_thread() and bch_dirty_init_thread() may call mca_cannibalize() to cannibalize other cached btree nodes. Only one thread can do it at a time, so the op of other threads will be added to the btree_cache_wait list. We must call finish_wait() to remove op from btree_cache_wait before free it's memory address. Otherwise, the list will be damaged. Also should call bch_cannibalize_unlock() to release the btree_cache_alloc_lock and wake_up other waiters. Fixes: 8e7102273f59 ("bcache: make bch_btree_check() to be multithreaded") Fixes: b144e45fc576 ("bcache: make bch_sectors_dirty_init() to be multithreaded") Cc: stable@vger.kernel.org Signed-off-by: Mingzhe Zou Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20230615121223.22502-7-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/btree.c | 11 ++++++++++- drivers/md/bcache/btree.h | 1 + drivers/md/bcache/writeback.c | 10 ++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 0ddf91204782..68b9d7ca864e 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -885,7 +885,7 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ -static void bch_cannibalize_unlock(struct cache_set *c) +void bch_cannibalize_unlock(struct cache_set *c) { spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { @@ -1970,6 +1970,15 @@ static int bch_btree_check_thread(void *arg) c->gc_stats.nodes++; bch_btree_op_init(&op, 0); ret = bcache_btree(check_recurse, p, c->root, &op); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op)->wait); if (ret) goto out; } diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 1b5fdbc0d83e..a2920bbfcad5 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -282,6 +282,7 @@ void bch_initial_gc_finish(struct cache_set *c); void bch_moving_gc(struct cache_set *c); int bch_btree_check(struct cache_set *c); void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); +void bch_cannibalize_unlock(struct cache_set *c); static inline void wake_up_gc(struct cache_set *c) { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d4a5fc0650bb..24c049067f61 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -890,6 +890,16 @@ static int bch_root_node_dirty_init(struct cache_set *c, if (ret < 0) pr_warn("sectors dirty init failed, ret=%d!\n", ret); + /* + * The op may be added to cache_set's btree_cache_wait + * in mca_cannibalize(), must ensure it is removed from + * the list and release btree_cache_alloc_lock before + * free op memory. + * Otherwise, the btree_cache_wait will be damaged. + */ + bch_cannibalize_unlock(c); + finish_wait(&c->btree_cache_wait, &(&op.op)->wait); + return ret; } From 1c606f7f056b266a84482d5522d35bbbbed062db Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 15 Jun 2023 08:17:57 -0700 Subject: [PATCH 239/266] nvme: forward port sysfs delete fix We had a late fix that modified nvme_sysfs_delete() after the staging branch for the next merge window relocated the function to a new file. Port commit 2eb94dd56a4a4 ("nvme: do not let the user delete a ctrl before a complete") to the latest to avoid a potentially confusing merge conflict. Cc: Maurizio Lombardi Cc: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 796e1d373b7c..45e91811f905 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -202,6 +202,9 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + if (!test_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags)) + return -EBUSY; + if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; From 0b24be4691c9e6ea13ca70050d42a9f9032fa788 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jun 2023 16:03:38 +0200 Subject: [PATCH 240/266] splice: don't call file_accessed in copy_splice_read copy_splice_read calls into ->read_iter to read the data, which already calls file_accessed. Fixes: 33b3b041543e ("splice: Add a func to do a splice from an O_DIRECT file without ITER_PIPE") Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Reviewed-by: David Howells Link: https://lore.kernel.org/r/20230614140341.521331-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/splice.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index 2420ead610a7..87c69fdb333d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -368,7 +368,6 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; - file_accessed(in); } else if (ret < 0) { /* * callers of ->splice_read() expect -EAGAIN on From 2e82f6c3bfd1acde2610dd9feb4f2b264c4ef742 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jun 2023 16:03:39 +0200 Subject: [PATCH 241/266] splice: simplify a conditional in copy_splice_read Check for -EFAULT instead of wrapping the check in an ret < 0 block. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Christian Brauner Reviewed-by: David Howells Link: https://lore.kernel.org/r/20230614140341.521331-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/splice.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 87c69fdb333d..7a9565d8ec4f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -368,15 +368,15 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, if (ret > 0) { keep = DIV_ROUND_UP(ret, PAGE_SIZE); *ppos = kiocb.ki_pos; - } else if (ret < 0) { - /* - * callers of ->splice_read() expect -EAGAIN on - * "can't put anything in there", rather than -EFAULT. - */ - if (ret == -EFAULT) - ret = -EAGAIN; } + /* + * Callers of ->splice_read() expect -EAGAIN on "can't put anything in + * there", rather than -EFAULT. + */ + if (ret == -EFAULT) + ret = -EAGAIN; + /* Free any pages that didn't get touched at all. */ if (keep < npages) release_pages(pages + keep, npages - keep); From e4cc64657becbd073c3ecc9d5938a1fe0d59913f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jun 2023 16:03:40 +0200 Subject: [PATCH 242/266] block: remove BIO_PAGE_REFFED Now that all block direct I/O helpers use page pinning, this flag is unused. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: Johannes Thumshirn Reviewed-by: David Howells Link: https://lore.kernel.org/r/20230614140341.521331-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 2 -- include/linux/bio.h | 3 +-- include/linux/blk_types.h | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/block/blk.h b/block/blk.h index 768852a84fef..608c5dcc516b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -445,8 +445,6 @@ static inline void bio_release_page(struct bio *bio, struct page *page) { if (bio_flagged(bio, BIO_PAGE_PINNED)) unpin_user_page(page); - else if (bio_flagged(bio, BIO_PAGE_REFFED)) - put_page(page); } struct request_queue *blk_alloc_queue(int node_id); diff --git a/include/linux/bio.h b/include/linux/bio.h index 617522928964..c4f5b5228105 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -492,8 +492,7 @@ void zero_fill_bio(struct bio *bio); static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { - if (bio_flagged(bio, BIO_PAGE_REFFED) || - bio_flagged(bio, BIO_PAGE_PINNED)) + if (bio_flagged(bio, BIO_PAGE_PINNED)) __bio_release_pages(bio, mark_dirty); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index deb69eeab6bd..752a54e3284b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -326,7 +326,6 @@ struct bio { */ enum { BIO_PAGE_PINNED, /* Unpin pages in bio_release_pages() */ - BIO_PAGE_REFFED, /* put pages in bio_release_pages() */ BIO_CLONED, /* doesn't own data */ BIO_BOUNCED, /* bio is a bounce bio */ BIO_QUIET, /* Make BIO Quiet */ From 84bd06c632c6d5279849f5f8ab47d9517d259422 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jun 2023 16:03:41 +0200 Subject: [PATCH 243/266] iov_iter: remove iov_iter_get_pages and iov_iter_get_pages_alloc Now that the direct I/O helpers have switched to use iov_iter_extract_pages, these helpers are unused. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Reviewed-by: David Howells Link: https://lore.kernel.org/r/20230614140341.521331-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/uio.h | 6 ------ lib/iov_iter.c | 35 +++++++---------------------------- 2 files changed, 7 insertions(+), 34 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 60c342bb7ab8..8e7d2c425340 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -277,14 +277,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); -ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, - size_t maxsize, unsigned maxpages, size_t *start, - iov_iter_extraction_t extraction_flags); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); -ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, - struct page ***pages, size_t maxsize, size_t *start, - iov_iter_extraction_t extraction_flags); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f18138e0292a..b667b1e2f688 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1082,8 +1082,7 @@ static struct page *first_bvec_segment(const struct iov_iter *i, static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, - unsigned int maxpages, size_t *start, - iov_iter_extraction_t extraction_flags) + unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; @@ -1093,8 +1092,6 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; - if (extraction_flags & ITER_ALLOW_P2PDMA) - gup_flags |= FOLL_PCI_P2PDMA; if (likely(user_backed_iter(i))) { unsigned long addr; @@ -1144,49 +1141,31 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, return -EFAULT; } -ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start, iov_iter_extraction_t extraction_flags) +ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, + size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); - return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, - start, extraction_flags); -} -EXPORT_SYMBOL_GPL(iov_iter_get_pages); - -ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, - size_t maxsize, unsigned maxpages, size_t *start) -{ - return iov_iter_get_pages(i, pages, maxsize, maxpages, start, 0); + return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); -ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start, iov_iter_extraction_t extraction_flags) +ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, + struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; - len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start, - extraction_flags); + len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } -EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc); - -ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, - struct page ***pages, size_t maxsize, size_t *start) -{ - return iov_iter_get_pages_alloc(i, pages, maxsize, start, 0); -} EXPORT_SYMBOL(iov_iter_get_pages_alloc2); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, From 245165658e1c9f95c0fecfe02b9b1ebd30a1198a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 16 Jun 2023 21:23:54 +0800 Subject: [PATCH 244/266] blk-mq: fix NULL dereference on q->elevator in blk_mq_elv_switch_none After grabbing q->sysfs_lock, q->elevator may become NULL because of elevator switch. Fix the NULL dereference on q->elevator by checking it with lock. Reported-by: Guangwu Zhang Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230616132354.415109-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 24dc8fe0a9d2..16c524e37123 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4604,9 +4604,6 @@ static bool blk_mq_elv_switch_none(struct list_head *head, { struct blk_mq_qe_pair *qe; - if (!q->elevator) - return true; - qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!qe) return false; @@ -4614,6 +4611,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head, /* q->elevator needs protection from ->sysfs_lock */ mutex_lock(&q->sysfs_lock); + /* the check has to be done with holding sysfs_lock */ + if (!q->elevator) { + kfree(qe); + goto unlock; + } + INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; @@ -4621,6 +4624,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); +unlock: mutex_unlock(&q->sysfs_lock); return true; From 9a7933f3aca9d3b77235953996126f0e87c1d496 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jun 2023 06:30:51 +0200 Subject: [PATCH 245/266] swim: fix a missing FMODE_ -> BLK_OPEN_ conversion in floppy_open Fix a missing conversion to the new BLK_OPEN constant in swim. Fixes: 05bdb9965305 ("block: replace fmode_t with a block-specific type for block open flags") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230620043051.707196-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/swim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 651009b3a601..f85b6af414b4 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -640,7 +640,7 @@ static int floppy_open(struct gendisk *disk, blk_mode_t mode) if (mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) { if (disk_check_media_change(disk) && fs->disk_in) fs->ejected = 0; - if ((mode & FMODE_WRITE) && fs->write_protected) { + if ((mode & BLK_OPEN_WRITE) && fs->write_protected) { err = -EROFS; goto out; } From b90ecc0379eb7bbe79337b0c7289390a98752646 Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Wed, 7 Jun 2023 13:08:37 -0400 Subject: [PATCH 246/266] block: increment diskseq on all media change events Currently, associating a loop device with a different file descriptor does not increment its diskseq. This allows the following race condition: 1. Program X opens a loop device 2. Program X gets the diskseq of the loop device. 3. Program X associates a file with the loop device. 4. Program X passes the loop device major, minor, and diskseq to something. 5. Program X exits. 6. Program Y detaches the file from the loop device. 7. Program Y attaches a different file to the loop device. 8. The opener finally gets around to opening the loop device and checks that the diskseq is what it expects it to be. Even though the diskseq is the expected value, the result is that the opener is accessing the wrong file. From discussions with Christoph Hellwig, it appears that disk_force_media_change() was supposed to call inc_diskseq(), but in fact it does not. Adding a Fixes: tag to indicate this. Christoph's Reported-by is because he stated that disk_force_media_change() calls inc_diskseq(), which is what led me to discover that it should but does not. Reported-by: Christoph Hellwig Signed-off-by: Demi Marie Obenour Fixes: e6138dc12de9 ("block: add a helper to raise a media changed event") Cc: stable@vger.kernel.org # 5.15+ Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230607170837.1559-1-demi@invisiblethingslab.com Signed-off-by: Jens Axboe --- block/disk-events.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/disk-events.c b/block/disk-events.c index 8b1b63225738..0cfac464e6d1 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -307,6 +307,7 @@ bool disk_force_media_change(struct gendisk *disk, unsigned int events) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return false; + inc_diskseq(disk); if (__invalidate_device(disk->part0, true)) pr_warn("VFS: busy inodes on changed media %s\n", disk->disk_name); From e89e001f24bf7bc558d9ebccb97fd559443021da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Jun 2023 06:35:36 +0200 Subject: [PATCH 247/266] block: document the holder argument to blkdev_get_by_path Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230620043536.707249-1-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bdev.c b/block/bdev.c index bd558a9ba3cd..9bb54d9d02a6 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -850,6 +850,7 @@ EXPORT_SYMBOL(blkdev_get_by_dev); * @path: path to the block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier + * @hops: holder operations * * Open the block device described by the device file at @path. If @holder is * not %NULL, the block device is opened with exclusive access. Exclusive opens From 985958b8584cc143555f1bd735e7ab5066c944a7 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Sun, 18 Jun 2023 22:04:02 +0800 Subject: [PATCH 248/266] block: fix wrong mode for blkdev_get_by_dev() from disk_scan_partitions() After commit 2736e8eeb0cc ("block: use the holder as indication for exclusive opens"), blkdev_get_by_dev() will warn if holder is NULL and mode contains 'FMODE_EXCL'. holder from blkdev_get_by_dev() from disk_scan_partitions() is always NULL, hence it should not use 'FMODE_EXCL', which is broben by the commit. For consequence, WARN_ON_ONCE() will be triggered from blkdev_get_by_dev() if user scan partitions with device opened exclusively. Fix this problem by removing 'FMODE_EXCL' from disk_scan_partitions(), as it used to be. Reported-by: syzbot+00cd27751f78817f167b@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=00cd27751f78817f167b Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") Signed-off-by: Yu Kuai Reviewed-by: Christian Brauner Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230618140402.7556-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index f71f82991434..a94952ae9e39 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -366,7 +366,8 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) } set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL, NULL); + bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXEC, NULL, + NULL); if (IS_ERR(bdev)) ret = PTR_ERR(bdev); else From c576c4bf9ecfa3fb9f7b11681cc2f60aba5276c4 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 20 Jun 2023 19:13:22 +0800 Subject: [PATCH 249/266] reiserfs: fix blkdev_put() warning from release_journal_dev() In journal_init_dev(), if super bdev is used as 'j_dev_bd', then blkdev_get_by_dev() is called with NULL holder, otherwise, holder will be journal. However, later in release_journal_dev(), blkdev_put() is called with journal unconditionally, cause following warning: WARNING: CPU: 1 PID: 5034 at block/bdev.c:617 bd_end_claim block/bdev.c:617 [inline] WARNING: CPU: 1 PID: 5034 at block/bdev.c:617 blkdev_put+0x562/0x8a0 block/bdev.c:901 RIP: 0010:blkdev_put+0x562/0x8a0 block/bdev.c:901 Call Trace: release_journal_dev fs/reiserfs/journal.c:2592 [inline] free_journal_ram+0x421/0x5c0 fs/reiserfs/journal.c:1896 do_journal_release fs/reiserfs/journal.c:1960 [inline] journal_release+0x276/0x630 fs/reiserfs/journal.c:1971 reiserfs_put_super+0xe4/0x5c0 fs/reiserfs/super.c:616 generic_shutdown_super+0x158/0x480 fs/super.c:499 kill_block_super+0x64/0xb0 fs/super.c:1422 deactivate_locked_super+0x98/0x160 fs/super.c:330 deactivate_super+0xb1/0xd0 fs/super.c:361 cleanup_mnt+0x2ae/0x3d0 fs/namespace.c:1247 task_work_run+0x16f/0x270 kernel/task_work.c:179 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xadc/0x2a30 kernel/exit.c:874 do_group_exit+0xd4/0x2a0 kernel/exit.c:1024 __do_sys_exit_group kernel/exit.c:1035 [inline] __se_sys_exit_group kernel/exit.c:1033 [inline] __x64_sys_exit_group+0x3e/0x50 kernel/exit.c:1033 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fix this problem by passing in NULL holder in this case. Reported-by: syzbot+04625c80899f4555de39@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=04625c80899f4555de39 Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") Signed-off-by: Yu Kuai Reviewed-by: Christian Brauner Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230620111322.1014775-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- fs/reiserfs/journal.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 62beee3c62b6..479aa4a57602 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2589,7 +2589,12 @@ static void release_journal_dev(struct super_block *super, struct reiserfs_journal *journal) { if (journal->j_dev_bd != NULL) { - blkdev_put(journal->j_dev_bd, journal); + void *holder = NULL; + + if (journal->j_dev_bd->bd_dev != super->s_dev) + holder = journal; + + blkdev_put(journal->j_dev_bd, holder); journal->j_dev_bd = NULL; } } From 12629621669b239445727256d1a5dab616b30deb Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 13 Jun 2023 16:40:07 +0800 Subject: [PATCH 250/266] block: disallow Persistent Reservation on partitions Refuse Persistent Reservation operations on partitions as reservation on partitions doesn't make sense. Besides, introduce blkdev_pr_allowed() helper, where more policies could be placed here later. Signed-off-by: Jingbo Xu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230613084008.93795-2-jefflexu@linux.alibaba.com Signed-off-by: Jens Axboe --- block/ioctl.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 61bb94fd4281..c75299006bdd 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -254,13 +254,25 @@ int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif +static bool blkdev_pr_allowed(struct block_device *bdev) +{ + /* no sense to make reservations for partitions */ + if (bdev_is_partition(bdev)) + return false; + + if (capable(CAP_SYS_ADMIN)) + return true; + + return false; +} + static int blkdev_pr_register(struct block_device *bdev, struct pr_registration __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; @@ -278,7 +290,7 @@ static int blkdev_pr_reserve(struct block_device *bdev, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; @@ -296,7 +308,7 @@ static int blkdev_pr_release(struct block_device *bdev, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; @@ -314,7 +326,7 @@ static int blkdev_pr_preempt(struct block_device *bdev, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; @@ -332,7 +344,7 @@ static int blkdev_pr_clear(struct block_device *bdev, const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; From 9a72a02456a839676fe8f220a44ef00951596047 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 13 Jun 2023 16:40:08 +0800 Subject: [PATCH 251/266] block: fine-granular CAP_SYS_ADMIN for Persistent Reservation Allow of unprivileged Persistent Reservation operations on devices if the write permission check on the device node has passed. brw-rw---- 1 root disk 259, 0 Jun 13 07:09 /dev/nvme0n1 In the example above, the "disk" group of nvme0n1 is also allowed to make reservations on the device even without CAP_SYS_ADMIN. Signed-off-by: Jingbo Xu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230613084008.93795-3-jefflexu@linux.alibaba.com Signed-off-by: Jens Axboe --- block/ioctl.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index c75299006bdd..3be11941fb2d 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -254,7 +254,7 @@ int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif -static bool blkdev_pr_allowed(struct block_device *bdev) +static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) { /* no sense to make reservations for partitions */ if (bdev_is_partition(bdev)) @@ -262,17 +262,20 @@ static bool blkdev_pr_allowed(struct block_device *bdev) if (capable(CAP_SYS_ADMIN)) return true; - - return false; + /* + * Only allow unprivileged reservations if the file descriptor is open + * for writing. + */ + return mode & BLK_OPEN_WRITE; } -static int blkdev_pr_register(struct block_device *bdev, +static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, struct pr_registration __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; - if (!blkdev_pr_allowed(bdev)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; @@ -284,13 +287,13 @@ static int blkdev_pr_register(struct block_device *bdev, return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); } -static int blkdev_pr_reserve(struct block_device *bdev, +static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!blkdev_pr_allowed(bdev)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; @@ -302,13 +305,13 @@ static int blkdev_pr_reserve(struct block_device *bdev, return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); } -static int blkdev_pr_release(struct block_device *bdev, +static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!blkdev_pr_allowed(bdev)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; @@ -320,13 +323,13 @@ static int blkdev_pr_release(struct block_device *bdev, return ops->pr_release(bdev, rsv.key, rsv.type); } -static int blkdev_pr_preempt(struct block_device *bdev, +static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode, struct pr_preempt __user *arg, bool abort) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; - if (!blkdev_pr_allowed(bdev)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; @@ -338,13 +341,13 @@ static int blkdev_pr_preempt(struct block_device *bdev, return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); } -static int blkdev_pr_clear(struct block_device *bdev, +static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode, struct pr_clear __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; - if (!blkdev_pr_allowed(bdev)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; @@ -546,17 +549,17 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: - return blkdev_pr_register(bdev, argp); + return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: - return blkdev_pr_reserve(bdev, argp); + return blkdev_pr_reserve(bdev, mode, argp); case IOC_PR_RELEASE: - return blkdev_pr_release(bdev, argp); + return blkdev_pr_release(bdev, mode, argp); case IOC_PR_PREEMPT: - return blkdev_pr_preempt(bdev, argp, false); + return blkdev_pr_preempt(bdev, mode, argp, false); case IOC_PR_PREEMPT_ABORT: - return blkdev_pr_preempt(bdev, argp, true); + return blkdev_pr_preempt(bdev, mode, argp, true); case IOC_PR_CLEAR: - return blkdev_pr_clear(bdev, argp); + return blkdev_pr_clear(bdev, mode, argp); default: return -ENOIOCTLCMD; } From 6d4e80db4ebe76c4a4b6ffb6547cb168275204ef Mon Sep 17 00:00:00 2001 From: Min Li Date: Mon, 19 Jun 2023 09:12:14 +0000 Subject: [PATCH 252/266] block: add capacity validation in bdev_add_partition() In the function bdev_add_partition(),there is no check that the start and end sectors exceed the size of the disk before calling add_partition. When we call the block's ioctl interface directly to add a partition, and the capacity of the disk is set to 0 by driver,the command will continue to execute. Signed-off-by: Min Li Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20230619091214.31615-1-min15.li@samsung.com Signed-off-by: Jens Axboe --- block/partitions/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/block/partitions/core.c b/block/partitions/core.c index 87a21942d606..13a7341299a9 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -441,10 +441,21 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { + sector_t capacity = get_capacity(disk), end; struct block_device *part; int ret; mutex_lock(&disk->open_mutex); + if (check_add_overflow(start, length, &end)) { + ret = -EINVAL; + goto out; + } + + if (start >= capacity || end > capacity) { + ret = -EINVAL; + goto out; + } + if (!disk_live(disk)) { ret = -ENXIO; goto out; From fc3d092c6bb48d5865fec15ed5b333c12f36288c Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Wed, 21 Jun 2023 08:17:23 +1200 Subject: [PATCH 253/266] block: fix signed int overflow in Amiga partition support The Amiga partition parser module uses signed int for partition sector address and count, which will overflow for disks larger than 1 TB. Use sector_t as type for sector address and size to allow using disks up to 2 TB without LBD support, and disks larger than 2 TB with LBD. This bug was reported originally in 2012, and the fix was created by the RDB author, Joanne Dow . A patch had been discussed and reviewed on linux-m68k at that time but never officially submitted. This patch differs from Joanne's patch only in its use of sector_t instead of unsigned int. No checking for overflows is done (see patch 3 of this series for that). Reported-by: Martin Steigerwald Closes: https://bugzilla.kernel.org/show_bug.cgi?id=43511 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Message-ID: <201206192146.09327.Martin@lichtvoll.de> Cc: # 5.2 Signed-off-by: Michael Schmitz Tested-by: Martin Steigerwald Reviewed-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230620201725.7020-2-schmitzmic@gmail.com Signed-off-by: Jens Axboe --- block/partitions/amiga.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 5c8624e26a54..85c5c79aae48 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -31,7 +31,8 @@ int amiga_partition(struct parsed_partitions *state) unsigned char *data; struct RigidDiskBlock *rdb; struct PartitionBlock *pb; - int start_sect, nr_sects, blk, part, res = 0; + sector_t start_sect, nr_sects; + int blk, part, res = 0; int blksize = 1; /* Multiplier for disk block size */ int slot = 1; @@ -96,14 +97,14 @@ int amiga_partition(struct parsed_partitions *state) /* Tell Kernel about it */ - nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - - be32_to_cpu(pb->pb_Environment[9])) * + nr_sects = ((sector_t)be32_to_cpu(pb->pb_Environment[10]) + 1 - + be32_to_cpu(pb->pb_Environment[9])) * be32_to_cpu(pb->pb_Environment[3]) * be32_to_cpu(pb->pb_Environment[5]) * blksize; if (!nr_sects) continue; - start_sect = be32_to_cpu(pb->pb_Environment[9]) * + start_sect = (sector_t)be32_to_cpu(pb->pb_Environment[9]) * be32_to_cpu(pb->pb_Environment[3]) * be32_to_cpu(pb->pb_Environment[5]) * blksize; From 95a55437dc49fb3342c82e61f5472a71c63d9ed0 Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Wed, 21 Jun 2023 08:17:24 +1200 Subject: [PATCH 254/266] block: change all __u32 annotations to __be32 in affs_hardblocks.h The Amiga partition parser module uses signed int for partition sector address and count, which will overflow for disks larger than 1 TB. Use u64 as type for sector address and size to allow using disks up to 2 TB without LBD support, and disks larger than 2 TB with LBD. The RBD format allows to specify disk sizes up to 2^128 bytes (though native OS limitations reduce this somewhat, to max 2^68 bytes), so check for u64 overflow carefully to protect against overflowing sector_t. This bug was reported originally in 2012, and the fix was created by the RDB author, Joanne Dow . A patch had been discussed and reviewed on linux-m68k at that time but never officially submitted (now resubmitted as patch 1 of this series). Patch 3 (this series) adds additional error checking and warning messages. One of the error checks now makes use of the previously unused rdb_CylBlocks field, which causes a 'sparse' warning (cast to restricted __be32). Annotate all 32 bit fields in affs_hardblocks.h as __be32, as the on-disk format of RDB and partition blocks is always big endian. Reported-by: Martin Steigerwald Closes: https://bugzilla.kernel.org/show_bug.cgi?id=43511 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Message-ID: <201206192146.09327.Martin@lichtvoll.de> Cc: # 5.2 Signed-off-by: Michael Schmitz Reviewed-by: Christoph Hellwig Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230620201725.7020-3-schmitzmic@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/affs_hardblocks.h | 68 ++++++++++++++-------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/include/uapi/linux/affs_hardblocks.h b/include/uapi/linux/affs_hardblocks.h index 5e2fb8481252..a5aff2eb5f70 100644 --- a/include/uapi/linux/affs_hardblocks.h +++ b/include/uapi/linux/affs_hardblocks.h @@ -7,42 +7,42 @@ /* Just the needed definitions for the RDB of an Amiga HD. */ struct RigidDiskBlock { - __u32 rdb_ID; + __be32 rdb_ID; __be32 rdb_SummedLongs; - __s32 rdb_ChkSum; - __u32 rdb_HostID; + __be32 rdb_ChkSum; + __be32 rdb_HostID; __be32 rdb_BlockBytes; - __u32 rdb_Flags; - __u32 rdb_BadBlockList; + __be32 rdb_Flags; + __be32 rdb_BadBlockList; __be32 rdb_PartitionList; - __u32 rdb_FileSysHeaderList; - __u32 rdb_DriveInit; - __u32 rdb_Reserved1[6]; - __u32 rdb_Cylinders; - __u32 rdb_Sectors; - __u32 rdb_Heads; - __u32 rdb_Interleave; - __u32 rdb_Park; - __u32 rdb_Reserved2[3]; - __u32 rdb_WritePreComp; - __u32 rdb_ReducedWrite; - __u32 rdb_StepRate; - __u32 rdb_Reserved3[5]; - __u32 rdb_RDBBlocksLo; - __u32 rdb_RDBBlocksHi; - __u32 rdb_LoCylinder; - __u32 rdb_HiCylinder; - __u32 rdb_CylBlocks; - __u32 rdb_AutoParkSeconds; - __u32 rdb_HighRDSKBlock; - __u32 rdb_Reserved4; + __be32 rdb_FileSysHeaderList; + __be32 rdb_DriveInit; + __be32 rdb_Reserved1[6]; + __be32 rdb_Cylinders; + __be32 rdb_Sectors; + __be32 rdb_Heads; + __be32 rdb_Interleave; + __be32 rdb_Park; + __be32 rdb_Reserved2[3]; + __be32 rdb_WritePreComp; + __be32 rdb_ReducedWrite; + __be32 rdb_StepRate; + __be32 rdb_Reserved3[5]; + __be32 rdb_RDBBlocksLo; + __be32 rdb_RDBBlocksHi; + __be32 rdb_LoCylinder; + __be32 rdb_HiCylinder; + __be32 rdb_CylBlocks; + __be32 rdb_AutoParkSeconds; + __be32 rdb_HighRDSKBlock; + __be32 rdb_Reserved4; char rdb_DiskVendor[8]; char rdb_DiskProduct[16]; char rdb_DiskRevision[4]; char rdb_ControllerVendor[8]; char rdb_ControllerProduct[16]; char rdb_ControllerRevision[4]; - __u32 rdb_Reserved5[10]; + __be32 rdb_Reserved5[10]; }; #define IDNAME_RIGIDDISK 0x5244534B /* "RDSK" */ @@ -50,16 +50,16 @@ struct RigidDiskBlock { struct PartitionBlock { __be32 pb_ID; __be32 pb_SummedLongs; - __s32 pb_ChkSum; - __u32 pb_HostID; + __be32 pb_ChkSum; + __be32 pb_HostID; __be32 pb_Next; - __u32 pb_Flags; - __u32 pb_Reserved1[2]; - __u32 pb_DevFlags; + __be32 pb_Flags; + __be32 pb_Reserved1[2]; + __be32 pb_DevFlags; __u8 pb_DriveName[32]; - __u32 pb_Reserved2[15]; + __be32 pb_Reserved2[15]; __be32 pb_Environment[17]; - __u32 pb_EReserved[15]; + __be32 pb_EReserved[15]; }; #define IDNAME_PARTITION 0x50415254 /* "PART" */ From b6f3f28f604ba3de4724ad82bea6adb1300c0b5f Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Wed, 21 Jun 2023 08:17:25 +1200 Subject: [PATCH 255/266] block: add overflow checks for Amiga partition support The Amiga partition parser module uses signed int for partition sector address and count, which will overflow for disks larger than 1 TB. Use u64 as type for sector address and size to allow using disks up to 2 TB without LBD support, and disks larger than 2 TB with LBD. The RBD format allows to specify disk sizes up to 2^128 bytes (though native OS limitations reduce this somewhat, to max 2^68 bytes), so check for u64 overflow carefully to protect against overflowing sector_t. Bail out if sector addresses overflow 32 bits on kernels without LBD support. This bug was reported originally in 2012, and the fix was created by the RDB author, Joanne Dow . A patch had been discussed and reviewed on linux-m68k at that time but never officially submitted (now resubmitted as patch 1 in this series). This patch adds additional error checking and warning messages. Reported-by: Martin Steigerwald Closes: https://bugzilla.kernel.org/show_bug.cgi?id=43511 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Message-ID: <201206192146.09327.Martin@lichtvoll.de> Cc: # 5.2 Signed-off-by: Michael Schmitz Reviewed-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230620201725.7020-4-schmitzmic@gmail.com Signed-off-by: Jens Axboe --- block/partitions/amiga.c | 103 ++++++++++++++++++++++++++++++++------- 1 file changed, 85 insertions(+), 18 deletions(-) diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 85c5c79aae48..ed222b9c901b 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -11,10 +11,18 @@ #define pr_fmt(fmt) fmt #include +#include +#include #include #include "check.h" +/* magic offsets in partition DosEnvVec */ +#define NR_HD 3 +#define NR_SECT 5 +#define LO_CYL 9 +#define HI_CYL 10 + static __inline__ u32 checksum_block(__be32 *m, int size) { @@ -31,9 +39,12 @@ int amiga_partition(struct parsed_partitions *state) unsigned char *data; struct RigidDiskBlock *rdb; struct PartitionBlock *pb; - sector_t start_sect, nr_sects; - int blk, part, res = 0; - int blksize = 1; /* Multiplier for disk block size */ + u64 start_sect, nr_sects; + sector_t blk, end_sect; + u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */ + u32 nr_hd, nr_sect, lo_cyl, hi_cyl; + int part, res = 0; + unsigned int blksize = 1; /* Multiplier for disk block size */ int slot = 1; for (blk = 0; ; blk++, put_dev_sector(sect)) { @@ -41,7 +52,7 @@ int amiga_partition(struct parsed_partitions *state) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { - pr_err("Dev %s: unable to read RDB block %d\n", + pr_err("Dev %s: unable to read RDB block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; @@ -58,12 +69,12 @@ int amiga_partition(struct parsed_partitions *state) *(__be32 *)(data+0xdc) = 0; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { - pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n", + pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n", blk); break; } - pr_err("Dev %s: RDB in block %d has bad checksum\n", + pr_err("Dev %s: RDB in block %llu has bad checksum\n", state->disk->disk_name, blk); } @@ -80,10 +91,15 @@ int amiga_partition(struct parsed_partitions *state) blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { - blk *= blksize; /* Read in terms partition table understands */ + /* Read in terms partition table understands */ + if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { + pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", + state->disk->disk_name, blk, part); + break; + } data = read_part_sector(state, blk, §); if (!data) { - pr_err("Dev %s: unable to read partition block %d\n", + pr_err("Dev %s: unable to read partition block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; @@ -95,19 +111,70 @@ int amiga_partition(struct parsed_partitions *state) if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) continue; - /* Tell Kernel about it */ + /* RDB gives us more than enough rope to hang ourselves with, + * many times over (2^128 bytes if all fields max out). + * Some careful checks are in order, so check for potential + * overflows. + * We are multiplying four 32 bit numbers to one sector_t! + */ + + nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]); + nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]); + + /* CylBlocks is total number of blocks per cylinder */ + if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) { + pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n", + state->disk->disk_name, cylblk); + continue; + } + + /* check for consistency with RDB defined CylBlocks */ + if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) { + pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n", + state->disk->disk_name, cylblk, + be32_to_cpu(rdb->rdb_CylBlocks)); + } + + /* RDB allows for variable logical block size - + * normalize to 512 byte blocks and check result. + */ + + if (check_mul_overflow(cylblk, blksize, &cylblk)) { + pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n", + state->disk->disk_name, part); + continue; + } + + /* Calculate partition start and end. Limit of 32 bit on cylblk + * guarantees no overflow occurs if LBD support is enabled. + */ + + lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]); + start_sect = ((u64) lo_cyl * cylblk); + + hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]); + nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk); - nr_sects = ((sector_t)be32_to_cpu(pb->pb_Environment[10]) + 1 - - be32_to_cpu(pb->pb_Environment[9])) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; if (!nr_sects) continue; - start_sect = (sector_t)be32_to_cpu(pb->pb_Environment[9]) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; + + /* Warn user if partition end overflows u32 (AmigaDOS limit) */ + + if ((start_sect + nr_sects) > UINT_MAX) { + pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n", + state->disk->disk_name, part, + start_sect, start_sect + nr_sects); + } + + if (check_add_overflow(start_sect, nr_sects, &end_sect)) { + pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n", + state->disk->disk_name, part, + start_sect, end_sect); + continue; + } + + /* Tell Kernel about it */ + put_partition(state,slot++,start_sect,nr_sects); { /* Be even more informative to aid mounting */ From 56e71bdf324d6ab263eba1fc3fa1f3fd8bb5678e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Jun 2023 14:49:14 +0200 Subject: [PATCH 256/266] block: fix the exclusive open mask in disk_scan_partitions FMODE_EXEC has nothing to do with exclusive opens, and even is of the wrong type. We need to check for BLK_OPEN_EXCL here. Fixes: 985958b8584c ("block: fix wrong mode for blkdev_get_by_dev() from disk_scan_partitions()") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230621124914.185992-1-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index a94952ae9e39..3d287b32d50d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -366,7 +366,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) } set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXEC, NULL, + bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, NULL); if (IS_ERR(bdev)) ret = PTR_ERR(bdev); From 137380c0ec40710cbaf57c7878726c41a6da81cd Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 20 Jun 2023 20:01:30 +0200 Subject: [PATCH 257/266] block/rnbd: make all 'class' structures const Now that the driver core allows for struct class to be in read-only memory, making all 'class' structures to be declared at build time placing them into read-only memory, instead of having to be dynamically allocated at load time. Cc: "Md. Haris Iqbal" Cc: Jack Wang Cc: Jens Axboe Cc: linux-block@vger.kernel.org Suggested-by: Greg Kroah-Hartman Signed-off-by: Ivan Orlov Signed-off-by: Greg Kroah-Hartman Acked-by: Jack Wang Link: https://lore.kernel.org/r/20230620180129.645646-5-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 20 +++++++++++--------- drivers/block/rnbd/rnbd-srv-sysfs.c | 22 ++++++++++++---------- 2 files changed, 23 insertions(+), 19 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index a0b49a0c0bdd..c36d8b1ceeed 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -24,7 +24,9 @@ #include "rnbd-clt.h" static struct device *rnbd_dev; -static struct class *rnbd_dev_class; +static const struct class rnbd_dev_class = { + .name = "rnbd_client", +}; static struct kobject *rnbd_devs_kobj; enum { @@ -646,11 +648,11 @@ int rnbd_clt_create_sysfs_files(void) { int err; - rnbd_dev_class = class_create("rnbd-client"); - if (IS_ERR(rnbd_dev_class)) - return PTR_ERR(rnbd_dev_class); + err = class_register(&rnbd_dev_class); + if (err) + return err; - rnbd_dev = device_create_with_groups(rnbd_dev_class, NULL, + rnbd_dev = device_create_with_groups(&rnbd_dev_class, NULL, MKDEV(0, 0), NULL, default_attr_groups, "ctl"); if (IS_ERR(rnbd_dev)) { @@ -666,9 +668,9 @@ int rnbd_clt_create_sysfs_files(void) return 0; dev_destroy: - device_destroy(rnbd_dev_class, MKDEV(0, 0)); + device_destroy(&rnbd_dev_class, MKDEV(0, 0)); cls_destroy: - class_destroy(rnbd_dev_class); + class_unregister(&rnbd_dev_class); return err; } @@ -678,6 +680,6 @@ void rnbd_clt_destroy_sysfs_files(void) sysfs_remove_group(&rnbd_dev->kobj, &default_attr_group); kobject_del(rnbd_devs_kobj); kobject_put(rnbd_devs_kobj); - device_destroy(rnbd_dev_class, MKDEV(0, 0)); - class_destroy(rnbd_dev_class); + device_destroy(&rnbd_dev_class, MKDEV(0, 0)); + class_unregister(&rnbd_dev_class); } diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index 39b89f9b6bd9..cba6ba43c2c2 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -19,7 +19,9 @@ #include "rnbd-srv.h" static struct device *rnbd_dev; -static struct class *rnbd_dev_class; +static const struct class rnbd_dev_class = { + .name = "rnbd-server", +}; static struct kobject *rnbd_devs_kobj; static void rnbd_srv_dev_release(struct kobject *kobj) @@ -213,12 +215,12 @@ int rnbd_srv_create_sysfs_files(void) { int err; - rnbd_dev_class = class_create("rnbd-server"); - if (IS_ERR(rnbd_dev_class)) - return PTR_ERR(rnbd_dev_class); + err = class_register(&rnbd_dev_class); + if (err) + return err; - rnbd_dev = device_create(rnbd_dev_class, NULL, - MKDEV(0, 0), NULL, "ctl"); + rnbd_dev = device_create(&rnbd_dev_class, NULL, + MKDEV(0, 0), NULL, "ctl"); if (IS_ERR(rnbd_dev)) { err = PTR_ERR(rnbd_dev); goto cls_destroy; @@ -232,9 +234,9 @@ int rnbd_srv_create_sysfs_files(void) return 0; dev_destroy: - device_destroy(rnbd_dev_class, MKDEV(0, 0)); + device_destroy(&rnbd_dev_class, MKDEV(0, 0)); cls_destroy: - class_destroy(rnbd_dev_class); + class_unregister(&rnbd_dev_class); return err; } @@ -243,6 +245,6 @@ void rnbd_srv_destroy_sysfs_files(void) { kobject_del(rnbd_devs_kobj); kobject_put(rnbd_devs_kobj); - device_destroy(rnbd_dev_class, MKDEV(0, 0)); - class_destroy(rnbd_dev_class); + device_destroy(&rnbd_dev_class, MKDEV(0, 0)); + class_unregister(&rnbd_dev_class); } From 65d7a37d4e3e226bb4a4ddf73a827d0dbc77f530 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 20 Jun 2023 20:01:31 +0200 Subject: [PATCH 258/266] aoe: make aoe_class a static const structure Now that the driver core allows for struct class to be in read-only memory, move the aoe_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Justin Sanders Cc: Jens Axboe Cc: linux-block@vger.kernel.org Suggested-by: Greg Kroah-Hartman Signed-off-by: Ivan Orlov Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230620180129.645646-6-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- drivers/block/aoe/aoechr.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 4c666f72203f..a42c4bcc85ba 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -49,7 +49,7 @@ static int emsgs_head_idx, emsgs_tail_idx; static struct completion emsgs_comp; static spinlock_t emsgs_lock; static int nblocked_emsgs_readers; -static struct class *aoe_class; + static struct aoe_chardev chardevs[] = { { MINOR_ERR, "err" }, { MINOR_DISCOVER, "discover" }, @@ -58,6 +58,16 @@ static struct aoe_chardev chardevs[] = { { MINOR_FLUSH, "flush" }, }; +static char *aoe_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev)); +} + +static const struct class aoe_class = { + .name = "aoe", + .devnode = aoe_devnode, +}; + static int discover(void) { @@ -273,11 +283,6 @@ static const struct file_operations aoe_fops = { .llseek = noop_llseek, }; -static char *aoe_devnode(const struct device *dev, umode_t *mode) -{ - return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev)); -} - int __init aoechr_init(void) { @@ -290,15 +295,14 @@ aoechr_init(void) } init_completion(&emsgs_comp); spin_lock_init(&emsgs_lock); - aoe_class = class_create("aoe"); - if (IS_ERR(aoe_class)) { + n = class_register(&aoe_class); + if (n) { unregister_chrdev(AOE_MAJOR, "aoechr"); - return PTR_ERR(aoe_class); + return n; } - aoe_class->devnode = aoe_devnode; for (i = 0; i < ARRAY_SIZE(chardevs); ++i) - device_create(aoe_class, NULL, + device_create(&aoe_class, NULL, MKDEV(AOE_MAJOR, chardevs[i].minor), NULL, chardevs[i].name); @@ -311,8 +315,8 @@ aoechr_exit(void) int i; for (i = 0; i < ARRAY_SIZE(chardevs); ++i) - device_destroy(aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor)); - class_destroy(aoe_class); + device_destroy(&aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor)); + class_unregister(&aoe_class); unregister_chrdev(AOE_MAJOR, "aoechr"); } From 2eefd399d28a52739fdbeebe84775275f016171c Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 20 Jun 2023 20:01:32 +0200 Subject: [PATCH 259/266] ublk: make ublk_chr_class a static const structure Now that the driver core allows for struct class to be in read-only memory, move the ublk_chr_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Ming Lei Cc: Jens Axboe Cc: linux-block@vger.kernel.org Suggested-by: Greg Kroah-Hartman Signed-off-by: Ivan Orlov Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230620180129.645646-7-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 9fdc4c7f908d..6287f13d1620 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -189,7 +189,9 @@ static inline void __ublk_complete_rq(struct request *req); static void ublk_complete_rq(struct kref *ref); static dev_t ublk_chr_devt; -static struct class *ublk_chr_class; +static const struct class ublk_chr_class = { + .name = "ublk-char", +}; static DEFINE_IDR(ublk_index_idr); static DEFINE_SPINLOCK(ublk_idr_lock); @@ -1755,7 +1757,7 @@ static int ublk_add_chdev(struct ublk_device *ub) dev->parent = ublk_misc.this_device; dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); - dev->class = ublk_chr_class; + dev->class = &ublk_chr_class; dev->release = ublk_cdev_rel; device_initialize(dev); @@ -2581,11 +2583,10 @@ static int __init ublk_init(void) if (ret) goto unregister_mis; - ublk_chr_class = class_create("ublk-char"); - if (IS_ERR(ublk_chr_class)) { - ret = PTR_ERR(ublk_chr_class); + ret = class_register(&ublk_chr_class); + if (ret) goto free_chrdev_region; - } + return 0; free_chrdev_region: @@ -2603,7 +2604,7 @@ static void __exit ublk_exit(void) idr_for_each_entry(&ublk_index_idr, ub, id) ublk_remove(ub); - class_destroy(ublk_chr_class); + class_unregister(&ublk_chr_class); misc_deregister(&ublk_misc); idr_destroy(&ublk_index_idr); From 72ef02b8dfa009029fa713e8a731a92d27d14e35 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 20 Jun 2023 20:01:33 +0200 Subject: [PATCH 260/266] bsg: make bsg_class a static const structure Now that the driver core allows for struct class to be in read-only memory, move the bsg_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: FUJITA Tomonori Cc: Jens Axboe Cc: linux-scsi@vger.kernel.org Cc: linux-block@vger.kernel.org Suggested-by: Greg Kroah-Hartman Signed-off-by: Ivan Orlov Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20230620180129.645646-8-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- block/bsg.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index bec4027842b3..1a9396a3b7d7 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -39,7 +39,7 @@ static inline struct bsg_device *to_bsg_device(struct inode *inode) #define BSG_MAX_DEVS 32768 static DEFINE_IDA(bsg_minor_ida); -static struct class *bsg_class; +static const struct class bsg_class; static int bsg_major; static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr) @@ -208,7 +208,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, return ERR_PTR(ret); } bd->device.devt = MKDEV(bsg_major, ret); - bd->device.class = bsg_class; + bd->device.class = &bsg_class; bd->device.parent = parent; bd->device.release = bsg_device_release; dev_set_name(&bd->device, "%s", name); @@ -242,15 +242,19 @@ static char *bsg_devnode(const struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); } +static const struct class bsg_class = { + .name = "bsg", + .devnode = bsg_devnode, +}; + static int __init bsg_init(void) { dev_t devid; int ret; - bsg_class = class_create("bsg"); - if (IS_ERR(bsg_class)) - return PTR_ERR(bsg_class); - bsg_class->devnode = bsg_devnode; + ret = class_register(&bsg_class); + if (ret) + return ret; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); if (ret) @@ -262,7 +266,7 @@ static int __init bsg_init(void) return 0; destroy_bsg_class: - class_destroy(bsg_class); + class_unregister(&bsg_class); return ret; } From 2293cae703cda162684ae966db6b1b4a11b5e88f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 21 Jun 2023 21:22:08 +0800 Subject: [PATCH 261/266] blk-mq: don't insert passthrough request into sw queue In case of real io scheduler, q->elevator is set, so blk_mq_run_hw_queue() may just check if scheduler queue has request to dispatch, see __blk_mq_sched_dispatch_requests(). Then IO hang may be caused because all passthorugh requests may stay in sw queue. And any passthrough request should have been inserted to hctx->dispatch always. Reported-by: Guangwu Zhang Fixes: d97217e7f024 ("blk-mq: don't queue plugged passthrough requests into scheduler") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230621132208.1142318-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 16c524e37123..720b5061ffe8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2728,7 +2728,12 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) percpu_ref_get(&this_hctx->queue->q_usage_counter); /* passthrough requests should never be issued to the I/O scheduler */ - if (this_hctx->queue->elevator && !is_passthrough) { + if (is_passthrough) { + spin_lock(&this_hctx->lock); + list_splice_tail_init(&list, &this_hctx->dispatch); + spin_unlock(&this_hctx->lock); + blk_mq_run_hw_queue(this_hctx, from_sched); + } else if (this_hctx->queue->elevator) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); From 017fb83ee0612595ec70c65ddd83472706b02a50 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 21 Jun 2023 09:50:54 -0700 Subject: [PATCH 262/266] block: Improve kernel-doc headers Fix the documentation of the devt_from_partuuid() return value. Fix the following two recently introduced kernel-doc warnings: block/bdev.c:570: warning: Function parameter or member 'hops' not described in 'bd_finish_claiming' block/early-lookup.c:46: warning: Function parameter or member 'devt' not described in 'devt_from_partuuid' Cc: Christoph Hellwig Fixes: 0718afd47f70 ("block: introduce holder ops") Fixes: cf056a431215 ("init: improve the name_to_dev_t interface") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230621165054.743815-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/bdev.c | 1 + block/early-lookup.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index 9bb54d9d02a6..979e28a46b98 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -561,6 +561,7 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev + * @hops: block device holder operations * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. diff --git a/block/early-lookup.c b/block/early-lookup.c index a5be3c68ed07..91002b19d09c 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -32,6 +32,7 @@ static int __init match_dev_by_uuid(struct device *dev, const void *data) /** * devt_from_partuuid - looks up the dev_t of a partition by its UUID * @uuid_str: char array containing ascii UUID + * @devt: dev_t result * * The function will return the first partition which contains a matching * UUID value in its partition_meta_info struct. This does not search @@ -40,7 +41,7 @@ static int __init match_dev_by_uuid(struct device *dev, const void *data) * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be * extracted and used as an offset from the partition identified by the UUID. * - * Returns the matching dev_t on success or 0 on failure. + * Returns 0 on success or a negative error code on failure. */ static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt) { From 8270cb10c0681d52fce508f827dfa1688d3acc3a Mon Sep 17 00:00:00 2001 From: Jordy Zomer Date: Sat, 17 Jun 2023 12:38:28 +0100 Subject: [PATCH 263/266] cdrom: Fix spectre-v1 gadget This patch fixes a spectre-v1 gadget in cdrom. The gadget could be triggered by speculatively bypassing the cdi->capacity check. Signed-off-by: Jordy Zomer Link: https://lore.kernel.org/all/20230612110040.849318-2-jordyzomer@google.com Reviewed-by: Phillip Potter Link: https://lore.kernel.org/all/ZI1+1OG9Ut1MqsUC@equinox Signed-off-by: Phillip Potter Link: https://lore.kernel.org/r/20230617113828.1230-2-phil@philpotter.co.uk Signed-off-by: Jens Axboe --- drivers/cdrom/cdrom.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index bd8cd59c758a..cc2839805983 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -264,6 +264,7 @@ #include #include #include +#include #include #include #include @@ -2311,6 +2312,9 @@ static int cdrom_ioctl_media_changed(struct cdrom_device_info *cdi, if (arg >= cdi->capacity) return -EINVAL; + /* Prevent arg from speculatively bypassing the length check */ + barrier_nospec(); + info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; From 648fa60fa7de3ca6f6303e1721591ad73def9cf0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Jun 2023 17:06:44 +0200 Subject: [PATCH 264/266] block: don't return -EINVAL for not found names in devt_from_devname When we didn't find a device and didn't guess it might be a partition, it might still show up later, so don't disable rootwait for it by returning -EINVAL. Fixes: 079caa35f786 ("init: clear root_wait on all invalid root= strings") Reported-by: Guenter Roeck Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230622150644.600327-1-hch@lst.de Signed-off-by: Jens Axboe --- block/early-lookup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/early-lookup.c b/block/early-lookup.c index 91002b19d09c..3effbd0d35e9 100644 --- a/block/early-lookup.c +++ b/block/early-lookup.c @@ -175,7 +175,7 @@ static int __init devt_from_devname(const char *name, dev_t *devt) while (p > s && isdigit(p[-1])) p--; if (p == s || !*p || *p == '0') - return -EINVAL; + return -ENODEV; /* try disk name without */ part = simple_strtoul(p, NULL, 10); @@ -186,7 +186,7 @@ static int __init devt_from_devname(const char *name, dev_t *devt) /* try disk name without p */ if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') - return -EINVAL; + return -ENODEV; p[-1] = '\0'; *devt = blk_lookup_devt(s, part); if (*devt) From a42fb5a75ccc37dfd69aa9bde5ba2866e802ff3c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2023 18:51:07 +0200 Subject: [PATCH 265/266] ext4: Fix warning in blkdev_put() ext4_blkdev_remove() passes a wrong holder pointer to blkdev_put() which triggers a warning there. Fix it. Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens") Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230622165107.13687-1-jack@suse.cz Signed-off-by: Jens Axboe --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 94a7b56ed876..64342adcd679 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1133,7 +1133,7 @@ static void ext4_blkdev_remove(struct ext4_sb_info *sbi) struct block_device *bdev; bdev = sbi->s_journal_bdev; if (bdev) { - blkdev_put(bdev, sbi->s_es); + blkdev_put(bdev, sbi->s_sb); sbi->s_journal_bdev = NULL; } } From fcaa174a9c995cf0af3967e55644a1543ea07e36 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 22 Jun 2023 00:01:11 +0800 Subject: [PATCH 266/266] scsi/sg: don't grab scsi host module reference In order to prevent request_queue to be freed before cleaning up blktrace debugfs entries, commit db59133e9279 ("scsi: sg: fix blktrace debugfs entries leakage") use scsi_device_get(), however, scsi_device_get() will also grab scsi module reference and scsi module can't be removed. It's reported that blktests can't unload scsi_debug after block/001: blktests (master) # ./check block block/001 (stress device hotplugging) [failed] +++ /root/blktests/results/nodev/block/001.out.bad 2023-06-19 Running block/001 Stressing sd +modprobe: FATAL: Module scsi_debug is in use. Fix this problem by grabbing request_queue reference directly, so that scsi host module can still be unloaded while request_queue will be pinged by sg device. Reported-by: Chaitanya Kulkarni Link: https://lore.kernel.org/all/1760da91-876d-fc9c-ab51-999a6f66ad50@nvidia.com/ Fixes: db59133e9279 ("scsi: sg: fix blktrace debugfs entries leakage") Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230621160111.1433521-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 2433eeef042a..dcb73787c29d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1497,7 +1497,7 @@ sg_add_device(struct device *cl_dev) int error; unsigned long iflags; - error = scsi_device_get(scsidp); + error = blk_get_queue(scsidp->request_queue); if (error) return error; @@ -1558,7 +1558,7 @@ sg_add_device(struct device *cl_dev) out: if (cdev) cdev_del(cdev); - scsi_device_put(scsidp); + blk_put_queue(scsidp->request_queue); return error; } @@ -1575,7 +1575,7 @@ sg_device_destroy(struct kref *kref) */ blk_trace_remove(q); - scsi_device_put(sdp->device); + blk_put_queue(q); write_lock_irqsave(&sg_index_lock, flags); idr_remove(&sg_index_idr, sdp->index);