From a3f143c461444c0b56360bbf468615fa814a8372 Mon Sep 17 00:00:00 2001 From: Manas Date: Mon, 18 Nov 2024 20:06:58 +0530 Subject: [PATCH 01/36] rust: block: simplify Result<()> in validate_block_size return `Result` is used in place of `Result<()>` because the default type parameters are unit `()` and `Error` types, which are automatically inferred. Thus keep the usage consistent throughout codebase. Suggested-by: Miguel Ojeda Link: https://github.com/Rust-for-Linux/linux/issues/1128 Signed-off-by: Manas Reviewed-by: Miguel Ojeda Link: https://lore.kernel.org/r/20241118-simplify-result-v3-1-6b1566a77eab@iiitd.ac.in Signed-off-by: Jens Axboe --- rust/kernel/block/mq/gen_disk.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs index 708125dce96a..798c4ae0bded 100644 --- a/rust/kernel/block/mq/gen_disk.rs +++ b/rust/kernel/block/mq/gen_disk.rs @@ -45,7 +45,7 @@ impl GenDiskBuilder { /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`, /// and that it is a power of two. - fn validate_block_size(size: u32) -> Result<()> { + fn validate_block_size(size: u32) -> Result { if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() { Err(error::code::EINVAL) } else { From 5dd18f09ce7399df6fffe80d1598add46c395ae9 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 5 Nov 2024 06:42:46 -0800 Subject: [PATCH 02/36] nvme/multipath: Fix RCU list traversal to use SRCU primitive The code currently uses list_for_each_entry_rcu() while holding an SRCU lock, triggering false positive warnings with CONFIG_PROVE_RCU=y enabled: drivers/nvme/host/multipath.c:168 RCU-list traversed in non-reader section!! drivers/nvme/host/multipath.c:227 RCU-list traversed in non-reader section!! drivers/nvme/host/multipath.c:260 RCU-list traversed in non-reader section!! While the list is properly protected by SRCU lock, the code uses the wrong list traversal primitive. Replace list_for_each_entry_rcu() with list_for_each_entry_srcu() to correctly indicate SRCU-based protection and eliminate the false warning. Signed-off-by: Breno Leitao Fixes: be647e2c76b2 ("nvme: use srcu for iterating namespace list") Signed-off-by: Keith Busch --- drivers/nvme/host/multipath.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f04cfe3fb936..a85d190942bd 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -165,7 +165,8 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) { if (!ns->head->disk) continue; kblockd_schedule_work(&ns->head->requeue_work); @@ -209,7 +210,8 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) int srcu_idx; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) { nvme_mpath_clear_current_path(ns); kblockd_schedule_work(&ns->head->requeue_work); } @@ -224,7 +226,8 @@ void nvme_mpath_revalidate_paths(struct nvme_ns *ns) int srcu_idx; srcu_idx = srcu_read_lock(&head->srcu); - list_for_each_entry_rcu(ns, &head->list, siblings) { + list_for_each_entry_srcu(ns, &head->list, siblings, + srcu_read_lock_held(&head->srcu)) { if (capacity != get_capacity(ns->disk)) clear_bit(NVME_NS_READY, &ns->flags); } @@ -257,7 +260,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; struct nvme_ns *found = NULL, *fallback = NULL, *ns; - list_for_each_entry_rcu(ns, &head->list, siblings) { + list_for_each_entry_srcu(ns, &head->list, siblings, + srcu_read_lock_held(&head->srcu)) { if (nvme_path_is_disabled(ns)) continue; @@ -356,7 +360,8 @@ static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head) unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX; unsigned int depth; - list_for_each_entry_rcu(ns, &head->list, siblings) { + list_for_each_entry_srcu(ns, &head->list, siblings, + srcu_read_lock_held(&head->srcu)) { if (nvme_path_is_disabled(ns)) continue; @@ -424,7 +429,8 @@ static bool nvme_available_path(struct nvme_ns_head *head) if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) return NULL; - list_for_each_entry_rcu(ns, &head->list, siblings) { + list_for_each_entry_srcu(ns, &head->list, siblings, + srcu_read_lock_held(&head->srcu)) { if (test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ns->ctrl->flags)) continue; switch (nvme_ctrl_state(ns->ctrl)) { @@ -783,7 +789,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, return 0; srcu_idx = srcu_read_lock(&ctrl->srcu); - list_for_each_entry_rcu(ns, &ctrl->namespaces, list) { + list_for_each_entry_srcu(ns, &ctrl->namespaces, list, + srcu_read_lock_held(&ctrl->srcu)) { unsigned nsid; again: nsid = le32_to_cpu(desc->nsids[n]); From 979c6342f9c0a48696a6420f14f9dd409591657f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 15 Nov 2024 13:41:21 -0800 Subject: [PATCH 03/36] nvme-pci: add support for sgl metadata Supporting this mode allows creating and merging multi-segment metadata requests that wouldn't be possible otherwise. It also allows directly using user space requests that straddle physically discontiguous pages. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 7 ++ drivers/nvme/host/pci.c | 144 +++++++++++++++++++++++++++++++++++---- include/linux/nvme.h | 1 + 3 files changed, 137 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 900719c4c70c..5ef284a376cc 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1126,6 +1126,13 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) return ctrl->sgls & ((1 << 0) | (1 << 1)); } +static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl) +{ + if (ctrl->ops->flags & NVME_F_FABRICS) + return true; + return ctrl->sgls & NVME_CTRL_SGLS_MSDS; +} + #ifdef CONFIG_NVME_HOST_AUTH int __init nvme_init_auth(void); void __exit nvme_exit_auth(void); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5f2e3ad2cc52..c6c3ae3a7c43 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -43,6 +43,7 @@ */ #define NVME_MAX_KB_SZ 8192 #define NVME_MAX_SEGS 128 +#define NVME_MAX_META_SEGS 15 #define NVME_MAX_NR_ALLOCATIONS 5 static int use_threaded_interrupts; @@ -144,6 +145,7 @@ struct nvme_dev { struct sg_table *hmb_sgt; mempool_t *iod_mempool; + mempool_t *iod_meta_mempool; /* shadow doorbell buffer support: */ __le32 *dbbuf_dbs; @@ -239,6 +241,8 @@ struct nvme_iod { dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; + struct sg_table meta_sgt; + union nvme_descriptor meta_list; union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; }; @@ -506,6 +510,14 @@ static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx) spin_unlock(&nvmeq->sq_lock); } +static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev, + struct request *req) +{ + if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) + return false; + return req->nr_integrity_segments > 1; +} + static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, int nseg) { @@ -518,6 +530,8 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, return false; if (!nvmeq->qid) return false; + if (nvme_pci_metadata_use_sgls(dev, req)) + return true; if (!sgl_threshold || avg_seg_size < sgl_threshold) return false; return true; @@ -780,7 +794,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { - if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + + if (!nvme_pci_metadata_use_sgls(dev, req) && + (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2) return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); @@ -824,11 +839,69 @@ out_free_sg: return ret; } -static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, - struct nvme_command *cmnd) +static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev, + struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_rw_command *cmnd = &iod->cmd.rw; + struct nvme_sgl_desc *sg_list; + struct scatterlist *sgl, *sg; + unsigned int entries; + dma_addr_t sgl_dma; + int rc, i; + + iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC); + if (!iod->meta_sgt.sgl) + return BLK_STS_RESOURCE; + + sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments); + iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req, + iod->meta_sgt.sgl); + if (!iod->meta_sgt.orig_nents) + goto out_free_sg; + + rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), + DMA_ATTR_NO_WARN); + if (rc) + goto out_free_sg; + + sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma); + if (!sg_list) + goto out_unmap_sg; + + entries = iod->meta_sgt.nents; + iod->meta_list.sg_list = sg_list; + iod->meta_dma = sgl_dma; + + cmnd->flags = NVME_CMD_SGL_METASEG; + cmnd->metadata = cpu_to_le64(sgl_dma); + + sgl = iod->meta_sgt.sgl; + if (entries == 1) { + nvme_pci_sgl_set_data(sg_list, sgl); + return BLK_STS_OK; + } + + sgl_dma += sizeof(*sg_list); + nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries); + for_each_sg(sgl, sg, entries, i) + nvme_pci_sgl_set_data(&sg_list[i + 1], sg); + + return BLK_STS_OK; + +out_unmap_sg: + dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); +out_free_sg: + mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); + return BLK_STS_RESOURCE; +} + +static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev, + struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct bio_vec bv = rq_integrity_vec(req); + struct nvme_command *cmnd = &iod->cmd; iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0); if (dma_mapping_error(dev->dev, iod->meta_dma)) @@ -837,6 +910,13 @@ static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req, return BLK_STS_OK; } +static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req) +{ + if (nvme_pci_metadata_use_sgls(dev, req)) + return nvme_pci_setup_meta_sgls(dev, req); + return nvme_pci_setup_meta_mptr(dev, req); +} + static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -845,6 +925,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) iod->aborted = false; iod->nr_allocations = -1; iod->sgt.nents = 0; + iod->meta_sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret) @@ -857,7 +938,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) } if (blk_integrity_rq(req)) { - ret = nvme_map_metadata(dev, req, &iod->cmd); + ret = nvme_map_metadata(dev, req); if (ret) goto out_unmap_data; } @@ -955,17 +1036,31 @@ static void nvme_queue_rqs(struct rq_list *rqlist) *rqlist = requeue_list; } +static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev, + struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + + if (!iod->meta_sgt.nents) { + dma_unmap_page(dev->dev, iod->meta_dma, + rq_integrity_vec(req).bv_len, + rq_dma_dir(req)); + return; + } + + dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list, + iod->meta_dma); + dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0); + mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool); +} + static __always_inline void nvme_pci_unmap_rq(struct request *req) { struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; - if (blk_integrity_rq(req)) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - - dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req).bv_len, rq_dma_dir(req)); - } + if (blk_integrity_rq(req)) + nvme_unmap_metadata(dev, req); if (blk_rq_nr_phys_segments(req)) nvme_unmap_data(dev, req); @@ -2761,6 +2856,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) { + size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1); size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS; dev->iod_mempool = mempool_create_node(1, @@ -2769,7 +2865,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev) dev_to_node(dev->dev)); if (!dev->iod_mempool) return -ENOMEM; + + dev->iod_meta_mempool = mempool_create_node(1, + mempool_kmalloc, mempool_kfree, + (void *)meta_size, GFP_KERNEL, + dev_to_node(dev->dev)); + if (!dev->iod_meta_mempool) + goto free; + return 0; +free: + mempool_destroy(dev->iod_mempool); + return -ENOMEM; } static void nvme_free_tagset(struct nvme_dev *dev) @@ -2834,6 +2941,11 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; + else + dev->ctrl.max_integrity_segments = 1; + nvme_dbbuf_dma_alloc(dev); result = nvme_setup_host_mem(dev); @@ -3101,11 +3213,6 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, dev->ctrl.max_hw_sectors = min_t(u32, NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; - - /* - * There is no support for SGLs for metadata (yet), so we are limited to - * a single integrity segment for the separate metadata pointer. - */ dev->ctrl.max_integrity_segments = 1; return dev; @@ -3168,6 +3275,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto out_disable; + if (nvme_ctrl_meta_sgl_supported(&dev->ctrl)) + dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS; + else + dev->ctrl.max_integrity_segments = 1; + nvme_dbbuf_dma_alloc(dev); result = nvme_setup_host_mem(dev); @@ -3210,6 +3322,7 @@ out_disable: nvme_free_queues(dev, 0); out_release_iod_mempool: mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->iod_meta_mempool); out_release_prp_pools: nvme_release_prp_pools(dev); out_dev_unmap: @@ -3275,6 +3388,7 @@ static void nvme_remove(struct pci_dev *pdev) nvme_dbbuf_dma_free(dev); nvme_free_queues(dev, 0); mempool_destroy(dev->iod_mempool); + mempool_destroy(dev->iod_meta_mempool); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_uninit_ctrl(&dev->ctrl); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 0a6e22038ce3..5873ce859cc8 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -389,6 +389,7 @@ enum { NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, + NVME_CTRL_SGLS_MSDS = 1 << 19, }; struct nvme_lbaf { From 6399a0db8cd61eedbfb4b7809a4f4699157a9bf8 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 13 Nov 2024 13:57:04 -0800 Subject: [PATCH 04/36] nvme: define the remaining used sgls constants This provides a little more context when reading the code than hardcoded magic numbers. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 3 ++- drivers/nvme/host/rdma.c | 4 ++-- drivers/nvme/target/admin-cmd.c | 7 ++++--- include/linux/nvme.h | 4 ++++ 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5ef284a376cc..611b02c8a8b3 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1123,7 +1123,8 @@ static inline void nvme_start_request(struct request *rq) static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) { - return ctrl->sgls & ((1 << 0) | (1 << 1)); + return ctrl->sgls & (NVME_CTRL_SGLS_BYTE_ALIGNED | + NVME_CTRL_SGLS_DWORD_ALIGNED); } static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 24a2759798d0..baf7d2490152 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1019,7 +1019,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) goto destroy_admin; } - if (!(ctrl->ctrl.sgls & (1 << 2))) { + if (!(ctrl->ctrl.sgls & NVME_CTRL_SGLS_KSDBDS)) { ret = -EOPNOTSUPP; dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not supported!\n"); @@ -1051,7 +1051,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; } - if (ctrl->ctrl.sgls & (1 << 20)) + if (ctrl->ctrl.sgls & NVME_CTRL_SGLS_SAOS) ctrl->use_inline_data = true; if (ctrl->ctrl.queue_count > 1) { diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 934b401fbc2f..4fa8496a4d96 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -601,11 +601,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->awun = 0; id->awupf = 0; - id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + /* we always support SGLs */ + id->sgls = cpu_to_le32(NVME_CTRL_SGLS_BYTE_ALIGNED); if (ctrl->ops->flags & NVMF_KEYED_SGLS) - id->sgls |= cpu_to_le32(1 << 2); + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_KSDBDS); if (req->port->inline_data_size) - id->sgls |= cpu_to_le32(1 << 20); + id->sgls |= cpu_to_le32(NVME_CTRL_SGLS_SAOS); strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5873ce859cc8..2baf9a80b470 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -389,7 +389,11 @@ enum { NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5, NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7, NVME_CTRL_CTRATT_UUID_LIST = 1 << 9, + NVME_CTRL_SGLS_BYTE_ALIGNED = 1, + NVME_CTRL_SGLS_DWORD_ALIGNED = 2, + NVME_CTRL_SGLS_KSDBDS = 1 << 2, NVME_CTRL_SGLS_MSDS = 1 << 19, + NVME_CTRL_SGLS_SAOS = 1 << 20, }; struct nvme_lbaf { From 6fad84a4d624c300d03ebba457cc641765050c43 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 8 Nov 2024 15:41:08 -0800 Subject: [PATCH 05/36] nvme-pci: use sgls for all user requests if possible If the device supports SGLs, use these for all user requests. This format encodes the expected transfer length so it can catch short buffer errors in a user command, whether it occurred accidently or maliciously. For controllers that support SGL data mode, this is a viable mitigation to CVE-2023-6238. For controllers that don't support SGLs, log a warning in the passthrough path since not having the capability can corrupt data if the interface is not used correctly. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/ioctl.c | 12 ++++++++++-- drivers/nvme/host/pci.c | 5 +++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index cb7f61e2077d..64b5542fb3b7 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -120,12 +120,20 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, struct nvme_ns *ns = q->queuedata; struct block_device *bdev = ns ? ns->disk->part0 : NULL; bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; bool has_metadata = meta_buffer && meta_len; struct bio *bio = NULL; int ret; - if (has_metadata && !supports_metadata) - return -EINVAL; + if (!nvme_ctrl_sgl_supported(ctrl)) + dev_warn_once(ctrl->device, "using unchecked data buffer\n"); + if (has_metadata) { + if (!supports_metadata) + return -EINVAL; + if (!nvme_ctrl_meta_sgl_supported(ctrl)) + dev_warn_once(ctrl->device, + "using unchecked metadata buffer\n"); + } if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { struct iov_iter iter; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c6c3ae3a7c43..4c644bb7f069 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -515,7 +515,8 @@ static inline bool nvme_pci_metadata_use_sgls(struct nvme_dev *dev, { if (!nvme_ctrl_meta_sgl_supported(&dev->ctrl)) return false; - return req->nr_integrity_segments > 1; + return req->nr_integrity_segments > 1 || + nvme_req(req)->flags & NVME_REQ_USERCMD; } static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, @@ -533,7 +534,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, if (nvme_pci_metadata_use_sgls(dev, req)) return true; if (!sgl_threshold || avg_seg_size < sgl_threshold) - return false; + return nvme_req(req)->flags & NVME_REQ_USERCMD; return true; } From 9c0ba14828d64744ccd195c610594ba254a1a9ab Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 18 Nov 2024 15:52:50 +0100 Subject: [PATCH 06/36] blk-settings: round down io_opt to physical_block_size There was a bug report [1] where the user got a warning alignment inconsistency. The user has optimal I/O 16776704 (0xFFFE00) and physical block size 4096. Note that the optimal I/O size may be set by the DMA engines or SCSI controllers and they have no knowledge about the disks attached to them, so the situation with optimal I/O not aligned to physical block size may happen. This commit makes blk_validate_limits round down optimal I/O size to the physical block size of the block device. Closes: https://lore.kernel.org/dm-devel/1426ad71-79b4-4062-b2bf-84278be66a5d@redhat.com/T/ [1] Signed-off-by: Mikulas Patocka Fixes: a23634644afc ("block: take io_opt and io_min into account for max_sectors") Cc: stable@vger.kernel.org # v6.11+ Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/3dc0014b-9690-dc38-81c9-4a316a2d4fb2@redhat.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index f1d4dfdc37a7..0fe16f987cde 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -248,6 +248,13 @@ int blk_validate_limits(struct queue_limits *lim) if (lim->io_min < lim->physical_block_size) lim->io_min = lim->physical_block_size; + /* + * The optimal I/O size may not be aligned to physical block size + * (because it may be limited by dma engines which have no clue about + * block size of the disks attached to them), so we round it down here. + */ + lim->io_opt = round_down(lim->io_opt, lim->physical_block_size); + /* * max_hw_sectors has a somewhat weird default for historical reason, * but driver really should set their own instead of relying on this From 3802f73bd80766d70f319658f334754164075bc3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 4 Nov 2024 19:00:05 +0800 Subject: [PATCH 07/36] block: fix uaf for flush rq while iterating tags blk_mq_clear_flush_rq_mapping() is not called during scsi probe, by checking blk_queue_init_done(). However, QUEUE_FLAG_INIT_DONE is cleared in del_gendisk by commit aec89dc5d421 ("block: keep q_usage_counter in atomic mode after del_gendisk"), hence for disk like scsi, following blk_mq_destroy_queue() will not clear flush rq from tags->rqs[] as well, cause following uaf that is found by our syzkaller for v6.6: ================================================================== BUG: KASAN: slab-use-after-free in blk_mq_find_and_get_req+0x16e/0x1a0 block/blk-mq-tag.c:261 Read of size 4 at addr ffff88811c969c20 by task kworker/1:2H/224909 CPU: 1 PID: 224909 Comm: kworker/1:2H Not tainted 6.6.0-ga836a5060850 #32 Workqueue: kblockd blk_mq_timeout_work Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x91/0xf0 lib/dump_stack.c:106 print_address_description.constprop.0+0x66/0x300 mm/kasan/report.c:364 print_report+0x3e/0x70 mm/kasan/report.c:475 kasan_report+0xb8/0xf0 mm/kasan/report.c:588 blk_mq_find_and_get_req+0x16e/0x1a0 block/blk-mq-tag.c:261 bt_iter block/blk-mq-tag.c:288 [inline] __sbitmap_for_each_set include/linux/sbitmap.h:295 [inline] sbitmap_for_each_set include/linux/sbitmap.h:316 [inline] bt_for_each+0x455/0x790 block/blk-mq-tag.c:325 blk_mq_queue_tag_busy_iter+0x320/0x740 block/blk-mq-tag.c:534 blk_mq_timeout_work+0x1a3/0x7b0 block/blk-mq.c:1673 process_one_work+0x7c4/0x1450 kernel/workqueue.c:2631 process_scheduled_works kernel/workqueue.c:2704 [inline] worker_thread+0x804/0xe40 kernel/workqueue.c:2785 kthread+0x346/0x450 kernel/kthread.c:388 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:293 Allocated by task 942: kasan_save_stack+0x22/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 ____kasan_kmalloc mm/kasan/common.c:374 [inline] __kasan_kmalloc mm/kasan/common.c:383 [inline] __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:380 kasan_kmalloc include/linux/kasan.h:198 [inline] __do_kmalloc_node mm/slab_common.c:1007 [inline] __kmalloc_node+0x69/0x170 mm/slab_common.c:1014 kmalloc_node include/linux/slab.h:620 [inline] kzalloc_node include/linux/slab.h:732 [inline] blk_alloc_flush_queue+0x144/0x2f0 block/blk-flush.c:499 blk_mq_alloc_hctx+0x601/0x940 block/blk-mq.c:3788 blk_mq_alloc_and_init_hctx+0x27f/0x330 block/blk-mq.c:4261 blk_mq_realloc_hw_ctxs+0x488/0x5e0 block/blk-mq.c:4294 blk_mq_init_allocated_queue+0x188/0x860 block/blk-mq.c:4350 blk_mq_init_queue_data block/blk-mq.c:4166 [inline] blk_mq_init_queue+0x8d/0x100 block/blk-mq.c:4176 scsi_alloc_sdev+0x843/0xd50 drivers/scsi/scsi_scan.c:335 scsi_probe_and_add_lun+0x77c/0xde0 drivers/scsi/scsi_scan.c:1189 __scsi_scan_target+0x1fc/0x5a0 drivers/scsi/scsi_scan.c:1727 scsi_scan_channel drivers/scsi/scsi_scan.c:1815 [inline] scsi_scan_channel+0x14b/0x1e0 drivers/scsi/scsi_scan.c:1791 scsi_scan_host_selected+0x2fe/0x400 drivers/scsi/scsi_scan.c:1844 scsi_scan+0x3a0/0x3f0 drivers/scsi/scsi_sysfs.c:151 store_scan+0x2a/0x60 drivers/scsi/scsi_sysfs.c:191 dev_attr_store+0x5c/0x90 drivers/base/core.c:2388 sysfs_kf_write+0x11c/0x170 fs/sysfs/file.c:136 kernfs_fop_write_iter+0x3fc/0x610 fs/kernfs/file.c:338 call_write_iter include/linux/fs.h:2083 [inline] new_sync_write+0x1b4/0x2d0 fs/read_write.c:493 vfs_write+0x76c/0xb00 fs/read_write.c:586 ksys_write+0x127/0x250 fs/read_write.c:639 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x70/0x120 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x78/0xe2 Freed by task 244687: kasan_save_stack+0x22/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x50 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] __kasan_slab_free+0x12a/0x1b0 mm/kasan/common.c:244 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1815 [inline] slab_free_freelist_hook mm/slub.c:1841 [inline] slab_free mm/slub.c:3807 [inline] __kmem_cache_free+0xe4/0x520 mm/slub.c:3820 blk_free_flush_queue+0x40/0x60 block/blk-flush.c:520 blk_mq_hw_sysfs_release+0x4a/0x170 block/blk-mq-sysfs.c:37 kobject_cleanup+0x136/0x410 lib/kobject.c:689 kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x119/0x140 lib/kobject.c:737 blk_mq_release+0x24f/0x3f0 block/blk-mq.c:4144 blk_free_queue block/blk-core.c:298 [inline] blk_put_queue+0xe2/0x180 block/blk-core.c:314 blkg_free_workfn+0x376/0x6e0 block/blk-cgroup.c:144 process_one_work+0x7c4/0x1450 kernel/workqueue.c:2631 process_scheduled_works kernel/workqueue.c:2704 [inline] worker_thread+0x804/0xe40 kernel/workqueue.c:2785 kthread+0x346/0x450 kernel/kthread.c:388 ret_from_fork+0x4d/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1b/0x30 arch/x86/entry/entry_64.S:293 Other than blk_mq_clear_flush_rq_mapping(), the flag is only used in blk_register_queue() from initialization path, hence it's safe not to clear the flag in del_gendisk. And since QUEUE_FLAG_REGISTERED already make sure that queue should only be registered once, there is no need to test the flag as well. Fixes: 6cfeadbff3f8 ("blk-mq: don't clear flush_rq from tags->rqs[]") Depends-on: commit aec89dc5d421 ("block: keep q_usage_counter in atomic mode after del_gendisk") Signed-off-by: Yu Kuai Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241104110005.1412161-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 ++---- block/genhd.c | 9 +++------ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d80a202cd170..4241aea84161 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -810,10 +810,8 @@ int blk_register_queue(struct gendisk *disk) * faster to shut down and is made fully functional here as * request_queues for non-existent devices never get registered. */ - if (!blk_queue_init_done(q)) { - blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); - percpu_ref_switch_to_percpu(&q->q_usage_counter); - } + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); return ret; diff --git a/block/genhd.c b/block/genhd.c index 9130e163e191..79230c109fca 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -742,13 +742,10 @@ void del_gendisk(struct gendisk *disk) * If the disk does not own the queue, allow using passthrough requests * again. Else leave the queue frozen to fail all I/O. */ - if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { - blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + if (!test_bit(GD_OWNS_QUEUE, &disk->state)) __blk_mq_unfreeze_queue(q, true); - } else { - if (queue_is_mq(q)) - blk_mq_exit_queue(q); - } + else if (queue_is_mq(q)) + blk_mq_exit_queue(q); if (start_drain) blk_unfreeze_release_lock(q, true, queue_dying); From 46fd48ab3ea3eb3bb215684bd66ea3d260b091a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 08:26:02 +0100 Subject: [PATCH 08/36] block: return unsigned int from bdev_io_min The underlying limit is defined as an unsigned int, so return that from bdev_io_min as well. Fixes: ac481c20ef8f ("block: Topology ioctls") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: John Garry Link: https://lore.kernel.org/r/20241119072602.1059488-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a1fd0ddce5cf..195db38fda16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1261,7 +1261,7 @@ static inline unsigned int queue_io_min(const struct request_queue *q) return q->limits.io_min; } -static inline int bdev_io_min(struct block_device *bdev) +static inline unsigned int bdev_io_min(struct block_device *bdev) { return queue_io_min(bdev_get_queue(bdev)); } From b49125574cae26458d4aa02ce8f4523ba9a2a328 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 19 Nov 2024 23:42:23 +0900 Subject: [PATCH 09/36] loop: Fix ABBA locking race Current loop calls vfs_statfs() while holding the q->limits_lock. If FS takes some locking in vfs_statfs callback, this may lead to ABBA locking bug (at least, FAT fs has this issue actually). So this patch calls vfs_statfs() outside q->limits_locks instead, because looks like no reason to hold q->limits_locks while getting discord configs. Chain exists of: &sbi->fat_lock --> &q->q_usage_counter(io)#17 --> &q->limits_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->limits_lock); lock(&q->q_usage_counter(io)#17); lock(&q->limits_lock); lock(&sbi->fat_lock); *** DEADLOCK *** Reported-by: syzbot+a5d8c609c02f508672cc@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=a5d8c609c02f508672cc Reviewed-by: Ming Lei Signed-off-by: OGAWA Hirofumi Signed-off-by: Jens Axboe --- drivers/block/loop.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fe9bb4fb5f1b..8f6761c27c68 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -770,12 +770,11 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } -static void loop_config_discard(struct loop_device *lo, - struct queue_limits *lim) +static void loop_get_discard_config(struct loop_device *lo, + u32 *granularity, u32 *max_discard_sectors) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; - u32 granularity = 0, max_discard_sectors = 0; struct kstatfs sbuf; /* @@ -788,24 +787,17 @@ static void loop_config_discard(struct loop_device *lo, if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); - max_discard_sectors = bdev_write_zeroes_sectors(bdev); - granularity = bdev_discard_granularity(bdev); + *max_discard_sectors = bdev_write_zeroes_sectors(bdev); + *granularity = bdev_discard_granularity(bdev); /* * We use punch hole to reclaim the free space used by the * image a.k.a. discard. */ } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { - max_discard_sectors = UINT_MAX >> 9; - granularity = sbuf.f_bsize; + *max_discard_sectors = UINT_MAX >> 9; + *granularity = sbuf.f_bsize; } - - lim->max_hw_discard_sectors = max_discard_sectors; - lim->max_write_zeroes_sectors = max_discard_sectors; - if (max_discard_sectors) - lim->discard_granularity = granularity; - else - lim->discard_granularity = 0; } struct loop_worker { @@ -991,6 +983,7 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize) struct inode *inode = file->f_mapping->host; struct block_device *backing_bdev = NULL; struct queue_limits lim; + u32 granularity = 0, max_discard_sectors = 0; if (S_ISBLK(inode->i_mode)) backing_bdev = I_BDEV(inode); @@ -1000,6 +993,8 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize) if (!bsize) bsize = loop_default_blocksize(lo, backing_bdev); + loop_get_discard_config(lo, &granularity, &max_discard_sectors); + lim = queue_limits_start_update(lo->lo_queue); lim.logical_block_size = bsize; lim.physical_block_size = bsize; @@ -1009,7 +1004,12 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize) lim.features |= BLK_FEAT_WRITE_CACHE; if (backing_bdev && !bdev_nonrot(backing_bdev)) lim.features |= BLK_FEAT_ROTATIONAL; - loop_config_discard(lo, &lim); + lim.max_hw_discard_sectors = max_discard_sectors; + lim.max_write_zeroes_sectors = max_discard_sectors; + if (max_discard_sectors) + lim.discard_granularity = granularity; + else + lim.discard_granularity = 0; return queue_limits_commit_update(lo->lo_queue, &lim); } From 84488282166de6b6760ada8030e87aaa08bce3aa Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 5 Nov 2024 11:42:08 +0530 Subject: [PATCH 10/36] Revert "nvme: make keep-alive synchronous operation" This reverts commit d06923670b5a5f609603d4a9fee4dec02d38de9c. It was realized that the fix implemented to contain the race condition among the keep alive task and the fabric shutdown code path in the commit d06923670b5ia ("nvme: make keep-alive synchronous operation") is not optimal. The reason being keep-alive runs under the workqueue and making it synchronous would waste a workqueue context. Furthermore, we later found that the above race condition is a regression caused due to the changes implemented in commit a54a93d0e359 ("nvme: move stopping keep-alive into nvme_uninit_ctrl()"). So we decided to revert the commit d06923670b5a ("nvme: make keep-alive synchronous operation") and then fix the regression. Link: https://lore.kernel.org/all/196f4013-3bbf-43ff-98b4-9cb2a96c20c2@grimberg.me/ Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7360e9c3acff..1221ebe399d7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1294,9 +1294,10 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl) queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); } -static void nvme_keep_alive_finish(struct request *rq, - blk_status_t status, struct nvme_ctrl *ctrl) +static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq, + blk_status_t status) { + struct nvme_ctrl *ctrl = rq->end_io_data; unsigned long rtt = jiffies - (rq->deadline - rq->timeout); unsigned long delay = nvme_keep_alive_work_period(ctrl); enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); @@ -1313,17 +1314,20 @@ static void nvme_keep_alive_finish(struct request *rq, delay = 0; } + blk_mq_free_request(rq); + if (status) { dev_err(ctrl->device, "failed nvme_keep_alive_end_io error=%d\n", status); - return; + return RQ_END_IO_NONE; } ctrl->ka_last_check_time = jiffies; ctrl->comp_seen = false; if (state == NVME_CTRL_LIVE || state == NVME_CTRL_CONNECTING) queue_delayed_work(nvme_wq, &ctrl->ka_work, delay); + return RQ_END_IO_NONE; } static void nvme_keep_alive_work(struct work_struct *work) @@ -1332,7 +1336,6 @@ static void nvme_keep_alive_work(struct work_struct *work) struct nvme_ctrl, ka_work); bool comp_seen = ctrl->comp_seen; struct request *rq; - blk_status_t status; ctrl->ka_last_check_time = jiffies; @@ -1355,9 +1358,9 @@ static void nvme_keep_alive_work(struct work_struct *work) nvme_init_request(rq, &ctrl->ka_cmd); rq->timeout = ctrl->kato * HZ; - status = blk_execute_rq(rq, false); - nvme_keep_alive_finish(rq, status, ctrl); - blk_mq_free_request(rq); + rq->end_io = nvme_keep_alive_end_io; + rq->end_io_data = ctrl; + blk_execute_rq_nowait(rq, false); } static void nvme_start_keep_alive(struct nvme_ctrl *ctrl) From e9869c85c81168a1275f909d5972a3fc435304be Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Tue, 5 Nov 2024 11:42:09 +0530 Subject: [PATCH 11/36] nvme-fabrics: fix kernel crash while shutting down controller The nvme keep-alive operation, which executes at a periodic interval, could potentially sneak in while shutting down a fabric controller. This may lead to a race between the fabric controller admin queue destroy code path (invoked while shutting down controller) and hw/hctx queue dispatcher called from the nvme keep-alive async request queuing operation. This race could lead to the kernel crash shown below: Call Trace: autoremove_wake_function+0x0/0xbc (unreliable) __blk_mq_sched_dispatch_requests+0x114/0x24c blk_mq_sched_dispatch_requests+0x44/0x84 blk_mq_run_hw_queue+0x140/0x220 nvme_keep_alive_work+0xc8/0x19c [nvme_core] process_one_work+0x200/0x4e0 worker_thread+0x340/0x504 kthread+0x138/0x140 start_kernel_thread+0x14/0x18 While shutting down fabric controller, if nvme keep-alive request sneaks in then it would be flushed off. The nvme_keep_alive_end_io function is then invoked to handle the end of the keep-alive operation which decrements the admin->q_usage_counter and assuming this is the last/only request in the admin queue then the admin->q_usage_counter becomes zero. If that happens then blk-mq destroy queue operation (blk_mq_destroy_ queue()) which could be potentially running simultaneously on another cpu (as this is the controller shutdown code path) would forward progress and deletes the admin queue. So, now from this point onward we are not supposed to access the admin queue resources. However the issue here's that the nvme keep-alive thread running hw/hctx queue dispatch operation hasn't yet finished its work and so it could still potentially access the admin queue resource while the admin queue had been already deleted and that causes the above crash. The above kernel crash is regression caused due to changes implemented in commit a54a93d0e359 ("nvme: move stopping keep-alive into nvme_uninit_ctrl()"). Ideally we should stop keep-alive before destroyin g the admin queue and freeing the admin tagset so that it wouldn't sneak in during the shutdown operation. However we removed the keep alive stop operation from the beginning of the controller shutdown code path in commit a54a93d0e359 ("nvme: move stopping keep-alive into nvme_uninit_ctrl()") and added it under nvme_uninit_ctrl() which executes very late in the shutdown code path after the admin queue is destroyed and its tagset is removed. So this change created the possibility of keep-alive sneaking in and interfering with the shutdown operation and causing observed kernel crash. To fix the observed crash, we decided to move nvme_stop_keep_alive() from nvme_uninit_ctrl() to nvme_remove_admin_tag_set(). This change would ensure that we don't forward progress and delete the admin queue until the keep- alive operation is finished (if it's in-flight) or cancelled and that would help contain the race condition explained above and hence avoid the crash. Moving nvme_stop_keep_alive() to nvme_remove_admin_tag_set() instead of adding nvme_stop_keep_alive() to the beginning of the controller shutdown code path in nvme_stop_ctrl(), as was the case earlier before commit a54a93d0e359 ("nvme: move stopping keep-alive into nvme_uninit_ctrl()"), would help save one callsite of nvme_stop_keep_alive(). Fixes: a54a93d0e359 ("nvme: move stopping keep-alive into nvme_uninit_ctrl()") Link: https://lore.kernel.org/all/1a21f37b-0f2a-4745-8c56-4dc8628d3983@linux.ibm.com/ Reviewed-by: Ming Lei Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1221ebe399d7..bfd71511c85f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4574,6 +4574,11 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) { + /* + * As we're about to destroy the queue and free tagset + * we can not have keep-alive work running. + */ + nvme_stop_keep_alive(ctrl); blk_mq_destroy_queue(ctrl->admin_q); blk_put_queue(ctrl->admin_q); if (ctrl->ops->flags & NVME_F_FABRICS) { From e924da7d6622b72f9eee78a3aad3e75b859da341 Mon Sep 17 00:00:00 2001 From: John Garry Date: Tue, 12 Nov 2024 09:21:44 +0000 Subject: [PATCH 12/36] block: Drop granularity check in queue_limit_discard_alignment() lim->discard_granularity is always at least SECTOR_SIZE, so drop the pointless check for granularity less than SECTOR_SIZE. Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241112092144.4059847-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 0fe16f987cde..2f1005da530c 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -465,8 +465,6 @@ static unsigned int queue_limit_discard_alignment( /* Why are these in bytes, not sectors? */ alignment = lim->discard_alignment >> SECTOR_SHIFT; granularity = lim->discard_granularity >> SECTOR_SHIFT; - if (!granularity) - return 0; /* Offset of the partition start in 'granularity' sectors */ offset = sector_div(sector, granularity); From 34c1227035b3ab930a1ae6ab6f22fec1af8ab09e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 19 Nov 2024 11:06:46 +0800 Subject: [PATCH 13/36] ublk: fix error code for unsupported command ENOTSUPP is for kernel use only, and shouldn't be sent to userspace. Fix it by replacing it with EOPNOTSUPP. Cc: stable@vger.kernel.org Fixes: bfbcef036396 ("ublk_drv: move ublk_get_device_from_id into ublk_ctrl_uring_cmd") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241119030646.2319030-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c6d18cd8af44..d4aed12dd436 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -3041,7 +3041,7 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_end_recovery(ub, cmd); break; default: - ret = -ENOTSUPP; + ret = -EOPNOTSUPP; break; } From d00eea91deaf363f83599532cb49fa528ab8e00e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 18 Nov 2024 10:50:14 +0000 Subject: [PATCH 14/36] block: Add extra checks in blk_validate_atomic_write_limits() It is so far expected that the limits passed are valid. In future atomic writes will be supported for stacked block devices, and calculating the limits there will be complicated, so add extra sanity checks to ensure that the values are always valid. Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241118105018.1870052-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 2f1005da530c..03c67620f98a 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -178,9 +178,26 @@ static void blk_validate_atomic_write_limits(struct queue_limits *lim) if (!lim->atomic_write_hw_max) goto unsupported; + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_min))) + goto unsupported; + + if (WARN_ON_ONCE(!is_power_of_2(lim->atomic_write_hw_unit_max))) + goto unsupported; + + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_min > + lim->atomic_write_hw_unit_max)) + goto unsupported; + + if (WARN_ON_ONCE(lim->atomic_write_hw_unit_max > + lim->atomic_write_hw_max)) + goto unsupported; + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; if (boundary_sectors) { + if (WARN_ON_ONCE(lim->atomic_write_hw_max > + lim->atomic_write_hw_boundary)) + goto unsupported; /* * A feature of boundary support is that it disallows bios to * be merged which would result in a merged request which From d7f36dc446e894e0f57b5f05c5628f03c5f9e2d2 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 18 Nov 2024 10:50:15 +0000 Subject: [PATCH 15/36] block: Support atomic writes limits for stacked devices Allow stacked devices to support atomic writes by aggregating the minimum capability of all bottom devices. Flag BLK_FEAT_ATOMIC_WRITES_STACKED is set for stacked devices which have been enabled to support atomic writes. Some things to note on the implementation: - For simplicity, all bottom devices must have same atomic write boundary value (if any) - The atomic write boundary must be a power-of-2 already, but this restriction could be relaxed. Furthermore, it is now required that the chunk sectors for a top device must be aligned with this boundary. - If a bottom device atomic write unit min/max are not aligned with the top device chunk sectors, the top device atomic write unit min/max are reduced to a value which works for the chunk sectors. Reviewed-by: Christoph Hellwig Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241118105018.1870052-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 115 +++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 4 ++ 2 files changed, 119 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 03c67620f98a..8f09e33f41f6 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -501,6 +501,119 @@ static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lb return sectors; } +/* Check if second and later bottom devices are compliant */ +static bool blk_stack_atomic_writes_tail(struct queue_limits *t, + struct queue_limits *b) +{ + /* We're not going to support different boundary sizes.. yet */ + if (t->atomic_write_hw_boundary != b->atomic_write_hw_boundary) + return false; + + /* Can't support this */ + if (t->atomic_write_hw_unit_min > b->atomic_write_hw_unit_max) + return false; + + /* Or this */ + if (t->atomic_write_hw_unit_max < b->atomic_write_hw_unit_min) + return false; + + t->atomic_write_hw_max = min(t->atomic_write_hw_max, + b->atomic_write_hw_max); + t->atomic_write_hw_unit_min = max(t->atomic_write_hw_unit_min, + b->atomic_write_hw_unit_min); + t->atomic_write_hw_unit_max = min(t->atomic_write_hw_unit_max, + b->atomic_write_hw_unit_max); + return true; +} + +/* Check for valid boundary of first bottom device */ +static bool blk_stack_atomic_writes_boundary_head(struct queue_limits *t, + struct queue_limits *b) +{ + /* + * Ensure atomic write boundary is aligned with chunk sectors. Stacked + * devices store chunk sectors in t->io_min. + */ + if (b->atomic_write_hw_boundary > t->io_min && + b->atomic_write_hw_boundary % t->io_min) + return false; + if (t->io_min > b->atomic_write_hw_boundary && + t->io_min % b->atomic_write_hw_boundary) + return false; + + t->atomic_write_hw_boundary = b->atomic_write_hw_boundary; + return true; +} + + +/* Check stacking of first bottom device */ +static bool blk_stack_atomic_writes_head(struct queue_limits *t, + struct queue_limits *b) +{ + if (b->atomic_write_hw_boundary && + !blk_stack_atomic_writes_boundary_head(t, b)) + return false; + + if (t->io_min <= SECTOR_SIZE) { + /* No chunk sectors, so use bottom device values directly */ + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + t->atomic_write_hw_unit_min = b->atomic_write_hw_unit_min; + t->atomic_write_hw_max = b->atomic_write_hw_max; + return true; + } + + /* + * Find values for limits which work for chunk size. + * b->atomic_write_hw_unit_{min, max} may not be aligned with chunk + * size (t->io_min), as chunk size is not restricted to a power-of-2. + * So we need to find highest power-of-2 which works for the chunk + * size. + * As an example scenario, we could have b->unit_max = 16K and + * t->io_min = 24K. For this case, reduce t->unit_max to a value + * aligned with both limits, i.e. 8K in this example. + */ + t->atomic_write_hw_unit_max = b->atomic_write_hw_unit_max; + while (t->io_min % t->atomic_write_hw_unit_max) + t->atomic_write_hw_unit_max /= 2; + + t->atomic_write_hw_unit_min = min(b->atomic_write_hw_unit_min, + t->atomic_write_hw_unit_max); + t->atomic_write_hw_max = min(b->atomic_write_hw_max, t->io_min); + + return true; +} + +static void blk_stack_atomic_writes_limits(struct queue_limits *t, + struct queue_limits *b) +{ + if (!(t->features & BLK_FEAT_ATOMIC_WRITES_STACKED)) + goto unsupported; + + if (!b->atomic_write_unit_min) + goto unsupported; + + /* + * If atomic_write_hw_max is set, we have already stacked 1x bottom + * device, so check for compliance. + */ + if (t->atomic_write_hw_max) { + if (!blk_stack_atomic_writes_tail(t, b)) + goto unsupported; + return; + } + + if (!blk_stack_atomic_writes_head(t, b)) + goto unsupported; + return; + +unsupported: + t->atomic_write_hw_max = 0; + t->atomic_write_hw_unit_max = 0; + t->atomic_write_hw_unit_min = 0; + t->atomic_write_hw_boundary = 0; + t->features &= ~BLK_FEAT_ATOMIC_WRITES_STACKED; +} + /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top device) @@ -661,6 +774,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->zone_write_granularity = 0; t->max_zone_append_sectors = 0; } + blk_stack_atomic_writes_limits(t, b); + return ret; } EXPORT_SYMBOL(blk_stack_limits); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 195db38fda16..31867de88213 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -333,6 +333,10 @@ typedef unsigned int __bitwise blk_features_t; #define BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE \ ((__force blk_features_t)(1u << 15)) +/* stacked device can/does support atomic writes */ +#define BLK_FEAT_ATOMIC_WRITES_STACKED \ + ((__force blk_features_t)(1u << 16)) + /* * Flags automatically inherited when stacking limits. */ From fa6fec82811bc6ebd3c4337ae4dae36c802c0fc1 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 18 Nov 2024 10:50:16 +0000 Subject: [PATCH 16/36] md/raid0: Atomic write support Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes. All other stacked device request queue limits should automatically be set properly. With regards to atomic write max bytes limit, this will be set at hw_max_sectors and this is limited by the stripe width, which we want. Reviewed-by: Yu Kuai Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241118105018.1870052-4-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/md/raid0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index baaf5f8b80ae..7049ec7fb8eb 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -384,6 +384,7 @@ static int raid0_set_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = mddev->chunk_sectors; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * mddev->raid_disks; + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) { queue_limits_cancel_update(mddev->gendisk->queue); From f2a38abf5f1c5aeb3be8e9f4d3d815c867fff7ca Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 18 Nov 2024 10:50:17 +0000 Subject: [PATCH 17/36] md/raid1: Atomic write support Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes. For an attempt to atomic write to a region which has bad blocks, error the write as we just cannot do this. It is unlikely to find devices which support atomic writes and bad blocks. Reviewed-by: Yu Kuai Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241118105018.1870052-5-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a5adf08ee174..519c56f0ee3d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1571,7 +1571,21 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, continue; } if (is_bad) { - int good_sectors = first_bad - r1_bio->sector; + int good_sectors; + + /* + * We cannot atomically write this, so just + * error in that case. It could be possible to + * atomically write other mirrors, but the + * complexity of supporting that is not worth + * the benefit. + */ + if (bio->bi_opf & REQ_ATOMIC) { + error = -EIO; + goto err_handle; + } + + good_sectors = first_bad - r1_bio->sector; if (good_sectors < max_sectors) max_sectors = good_sectors; } @@ -1657,7 +1671,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_iter.bi_sector = (r1_bio->sector + rdev->data_offset); mbio->bi_end_io = raid1_end_write_request; - mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA)); + mbio->bi_opf = bio_op(bio) | + (bio->bi_opf & (REQ_SYNC | REQ_FUA | REQ_ATOMIC)); if (test_bit(FailFast, &rdev->flags) && !test_bit(WriteMostly, &rdev->flags) && conf->raid_disks - mddev->degraded > 1) @@ -3224,6 +3239,7 @@ static int raid1_set_limits(struct mddev *mddev) md_init_stacking_limits(&lim); lim.max_write_zeroes_sectors = 0; + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) { queue_limits_cancel_update(mddev->gendisk->queue); From a1d9b4fd42d93f46c11e7e9d919a55a3f6ca6126 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 18 Nov 2024 10:50:18 +0000 Subject: [PATCH 18/36] md/raid10: Atomic write support Set BLK_FEAT_ATOMIC_WRITES_STACKED to enable atomic writes. For an attempt to atomic write to a region which has bad blocks, error the write as we just cannot do this. It is unlikely to find devices which support atomic writes and bad blocks. Reviewed-by: Yu Kuai Signed-off-by: John Garry Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20241118105018.1870052-6-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/md/raid10.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8c7f5daa073a..f779d8225667 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1255,6 +1255,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, const enum req_op op = bio_op(bio); const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; const blk_opf_t do_fua = bio->bi_opf & REQ_FUA; + const blk_opf_t do_atomic = bio->bi_opf & REQ_ATOMIC; unsigned long flags; struct r10conf *conf = mddev->private; struct md_rdev *rdev; @@ -1273,7 +1274,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr + choose_data_offset(r10_bio, rdev)); mbio->bi_end_io = raid10_end_write_request; - mbio->bi_opf = op | do_sync | do_fua; + mbio->bi_opf = op | do_sync | do_fua | do_atomic; if (!replacement && test_bit(FailFast, &conf->mirrors[devnum].rdev->flags) && enough(conf, devnum)) @@ -1468,7 +1469,21 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, continue; } if (is_bad) { - int good_sectors = first_bad - dev_sector; + int good_sectors; + + /* + * We cannot atomically write this, so just + * error in that case. It could be possible to + * atomically write other mirrors, but the + * complexity of supporting that is not worth + * the benefit. + */ + if (bio->bi_opf & REQ_ATOMIC) { + error = -EIO; + goto err_handle; + } + + good_sectors = first_bad - dev_sector; if (good_sectors < max_sectors) max_sectors = good_sectors; } @@ -4025,6 +4040,7 @@ static int raid10_set_queue_limits(struct mddev *mddev) lim.max_write_zeroes_sectors = 0; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * raid10_nr_stripes(conf); + lim.features |= BLK_FEAT_ATOMIC_WRITES_STACKED; err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) { queue_limits_cancel_update(mddev->gendisk->queue); From cf5a60d971c7b59efb89927919404be655a9e35a Mon Sep 17 00:00:00 2001 From: Zach Wade Date: Tue, 19 Nov 2024 23:34:10 +0800 Subject: [PATCH 19/36] Revert "block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator()" This reverts commit bc3b1e9e7c50e1de0f573eea3871db61dd4787de. The bic is associated with sync_bfqq, and bfq_release_process_ref cannot be put into bfq_put_cooperator. kasan report: [ 400.347277] ================================================================== [ 400.347287] BUG: KASAN: slab-use-after-free in bic_set_bfqq+0x200/0x230 [ 400.347420] Read of size 8 at addr ffff88881cab7d60 by task dockerd/5800 [ 400.347430] [ 400.347436] CPU: 24 UID: 0 PID: 5800 Comm: dockerd Kdump: loaded Tainted: G E 6.12.0 #32 [ 400.347450] Tainted: [E]=UNSIGNED_MODULE [ 400.347454] Hardware name: VMware, Inc. VMware20,1/440BX Desktop Reference Platform, BIOS VMW201.00V.20192059.B64.2207280713 07/28/2022 [ 400.347460] Call Trace: [ 400.347464] [ 400.347468] dump_stack_lvl+0x5d/0x80 [ 400.347490] print_report+0x174/0x505 [ 400.347521] kasan_report+0xe0/0x160 [ 400.347541] bic_set_bfqq+0x200/0x230 [ 400.347549] bfq_bic_update_cgroup+0x419/0x740 [ 400.347560] bfq_bio_merge+0x133/0x320 [ 400.347584] blk_mq_submit_bio+0x1761/0x1e20 [ 400.347625] __submit_bio+0x28b/0x7b0 [ 400.347664] submit_bio_noacct_nocheck+0x6b2/0xd30 [ 400.347690] iomap_readahead+0x50c/0x680 [ 400.347731] read_pages+0x17f/0x9c0 [ 400.347785] page_cache_ra_unbounded+0x366/0x4a0 [ 400.347795] filemap_fault+0x83d/0x2340 [ 400.347819] __xfs_filemap_fault+0x11a/0x7d0 [xfs] [ 400.349256] __do_fault+0xf1/0x610 [ 400.349270] do_fault+0x977/0x11a0 [ 400.349281] __handle_mm_fault+0x5d1/0x850 [ 400.349314] handle_mm_fault+0x1f8/0x560 [ 400.349324] do_user_addr_fault+0x324/0x970 [ 400.349337] exc_page_fault+0x76/0xf0 [ 400.349350] asm_exc_page_fault+0x26/0x30 [ 400.349360] RIP: 0033:0x55a480d77375 [ 400.349384] Code: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc 49 3b 66 10 0f 86 ae 02 00 00 55 48 89 e5 48 83 ec 58 48 8b 10 <83> 7a 10 00 0f 84 27 02 00 00 44 0f b6 42 28 44 0f b6 4a 29 41 80 [ 400.349392] RSP: 002b:00007f18c37fd8b8 EFLAGS: 00010216 [ 400.349401] RAX: 00007f18c37fd9d0 RBX: 0000000000000000 RCX: 0000000000000000 [ 400.349407] RDX: 000055a484407d38 RSI: 000000c000e8b0c0 RDI: 0000000000000000 [ 400.349412] RBP: 00007f18c37fd910 R08: 000055a484017f60 R09: 000055a484066f80 [ 400.349417] R10: 0000000000194000 R11: 0000000000000005 R12: 0000000000000008 [ 400.349422] R13: 0000000000000000 R14: 000000c000476a80 R15: 0000000000000000 [ 400.349430] [ 400.349452] [ 400.349454] Allocated by task 5800: [ 400.349459] kasan_save_stack+0x30/0x50 [ 400.349469] kasan_save_track+0x14/0x30 [ 400.349475] __kasan_slab_alloc+0x89/0x90 [ 400.349482] kmem_cache_alloc_node_noprof+0xdc/0x2a0 [ 400.349492] bfq_get_queue+0x1ef/0x1100 [ 400.349502] __bfq_get_bfqq_handle_split+0x11a/0x510 [ 400.349511] bfq_insert_requests+0xf55/0x9030 [ 400.349519] blk_mq_flush_plug_list+0x446/0x14c0 [ 400.349527] __blk_flush_plug+0x27c/0x4e0 [ 400.349534] blk_finish_plug+0x52/0xa0 [ 400.349540] _xfs_buf_ioapply+0x739/0xc30 [xfs] [ 400.350246] __xfs_buf_submit+0x1b2/0x640 [xfs] [ 400.350967] xfs_buf_read_map+0x306/0xa20 [xfs] [ 400.351672] xfs_trans_read_buf_map+0x285/0x7d0 [xfs] [ 400.352386] xfs_imap_to_bp+0x107/0x270 [xfs] [ 400.353077] xfs_iget+0x70d/0x1eb0 [xfs] [ 400.353786] xfs_lookup+0x2ca/0x3a0 [xfs] [ 400.354506] xfs_vn_lookup+0x14e/0x1a0 [xfs] [ 400.355197] __lookup_slow+0x19c/0x340 [ 400.355204] lookup_one_unlocked+0xfc/0x120 [ 400.355211] ovl_lookup_single+0x1b3/0xcf0 [overlay] [ 400.355255] ovl_lookup_layer+0x316/0x490 [overlay] [ 400.355295] ovl_lookup+0x844/0x1fd0 [overlay] [ 400.355351] lookup_one_qstr_excl+0xef/0x150 [ 400.355357] do_unlinkat+0x22a/0x620 [ 400.355366] __x64_sys_unlinkat+0x109/0x1e0 [ 400.355375] do_syscall_64+0x82/0x160 [ 400.355384] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 400.355393] [ 400.355395] Freed by task 5800: [ 400.355400] kasan_save_stack+0x30/0x50 [ 400.355407] kasan_save_track+0x14/0x30 [ 400.355413] kasan_save_free_info+0x3b/0x70 [ 400.355422] __kasan_slab_free+0x4f/0x70 [ 400.355429] kmem_cache_free+0x176/0x520 [ 400.355438] bfq_put_queue+0x67e/0x980 [ 400.355447] bfq_bic_update_cgroup+0x407/0x740 [ 400.355454] bfq_bio_merge+0x133/0x320 [ 400.355460] blk_mq_submit_bio+0x1761/0x1e20 [ 400.355467] __submit_bio+0x28b/0x7b0 [ 400.355473] submit_bio_noacct_nocheck+0x6b2/0xd30 [ 400.355480] iomap_readahead+0x50c/0x680 [ 400.355490] read_pages+0x17f/0x9c0 [ 400.355498] page_cache_ra_unbounded+0x366/0x4a0 [ 400.355505] filemap_fault+0x83d/0x2340 [ 400.355514] __xfs_filemap_fault+0x11a/0x7d0 [xfs] [ 400.356204] __do_fault+0xf1/0x610 [ 400.356213] do_fault+0x977/0x11a0 [ 400.356221] __handle_mm_fault+0x5d1/0x850 [ 400.356230] handle_mm_fault+0x1f8/0x560 [ 400.356238] do_user_addr_fault+0x324/0x970 [ 400.356248] exc_page_fault+0x76/0xf0 [ 400.356258] asm_exc_page_fault+0x26/0x30 [ 400.356266] [ 400.356269] The buggy address belongs to the object at ffff88881cab7bc0 which belongs to the cache bfq_queue of size 576 [ 400.356276] The buggy address is located 416 bytes inside of freed 576-byte region [ffff88881cab7bc0, ffff88881cab7e00) [ 400.356285] [ 400.356287] The buggy address belongs to the physical page: [ 400.356292] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88881cab0b00 pfn:0x81cab0 [ 400.356300] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [ 400.356323] flags: 0x50000000000040(head|node=1|zone=2) [ 400.356331] page_type: f5(slab) [ 400.356340] raw: 0050000000000040 ffff88880a00c280 dead000000000122 0000000000000000 [ 400.356347] raw: ffff88881cab0b00 00000000802e0025 00000001f5000000 0000000000000000 [ 400.356354] head: 0050000000000040 ffff88880a00c280 dead000000000122 0000000000000000 [ 400.356359] head: ffff88881cab0b00 00000000802e0025 00000001f5000000 0000000000000000 [ 400.356365] head: 0050000000000003 ffffea002072ac01 ffffffffffffffff 0000000000000000 [ 400.356370] head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 [ 400.356378] page dumped because: kasan: bad access detected [ 400.356381] [ 400.356383] Memory state around the buggy address: [ 400.356387] ffff88881cab7c00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 400.356392] ffff88881cab7c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 400.356397] >ffff88881cab7d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 400.356400] ^ [ 400.356405] ffff88881cab7d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 400.356409] ffff88881cab7e00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 400.356413] ================================================================== Cc: stable@vger.kernel.org Fixes: bc3b1e9e7c50 ("block, bfq: merge bfq_release_process_ref() into bfq_put_cooperator()") Signed-off-by: Zach Wade Cc: Ding Hui Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20241119153410.2546-1-zachwade.k@gmail.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 1 + block/bfq-iosched.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index e831aedb4643..9fb9f3533150 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -736,6 +736,7 @@ static void bfq_sync_bfqq_move(struct bfq_data *bfqd, */ bfq_put_cooperator(sync_bfqq); bic_set_bfqq(bic, NULL, true, act_idx); + bfq_release_process_ref(bfqd, sync_bfqq); } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0747d9d0e48c..28c2bb06e859 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5434,8 +5434,6 @@ void bfq_put_cooperator(struct bfq_queue *bfqq) bfq_put_queue(__bfqq); __bfqq = next; } - - bfq_release_process_ref(bfqq->bfqd, bfqq); } static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) @@ -5448,6 +5446,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); bfq_put_cooperator(bfqq); + + bfq_release_process_ref(bfqd, bfqq); } static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, @@ -6734,6 +6734,8 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); bfq_put_cooperator(bfqq); + + bfq_release_process_ref(bfqq->bfqd, bfqq); return NULL; } From dcbb598e689e30d4636201d35582e167d1b8dfa4 Mon Sep 17 00:00:00 2001 From: Suraj Sonawane Date: Tue, 19 Nov 2024 22:14:12 +0530 Subject: [PATCH 20/36] block: blk-mq: fix uninit-value in blk_rq_prep_clone and refactor Fix an issue detected by the `smatch` tool: block/blk-mq.c:3314 blk_rq_prep_clone() error: uninitialized symbol 'bio'. This patch refactors `blk_rq_prep_clone()` to improve code readability and ensure safety by addressing potential misuse of the `bio` variable: - Move the bio_put(bio); call to the bio_ctr error handling block, which is the only place where it can be triggered. - Move the bio variable into the __rq_for_each_bio loop scope. This change removes the need to set bio to NULL at the loop's end. discussion on why bio remains uninitialized: https://lore.kernel.org/lkml/20241004141037.43277-1-surajsonawane0215@gmail.com Summary of above discussion: - I pointed out that `bio` can remain uninitialized if the allocation with `bio_alloc_clone` fails. - Keith Busch explained that `bio` is initialized to `NULL` when `bio_alloc_clone()` fails, preventing uninitialized usage. - John Garry questioned whether `rq_src->bio` being `NULL` could leave `bio` uninitialized. Keith clarified that in such cases, `bio` is not referenced, so it does not need initialization. - Christoph Hellwig recommended code improvements: - move the bio_put to the bio_ctr error handling, which is the only case where it can happen - move the bio variable into the __rq_for_each_bio scope, which also removed the need to zero it at the end of the loop These changes enhance code clarity, address static analysis tool warnings, and make the function more maintainable. thread of previous version patch discussion: https://lore.kernel.org/lkml/20241004100842.9052-1-surajsonawane0215@gmail.com Signed-off-by: Suraj Sonawane Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241119164412.37609-1-surajsonawane0215@gmail.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 270cfd9fc6b0..abdf44736e08 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3273,19 +3273,21 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { - struct bio *bio, *bio_src; + struct bio *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { - bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, - bs); + struct bio *bio = bio_alloc_clone(rq->q->disk->part0, bio_src, + gfp_mask, bs); if (!bio) goto free_and_out; - if (bio_ctr && bio_ctr(bio, bio_src, data)) + if (bio_ctr && bio_ctr(bio, bio_src, data)) { + bio_put(bio); goto free_and_out; + } if (rq->bio) { rq->biotail->bi_next = bio; @@ -3293,7 +3295,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } else { rq->bio = rq->biotail = bio; } - bio = NULL; } /* Copy attributes of the original request to the clone request. */ @@ -3311,8 +3312,6 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, return 0; free_and_out: - if (bio) - bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; From 9f8d68283342a48f692f2c02231318bb4a7b207f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:11:50 +0100 Subject: [PATCH 21/36] block: don't bother checking the data direction for merges Because it already is encoded in the opcode. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119161157.1328171-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index e0b28e9298c9..64860cbd5e27 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -864,9 +864,6 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - if (rq_data_dir(req) != rq_data_dir(next)) - return NULL; - if (req->bio && next->bio) { /* Don't merge requests with different write hints. */ if (req->bio->bi_write_hint != next->bio->bi_write_hint) @@ -986,10 +983,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (req_op(rq) != bio_op(bio)) return false; - /* different data direction or already started, don't merge */ - if (bio_data_dir(bio) != rq_data_dir(rq)) - return false; - /* don't merge across cgroup boundaries */ if (!blk_cgroup_mergeable(rq, bio)) return false; From 81314bfbde9d089fa2318adba54891dfaadb1c05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:11:51 +0100 Subject: [PATCH 22/36] block: req->bio is always set in the merge code As smatch, which is a lot smarter than me noticed. So remove the checks for it, and condense these checks a bit including the comments stating the obvious. Reported-by: Dan Carpenter Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119161157.1328171-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 64860cbd5e27..e01383c6e534 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -864,14 +864,10 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - if (req->bio && next->bio) { - /* Don't merge requests with different write hints. */ - if (req->bio->bi_write_hint != next->bio->bi_write_hint) - return NULL; - if (req->bio->bi_ioprio != next->bio->bi_ioprio) - return NULL; - } - + if (req->bio->bi_write_hint != next->bio->bi_write_hint) + return NULL; + if (req->bio->bi_ioprio != next->bio->bi_ioprio) + return NULL; if (!blk_atomic_write_mergeable_rqs(req, next)) return NULL; @@ -983,26 +979,16 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (req_op(rq) != bio_op(bio)) return false; - /* don't merge across cgroup boundaries */ if (!blk_cgroup_mergeable(rq, bio)) return false; - - /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; - - /* Only merge if the crypt contexts are compatible */ if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; - - if (rq->bio) { - /* Don't merge requests with different write hints. */ - if (rq->bio->bi_write_hint != bio->bi_write_hint) - return false; - if (rq->bio->bi_ioprio != bio->bi_ioprio) - return false; - } - + if (rq->bio->bi_write_hint != bio->bi_write_hint) + return false; + if (rq->bio->bi_ioprio != bio->bi_ioprio) + return false; if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) return false; From 5a9d1b83e5334915c651604648c20a9fc64d47a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:18 +0100 Subject: [PATCH 23/36] block: return unsigned int from bdev_io_opt The underlying limit is defined as an unsigned int, so return that from bdev_io_opt as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 31867de88213..97c89c0c1398 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1275,7 +1275,7 @@ static inline unsigned int queue_io_opt(const struct request_queue *q) return q->limits.io_opt; } -static inline int bdev_io_opt(struct block_device *bdev) +static inline unsigned int bdev_io_opt(struct block_device *bdev) { return queue_io_opt(bdev_get_queue(bdev)); } From ed5db174cf39374215934f21b04639a7a1513023 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:19 +0100 Subject: [PATCH 24/36] block: return unsigned int from queue_dma_alignment The underlying limit is defined as an unsigned int, so return that from queue_dma_alignment as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-3-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 97c89c0c1398..d8de261c2b99 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1421,7 +1421,7 @@ static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) return is_seq; } -static inline int queue_dma_alignment(const struct request_queue *q) +static inline unsigned int queue_dma_alignment(const struct request_queue *q) { return q->limits.dma_alignment; } From e769489a54401d0c89555f7ad8672038b5c2b767 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:20 +0100 Subject: [PATCH 25/36] block: return unsigned int from blk_lim_dma_alignment_and_pad The underlying limits are defined as unsigned int, so return that from blk_lim_dma_alignment_and_pad as well. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-4-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d8de261c2b99..d0d8190429c8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1462,7 +1462,8 @@ static inline bool bdev_iter_is_aligned(struct block_device *bdev, bdev_logical_block_size(bdev) - 1); } -static inline int blk_lim_dma_alignment_and_pad(struct queue_limits *lim) +static inline unsigned int +blk_lim_dma_alignment_and_pad(struct queue_limits *lim) { return lim->dma_alignment | lim->dma_pad_mask; } From da77d9b23700708d0d22a4407d32a8755a3596e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:21 +0100 Subject: [PATCH 26/36] block: return bool from blk_rq_aligned blk_rq_aligned returns a boolean condition, don't mascquerade it as int. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-5-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d0d8190429c8..5270117d54ac 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1468,7 +1468,7 @@ blk_lim_dma_alignment_and_pad(struct queue_limits *lim) return lim->dma_alignment | lim->dma_pad_mask; } -static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, +static inline bool blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { unsigned int alignment = blk_lim_dma_alignment_and_pad(&q->limits); From e888810bc4f471f85989a0991aff28d2ac9f783b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:22 +0100 Subject: [PATCH 27/36] block: remove a duplicate definition for bdev_read_only bdev_read_only is already defined as an inline function in blkdev.h. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-6-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5270117d54ac..8b4e4692e7fb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1586,7 +1586,6 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } -int bdev_read_only(struct block_device *bdev); int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); From 766a71ef65bb217ed8bf1c068ac14c7d3c15d487 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2024 17:09:23 +0100 Subject: [PATCH 28/36] block: return bool from get_disk_ro and bdev_read_only get_disk_ro and bdev_read_only return boolean conditions, don't masquerade them as int. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20241119160932.1327864-7-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8b4e4692e7fb..08a727b40816 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -779,13 +779,13 @@ static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag) atomic_andnot(flag, &bdev->__bd_flags); } -static inline int get_disk_ro(struct gendisk *disk) +static inline bool get_disk_ro(struct gendisk *disk) { return bdev_test_flag(disk->part0, BD_READ_ONLY) || test_bit(GD_READ_ONLY, &disk->state); } -static inline int bdev_read_only(struct block_device *bdev) +static inline bool bdev_read_only(struct block_device *bdev) { return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk); } From 7d2f9f870f26798699585f96fa7a2a2cab38dc02 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Nov 2024 15:10:07 +0800 Subject: [PATCH 29/36] nvme: introduce change ptpl and iekey definition This is for the next tuning pr code more readble patch, make linux/nvme.h's changes separately. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2baf9a80b470..13377dde4527 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2171,4 +2171,13 @@ enum nvme_pr_release_action { NVME_PR_RELEASE_ACT_CLEAR = 1, }; +enum nvme_pr_change_ptpl { + NVME_PR_CPTPL_NO_CHANGE = 0, + NVME_PR_CPTPL_RESV = 1 << 30, + NVME_PR_CPTPL_CLEARED = 2 << 30, + NVME_PR_CPTPL_PERSIST = 3 << 30, +}; + +#define NVME_PR_IGNORE_KEY (1 << 3) + #endif /* _LINUX_NVME_H */ From 029cc98dec2eadb5d0978b5fea9ae6c427f2a020 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Nov 2024 15:10:08 +0800 Subject: [PATCH 30/36] nvme: tuning pr code by using defined structs and macros All the modifications are simply to make the code more readable, and this patch does not include any functional changes. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pr.c | 126 +++++++++++++++++++++++++---------------- 1 file changed, 77 insertions(+), 49 deletions(-) diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index dc7922f22600..cf2d2c5039dd 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -94,109 +94,137 @@ static int nvme_status_to_pr_err(int status) } } -static int nvme_send_pr_command(struct block_device *bdev, - struct nvme_command *c, void *data, unsigned int data_len) +static int __nvme_send_pr_command(struct block_device *bdev, u32 cdw10, + u32 cdw11, u8 op, void *data, unsigned int data_len) { - if (nvme_disk_is_ns_head(bdev->bd_disk)) - return nvme_send_ns_head_pr_command(bdev, c, data, data_len); - - return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data, - data_len); -} - -static int nvme_pr_command(struct block_device *bdev, u32 cdw10, - u64 key, u64 sa_key, u8 op) -{ - struct nvme_command c = { }; - u8 data[16] = { 0, }; - int ret; - - put_unaligned_le64(key, &data[0]); - put_unaligned_le64(sa_key, &data[8]); + struct nvme_command c = { 0 }; c.common.opcode = op; c.common.cdw10 = cpu_to_le32(cdw10); + c.common.cdw11 = cpu_to_le32(cdw11); - ret = nvme_send_pr_command(bdev, &c, data, sizeof(data)); - if (ret < 0) - return ret; - - return nvme_status_to_pr_err(ret); + if (nvme_disk_is_ns_head(bdev->bd_disk)) + return nvme_send_ns_head_pr_command(bdev, &c, data, data_len); + return nvme_send_ns_pr_command(bdev->bd_disk->private_data, &c, + data, data_len); } -static int nvme_pr_register(struct block_device *bdev, u64 old, - u64 new, unsigned flags) +static int nvme_send_pr_command(struct block_device *bdev, u32 cdw10, u32 cdw11, + u8 op, void *data, unsigned int data_len) { + int ret; + + ret = __nvme_send_pr_command(bdev, cdw10, cdw11, op, data, data_len); + return ret < 0 ? ret : nvme_status_to_pr_err(ret); +} + +static int nvme_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, + unsigned int flags) +{ + struct nvmet_pr_register_data data = { 0 }; u32 cdw10; if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; - cdw10 = old ? 2 : 0; - cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; - cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); + data.crkey = cpu_to_le64(old_key); + data.nrkey = cpu_to_le64(new_key); + + cdw10 = old_key ? NVME_PR_REGISTER_ACT_REPLACE : + NVME_PR_REGISTER_ACT_REG; + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0; + cdw10 |= NVME_PR_CPTPL_PERSIST; + + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_register, + &data, sizeof(data)); } static int nvme_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, unsigned flags) { + struct nvmet_pr_acquire_data data = { 0 }; u32 cdw10; if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; - cdw10 = nvme_pr_type_from_blk(type) << 8; - cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); + data.crkey = cpu_to_le64(key); + + cdw10 = NVME_PR_ACQUIRE_ACT_ACQUIRE; + cdw10 |= nvme_pr_type_from_blk(type) << 8; + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0; + + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire, + &data, sizeof(data)); } static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, enum pr_type type, bool abort) { - u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (abort ? 2 : 1); + struct nvmet_pr_acquire_data data = { 0 }; + u32 cdw10; - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); + data.crkey = cpu_to_le64(old); + data.prkey = cpu_to_le64(new); + + cdw10 = abort ? NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT : + NVME_PR_ACQUIRE_ACT_PREEMPT; + cdw10 |= nvme_pr_type_from_blk(type) << 8; + + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_acquire, + &data, sizeof(data)); } static int nvme_pr_clear(struct block_device *bdev, u64 key) { - u32 cdw10 = 1 | (key ? 0 : 1 << 3); + struct nvmet_pr_release_data data = { 0 }; + u32 cdw10; - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); + data.crkey = cpu_to_le64(key); + + cdw10 = NVME_PR_RELEASE_ACT_CLEAR; + cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY; + + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release, + &data, sizeof(data)); } static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { - u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (key ? 0 : 1 << 3); + struct nvmet_pr_release_data data = { 0 }; + u32 cdw10; - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); + data.crkey = cpu_to_le64(key); + + cdw10 = NVME_PR_RELEASE_ACT_RELEASE; + cdw10 |= nvme_pr_type_from_blk(type) << 8; + cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY; + + return nvme_send_pr_command(bdev, cdw10, 0, nvme_cmd_resv_release, + &data, sizeof(data)); } static int nvme_pr_resv_report(struct block_device *bdev, void *data, u32 data_len, bool *eds) { - struct nvme_command c = { }; + u32 cdw10, cdw11; int ret; - c.common.opcode = nvme_cmd_resv_report; - c.common.cdw10 = cpu_to_le32(nvme_bytes_to_numd(data_len)); - c.common.cdw11 = cpu_to_le32(NVME_EXTENDED_DATA_STRUCT); + cdw10 = nvme_bytes_to_numd(data_len); + cdw11 = NVME_EXTENDED_DATA_STRUCT; *eds = true; retry: - ret = nvme_send_pr_command(bdev, &c, data, data_len); + ret = __nvme_send_pr_command(bdev, cdw10, cdw11, nvme_cmd_resv_report, + data, data_len); if (ret == NVME_SC_HOST_ID_INCONSIST && - c.common.cdw11 == cpu_to_le32(NVME_EXTENDED_DATA_STRUCT)) { - c.common.cdw11 = 0; + cdw11 == NVME_EXTENDED_DATA_STRUCT) { + cdw11 = 0; *eds = false; goto retry; } - if (ret < 0) - return ret; - - return nvme_status_to_pr_err(ret); + return ret < 0 ? ret : nvme_status_to_pr_err(ret); } static int nvme_pr_read_keys(struct block_device *bdev, From edc80c585772cac59ef780899269436a0823fe67 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 25 Nov 2024 10:02:58 +0000 Subject: [PATCH 31/36] block: Remove extra part pointer NULLify in blk_rq_init() The rq->part pointer is already NULLified in the memset() call, so - like for other pointers in rq - don't re-NULLify. Signed-off-by: John Garry Link: https://lore.kernel.org/r/20241125100258.4172774-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index abdf44736e08..424239c075e2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -388,7 +388,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = blk_time_get_ns(); - rq->part = NULL; blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); From 0b83c86b444ab467134b0e618f45ad2216a4973c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 26 Nov 2024 19:47:05 +0900 Subject: [PATCH 32/36] block: Prevent potential deadlock in blk_revalidate_disk_zones() The function blk_revalidate_disk_zones() calls the function disk_update_zone_resources() after freezing the device queue. In turn, disk_update_zone_resources() calls queue_limits_start_update() which takes a queue limits mutex lock, resulting in the ordering: q->q_usage_counter check -> q->limits_lock. However, the usual ordering is to always take a queue limit lock before freezing the queue to commit the limits updates, e.g., the code pattern: lim = queue_limits_start_update(q); ... blk_mq_freeze_queue(q); ret = queue_limits_commit_update(q, &lim); blk_mq_unfreeze_queue(q); Thus, blk_revalidate_disk_zones() introduces a potential circular locking dependency deadlock that lockdep sometimes catches with the splat: [ 51.934109] ====================================================== [ 51.935916] WARNING: possible circular locking dependency detected [ 51.937561] 6.12.0+ #2107 Not tainted [ 51.938648] ------------------------------------------------------ [ 51.940351] kworker/u16:4/157 is trying to acquire lock: [ 51.941805] ffff9fff0aa0bea8 (&q->limits_lock){+.+.}-{4:4}, at: disk_update_zone_resources+0x86/0x170 [ 51.944314] but task is already holding lock: [ 51.945688] ffff9fff0aa0b890 (&q->q_usage_counter(queue)#3){++++}-{0:0}, at: blk_revalidate_disk_zones+0x15f/0x340 [ 51.948527] which lock already depends on the new lock. [ 51.951296] the existing dependency chain (in reverse order) is: [ 51.953708] -> #1 (&q->q_usage_counter(queue)#3){++++}-{0:0}: [ 51.956131] blk_queue_enter+0x1c9/0x1e0 [ 51.957290] blk_mq_alloc_request+0x187/0x2a0 [ 51.958365] scsi_execute_cmd+0x78/0x490 [scsi_mod] [ 51.959514] read_capacity_16+0x111/0x410 [sd_mod] [ 51.960693] sd_revalidate_disk.isra.0+0x872/0x3240 [sd_mod] [ 51.962004] sd_probe+0x2d7/0x520 [sd_mod] [ 51.962993] really_probe+0xd5/0x330 [ 51.963898] __driver_probe_device+0x78/0x110 [ 51.964925] driver_probe_device+0x1f/0xa0 [ 51.965916] __driver_attach_async_helper+0x60/0xe0 [ 51.967017] async_run_entry_fn+0x2e/0x140 [ 51.968004] process_one_work+0x21f/0x5a0 [ 51.968987] worker_thread+0x1dc/0x3c0 [ 51.969868] kthread+0xe0/0x110 [ 51.970377] ret_from_fork+0x31/0x50 [ 51.970983] ret_from_fork_asm+0x11/0x20 [ 51.971587] -> #0 (&q->limits_lock){+.+.}-{4:4}: [ 51.972479] __lock_acquire+0x1337/0x2130 [ 51.973133] lock_acquire+0xc5/0x2d0 [ 51.973691] __mutex_lock+0xda/0xcf0 [ 51.974300] disk_update_zone_resources+0x86/0x170 [ 51.975032] blk_revalidate_disk_zones+0x16c/0x340 [ 51.975740] sd_zbc_revalidate_zones+0x73/0x160 [sd_mod] [ 51.976524] sd_revalidate_disk.isra.0+0x465/0x3240 [sd_mod] [ 51.977824] sd_probe+0x2d7/0x520 [sd_mod] [ 51.978917] really_probe+0xd5/0x330 [ 51.979915] __driver_probe_device+0x78/0x110 [ 51.981047] driver_probe_device+0x1f/0xa0 [ 51.982143] __driver_attach_async_helper+0x60/0xe0 [ 51.983282] async_run_entry_fn+0x2e/0x140 [ 51.984319] process_one_work+0x21f/0x5a0 [ 51.985873] worker_thread+0x1dc/0x3c0 [ 51.987289] kthread+0xe0/0x110 [ 51.988546] ret_from_fork+0x31/0x50 [ 51.989926] ret_from_fork_asm+0x11/0x20 [ 51.991376] other info that might help us debug this: [ 51.994127] Possible unsafe locking scenario: [ 51.995651] CPU0 CPU1 [ 51.996694] ---- ---- [ 51.997716] lock(&q->q_usage_counter(queue)#3); [ 51.998817] lock(&q->limits_lock); [ 52.000043] lock(&q->q_usage_counter(queue)#3); [ 52.001638] lock(&q->limits_lock); [ 52.002485] *** DEADLOCK *** Prevent this issue by moving the calls to blk_mq_freeze_queue() and blk_mq_unfreeze_queue() around the call to queue_limits_commit_update() in disk_update_zone_resources(). In case of revalidation failure, the call to disk_free_zone_resources() in blk_revalidate_disk_zones() is still done with the queue frozen as before. Fixes: 843283e96e5a ("block: Fake max open zones limit when there is no limit") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241126104705.183996-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 70211751df16..263e28b72053 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1551,6 +1551,7 @@ static int disk_update_zone_resources(struct gendisk *disk, unsigned int nr_seq_zones, nr_conv_zones; unsigned int pool_size; struct queue_limits lim; + int ret; disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; @@ -1601,7 +1602,11 @@ static int disk_update_zone_resources(struct gendisk *disk, } commit: - return queue_limits_commit_update(q, &lim); + blk_mq_freeze_queue(q); + ret = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + + return ret; } static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, @@ -1816,14 +1821,15 @@ int blk_revalidate_disk_zones(struct gendisk *disk) * Set the new disk zone parameters only once the queue is frozen and * all I/Os are completed. */ - blk_mq_freeze_queue(q); if (ret > 0) ret = disk_update_zone_resources(disk, &args); else pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - if (ret) + if (ret) { + blk_mq_freeze_queue(q); disk_free_zone_resources(disk); - blk_mq_unfreeze_queue(q); + blk_mq_unfreeze_queue(q); + } return ret; } From 1b0cab327e060ccf397ae634a34c84dd1d4d2bb2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Nov 2024 11:21:36 +0100 Subject: [PATCH 33/36] mq-deadline: don't call req_get_ioprio from the I/O completion handler req_get_ioprio looks at req->bio to find the I/O priority, which is not set when completing bios that the driver fully iterated through. Stash away the dd_per_prio in the elevator private data instead of looking it up again to optimize the code a bit while fixing the regression from removing the per-request ioprio value. Fixes: 6975c1a486a4 ("block: remove the ioprio field from struct request") Reported-by: Chris Bainbridge Reported-by: Sam Protsenko Signed-off-by: Christoph Hellwig Tested-by: Chris Bainbridge Tested-by: Sam Protsenko Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20241126102136.619067-1-hch@lst.de Signed-off-by: Jens Axboe --- block/mq-deadline.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index acdc28756d9d..91b3789f710e 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -685,10 +685,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; - if (!rq->elv.priv[0]) { + if (!rq->elv.priv[0]) per_prio->stats.inserted++; - rq->elv.priv[0] = (void *)(uintptr_t)1; - } + rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; @@ -753,18 +752,14 @@ static void dd_prepare_request(struct request *rq) */ static void dd_finish_request(struct request *rq) { - struct request_queue *q = rq->q; - struct deadline_data *dd = q->elevator->elevator_data; - const u8 ioprio_class = dd_rq_ioclass(rq); - const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - struct dd_per_prio *per_prio = &dd->per_prio[prio]; + struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ - if (rq->elv.priv[0]) + if (per_prio) atomic_inc(&per_prio->stats.completed); } From 2cbd51f1f8739fd2fdf4bae1386bcf75ce0176ba Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 27 Nov 2024 09:23:18 +0000 Subject: [PATCH 34/36] block: Don't allow an atomic write be truncated in blkdev_write_iter() A write which goes past the end of the bdev in blkdev_write_iter() will be truncated. Truncating cannot tolerated for an atomic write, so error that condition. Fixes: caf336f81b3a ("block: Add fops atomic write support") Signed-off-by: John Garry Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241127092318.632790-1-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- block/fops.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/block/fops.c b/block/fops.c index 2d01c9007681..13a67940d040 100644 --- a/block/fops.c +++ b/block/fops.c @@ -677,6 +677,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(bd_inode); + bool atomic = iocb->ki_flags & IOCB_ATOMIC; loff_t size = bdev_nr_bytes(bdev); size_t shorted = 0; ssize_t ret; @@ -696,7 +697,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; - if (iocb->ki_flags & IOCB_ATOMIC) { + if (atomic) { ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; @@ -704,6 +705,8 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) size -= iocb->ki_pos; if (iov_iter_count(from) > size) { + if (atomic) + return -EINVAL; shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } From e8b8344de3980709080d86c157d24e7de07d70ad Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 29 Nov 2024 17:15:09 +0800 Subject: [PATCH 35/36] block, bfq: fix bfqq uaf in bfq_limit_depth() Set new allocated bfqq to bic or remove freed bfqq from bic are both protected by bfqd->lock, however bfq_limit_depth() is deferencing bfqq from bic without the lock, this can lead to UAF if the io_context is shared by multiple tasks. For example, test bfq with io_uring can trigger following UAF in v6.6: ================================================================== BUG: KASAN: slab-use-after-free in bfqq_group+0x15/0x50 Call Trace: dump_stack_lvl+0x47/0x80 print_address_description.constprop.0+0x66/0x300 print_report+0x3e/0x70 kasan_report+0xb4/0xf0 bfqq_group+0x15/0x50 bfqq_request_over_limit+0x130/0x9a0 bfq_limit_depth+0x1b5/0x480 __blk_mq_alloc_requests+0x2b5/0xa00 blk_mq_get_new_requests+0x11d/0x1d0 blk_mq_submit_bio+0x286/0xb00 submit_bio_noacct_nocheck+0x331/0x400 __block_write_full_folio+0x3d0/0x640 writepage_cb+0x3b/0xc0 write_cache_pages+0x254/0x6c0 write_cache_pages+0x254/0x6c0 do_writepages+0x192/0x310 filemap_fdatawrite_wbc+0x95/0xc0 __filemap_fdatawrite_range+0x99/0xd0 filemap_write_and_wait_range.part.0+0x4d/0xa0 blkdev_read_iter+0xef/0x1e0 io_read+0x1b6/0x8a0 io_issue_sqe+0x87/0x300 io_wq_submit_work+0xeb/0x390 io_worker_handle_work+0x24d/0x550 io_wq_worker+0x27f/0x6c0 ret_from_fork_asm+0x1b/0x30 Allocated by task 808602: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_slab_alloc+0x83/0x90 kmem_cache_alloc_node+0x1b1/0x6d0 bfq_get_queue+0x138/0xfa0 bfq_get_bfqq_handle_split+0xe3/0x2c0 bfq_init_rq+0x196/0xbb0 bfq_insert_request.isra.0+0xb5/0x480 bfq_insert_requests+0x156/0x180 blk_mq_insert_request+0x15d/0x440 blk_mq_submit_bio+0x8a4/0xb00 submit_bio_noacct_nocheck+0x331/0x400 __blkdev_direct_IO_async+0x2dd/0x330 blkdev_write_iter+0x39a/0x450 io_write+0x22a/0x840 io_issue_sqe+0x87/0x300 io_wq_submit_work+0xeb/0x390 io_worker_handle_work+0x24d/0x550 io_wq_worker+0x27f/0x6c0 ret_from_fork+0x2d/0x50 ret_from_fork_asm+0x1b/0x30 Freed by task 808589: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x27/0x40 __kasan_slab_free+0x126/0x1b0 kmem_cache_free+0x10c/0x750 bfq_put_queue+0x2dd/0x770 __bfq_insert_request.isra.0+0x155/0x7a0 bfq_insert_request.isra.0+0x122/0x480 bfq_insert_requests+0x156/0x180 blk_mq_dispatch_plug_list+0x528/0x7e0 blk_mq_flush_plug_list.part.0+0xe5/0x590 __blk_flush_plug+0x3b/0x90 blk_finish_plug+0x40/0x60 do_writepages+0x19d/0x310 filemap_fdatawrite_wbc+0x95/0xc0 __filemap_fdatawrite_range+0x99/0xd0 filemap_write_and_wait_range.part.0+0x4d/0xa0 blkdev_read_iter+0xef/0x1e0 io_read+0x1b6/0x8a0 io_issue_sqe+0x87/0x300 io_wq_submit_work+0xeb/0x390 io_worker_handle_work+0x24d/0x550 io_wq_worker+0x27f/0x6c0 ret_from_fork+0x2d/0x50 ret_from_fork_asm+0x1b/0x30 Fix the problem by protecting bic_to_bfqq() with bfqd->lock. CC: Jan Kara Fixes: 76f1df88bbc2 ("bfq: Limit number of requests consumed by each cgroup") Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20241129091509.2227136-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 28c2bb06e859..95dd7b795935 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -582,23 +582,31 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, #define BFQ_LIMIT_INLINE_DEPTH 16 #ifdef CONFIG_BFQ_GROUP_IOSCHED -static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +static bool bfqq_request_over_limit(struct bfq_data *bfqd, + struct bfq_io_cq *bic, blk_opf_t opf, + unsigned int act_idx, int limit) { - struct bfq_data *bfqd = bfqq->bfqd; - struct bfq_entity *entity = &bfqq->entity; struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; struct bfq_entity **entities = inline_entities; - int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH; - int class_idx = bfqq->ioprio_class - 1; + int alloc_depth = BFQ_LIMIT_INLINE_DEPTH; struct bfq_sched_data *sched_data; + struct bfq_entity *entity; + struct bfq_queue *bfqq; unsigned long wsum; bool ret = false; - - if (!entity->on_st_or_in_serv) - return false; + int depth; + int level; retry: spin_lock_irq(&bfqd->lock); + bfqq = bic_to_bfqq(bic, op_is_sync(opf), act_idx); + if (!bfqq) + goto out; + + entity = &bfqq->entity; + if (!entity->on_st_or_in_serv) + goto out; + /* +1 for bfqq entity, root cgroup not included */ depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; if (depth > alloc_depth) { @@ -643,7 +651,7 @@ retry: * class. */ wsum = 0; - for (i = 0; i <= class_idx; i++) { + for (i = 0; i <= bfqq->ioprio_class - 1; i++) { wsum = wsum * IOPRIO_BE_NR + sched_data->service_tree[i].wsum; } @@ -666,7 +674,9 @@ out: return ret; } #else -static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +static bool bfqq_request_over_limit(struct bfq_data *bfqd, + struct bfq_io_cq *bic, blk_opf_t opf, + unsigned int act_idx, int limit) { return false; } @@ -704,8 +714,9 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) } for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { - struct bfq_queue *bfqq = - bic_to_bfqq(bic, op_is_sync(opf), act_idx); + /* Fast path to check if bfqq is already allocated. */ + if (!bic_to_bfqq(bic, op_is_sync(opf), act_idx)) + continue; /* * Does queue (or any parent entity) exceed number of @@ -713,7 +724,7 @@ static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) * limit depth so that it cannot consume more * available requests and thus starve other entities. */ - if (bfqq && bfqq_request_over_limit(bfqq, limit)) { + if (bfqq_request_over_limit(bfqd, bic, opf, act_idx, limit)) { depth = 1; break; } From 82734209bedd65a8b508844bab652b464379bfdd Mon Sep 17 00:00:00 2001 From: Zhang Xianwei Date: Thu, 28 Nov 2024 17:00:56 +0800 Subject: [PATCH 36/36] brd: decrease the number of allocated pages which discarded The number of allocated pages which discarded will not decrease. Fix it. Fixes: 9ead7efc6f3f ("brd: implement discard support") Signed-off-by: Zhang Xianwei Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20241128170056565nPKSz2vsP8K8X2uk2iaDG@zte.com.cn Signed-off-by: Jens Axboe --- drivers/block/brd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 5a95671d8151..292f127cae0a 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -231,8 +231,10 @@ static void brd_do_discard(struct brd_device *brd, sector_t sector, u32 size) xa_lock(&brd->brd_pages); while (size >= PAGE_SIZE && aligned_sector < rd_size * 2) { page = __xa_erase(&brd->brd_pages, aligned_sector >> PAGE_SECTORS_SHIFT); - if (page) + if (page) { __free_page(page); + brd->brd_nr_pages--; + } aligned_sector += PAGE_SECTORS; size -= PAGE_SIZE; }