From 84909f7decbd8981a24be829f110c248ecb8c51a Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Sun, 24 Nov 2024 18:25:53 +0530 Subject: [PATCH 01/15] nvmet: use kzalloc instead of ZERO_PAGE in nvme_execute_identify_ns_nvm() The nvme_execute_identify_ns_nvm function uses ZERO_PAGE for copying SG list with all zeros. As ZERO_PAGE would not necessarily return the virtual-address of the zero page, we need to first convert the page address to kernel virtual-address and then use it as source address for copying the data to SG list with all zeros. Using return address of ZERO_PAGE(0) as source address for copying data to SG list would fill the target buffer with random/garbage value and causes the undesired side effect. As other identify implemenations uses kzalloc for allocating a zero filled buffer, we decided use kzalloc for allocating a zero filled buffer in nvme_execute_identify_ns_nvm function and then use this buffer for copying all zeros to SG list buffers. So esentially, we now avoid using ZERO_PAGE. Reported-by: Yi Zhang Fixes: 64a51080eaba ("nvmet: implement id ns for nvm command set") Link: https://lore.kernel.org/all/CAHj4cs8OVyxmn4XTvA=y4uQ3qWpdw-x3M3FSUYr-KpE-nhaFEA@mail.gmail.com/ Signed-off-by: Nilay Shroff Tested-by: Yi Zhang Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 4fa8496a4d96..2962794ce881 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -902,13 +902,18 @@ static void nvmet_execute_identify_ctrl_nvm(struct nvmet_req *req) static void nvme_execute_identify_ns_nvm(struct nvmet_req *req) { u16 status; + struct nvme_id_ns_nvm *id; status = nvmet_req_find_ns(req); if (status) goto out; - status = nvmet_copy_to_sgl(req, 0, ZERO_PAGE(0), - NVME_IDENTIFY_DATA_SIZE); + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); out: nvmet_req_complete(req, status); } From 58a0c875ce028678c9594c7bdf3fe33462392808 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 Nov 2024 07:42:18 +0100 Subject: [PATCH 02/15] nvme: don't apply NVME_QUIRK_DEALLOCATE_ZEROES when DSM is not supported Commit 63dfa1004322 ("nvme: move NVME_QUIRK_DEALLOCATE_ZEROES out of nvme_config_discard") started applying the NVME_QUIRK_DEALLOCATE_ZEROES quirk even then the Dataset Management is not supported. It turns out that there versions of these old Intel SSDs that have DSM support disabled in the firmware, which will now lead to errors everytime a Write Zeroes command is issued. Fix this by checking for DSM support before applying the quirk. Reported-by: Saeed Mirzamohammadi Fixes: 63dfa1004322 ("nvme: move NVME_QUIRK_DEALLOCATE_ZEROES out of nvme_config_discard") Tested-by: Saeed Mirzamohammadi Signed-off-by: Christoph Hellwig Reviewed-by: Nitesh Shetty Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bfd71511c85f..5e5c9e15ad0c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2043,7 +2043,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, lim->physical_block_size = min(phys_bs, atomic_bs); lim->io_min = phys_bs; lim->io_opt = io_opt; - if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) && + (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)) lim->max_write_zeroes_sectors = UINT_MAX; else lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors; From 3c93e4e4a2aeb92ea99e1eac3e1180f5ed49538c Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 30 Nov 2024 18:45:21 +0900 Subject: [PATCH 03/15] block: rnull: add missing MODULE_DESCRIPTION Add the missing description to fix the following warning: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/block/rnull_mod.o Signed-off-by: FUJITA Tomonori Acked-by: Andreas Hindborg Link: https://lore.kernel.org/r/20241130094521.193924-1-fujita.tomonori@gmail.com Signed-off-by: Jens Axboe --- drivers/block/rnull.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs index 5de7223beb4d..9cca05dcf772 100644 --- a/drivers/block/rnull.rs +++ b/drivers/block/rnull.rs @@ -28,6 +28,7 @@ type: NullBlkModule, name: "rnull_mod", author: "Andreas Hindborg", + description: "Rust implementation of the C null block driver", license: "GPL v2", } From b0de5456e201c475d6a860ceeb3ed8ee2923695a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 Dec 2024 09:45:48 -0800 Subject: [PATCH 04/15] nvme-pci: remove two deallocate zeroes quirks The quirk was initially used as a signal to set the discard_zeroes_data queue limit because there were some use cases that relied on that behavior. The queue limit no longer exists as every user of it has been converted to use the write zeroes operation instead. The quirk now means to use a discard command as an alias to a write zeroes request. Two of the devices previously using the quirk support the write zeroes command directly, so these don't need or want to use discard when the desired operation is to write zeroes. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4c644bb7f069..9535e35ef18a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3588,12 +3588,10 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_DEALLOCATE_ZEROES, }, { PCI_VDEVICE(INTEL, 0x0a54), /* Intel P4500/P4600 */ .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES | NVME_QUIRK_IGNORE_DEV_SUBNQN | NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(INTEL, 0x0a55), /* Dell Express Flash P4600 */ - .driver_data = NVME_QUIRK_STRIPE_SIZE | - NVME_QUIRK_DEALLOCATE_ZEROES, }, + .driver_data = NVME_QUIRK_STRIPE_SIZE, }, { PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS | NVME_QUIRK_MEDIUM_PRIO_SQ | From b2e382ae12a63560fca35050498e19e760adf8c0 Mon Sep 17 00:00:00 2001 From: Liequan Che Date: Mon, 2 Dec 2024 19:56:38 +0800 Subject: [PATCH 05/15] bcache: revert replacing IS_ERR_OR_NULL with IS_ERR again Commit 028ddcac477b ("bcache: Remove unnecessary NULL point check in node allocations") leads a NULL pointer deference in cache_set_flush(). 1721 if (!IS_ERR_OR_NULL(c->root)) 1722 list_add(&c->root->list, &c->btree_cache); >From the above code in cache_set_flush(), if previous registration code fails before allocating c->root, it is possible c->root is NULL as what it is initialized. __bch_btree_node_alloc() never returns NULL but c->root is possible to be NULL at above line 1721. This patch replaces IS_ERR() by IS_ERR_OR_NULL() to fix this. Fixes: 028ddcac477b ("bcache: Remove unnecessary NULL point check in node allocations") Signed-off-by: Liequan Che Cc: stable@vger.kernel.org Cc: Zheng Wang Reviewed-by: Mingzhe Zou Signed-off-by: Coly Li Link: https://lore.kernel.org/r/20241202115638.28957-1-colyli@suse.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e7abfdd77c3b..e42f1400cea9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1718,7 +1718,7 @@ static CLOSURE_CALLBACK(cache_set_flush) if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); - if (!IS_ERR(c->root)) + if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); /* From 88c23a32b851e36adc4ab36f796d9b711f47e2bb Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 29 Nov 2024 15:17:06 +0100 Subject: [PATCH 06/15] nvme-fabrics: handle zero MAXCMD without closing the connection The NVMe specification states that MAXCMD is mandatory for NVMe-over-Fabrics implementations. However, some NVMe/TCP and NVMe/FC arrays from major vendors have buggy firmware that reports MAXCMD as zero in the Identify Controller data structure. Currently, the implementation closes the connection in such cases, completely preventing the host from connecting to the target. Fix the issue by printing a clear error message about the firmware bug and allowing the connection to proceed. It assumes that the target supports a MAXCMD value of SQSIZE + 1. If any issues arise, the user can manually adjust SQSIZE to mitigate them. Fixes: 4999568184e5 ("nvme-fabrics: check max outstanding commands") Signed-off-by: Maurizio Lombardi Reviewed-by: Laurence Oberman Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5e5c9e15ad0c..1e904a794a4f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3233,8 +3233,9 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct } if (!ctrl->maxcmd) { - dev_err(ctrl->device, "Maximum outstanding commands is 0\n"); - return -EINVAL; + dev_warn(ctrl->device, + "Firmware bug: maximum outstanding commands is 0\n"); + ctrl->maxcmd = ctrl->sqsize + 1; } return 0; From 41d826c8a9de37af5d1710a37b55720bd5ad8709 Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Sun, 1 Dec 2024 01:02:58 +0800 Subject: [PATCH 07/15] nvmet: replace kmalloc + memset with kzalloc for data allocation cocci warnings: (new ones prefixed by >>) >> drivers/nvme/target/pr.c:831:8-15: WARNING: kzalloc should be used for data, instead of kmalloc/memset The pattern of using 'kmalloc' followed by 'memset' is replaced with 'kzalloc', which is functionally equivalent to 'kmalloc' + 'memset', but more efficient. 'kzalloc' automatically zeroes the allocated memory, making it a faster and more streamlined solution. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411301434.LEckbcWx-lkp@intel.com/ Reviewed-by: Kuan-Wei Chiu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Yu-Chun Lin Signed-off-by: Keith Busch --- drivers/nvme/target/pr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/target/pr.c b/drivers/nvme/target/pr.c index 25a02b50d9f3..90e9f5bbe581 100644 --- a/drivers/nvme/target/pr.c +++ b/drivers/nvme/target/pr.c @@ -828,12 +828,11 @@ static void nvmet_execute_pr_report(struct nvmet_req *req) goto out; } - data = kmalloc(num_bytes, GFP_KERNEL); + data = kzalloc(num_bytes, GFP_KERNEL); if (!data) { status = NVME_SC_INTERNAL; goto out; } - memset(data, 0, num_bytes); data->gen = cpu_to_le32(atomic_read(&pr->generation)); data->ptpls = 0; ctrl_eds = data->regctl_eds; From ad0cf42e1fc4810170a8e8e232e85d69073e4d25 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 4 Dec 2024 09:42:40 +0900 Subject: [PATCH 08/15] nvme-pci: don't use dma_alloc_noncontiguous with 0 merge boundary Only call into nvme_alloc_host_mem_single which uses dma_alloc_noncontiguous when there is non-null dma merge boundary. Without this we'll call into dma_alloc_noncontiguous for device using dma-direct, which can work fine as long as the preferred size is below the MAX_ORDER of the page allocator, but blows up with a warning if it is too large. Fixes: 63a5c7a4b4c4 ("nvme-pci: use dma_alloc_noncontigous if possible") Reported-by: Leon Romanovsky Reported-by: Chaitanya Kumar Borah Signed-off-by: Christoph Hellwig Reviewed-by: Leon Romanovsky Tested-by: Chaitanya Kumar Borah Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 9535e35ef18a..1a5ba80f1811 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2172,6 +2172,7 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { + unsigned long dma_merge_boundary = dma_get_merge_boundary(dev->dev); u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); u64 chunk_size; @@ -2180,7 +2181,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) * If there is an IOMMU that can merge pages, try a virtually * non-contiguous allocation for a single segment first. */ - if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) { + if (dma_merge_boundary && (PAGE_SIZE & dma_merge_boundary) == 0) { if (!nvme_alloc_host_mem_single(dev, preferred)) return 0; } From fec55c29e54d3ca6fe9d7d7d9266098b4514fd34 Mon Sep 17 00:00:00 2001 From: "Chunguang.xu" Date: Tue, 3 Dec 2024 11:34:40 +0800 Subject: [PATCH 09/15] nvme-tcp: fix the memleak while create new ctrl failed Now while we create new ctrl failed, we have not free the tagset occupied by admin_q, here try to fix it. Fixes: fd1418de10b9 ("nvme-tcp: avoid open-coding nvme_tcp_teardown_admin_queue()") Signed-off-by: Chunguang.xu Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3e416af2659f..55abfe5e1d25 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2278,7 +2278,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) } destroy_admin: nvme_stop_keep_alive(ctrl); - nvme_tcp_teardown_admin_queue(ctrl, false); + nvme_tcp_teardown_admin_queue(ctrl, new); return ret; } From 5858b687559809f05393af745cbadf06dee61295 Mon Sep 17 00:00:00 2001 From: "Chunguang.xu" Date: Tue, 3 Dec 2024 11:34:41 +0800 Subject: [PATCH 10/15] nvme-rdma: unquiesce admin_q before destroy it Kernel will hang on destroy admin_q while we create ctrl failed, such as following calltrace: PID: 23644 TASK: ff2d52b40f439fc0 CPU: 2 COMMAND: "nvme" #0 [ff61d23de260fb78] __schedule at ffffffff8323bc15 #1 [ff61d23de260fc08] schedule at ffffffff8323c014 #2 [ff61d23de260fc28] blk_mq_freeze_queue_wait at ffffffff82a3dba1 #3 [ff61d23de260fc78] blk_freeze_queue at ffffffff82a4113a #4 [ff61d23de260fc90] blk_cleanup_queue at ffffffff82a33006 #5 [ff61d23de260fcb0] nvme_rdma_destroy_admin_queue at ffffffffc12686ce #6 [ff61d23de260fcc8] nvme_rdma_setup_ctrl at ffffffffc1268ced #7 [ff61d23de260fd28] nvme_rdma_create_ctrl at ffffffffc126919b #8 [ff61d23de260fd68] nvmf_dev_write at ffffffffc024f362 #9 [ff61d23de260fe38] vfs_write at ffffffff827d5f25 RIP: 00007fda7891d574 RSP: 00007ffe2ef06958 RFLAGS: 00000202 RAX: ffffffffffffffda RBX: 000055e8122a4d90 RCX: 00007fda7891d574 RDX: 000000000000012b RSI: 000055e8122a4d90 RDI: 0000000000000004 RBP: 00007ffe2ef079c0 R8: 000000000000012b R9: 000055e8122a4d90 R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000004 R13: 000055e8122923c0 R14: 000000000000012b R15: 00007fda78a54500 ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b This due to we have quiesced admi_q before cancel requests, but forgot to unquiesce before destroy it, as a result we fail to drain the pending requests, and hang on blk_mq_freeze_queue_wait() forever. Here try to reuse nvme_rdma_teardown_admin_queue() to fix this issue and simplify the code. Fixes: 958dc1d32c80 ("nvme-rdma: add clean action for failed reconnection") Reported-by: Yingfu.zhou Signed-off-by: Chunguang.xu Signed-off-by: Yue.zhao Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index baf7d2490152..86a2891d9bcc 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1091,13 +1091,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) } destroy_admin: nvme_stop_keep_alive(&ctrl->ctrl); - nvme_quiesce_admin_queue(&ctrl->ctrl); - blk_sync_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_cancel_admin_tagset(&ctrl->ctrl); - if (new) - nvme_remove_admin_tag_set(&ctrl->ctrl); - nvme_rdma_destroy_admin_queue(ctrl); + nvme_rdma_teardown_admin_queue(ctrl, new); return ret; } From fdc5664c690f6e0ee235dbc6b1f17ec59aca55bd Mon Sep 17 00:00:00 2001 From: "Chunguang.xu" Date: Tue, 3 Dec 2024 11:34:42 +0800 Subject: [PATCH 11/15] nvme-tcp: no need to quiesce admin_q in nvme_tcp_teardown_io_queues() As we quiesce admin_q in nvme_tcp_teardown_admin_queue(), so we should no need to quiesce it in nvme_tcp_reaardown_io_queues(), make things simple. Signed-off-by: Chunguang.xu Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 55abfe5e1d25..98bf758dc6fc 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2178,7 +2178,6 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - nvme_quiesce_admin_queue(ctrl); nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); From b4e12f5728ff963ca5590c48b85a20d076bf517d Mon Sep 17 00:00:00 2001 From: "Chunguang.xu" Date: Tue, 3 Dec 2024 11:34:43 +0800 Subject: [PATCH 12/15] nvme-tcp: simplify nvme_tcp_teardown_io_queues() As nvme_tcp_teardown_io_queues() is the only one caller of nvme_tcp_destroy_admin_queue(), so we can merge it into nvme_tcp_teardown_io_queues() to simplify the code. Signed-off-by: Chunguang.xu Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 98bf758dc6fc..28c76a3e1bd2 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2101,14 +2101,6 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) return ret; } -static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_queue(ctrl, 0); - if (remove) - nvme_remove_admin_tag_set(ctrl); - nvme_tcp_free_admin_queue(ctrl); -} - static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) { int error; @@ -2163,9 +2155,11 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); - if (remove) + if (remove) { nvme_unquiesce_admin_queue(ctrl); - nvme_tcp_destroy_admin_queue(ctrl, remove); + nvme_remove_admin_tag_set(ctrl); + } + nvme_tcp_free_admin_queue(ctrl); if (ctrl->tls_pskid) { dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n", ctrl->tls_pskid); From 7678abee0867e6b7fb89aa40f6e9f575f755fb37 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Nov 2024 20:58:21 +0800 Subject: [PATCH 13/15] virtio-blk: don't keep queue frozen during system suspend Commit 4ce6e2db00de ("virtio-blk: Ensure no requests in virtqueues before deleting vqs.") replaces queue quiesce with queue freeze in virtio-blk's PM callbacks. And the motivation is to drain inflight IOs before suspending. block layer's queue freeze looks very handy, but it is also easy to cause deadlock, such as, any attempt to call into bio_queue_enter() may run into deadlock if the queue is frozen in current context. There are all kinds of ->suspend() called in suspend context, so keeping queue frozen in the whole suspend context isn't one good idea. And Marek reported lockdep warning[1] caused by virtio-blk's freeze queue in virtblk_freeze(). [1] https://lore.kernel.org/linux-block/ca16370e-d646-4eee-b9cc-87277c89c43c@samsung.com/ Given the motivation is to drain in-flight IOs, it can be done by calling freeze & unfreeze, meantime restore to previous behavior by keeping queue quiesced during suspend. Cc: Yi Sun Cc: Michael S. Tsirkin Cc: Jason Wang Cc: Stefan Hajnoczi Cc: virtualization@lists.linux.dev Reported-by: Marek Szyprowski Signed-off-by: Ming Lei Acked-by: Stefan Hajnoczi Link: https://lore.kernel.org/r/20241112125821.1475793-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c0cdba71f436..3efe378f1386 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1586,9 +1586,12 @@ static void virtblk_remove(struct virtio_device *vdev) static int virtblk_freeze(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; + struct request_queue *q = vblk->disk->queue; /* Ensure no requests in virtqueues before deleting vqs. */ - blk_mq_freeze_queue(vblk->disk->queue); + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue_nowait(q); + blk_mq_unfreeze_queue(q); /* Ensure we don't receive any more interrupts */ virtio_reset_device(vdev); @@ -1612,8 +1615,8 @@ static int virtblk_restore(struct virtio_device *vdev) return ret; virtio_device_ready(vdev); + blk_mq_unquiesce_queue(vblk->disk->queue); - blk_mq_unfreeze_queue(vblk->disk->queue); return 0; } #endif From 4bf485a7db5d82ddd0f3ad2b299893199090375e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Dec 2024 19:16:06 +0800 Subject: [PATCH 14/15] blk-mq: register cpuhp callback after hctx is added to xarray table We need to retrieve 'hctx' from xarray table in the cpuhp callback, so the callback should be registered after this 'hctx' is added to xarray table. Cc: Reinette Chatre Cc: Fenghua Yu Cc: Peter Newman Cc: Babu Moger Cc: Luck Tony Signed-off-by: Ming Lei Tested-by: Tony Luck Link: https://lore.kernel.org/r/20241206111611.978870-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 424239c075e2..a404465036de 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3824,16 +3824,11 @@ static int blk_mq_init_hctx(struct request_queue *q, { hctx->queue_num = hctx_idx; - if (!(hctx->flags & BLK_MQ_F_STACKING)) - cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, - &hctx->cpuhp_online); - cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); - hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) - goto unregister_cpu_notifier; + goto fail; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) @@ -3842,6 +3837,11 @@ static int blk_mq_init_hctx(struct request_queue *q, if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) goto exit_flush_rq; + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); + return 0; exit_flush_rq: @@ -3850,8 +3850,7 @@ static int blk_mq_init_hctx(struct request_queue *q, exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - unregister_cpu_notifier: - blk_mq_remove_cpuhp(hctx); + fail: return -1; } From 22465bbac53c821319089016f268a2437de9b00a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 6 Dec 2024 19:16:07 +0800 Subject: [PATCH 15/15] blk-mq: move cpuhp callback registering out of q->sysfs_lock Registering and unregistering cpuhp callback requires global cpu hotplug lock, which is used everywhere. Meantime q->sysfs_lock is used in block layer almost everywhere. It is easy to trigger lockdep warning[1] by connecting the two locks. Fix the warning by moving blk-mq's cpuhp callback registering out of q->sysfs_lock. Add one dedicated global lock for covering registering & unregistering hctx's cpuhp, and it is safe to do so because hctx is guaranteed to be live if our request_queue is live. [1] https://lore.kernel.org/lkml/Z04pz3AlvI4o0Mr8@agluck-desk3/ Cc: Reinette Chatre Cc: Fenghua Yu Cc: Peter Newman Cc: Babu Moger Reported-by: Luck Tony Signed-off-by: Ming Lei Tested-by: Tony Luck Link: https://lore.kernel.org/r/20241206111611.978870-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 103 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a404465036de..aa340b097b6e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -43,6 +43,7 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); +static DEFINE_MUTEX(blk_mq_cpuhp_lock); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, @@ -3739,13 +3740,91 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) return 0; } -static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) +static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { - if (!(hctx->flags & BLK_MQ_F_STACKING)) + lockdep_assert_held(&blk_mq_cpuhp_lock); + + if (!(hctx->flags & BLK_MQ_F_STACKING) && + !hlist_unhashed(&hctx->cpuhp_online)) { cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); - cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, - &hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_online); + } + + if (!hlist_unhashed(&hctx->cpuhp_dead)) { + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_dead); + } +} + +static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) +{ + mutex_lock(&blk_mq_cpuhp_lock); + __blk_mq_remove_cpuhp(hctx); + mutex_unlock(&blk_mq_cpuhp_lock); +} + +static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx) +{ + lockdep_assert_held(&blk_mq_cpuhp_lock); + + if (!(hctx->flags & BLK_MQ_F_STACKING) && + hlist_unhashed(&hctx->cpuhp_online)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); + + if (hlist_unhashed(&hctx->cpuhp_dead)) + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); +} + +static void __blk_mq_remove_cpuhp_list(struct list_head *head) +{ + struct blk_mq_hw_ctx *hctx; + + lockdep_assert_held(&blk_mq_cpuhp_lock); + + list_for_each_entry(hctx, head, hctx_list) + __blk_mq_remove_cpuhp(hctx); +} + +/* + * Unregister cpuhp callbacks from exited hw queues + * + * Safe to call if this `request_queue` is live + */ +static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q) +{ + LIST_HEAD(hctx_list); + + spin_lock(&q->unused_hctx_lock); + list_splice_init(&q->unused_hctx_list, &hctx_list); + spin_unlock(&q->unused_hctx_lock); + + mutex_lock(&blk_mq_cpuhp_lock); + __blk_mq_remove_cpuhp_list(&hctx_list); + mutex_unlock(&blk_mq_cpuhp_lock); + + spin_lock(&q->unused_hctx_lock); + list_splice(&hctx_list, &q->unused_hctx_list); + spin_unlock(&q->unused_hctx_lock); +} + +/* + * Register cpuhp callbacks from all hw queues + * + * Safe to call if this `request_queue` is live + */ +static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + mutex_lock(&blk_mq_cpuhp_lock); + queue_for_each_hw_ctx(q, hctx, i) + __blk_mq_add_cpuhp(hctx); + mutex_unlock(&blk_mq_cpuhp_lock); } /* @@ -3796,8 +3875,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); - blk_mq_remove_cpuhp(hctx); - xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); @@ -3814,6 +3891,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; + blk_mq_remove_cpuhp(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } @@ -3837,11 +3915,6 @@ static int blk_mq_init_hctx(struct request_queue *q, if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) goto exit_flush_rq; - if (!(hctx->flags & BLK_MQ_F_STACKING)) - cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, - &hctx->cpuhp_online); - cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); - return 0; exit_flush_rq: @@ -3876,6 +3949,8 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); + INIT_HLIST_NODE(&hctx->cpuhp_dead); + INIT_HLIST_NODE(&hctx->cpuhp_online); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; @@ -4414,6 +4489,12 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); + + /* unregister cpuhp callbacks for exited hctxs */ + blk_mq_remove_hw_queues_cpuhp(q); + + /* register cpuhp for new initialized hctxs */ + blk_mq_add_hw_queues_cpuhp(q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,