From efa56305908ba20de2104f1b8508c6a7401833be Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 22 Dec 2023 16:17:48 +0100 Subject: [PATCH 01/45] nvmet-tcp: Fix a kernel panic when host sends an invalid H2C PDU length If the host sends an H2CData command with an invalid DATAL, the kernel may crash in nvmet_tcp_build_pdu_iovec(). Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 lr : nvmet_tcp_io_work+0x6ac/0x718 [nvmet_tcp] Call trace: process_one_work+0x174/0x3c8 worker_thread+0x2d0/0x3e8 kthread+0x104/0x110 Fix the bug by raising a fatal error if DATAL isn't coherent with the packet size. Also, the PDU length should never exceed the MAXH2CDATA parameter which has been communicated to the host in nvmet_tcp_handle_icreq(). Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4cc27856aa8f..ad16795934b8 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -24,6 +24,7 @@ #include "nvmet.h" #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) +#define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */ static int param_store_val(const char *str, int *val, int min, int max) { @@ -923,7 +924,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) icresp->hdr.pdo = 0; icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); - icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ + icresp->maxdata = cpu_to_le32(NVMET_TCP_MAXH2CDATA); icresp->cpda = 0; if (queue->hdr_digest) icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; @@ -978,6 +979,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; + unsigned int plen; if (likely(queue->nr_cmds)) { if (unlikely(data->ttag >= queue->nr_cmds)) { @@ -1001,7 +1003,16 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) return -EPROTO; } + plen = le32_to_cpu(data->hdr.plen); cmd->pdu_len = le32_to_cpu(data->data_length); + if (unlikely(cmd->pdu_len != (plen - sizeof(*data)) || + cmd->pdu_len == 0 || + cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) { + pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len); + /* FIXME: use proper transport errors */ + nvmet_tcp_fatal_error(queue); + return -EPROTO; + } cmd->pdu_recv = 0; nvmet_tcp_build_pdu_iovec(cmd); queue->cmd = cmd; From 0849a5441358cef02586fb2d60f707c0db195628 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 22 Dec 2023 16:17:49 +0100 Subject: [PATCH 02/45] nvmet-tcp: fix a crash in nvmet_req_complete() in nvmet_tcp_handle_h2c_data_pdu(), if the host sends a data_offset different from rbytes_done, the driver ends up calling nvmet_req_complete() passing a status error. The problem is that at this point cmd->req is not yet initialized, the kernel will crash after dereferencing a NULL pointer. Fix the bug by replacing the call to nvmet_req_complete() with nvmet_tcp_fatal_error(). Fixes: 872d26a391da ("nvmet-tcp: add NVMe over TCP target driver") Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Maurizio Lombardi Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index ad16795934b8..b4b6a8ac8089 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -998,8 +998,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) data->ttag, le32_to_cpu(data->data_offset), cmd->rbytes_done); /* FIXME: use path and transport errors */ - nvmet_req_complete(&cmd->req, - NVME_SC_INVALID_FIELD | NVME_SC_DNR); + nvmet_tcp_fatal_error(queue); return -EPROTO; } From 75011bd0f9c55db523242f9f9a0b0b826165f14b Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 22 Dec 2023 16:17:50 +0100 Subject: [PATCH 03/45] nvmet-tcp: remove boilerplate code Simplify the nvmet_tcp_handle_h2c_data_pdu() function by removing boilerplate code. Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index b4b6a8ac8089..3569c1255c5e 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -985,8 +985,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) if (unlikely(data->ttag >= queue->nr_cmds)) { pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n", queue->idx, data->ttag, queue->nr_cmds); - nvmet_tcp_fatal_error(queue); - return -EPROTO; + goto err_proto; } cmd = &queue->cmds[data->ttag]; } else { @@ -997,9 +996,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) pr_err("ttag %u unexpected data offset %u (expected %u)\n", data->ttag, le32_to_cpu(data->data_offset), cmd->rbytes_done); - /* FIXME: use path and transport errors */ - nvmet_tcp_fatal_error(queue); - return -EPROTO; + goto err_proto; } plen = le32_to_cpu(data->hdr.plen); @@ -1008,9 +1005,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) cmd->pdu_len == 0 || cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) { pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len); - /* FIXME: use proper transport errors */ - nvmet_tcp_fatal_error(queue); - return -EPROTO; + goto err_proto; } cmd->pdu_recv = 0; nvmet_tcp_build_pdu_iovec(cmd); @@ -1018,6 +1013,11 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) queue->rcv_state = NVMET_TCP_RECV_DATA; return 0; + +err_proto: + /* FIXME: use proper transport errors */ + nvmet_tcp_fatal_error(queue); + return -EPROTO; } static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) From ef184b8844bf98a2a80fab8eecda1489aed5d97f Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Sun, 31 Dec 2023 14:56:44 +0800 Subject: [PATCH 04/45] nvme: tcp: remove unnecessary goto statement There is no requirement to call nvme_tcp_free_queue() for queue deallocation if the pskid is null or the queue allocation fails, as the NVME_TCP_Q_ALLOCATED flag would not be set in such scenarios. Signed-off-by: Guixin Liu Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d79811cfa0ce..5056bcae2f39 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1922,14 +1922,13 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) ctrl->opts->subsysnqn); if (!pskid) { dev_err(ctrl->device, "no valid PSK found\n"); - ret = -ENOKEY; - goto out_free_queue; + return -ENOKEY; } } ret = nvme_tcp_alloc_queue(ctrl, 0, pskid); if (ret) - goto out_free_queue; + return ret; ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl)); if (ret) From 2ad28ce9b98f8b22feaecc0966c706a8ef59cbf0 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 1 Jan 2024 12:35:27 +0200 Subject: [PATCH 05/45] nvme: remove unused definition There is no users for NVMF_AUTH_HASH_LEN macro. Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 44325c068b6a..462c21e0e417 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -20,7 +20,6 @@ #define NVMF_TRSVCID_SIZE 32 #define NVMF_TRADDR_SIZE 256 #define NVMF_TSAS_SIZE 256 -#define NVMF_AUTH_HASH_LEN 64 #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" From 2abd2c39ada8200ca5f02d483dccfa82799f51a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:14:12 +0000 Subject: [PATCH 06/45] nvme-common: mark nvme_tls_psk_prio static Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/common/keyring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/common/keyring.c b/drivers/nvme/common/keyring.c index ee341b83eeba..a5c0431c101c 100644 --- a/drivers/nvme/common/keyring.c +++ b/drivers/nvme/common/keyring.c @@ -111,7 +111,7 @@ static struct key *nvme_tls_psk_lookup(struct key *keyring, * should be preferred to 'generated' PSKs, * and SHA-384 should be preferred to SHA-256. */ -struct nvme_tls_psk_priority_list { +static struct nvme_tls_psk_priority_list { bool generated; enum nvme_tcp_tls_cipher cipher; } nvme_tls_psk_prio[] = { From 3a96bff229d6e3016805fd6c3dba0655ccba01eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:13:29 +0000 Subject: [PATCH 07/45] nvmet-tcp: fix a missing endianess conversion in nvmet_tcp_try_peek_pdu No, a __le32 cast doesn't magically byteswap on big-endian systems.. Fixes: 70525e5d82f6 ("nvmet-tcp: peek icreq before starting TLS") Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 3569c1255c5e..792828fb91cc 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1778,7 +1778,7 @@ static int nvmet_tcp_try_peek_pdu(struct nvmet_tcp_queue *queue) (int)sizeof(struct nvme_tcp_icreq_pdu)); if (hdr->type == nvme_tcp_icreq && hdr->hlen == sizeof(struct nvme_tcp_icreq_pdu) && - hdr->plen == (__le32)sizeof(struct nvme_tcp_icreq_pdu)) { + hdr->plen == cpu_to_le32(sizeof(struct nvme_tcp_icreq_pdu))) { pr_debug("queue %d: icreq detected\n", queue->idx); return len; From d3074e9a73e3c0511f1033b15345e2feb9664b3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:41 +0000 Subject: [PATCH 08/45] nvme: update the explanation for not updating the limits in nvme_config_discard Expeand the comment a bit to explain what is going on. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d144d1acb09a..56107cfc97b7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1743,7 +1743,13 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, queue->limits.discard_granularity = size; - /* If discard is already enabled, don't reset queue limits */ + /* + * If discard is already enabled, don't reset queue limits. + * + * This works around the fact that the block layer can't cope well with + * updating the hardware limits when overridden through sysfs. This is + * harmless because discard limits in NVMe are purely advisory. + */ if (queue->limits.max_discard_sectors) return; From a4be9679aa3e862adcab465122c7678c2b5d40e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:42 +0000 Subject: [PATCH 09/45] nvme: also skip discard granularity updates in nvme_config_discard Don't just skip the discard sectors and segments but also the granularity if a value was already set before. Signed-off-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 56107cfc97b7..6c52b0ab382c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1727,7 +1727,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, struct nvme_ns_head *head) { struct request_queue *queue = disk->queue; - u32 size = queue_logical_block_size(queue); if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) ctrl->max_discard_sectors = @@ -1741,8 +1740,6 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - queue->limits.discard_granularity = size; - /* * If discard is already enabled, don't reset queue limits. * @@ -1755,6 +1752,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); + queue->limits.discard_granularity = queue_logical_block_size(queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); From f29886c249ec2ed566e423fd02f6071b8f0a3346 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:43 +0000 Subject: [PATCH 10/45] nvme: fix max_discard_sectors calculation ctrl->max_discard_sectors stores a value that is potentially based of the DMRSL field in Identify Controller, which is in units of LBAs and thus dependent on the Format of a namespace. Fix this by moving the calculation of max_discard_sectors entirely into nvme_config_discard and replacing the ctrl->max_discard_sectors value with a local variable so that the calculation is always namespace-specific. Fixes: 1a86924e4f46 ("nvme: fix interpretation of DMRSL") Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 20 +++++++++----------- drivers/nvme/host/nvme.h | 1 - 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c52b0ab382c..86b9a1c4876f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1727,12 +1727,13 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, struct nvme_ns_head *head) { struct request_queue *queue = disk->queue; + u32 max_discard_sectors; - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) - ctrl->max_discard_sectors = - nvme_lba_to_sect(head, ctrl->dmrsl); - - if (ctrl->max_discard_sectors == 0) { + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) { + max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl); + } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { + max_discard_sectors = UINT_MAX; + } else { blk_queue_max_discard_sectors(queue, 0); return; } @@ -1750,7 +1751,7 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, if (queue->limits.max_discard_sectors) return; - blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); + blk_queue_max_discard_sectors(queue, max_discard_sectors); blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); queue->limits.discard_granularity = queue_logical_block_size(queue); @@ -2911,13 +2912,10 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) struct nvme_id_ctrl_nvm *id; int ret; - if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { - ctrl->max_discard_sectors = UINT_MAX; + if (ctrl->oncs & NVME_CTRL_ONCS_DSM) ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; - } else { - ctrl->max_discard_sectors = 0; + else ctrl->max_discard_segments = 0; - } /* * Even though NVMe spec explicitly states that MDTS is not applicable diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 3dbd187896d8..9a698c49ea03 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -297,7 +297,6 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; - u32 max_discard_sectors; u32 max_discard_segments; u32 max_zeroes_sectors; #ifdef CONFIG_BLK_DEV_ZONED From 3b946fe1cc149b23dad3a233c77b1475834f4d6f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 26 Dec 2023 08:58:44 +0000 Subject: [PATCH 11/45] nvme: simplify the max_discard_segments calculation Just stash away the DMRL value in the nvme_ctrl struture, and leave all interpretation to nvme_config_discard, where we know DSM is supported by the time we're configuring the number of segments. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 13 +++++-------- drivers/nvme/host/nvme.h | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 86b9a1c4876f..50818dbcfa1a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1752,7 +1752,10 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, return; blk_queue_max_discard_sectors(queue, max_discard_sectors); - blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); + if (ctrl->dmrl) + blk_queue_max_discard_segments(queue, ctrl->dmrl); + else + blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); queue->limits.discard_granularity = queue_logical_block_size(queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) @@ -2912,11 +2915,6 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) struct nvme_id_ctrl_nvm *id; int ret; - if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - ctrl->max_discard_segments = NVME_DSM_MAX_RANGES; - else - ctrl->max_discard_segments = 0; - /* * Even though NVMe spec explicitly states that MDTS is not applicable * to the write-zeroes, we are cautious and limit the size to the @@ -2946,8 +2944,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) if (ret) goto free_data; - if (id->dmrl) - ctrl->max_discard_segments = id->dmrl; + ctrl->dmrl = id->dmrl; ctrl->dmrsl = le32_to_cpu(id->dmrsl); if (id->wzsl) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9a698c49ea03..297b80430f1b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -297,13 +297,13 @@ struct nvme_ctrl { u32 max_hw_sectors; u32 max_segments; u32 max_integrity_segments; - u32 max_discard_segments; u32 max_zeroes_sectors; #ifdef CONFIG_BLK_DEV_ZONED u32 max_zone_append; #endif u16 crdt[3]; u16 oncs; + u8 dmrl; u32 dmrsl; u16 oacs; u16 sqsize; From 72e8c9379dbef2662c2479c3d142e4c44d598a5b Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 18 Dec 2023 16:30:50 +0100 Subject: [PATCH 12/45] nvmet-fc: remove unnecessary bracket There is no need for the bracket around the identifier. Remove it. Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index bd59990b5250..bda7a3009e85 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1031,7 +1031,7 @@ nvmet_fc_match_hostport(struct nvmet_fc_tgtport *tgtport, void *hosthandle) list_for_each_entry(host, &tgtport->host_list, host_list) { if (host->hosthandle == hosthandle && !host->invalid) { if (nvmet_fc_hostport_get(host)) - return (host); + return host; } } From 0e716cec6fb11a14c220ee17c404b67962e902f7 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 18 Dec 2023 16:30:51 +0100 Subject: [PATCH 13/45] nvmet-trace: avoid dereferencing pointer too early The first command issued from the host to the target is the fabrics connect command. At this point, neither the target queue nor the controller have been allocated. But we already try to trace this command in nvmet_req_init. Reported by KASAN. Reviewed-by: Hannes Reinecke Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/trace.c | 6 +++--- drivers/nvme/target/trace.h | 28 +++++++++++++++++----------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index bff454d46255..6ee1f3db81d0 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -211,7 +211,7 @@ const char *nvmet_trace_disk_name(struct trace_seq *p, char *name) return ret; } -const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl) +const char *nvmet_trace_ctrl_id(struct trace_seq *p, u16 ctrl_id) { const char *ret = trace_seq_buffer_ptr(p); @@ -224,8 +224,8 @@ const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl) * If we can know the extra data of the connect command in this stage, * we can update this print statement later. */ - if (ctrl) - trace_seq_printf(p, "%d", ctrl->cntlid); + if (ctrl_id) + trace_seq_printf(p, "%d", ctrl_id); else trace_seq_printf(p, "_"); trace_seq_putc(p, 0); diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index 6109b3806b12..2f15070ddc56 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -32,18 +32,24 @@ const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype, nvmet_trace_parse_nvm_cmd(p, opcode, cdw10) : \ nvmet_trace_parse_admin_cmd(p, opcode, cdw10))) -const char *nvmet_trace_ctrl_name(struct trace_seq *p, struct nvmet_ctrl *ctrl); -#define __print_ctrl_name(ctrl) \ - nvmet_trace_ctrl_name(p, ctrl) +const char *nvmet_trace_ctrl_id(struct trace_seq *p, u16 ctrl_id); +#define __print_ctrl_id(ctrl_id) \ + nvmet_trace_ctrl_id(p, ctrl_id) const char *nvmet_trace_disk_name(struct trace_seq *p, char *name); #define __print_disk_name(name) \ nvmet_trace_disk_name(p, name) #ifndef TRACE_HEADER_MULTI_READ -static inline struct nvmet_ctrl *nvmet_req_to_ctrl(struct nvmet_req *req) +static inline u16 nvmet_req_to_ctrl_id(struct nvmet_req *req) { - return req->sq->ctrl; + /* + * The queue and controller pointers are not valid until an association + * has been established. + */ + if (!req->sq || !req->sq->ctrl) + return 0; + return req->sq->ctrl->cntlid; } static inline void __assign_req_name(char *name, struct nvmet_req *req) @@ -63,7 +69,7 @@ TRACE_EVENT(nvmet_req_init, TP_ARGS(req, cmd), TP_STRUCT__entry( __field(struct nvme_command *, cmd) - __field(struct nvmet_ctrl *, ctrl) + __field(u16, ctrl_id) __array(char, disk, DISK_NAME_LEN) __field(int, qid) __field(u16, cid) @@ -76,7 +82,7 @@ TRACE_EVENT(nvmet_req_init, ), TP_fast_assign( __entry->cmd = cmd; - __entry->ctrl = nvmet_req_to_ctrl(req); + __entry->ctrl_id = nvmet_req_to_ctrl_id(req); __assign_req_name(__entry->disk, req); __entry->qid = req->sq->qid; __entry->cid = cmd->common.command_id; @@ -90,7 +96,7 @@ TRACE_EVENT(nvmet_req_init, ), TP_printk("nvmet%s: %sqid=%d, cmdid=%u, nsid=%u, flags=%#x, " "meta=%#llx, cmd=(%s, %s)", - __print_ctrl_name(__entry->ctrl), + __print_ctrl_id(__entry->ctrl_id), __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->nsid, __entry->flags, __entry->metadata, @@ -104,7 +110,7 @@ TRACE_EVENT(nvmet_req_complete, TP_PROTO(struct nvmet_req *req), TP_ARGS(req), TP_STRUCT__entry( - __field(struct nvmet_ctrl *, ctrl) + __field(u16, ctrl_id) __array(char, disk, DISK_NAME_LEN) __field(int, qid) __field(int, cid) @@ -112,7 +118,7 @@ TRACE_EVENT(nvmet_req_complete, __field(u16, status) ), TP_fast_assign( - __entry->ctrl = nvmet_req_to_ctrl(req); + __entry->ctrl_id = nvmet_req_to_ctrl_id(req); __entry->qid = req->cq->qid; __entry->cid = req->cqe->command_id; __entry->result = le64_to_cpu(req->cqe->result.u64); @@ -120,7 +126,7 @@ TRACE_EVENT(nvmet_req_complete, __assign_req_name(__entry->disk, req); ), TP_printk("nvmet%s: %sqid=%d, cmdid=%u, res=%#llx, status=%#x", - __print_ctrl_name(__entry->ctrl), + __print_ctrl_id(__entry->ctrl_id), __print_disk_name(__entry->disk), __entry->qid, __entry->cid, __entry->result, __entry->status) From f644d21baab34c837d639e9639aa8204dba7f3cf Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 18 Dec 2023 16:30:53 +0100 Subject: [PATCH 14/45] nvmet-fcloop: Remove remote port from list when unlinking The remote port is removed too late from fcloop_nports list. Remove it when port is unregistered. This prevents a busy loop in fcloop_exit, because it is possible the remote port is found in the list and thus we will never progress. The kernel log will be spammed with nvme_fcloop: fcloop_exit: Failed deleting remote port nvme_fcloop: fcloop_exit: Failed deleting target port Signed-off-by: Daniel Wagner Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Keith Busch --- drivers/nvme/target/fcloop.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index c65a73433c05..ead349af30f1 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -995,11 +995,6 @@ fcloop_nport_free(struct kref *ref) { struct fcloop_nport *nport = container_of(ref, struct fcloop_nport, ref); - unsigned long flags; - - spin_lock_irqsave(&fcloop_lock, flags); - list_del(&nport->nport_list); - spin_unlock_irqrestore(&fcloop_lock, flags); kfree(nport); } @@ -1357,6 +1352,8 @@ __unlink_remote_port(struct fcloop_nport *nport) nport->tport->remoteport = NULL; nport->rport = NULL; + list_del(&nport->nport_list); + return rport; } From bd029a02ce46e77e4d553140b039f93cf78ee8c1 Mon Sep 17 00:00:00 2001 From: "Jim.Lin" Date: Tue, 28 Nov 2023 10:57:37 +0800 Subject: [PATCH 15/45] nvme-pci: disable write zeroes for SK Hynix BC901 SK Hynix BC901 drive write zero will cause Chromebook takes more than 20 mins to switch to developer mode "disable write zeroes" can fix this issue and Sk Hynix has been verified. Signed-off-by: Jim.Lin Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 507bc149046d..f27202680741 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3394,6 +3394,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1c5c, 0x174a), /* SK Hynix P31 SSD */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1c5c, 0x1D59), /* SK Hynix BC901 */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1d97, 0x2263), /* SPCC */ From bafd590910d00327decb3937e77f6f11c3e80e4b Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 27 Dec 2023 17:31:06 +0800 Subject: [PATCH 16/45] nvme: introduce nvme_disk_is_ns_head helper We currently rely on gendisk's file operations (fops) to distinguish between a namespace head (ns_head) and a regular namespace. To enhance code readability, introduce a helper function. Additionally, we must ensure that the device is not an ns_head before calling nvme_get_ns_from_dev(). To enforce this, add a WARN_ON check within the nvme_get_ns_from_dev(). Signed-off-by: Guixin Liu Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Liu Song [include fix: https://lore.kernel.org/oe-kbuild-all/202401031943.0N72Tkji-lkp@intel.com/] Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 13 ++++++++++++- drivers/nvme/host/pr.c | 2 +- drivers/nvme/host/sysfs.c | 8 ++++---- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 297b80430f1b..6092cc361837 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -920,6 +920,10 @@ extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; +static inline bool nvme_disk_is_ns_head(struct gendisk *disk) +{ + return disk->fops == &nvme_ns_head_ops; +} #else #define multipath false static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl) @@ -997,6 +1001,10 @@ static inline void nvme_mpath_start_request(struct request *rq) static inline void nvme_mpath_end_request(struct request *rq) { } +static inline bool nvme_disk_is_ns_head(struct gendisk *disk) +{ + return false; +} #endif /* CONFIG_NVME_MULTIPATH */ int nvme_revalidate_zones(struct nvme_ns *ns); @@ -1025,7 +1033,10 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { - return dev_to_disk(dev)->private_data; + struct gendisk *disk = dev_to_disk(dev); + + WARN_ON(nvme_disk_is_ns_head(disk)); + return disk->private_data; } #ifdef CONFIG_NVME_HWMON diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index 391b1465ebfd..fc3eed00f9ff 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -98,7 +98,7 @@ static int nvme_send_pr_command(struct block_device *bdev, struct nvme_command *c, void *data, unsigned int data_len) { if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && - bdev->bd_disk->fops == &nvme_ns_head_ops) + nvme_disk_is_ns_head(bdev->bd_disk)) return nvme_send_ns_head_pr_command(bdev, c, data, data_len); return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data, diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index ac24ad102380..754e91111042 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -39,10 +39,9 @@ static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); - if (disk->fops == &nvme_bdev_ops) - return nvme_get_ns_from_dev(dev)->head; - else + if (nvme_disk_is_ns_head(disk)) return disk->private_data; + return nvme_get_ns_from_dev(dev)->head; } static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, @@ -233,7 +232,8 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, } #ifdef CONFIG_NVME_MULTIPATH if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) { - if (dev_to_disk(dev)->fops != &nvme_bdev_ops) /* per-path attr */ + /* per-path attr */ + if (nvme_disk_is_ns_head(dev_to_disk(dev))) return 0; if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl)) return 0; From 4ee7ffeb4ce50c80bc4504db6f39b25a2df6bcf4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2024 16:56:55 +0100 Subject: [PATCH 17/45] nvmet: re-fix tracing strncpy() warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An earlier patch had tried to address a warning about a string copy with missing zero termination: drivers/nvme/target/trace.h:52:3: warning: ‘strncpy’ specified bound 32 equals destination size [-Wstringop-truncation] The new version causes a different warning with some compiler versions, notably gcc-9 and gcc-10, and also misses the zero padding that was apparently done intentionally in the original code: drivers/nvme/target/trace.h:56:2: error: 'strncpy' specified bound depends on the length of the source argument [-Werror=stringop-overflow=] Change it to use strscpy_pad() with the original length, which will give a properly padded and zero-terminated string as well as avoiding the warning. Fixes: d86481e924a7 ("nvmet: use min of device_path and disk len") Signed-off-by: Arnd Bergmann Signed-off-by: Keith Busch --- drivers/nvme/target/trace.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index 2f15070ddc56..89020018a0e3 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -59,8 +59,7 @@ static inline void __assign_req_name(char *name, struct nvmet_req *req) return; } - strncpy(name, req->ns->device_path, - min_t(size_t, DISK_NAME_LEN, strlen(req->ns->device_path))); + strscpy_pad(name, req->ns->device_path, DISK_NAME_LEN); } #endif From a7de1dea76cd6a3707707af4ea2f8bc3cdeaeb11 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Jan 2024 16:56:56 +0100 Subject: [PATCH 18/45] nvme: trace: avoid memcpy overflow warning A previous patch introduced a struct_group() in nvme_common_command to help stringop fortification figure out the length of the fields, but one function is not currently using them: In file included from drivers/nvme/target/core.c:7: In file included from include/linux/string.h:254: include/linux/fortify-string.h:592:4: error: call to '__read_overflow2_field' declared with 'warning' attribute: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror,-Wattribute-warning] __read_overflow2_field(q_size_field, size); ^ Change this one to use the correct field name to avoid the warning. Fixes: 5c629dc9609dc ("nvme: use struct group for generic command dwords") Signed-off-by: Arnd Bergmann Signed-off-by: Keith Busch --- drivers/nvme/target/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h index 89020018a0e3..7f7ebf9558e5 100644 --- a/drivers/nvme/target/trace.h +++ b/drivers/nvme/target/trace.h @@ -90,7 +90,7 @@ TRACE_EVENT(nvmet_req_init, __entry->flags = cmd->common.flags; __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = le64_to_cpu(cmd->common.metadata); - memcpy(__entry->cdw10, &cmd->common.cdw10, + memcpy(__entry->cdw10, &cmd->common.cdws, sizeof(__entry->cdw10)); ), TP_printk("nvmet%s: %sqid=%d, cmdid=%u, nsid=%u, flags=%#x, " From 172fb49600c2b96a4ade255fbfc3fe7098b8afd3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 Dec 2023 10:48:30 -0800 Subject: [PATCH 19/45] nvme-pci: enhance timeout kernel log Kernel configs don't necessarily have opcode decoding, and some opcodes are not even decodable. It is still interesting for debugging SSD issues to know what opcode is timing out, what request type it came from, and the data size (if applicable). Also print the command_id along side blk-mq's tag to help match commands with protocol wire traces and firmware logs, Reviewed-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f27202680741..75e763ce09aa 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1284,6 +1284,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) struct request *abort_req; struct nvme_command cmd = { }; u32 csts = readl(dev->bar + NVME_REG_CSTS); + u8 opcode; /* If PCI error recovery process is happening, we cannot reset or * the recovery mechanism will surely fail. @@ -1310,8 +1311,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT) { dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, completion polled\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) QID %d timeout, completion polled\n", + req->tag, nvme_cid(req), nvmeq->qid); return BLK_EH_DONE; } @@ -1327,8 +1328,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) fallthrough; case NVME_CTRL_DELETING: dev_warn_ratelimited(dev->ctrl.device, - "I/O %d QID %d timeout, disable controller\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) QID %d timeout, disable controller\n", + req->tag, nvme_cid(req), nvmeq->qid); nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_dev_disable(dev, true); return BLK_EH_DONE; @@ -1343,10 +1344,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * command was already aborted once before and still hasn't been * returned to the driver, or if this is the admin queue. */ + opcode = nvme_req(req)->cmd->common.opcode; if (!nvmeq->qid || iod->aborted) { dev_warn(dev->ctrl.device, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n", + req->tag, nvme_cid(req), opcode, + nvme_opcode_str(nvmeq->qid, opcode, 0), nvmeq->qid); nvme_req(req)->flags |= NVME_REQ_CANCELLED; goto disable; } @@ -1362,10 +1365,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) cmd.abort.sqid = cpu_to_le16(nvmeq->qid); dev_warn(nvmeq->dev->ctrl.device, - "I/O %d (%s) QID %d timeout, aborting\n", - req->tag, - nvme_get_opcode_str(nvme_req(req)->cmd->common.opcode), - nvmeq->qid); + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, aborting req_op:%s(%u) size:%u\n", + req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode), + nvmeq->qid, blk_op_str(req_op(req)), req_op(req), + blk_rq_bytes(req)); abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd), BLK_MQ_REQ_NOWAIT); From a5c1a87ce0876192d4343cdf5f0a22d737eb0ec3 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 7 Jan 2024 02:29:49 +0200 Subject: [PATCH 20/45] nvme-rdma: enhance timeout kernel log Print the command_id along side blk-mq's tag to help match commands with protocol wire traces and logs. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index bc90ec3c51b0..2e77c0f25f71 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1941,9 +1941,14 @@ static enum blk_eh_timer_return nvme_rdma_timeout(struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_queue *queue = req->queue; struct nvme_rdma_ctrl *ctrl = queue->ctrl; + u8 opcode = req->req.cmd->common.opcode; + u8 fctype = req->req.cmd->fabrics.fctype; + int qid = nvme_rdma_queue_idx(queue); - dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", - rq->tag, nvme_rdma_queue_idx(queue)); + dev_warn(ctrl->ctrl.device, + "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout\n", + rq->tag, nvme_cid(rq), opcode, + nvme_opcode_str(qid, opcode, fctype), qid); if (ctrl->ctrl.state != NVME_CTRL_LIVE) { /* From 45c36f04f1beb7c4c45f5910a1f69d6d9d5a19fb Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 7 Jan 2024 02:29:50 +0200 Subject: [PATCH 21/45] nvme-tcp: enhance timeout kernel log Print the command_id along side blk-mq's tag to help match commands with protocol wire traces and logs. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 5056bcae2f39..b234f0674aeb 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2425,9 +2425,9 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) int qid = nvme_tcp_queue_id(req->queue); dev_warn(ctrl->device, - "queue %d: timeout cid %#x type %d opcode %#x (%s)\n", - nvme_tcp_queue_id(req->queue), nvme_cid(rq), pdu->hdr.type, - opc, nvme_opcode_str(qid, opc, fctype)); + "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n", + rq->tag, nvme_cid(rq), pdu->hdr.type, opc, + nvme_opcode_str(qid, opc, fctype), qid); if (ctrl->state != NVME_CTRL_LIVE) { /* From 9a1abc24850eb759e36a2f8869161c3b7254c904 Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Fri, 5 Jan 2024 09:14:44 +0100 Subject: [PATCH 22/45] nvmet-tcp: Fix the H2C expected PDU len calculation The nvmet_tcp_handle_h2c_data_pdu() function should take into consideration the possibility that the header digest and/or the data digests are enabled when calculating the expected PDU length, before comparing it to the value stored in cmd->pdu_len. Fixes: efa56305908b ("nvmet-tcp: Fix a kernel panic when host sends an invalid H2C PDU length") Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 792828fb91cc..4dc60cbcb205 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -979,7 +979,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) { struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvmet_tcp_cmd *cmd; - unsigned int plen; + unsigned int exp_data_len; if (likely(queue->nr_cmds)) { if (unlikely(data->ttag >= queue->nr_cmds)) { @@ -999,9 +999,13 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) goto err_proto; } - plen = le32_to_cpu(data->hdr.plen); + exp_data_len = le32_to_cpu(data->hdr.plen) - + nvmet_tcp_hdgst_len(queue) - + nvmet_tcp_ddgst_len(queue) - + sizeof(*data); + cmd->pdu_len = le32_to_cpu(data->data_length); - if (unlikely(cmd->pdu_len != (plen - sizeof(*data)) || + if (unlikely(cmd->pdu_len != exp_data_len || cmd->pdu_len == 0 || cmd->pdu_len > NVMET_TCP_MAXH2CDATA)) { pr_err("H2CData PDU len %u is invalid\n", cmd->pdu_len); From 3b7cb745473aec7255d66e3854abaa9c3f46f952 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Jan 2024 11:50:16 -0700 Subject: [PATCH 23/45] block: move __get_task_ioprio() into header file We call this once per IO, which can be millions of times per second. Since nobody really uses io priorities, or at least it isn't very common, this is all wasted time and can amount to as much as 3% of the total kernel time. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/ioprio.c | 26 -------------------------- include/linux/ioprio.h | 25 ++++++++++++++++++++++++- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/block/ioprio.c b/block/ioprio.c index b5a942519a79..73301a261429 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -139,32 +139,6 @@ out: return ret; } -/* - * If the task has set an I/O priority, use that. Otherwise, return - * the default I/O priority. - * - * Expected to be called for current task or with task_lock() held to keep - * io_context stable. - */ -int __get_task_ioprio(struct task_struct *p) -{ - struct io_context *ioc = p->io_context; - int prio; - - if (p != current) - lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; - - if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) - prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), - task_nice_ioprio(p)); - return prio; -} -EXPORT_SYMBOL_GPL(__get_task_ioprio); - static int get_task_ioprio(struct task_struct *p) { int ret; diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 7578d4f6a969..d6a9b5b7ed16 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -47,7 +47,30 @@ static inline int task_nice_ioclass(struct task_struct *task) } #ifdef CONFIG_BLOCK -int __get_task_ioprio(struct task_struct *p); +/* + * If the task has set an I/O priority, use that. Otherwise, return + * the default I/O priority. + * + * Expected to be called for current task or with task_lock() held to keep + * io_context stable. + */ +static inline int __get_task_ioprio(struct task_struct *p) +{ + struct io_context *ioc = p->io_context; + int prio; + + if (p != current) + lockdep_assert_held(&p->alloc_lock); + if (ioc) + prio = ioc->ioprio; + else + prio = IOPRIO_DEFAULT; + + if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) + prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), + task_nice_ioprio(p)); + return prio; +} #else static inline int __get_task_ioprio(struct task_struct *p) { From 53889bcaf536b3abedeaf104019877cee37dd08b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 Jan 2024 11:51:57 -0700 Subject: [PATCH 24/45] block: make __get_task_ioprio() easier to read We don't need to do any gymnastics if we don't have an io_context assigned at all, so just return early with our default priority. Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/ioprio.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index d6a9b5b7ed16..db1249cd9692 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -59,13 +59,13 @@ static inline int __get_task_ioprio(struct task_struct *p) struct io_context *ioc = p->io_context; int prio; + if (!ioc) + return IOPRIO_DEFAULT; + if (p != current) lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; + prio = ioc->ioprio; if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), task_nice_ioprio(p)); From f9cfe7e7f96a9414a17d596e288693c4f2325d49 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 9 Jan 2024 21:39:57 +0800 Subject: [PATCH 25/45] md: Fix md_seq_ops() regressions Commit cf1b6d4441ff ("md: simplify md_seq_ops") introduce following regressions: 1) If list all_mddevs is emptly, personalities and unused devices won't be showed to user anymore. 2) If seq_file buffer overflowed from md_seq_show(), then md_seq_start() will be called again, hence personalities will be showed to user again. 3) If seq_file buffer overflowed from md_seq_stop(), seq_read_iter() doesn't handle this, hence unused devices won't be showed to user. Fix above problems by printing personalities and unused devices in md_seq_show(). Fixes: cf1b6d4441ff ("md: simplify md_seq_ops") Cc: stable@vger.kernel.org # v6.7+ Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240109133957.2975272-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e351e6c51cc7..ff3057c787c1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8135,6 +8135,19 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } +static void status_personalities(struct seq_file *seq) +{ + struct md_personality *pers; + + seq_puts(seq, "Personalities : "); + spin_lock(&pers_lock); + list_for_each_entry(pers, &pers_list, list) + seq_printf(seq, "[%s] ", pers->name); + + spin_unlock(&pers_lock); + seq_puts(seq, "\n"); +} + static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; @@ -8276,20 +8289,10 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) static void *md_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&all_mddevs_lock) { - struct md_personality *pers; - - seq_puts(seq, "Personalities : "); - spin_lock(&pers_lock); - list_for_each_entry(pers, &pers_list, list) - seq_printf(seq, "[%s] ", pers->name); - - spin_unlock(&pers_lock); - seq_puts(seq, "\n"); seq->poll_event = atomic_read(&md_event_count); - spin_lock(&all_mddevs_lock); - return seq_list_start(&all_mddevs, *pos); + return seq_list_start_head(&all_mddevs, *pos); } static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -8300,16 +8303,23 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void md_seq_stop(struct seq_file *seq, void *v) __releases(&all_mddevs_lock) { - status_unused(seq); spin_unlock(&all_mddevs_lock); } static int md_seq_show(struct seq_file *seq, void *v) { - struct mddev *mddev = list_entry(v, struct mddev, all_mddevs); + struct mddev *mddev; sector_t sectors; struct md_rdev *rdev; + if (v == &all_mddevs) { + status_personalities(seq); + if (list_empty(&all_mddevs)) + status_unused(seq); + return 0; + } + + mddev = list_entry(v, struct mddev, all_mddevs); if (!mddev_get(mddev)) return 0; @@ -8385,6 +8395,10 @@ static int md_seq_show(struct seq_file *seq, void *v) } spin_unlock(&mddev->lock); spin_lock(&all_mddevs_lock); + + if (mddev == list_last_entry(&all_mddevs, struct mddev, all_mddevs)) + status_unused(seq); + if (atomic_dec_and_test(&mddev->active)) __mddev_put(mddev); From 7dab24554dedd4e6f408af8eb2d25c89997a6a1f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 7 Jan 2024 16:12:23 -0800 Subject: [PATCH 26/45] md/raid1: Use blk_opf_t for read and write operations Use the type blk_opf_t for read and write operations instead of int. This patch does not affect the generated code but fixes the following sparse warning: drivers/md/raid1.c:1993:60: sparse: sparse: incorrect type in argument 5 (different base types) expected restricted blk_opf_t [usertype] opf got int rw Cc: Song Liu Cc: Jens Axboe Fixes: 3c5e514db58f ("md/raid1: Use the new blk_opf_t type") Cc: stable@vger.kernel.org # v6.0+ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401080657.UjFnvQgX-lkp@intel.com/ Signed-off-by: Bart Van Assche Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240108001223.23835-1-bvanassche@acm.org --- drivers/md/raid1.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index aaa434f0c175..24f0d799fd98 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1968,12 +1968,12 @@ static void end_sync_write(struct bio *bio) } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, - int sectors, struct page *page, int rw) + int sectors, struct page *page, blk_opf_t rw) { if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) /* success */ return 1; - if (rw == WRITE) { + if (rw == REQ_OP_WRITE) { set_bit(WriteErrorSeen, &rdev->flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -2090,7 +2090,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) rdev = conf->mirrors[d].rdev; if (r1_sync_page_io(rdev, sect, s, pages[idx], - WRITE) == 0) { + REQ_OP_WRITE) == 0) { r1_bio->bios[d]->bi_end_io = NULL; rdev_dec_pending(rdev, mddev); } @@ -2105,7 +2105,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio) rdev = conf->mirrors[d].rdev; if (r1_sync_page_io(rdev, sect, s, pages[idx], - READ) != 0) + REQ_OP_READ) != 0) atomic_add(s, &rdev->corrected_errors); } sectors -= s; @@ -2321,7 +2321,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); r1_sync_page_io(rdev, sect, s, - conf->tmppage, WRITE); + conf->tmppage, REQ_OP_WRITE); rdev_dec_pending(rdev, mddev); } } @@ -2335,7 +2335,7 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) !test_bit(Faulty, &rdev->flags)) { atomic_inc(&rdev->nr_pending); if (r1_sync_page_io(rdev, sect, s, - conf->tmppage, READ)) { + conf->tmppage, REQ_OP_READ)) { atomic_add(s, &rdev->corrected_errors); pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %pg)\n", mdname(mddev), s, From 742e324a0679ce271f7475a40056bac6917a950c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2024 08:29:53 -0700 Subject: [PATCH 27/45] block/iocost: silence warning on 'last_period' potentially being unused If CONFIG_TRACEPOINTS isn't enabled, we assign this variable but then never use it. This can cause the compiler to complain about that: block/blk-iocost.c:1264:6: warning: variable 'last_period' set but not used [-Wunused-but-set-variable] 1264 | u64 last_period, cur_period; | ^ Rather than add ifdefs to guard this, just mark it __maybe_unused. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401102335.GiWdeIo9-lkp@intel.com/ Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 089fcb9cfce3..c8beec6d7df0 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period; + u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; From 748dc0b65ec2b4b7b3dbd7befcc4a54fdcac7988 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 10 Jan 2024 18:29:42 +0900 Subject: [PATCH 28/45] block: fix partial zone append completion handling in req_bio_endio() Partial completions of zone append request is not allowed but if a zone append completion indicates a number of completed bytes different from the original BIO size, only the BIO status is set to error. This leads to bio_advance() not setting the BIO size to 0 and thus to not call bio_endio() at the end of req_bio_endio(). Make sure a partially completed zone append is failed and completed immediately by forcing the completed number of bytes (nbytes) to be equal to the BIO size, thus ensuring that bio_endio() is called. Fixes: 297db731847e ("block: fix req_bio_endio append error handling") Cc: stable@kernel.vger.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240110092942.442334-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index fb29ff5cc281..aa9a05fdd023 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -772,11 +772,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. + * For such case, force the completed nbytes to be equal to + * the BIO size so that bio_advance() sets the BIO remaining + * size to 0 and we end up calling bio_endio() before returning. */ - if (bio->bi_iter.bi_size != nbytes) + if (bio->bi_iter.bi_size != nbytes) { bio->bi_status = BLK_STS_IOERR; - else + nbytes = bio->bi_iter.bi_size; + } else { bio->bi_iter.bi_sector = rq->__sector; + } } bio_advance(bio, nbytes); From 06c59d427017fcde3107c236177fcc74c9db7909 Mon Sep 17 00:00:00 2001 From: William Butler Date: Wed, 10 Jan 2024 18:28:55 +0000 Subject: [PATCH 29/45] nvme-pci: set doorbell config before unquiescing During resets, if queues are unquiesced first, then the host can submit IOs to the controller using shadow doorbell logic but the controller won't be aware. This can lead to necessary MMIO doorbells from being not issued, causing requests to be delayed and timed-out. Signed-off-by: William Butler Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 75e763ce09aa..46d3897b8986 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2746,10 +2746,10 @@ static void nvme_reset_work(struct work_struct *work) * controller around but remove all namespaces. */ if (dev->online_queues > 1) { + nvme_dbbuf_set(dev); nvme_unquiesce_io_queues(&dev->ctrl); nvme_wait_freeze(&dev->ctrl); nvme_pci_update_nr_queues(dev); - nvme_dbbuf_set(dev); nvme_unfreeze(&dev->ctrl); } else { dev_warn(dev->ctrl.device, "IO queues lost\n"); From 07a29b134ce8e47aef15ea71eab8e6b3734a9720 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 8 Dec 2023 13:53:20 +0100 Subject: [PATCH 30/45] nvmet-tcp: avoid circular locking dependency on install_queue() nvmet_tcp_install_queue() is driven from the ->io_work workqueue function, but will call flush_workqueue() which might trigger ->release_work() which in itself calls flush_work on ->io_work. To avoid that check for pending queue in disconnecting status, and return 'controller busy' when we reached a certain threshold. Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 4dc60cbcb205..6a1e6bb80062 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -25,6 +25,7 @@ #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) #define NVMET_TCP_MAXH2CDATA 0x400000 /* 16M arbitrary limit */ +#define NVMET_TCP_BACKLOG 128 static int param_store_val(const char *str, int *val, int min, int max) { @@ -2067,7 +2068,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - ret = kernel_listen(port->sock, 128); + ret = kernel_listen(port->sock, NVMET_TCP_BACKLOG); if (ret) { pr_err("failed to listen %d on port sock\n", ret); goto err_sock; @@ -2133,8 +2134,19 @@ static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq) container_of(sq, struct nvmet_tcp_queue, nvme_sq); if (sq->qid == 0) { - /* Let inflight controller teardown complete */ - flush_workqueue(nvmet_wq); + struct nvmet_tcp_queue *q; + int pending = 0; + + /* Check for pending controller teardown */ + mutex_lock(&nvmet_tcp_queue_mutex); + list_for_each_entry(q, &nvmet_tcp_queue_list, queue_list) { + if (q->nvme_sq.ctrl == sq->ctrl && + q->state == NVMET_TCP_Q_DISCONNECTING) + pending++; + } + mutex_unlock(&nvmet_tcp_queue_mutex); + if (pending > NVMET_TCP_BACKLOG) + return NVME_SC_CONNECT_CTRL_BUSY; } queue->nr_cmds = sq->size * 2; From 31deaeb11ba7a885116c9c30892b9f763c04d59c Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 8 Dec 2023 13:53:21 +0100 Subject: [PATCH 31/45] nvmet-rdma: avoid circular locking dependency on install_queue() nvmet_rdma_install_queue() is driven from the ->io_work workqueue function, but will call flush_workqueue() which might trigger ->release_work() which in itself calls flush_work on ->io_work. To avoid that check for pending queue in disconnecting status, and return 'controller busy' when we reached a certain threshold. Signed-off-by: Hannes Reinecke Tested-by: Shin'ichiro Kawasaki Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 4597bca43a6d..667f9c04f35d 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -37,6 +37,8 @@ #define NVMET_RDMA_MAX_MDTS 8 #define NVMET_RDMA_MAX_METADATA_MDTS 5 +#define NVMET_RDMA_BACKLOG 128 + struct nvmet_rdma_srq; struct nvmet_rdma_cmd { @@ -1583,8 +1585,19 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, } if (queue->host_qid == 0) { - /* Let inflight controller teardown complete */ - flush_workqueue(nvmet_wq); + struct nvmet_rdma_queue *q; + int pending = 0; + + /* Check for pending controller teardown */ + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(q, &nvmet_rdma_queue_list, queue_list) { + if (q->nvme_sq.ctrl == queue->nvme_sq.ctrl && + q->state == NVMET_RDMA_Q_DISCONNECTING) + pending++; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + if (pending > NVMET_RDMA_BACKLOG) + return NVME_SC_CONNECT_CTRL_BUSY; } ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); @@ -1880,7 +1893,7 @@ static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) goto out_destroy_id; } - ret = rdma_listen(cm_id, 128); + ret = rdma_listen(cm_id, NVMET_RDMA_BACKLOG); if (ret) { pr_err("listening to %pISpcs failed (%d)\n", addr, ret); goto out_destroy_id; From 5266caaf5660529e3da53004b8b7174cab6374ed Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 12 Jan 2024 20:26:26 +0800 Subject: [PATCH 32/45] blk-mq: fix IO hang from sbitmap wakeup race In blk_mq_mark_tag_wait(), __add_wait_queue() may be re-ordered with the following blk_mq_get_driver_tag() in case of getting driver tag failure. Then in __sbitmap_queue_wake_up(), waitqueue_active() may not observe the added waiter in blk_mq_mark_tag_wait() and wake up nothing, meantime blk_mq_mark_tag_wait() can't get driver tag successfully. This issue can be reproduced by running the following test in loop, and fio hang can be observed in < 30min when running it on my test VM in laptop. modprobe -r scsi_debug modprobe scsi_debug delay=0 dev_size_mb=4096 max_queue=1 host_max_queue=1 submit_queues=4 dev=`ls -d /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/* | head -1 | xargs basename` fio --filename=/dev/"$dev" --direct=1 --rw=randrw --bs=4k --iodepth=1 \ --runtime=100 --numjobs=40 --time_based --name=test \ --ioengine=libaio Fix the issue by adding one explicit barrier in blk_mq_mark_tag_wait(), which is just fine in case of running out of tag. Cc: Jan Kara Cc: Kemeng Shi Reported-by: Changhui Zhong Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240112122626.4181044-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index aa9a05fdd023..a4c54c5895a1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1852,6 +1852,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); + /* + * Add one explicit barrier since blk_mq_get_driver_tag() may + * not imply barrier in case of failure. + * + * Order adding us to wait queue and allocating driver tag. + * + * The pair is the one implied in sbitmap_queue_wake_up() which + * orders clearing sbitmap tag bits and waitqueue_active() in + * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless + * + * Otherwise, re-order of adding wait queue and getting driver tag + * may cause __sbitmap_queue_wake_up() to wake up nothing because + * the waitqueue_active() may not observe us in wait queue. + */ + smp_mb(); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait From 25c1772a0493463408489b1fae65cf77fe46cac1 Mon Sep 17 00:00:00 2001 From: Christian Heusel Date: Fri, 12 Jan 2024 00:15:18 +0100 Subject: [PATCH 33/45] block: print symbolic error name instead of error code Utilize the %pe print specifier to get the symbolic error name as a string (i.e "-ENOMEM") in the log message instead of the error code to increase its readablility. This change was suggested in https://lore.kernel.org/all/92972476-0b1f-4d0a-9951-af3fc8bc6e65@suswa.mountain/ Signed-off-by: Christian Heusel Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20240111231521.1596838-1-christian@heusel.eu Signed-off-by: Jens Axboe --- block/partitions/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index e6ac73617f3e..cab0d76a828e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -562,8 +562,8 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); return true; } From 309ce6741430b5a7b5e85cd1a7569647f8d9dfa6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jan 2024 14:57:04 +0100 Subject: [PATCH 34/45] blk-mq: rename blk_mq_can_use_cached_rq blk_mq_can_use_cached_rq doesn't just check if we can use the request, but also performs the work to actually use it. Remove the _can in the naming, and improve the comment describing the function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240111135705.2155518-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a4c54c5895a1..f57b86d6de6a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2900,8 +2900,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -/* return true if this @rq can be used for @bio */ -static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, +/* + * Check if we can use the passed on request for submitting the passed in bio, + * and remove it from the request list if it can be used. + */ +static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); @@ -2979,7 +2982,7 @@ void blk_mq_submit_bio(struct bio *bio) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; - if (blk_mq_can_use_cached_rq(rq, plug, bio)) + if (blk_mq_use_cached_rq(rq, plug, bio)) goto done; percpu_ref_get(&q->q_usage_counter); } else { From 7b4f36cd22a65b750b4cb6ac14804fb7d6e6c67d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Jan 2024 09:12:20 -0700 Subject: [PATCH 35/45] block: ensure we hold a queue reference when using queue limits q_usage_counter is the only thing preventing us from the limits changing under us in __bio_split_to_limits, but blk_mq_submit_bio doesn't hold it while calling into it. Move the splitting inside the region where we know we've got a queue reference. Ideally this could still remain a shared section of code, but let's keep the fix simple and defer any refactoring here to later. Reported-by: Christoph Hellwig Fixes: 900e08075202 ("block: move queue enter logic into blk_mq_submit_bio()") Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f57b86d6de6a..e02c4b1af8c5 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2964,12 +2964,6 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t ret; bio = blk_queue_bounce(bio, q); - if (bio_may_exceed_limits(bio, &q->limits)) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - bio_set_ioprio(bio); if (plug) { @@ -2978,6 +2972,11 @@ void blk_mq_submit_bio(struct bio *bio) rq = NULL; } if (rq) { + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + return; + } if (!bio_integrity_prep(bio)) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) @@ -2988,6 +2987,11 @@ void blk_mq_submit_bio(struct bio *bio) } else { if (unlikely(bio_queue_enter(bio))) return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto fail; + } if (!bio_integrity_prep(bio)) goto fail; } From 95931a245b44ee04f3359ec432e73614d44d8b38 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Jan 2024 10:00:59 +0100 Subject: [PATCH 36/45] null_blk: Remove usage of the deprecated ida_simple_xx() API ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/bf257b1078475a415cdc3344c6a750842946e367.1705222845.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 9f7695f00c2d..36755f263e8e 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1840,7 +1840,7 @@ static void null_del_dev(struct nullb *nullb) dev = nullb->dev; - ida_simple_remove(&nullb_indexes, nullb->index); + ida_free(&nullb_indexes, nullb->index); list_del_init(&nullb->list); @@ -2174,7 +2174,7 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); mutex_lock(&lock); - rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) { mutex_unlock(&lock); goto out_cleanup_zone; From 521277d12b5a75982d4f642d2ee22db8d7f986dd Mon Sep 17 00:00:00 2001 From: Nicky Chorley Date: Sun, 14 Jan 2024 19:10:56 +0000 Subject: [PATCH 37/45] block: Correct a documentation comment in blk-cgroup.c Commit 99e603874366 ("blk-cgroup: pass a gendisk to the blkg allocation helpers") changed blkg_alloc() to take a struct gendisk instead of a struct request_queue, but the documentation comment still referred to q. So, update that comment to refer to disk instead and fix a typo. Signed-off-by: Nicky Chorley Link: https://lore.kernel.org/r/20240114191056.6992-1-ndchorley@gmail.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e303fd317313..ff93c385ba5a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) From 5c7fa5c8ad79a1d7cc9f59636e2f99e8b5471248 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 15 Jan 2024 22:56:26 +0800 Subject: [PATCH 38/45] sbitmap: remove stale comment in sbq_calc_wake_batch After commit 106397376c036 ("sbitmap: fix batching wakeup"), we may wake up more than one queue for each batch. Just remove stale comment that we wake up only one queue for each batch. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240115145626.665562-1-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- lib/sbitmap.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/sbitmap.c b/lib/sbitmap.c index d0a5081dfd12..92c6b1fd8989 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -388,11 +388,6 @@ static unsigned int sbq_calc_wake_batch(struct sbitmap_queue *sbq, unsigned int shallow_depth; /* - * For each batch, we wake up one queue. We need to make sure that our - * batch size is small enough that the full depth of the bitmap, - * potentially limited by a shallow depth, is enough to wake up all of - * the queues. - * * Each full word of the bitmap has bits_per_word bits, and there might * be a partial word. There are depth / bits_per_word full words and * depth % bits_per_word bits left over. In bitwise arithmetic: From 04036d49c44b1772d4158640f3ccd938a12a3cb8 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sat, 13 Jan 2024 12:09:47 +0800 Subject: [PATCH 39/45] virtio_blk: remove duplicate check if queue is broken in virtblk_done virtqueue_enable_cb() will call virtqueue_poll() which will check if queue is broken at beginning, so remove the virtqueue_is_broken() call Acked-by: Jason Wang Signed-off-by: Li RongQing Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 7d7a19b2b9a8..24963f445cfe 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -367,8 +367,6 @@ static void virtblk_done(struct virtqueue *vq) blk_mq_complete_request(req); req_done = true; } - if (unlikely(virtqueue_is_broken(vq))) - break; } while (!virtqueue_enable_cb(vq)); /* In case queue is stopped waiting for more buffers. */ From be50df31c4e2a69f961a3bb759346d299eaa2b23 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 16 Jan 2024 17:34:31 +0300 Subject: [PATCH 40/45] block: bio-integrity: fix kcalloc() arguments order When compiling with gcc version 14.0.1 20240116 (experimental) and W=1, I've noticed the following warning: block/bio-integrity.c: In function 'bio_integrity_map_user': block/bio-integrity.c:339:38: warning: 'kcalloc' sizes specified with 'sizeof' in the earlier argument and not in the later argument [-Wcalloc-transposed-args] 339 | bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); | ^ block/bio-integrity.c:339:38: note: earlier argument should specify number of elements, later size of each element Since 'n' and 'size' arguments of 'kcalloc()' are multiplied to calculate the final size, their actual order doesn't affect the result and so this is not a bug. But it's still worth to fix it. Fixes: 492c5d455969 ("block: bio-integrity: directly map user buffers") Signed-off-by: Dmitry Antipov Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20240116143437.89060-1-dmantipov@yandex.ru Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index feef615e2c9c..c9a16fba58b9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -336,7 +336,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { - bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL); + bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); if (!bvec) return -ENOMEM; pages = NULL; From 7bed6f3d08b7af27b7015da8dc3acf2b9c1f21d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Jan 2024 21:29:59 +0000 Subject: [PATCH 41/45] block: Fix iterating over an empty bio with bio_for_each_folio_all If the bio contains no data, bio_first_folio() calls page_folio() on a NULL pointer and oopses. Move the test that we've reached the end of the bio from bio_next_folio() to bio_first_folio(). Reported-by: syzbot+8b23309d5788a79d3eea@syzkaller.appspotmail.com Reported-by: syzbot+004c1e0fced2b4bc3dcc@syzkaller.appspotmail.com Fixes: 640d1930bef4 ("block: Add bio_for_each_folio_all()") Cc: stable@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240116212959.3413014-1-willy@infradead.org [axboe: add unlikely() to error case] Signed-off-by: Jens Axboe --- include/linux/bio.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index ec4db73e5f4e..875d792bffff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -286,6 +286,11 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, { struct bio_vec *bvec = bio_first_bvec_all(bio) + i; + if (unlikely(i >= bio->bi_vcnt)) { + fi->folio = NULL; + return; + } + fi->folio = page_folio(bvec->bv_page); fi->offset = bvec->bv_offset + PAGE_SIZE * (bvec->bv_page - &fi->folio->page); @@ -303,10 +308,8 @@ static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) fi->offset = 0; fi->length = min(folio_size(fi->folio), fi->_seg_count); fi->_next = folio_next(fi->folio); - } else if (fi->_i + 1 < bio->bi_vcnt) { - bio_first_folio(fi, bio, fi->_i + 1); } else { - fi->folio = NULL; + bio_first_folio(fi, bio, fi->_i + 1); } } From 78fbb92af27d0982634116c7a31065f24d092826 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 12 Jan 2024 13:26:57 +0000 Subject: [PATCH 42/45] nbd: always initialize struct msghdr completely syzbot complains that msg->msg_get_inq value can be uninitialized [1] struct msghdr got many new fields recently, we should always make sure their values is zero by default. [1] BUG: KMSAN: uninit-value in tcp_recvmsg+0x686/0xac0 net/ipv4/tcp.c:2571 tcp_recvmsg+0x686/0xac0 net/ipv4/tcp.c:2571 inet_recvmsg+0x131/0x580 net/ipv4/af_inet.c:879 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0x12b/0x1e0 net/socket.c:1066 __sock_xmit+0x236/0x5c0 drivers/block/nbd.c:538 nbd_read_reply drivers/block/nbd.c:732 [inline] recv_work+0x262/0x3100 drivers/block/nbd.c:863 process_one_work kernel/workqueue.c:2627 [inline] process_scheduled_works+0x104e/0x1e70 kernel/workqueue.c:2700 worker_thread+0xf45/0x1490 kernel/workqueue.c:2781 kthread+0x3ed/0x540 kernel/kthread.c:388 ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Local variable msg created at: __sock_xmit+0x4c/0x5c0 drivers/block/nbd.c:513 nbd_read_reply drivers/block/nbd.c:732 [inline] recv_work+0x262/0x3100 drivers/block/nbd.c:863 CPU: 1 PID: 7465 Comm: kworker/u5:1 Not tainted 6.7.0-rc7-syzkaller-00041-gf016f7547aee #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 Workqueue: nbd5-recv recv_work Fixes: f94fd25cb0aa ("tcp: pass back data left in socket after receive") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: stable@vger.kernel.org Cc: Josef Bacik Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: nbd@other.debian.org Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240112132657.647112-1-edumazet@google.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4e72ec4e25ac..33a8f37bb6a1 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -508,7 +508,7 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, struct iov_iter *iter, int msg_flags, int *sent) { int result; - struct msghdr msg; + struct msghdr msg = {} ; unsigned int noreclaim_flag; if (unlikely(!sock)) { @@ -524,10 +524,6 @@ static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send, do { sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; sock->sk->sk_use_task_frag = false; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; msg.msg_flags = msg_flags | MSG_NOSIGNAL; if (send) From 49e60333d743ae32db3bdde2f93bc818482dd741 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 17 Jan 2024 12:36:09 -0800 Subject: [PATCH 43/45] blk-mq: Remove the hctx 'run' debugfs attribute Nobody uses the debugfs hctx 'run' attribute. Hence remove this attribute and also the code that updates the corresponding member variable. Suggested-by: Jens Axboe Cc: Gabriel Ryan Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240117203609.4122520-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 18 ------------------ block/blk-mq-sched.c | 2 -- include/linux/blk-mq.h | 3 --- 3 files changed, 23 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5cbeb9344f2f..94668e72ab09 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -479,23 +479,6 @@ out: return res; } -static int hctx_run_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->run); - return 0; -} - -static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->run = 0; - return count; -} - static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 67c95f31b15b..451a2c1f1f32 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; - hctx->run++; - /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a676e116085f..7a8150a5f051 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -391,9 +391,6 @@ struct blk_mq_hw_ctx { */ struct blk_mq_tags *sched_tags; - /** @run: Number of dispatched requests. */ - unsigned long run; - /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ From baa7d536077dcdfe2b70c476a8873d1745d3de0f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 17 Jan 2024 18:59:01 +0100 Subject: [PATCH 44/45] loop: fix the the direct I/O support check when used on top of block devices __loop_update_dio only checks the alignment requirement for block backed file systems, but misses them for the case where the loop device is created directly on top of another block device. Due to this creating a loop device with default option plus the direct I/O flag on a > 512 byte sector size file system will lead to incorrect I/O being submitted to the lower block device and a lot of error from the lock layer. This can be seen with xfstests generic/563. Fix the code in __loop_update_dio by factoring the alignment check into a helper, and calling that also for the struct block_device of a block device inode. Also remove the TODO comment talking about dynamically switching between buffered and direct I/O, which is a would be a recipe for horrible performance and occasional data loss. Fixes: 2e5ab5f379f9 ("block: loop: prepare for supporing direct IO") Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20240117175901.871796-1-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 52 +++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2bd10f3bfcb2..01bb94362404 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -165,39 +165,37 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) return get_size(lo->lo_offset, lo->lo_sizelimit, file); } +/* + * We support direct I/O only if lo_offset is aligned with the logical I/O size + * of backing device, and the logical block size of loop is bigger than that of + * the backing device. + */ +static bool lo_bdev_can_use_dio(struct loop_device *lo, + struct block_device *backing_bdev) +{ + unsigned short sb_bsize = bdev_logical_block_size(backing_bdev); + + if (queue_logical_block_size(lo->lo_queue) < sb_bsize) + return false; + if (lo->lo_offset & (sb_bsize - 1)) + return false; + return true; +} + static void __loop_update_dio(struct loop_device *lo, bool dio) { struct file *file = lo->lo_backing_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned short sb_bsize = 0; - unsigned dio_align = 0; + struct inode *inode = file->f_mapping->host; + struct block_device *backing_bdev = NULL; bool use_dio; - if (inode->i_sb->s_bdev) { - sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); - dio_align = sb_bsize - 1; - } + if (S_ISBLK(inode->i_mode)) + backing_bdev = I_BDEV(inode); + else if (inode->i_sb->s_bdev) + backing_bdev = inode->i_sb->s_bdev; - /* - * We support direct I/O only if lo_offset is aligned with the - * logical I/O size of backing device, and the logical block - * size of loop is bigger than the backing device's. - * - * TODO: the above condition may be loosed in the future, and - * direct I/O may be switched runtime at that time because most - * of requests in sane applications should be PAGE_SIZE aligned - */ - if (dio) { - if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && - !(lo->lo_offset & dio_align) && - (file->f_mode & FMODE_CAN_ODIRECT)) - use_dio = true; - else - use_dio = false; - } else { - use_dio = false; - } + use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) && + (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev)); if (lo->use_dio == use_dio) return; From b2e792ae883a0aa976d4176dfa7dc933263440ea Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Thu, 18 Jan 2024 09:29:56 +0000 Subject: [PATCH 45/45] Documentation: block: ioprio: Update schedulers This doc hasn't been touched in a while, in the meantime some new io schedulers were added (e.g. all of mq), some with ioprio support. Also reword the introduction to remove reference to CFQ and the limitation that io priorities only work on reads, which is no longer true. Signed-off-by: Christian Loehle Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/a86cfdc8-016f-40f1-8b58-0cb15d2a792c@arm.com Signed-off-by: Jens Axboe --- Documentation/block/ioprio.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/Documentation/block/ioprio.rst b/Documentation/block/ioprio.rst index a25c6d5df87b..4662e1ff3d81 100644 --- a/Documentation/block/ioprio.rst +++ b/Documentation/block/ioprio.rst @@ -6,17 +6,16 @@ Block io priorities Intro ----- -With the introduction of cfq v3 (aka cfq-ts or time sliced cfq), basic io -priorities are supported for reads on files. This enables users to io nice -processes or process groups, similar to what has been possible with cpu -scheduling for ages. This document mainly details the current possibilities -with cfq; other io schedulers do not support io priorities thus far. +The io priority feature enables users to io nice processes or process groups, +similar to what has been possible with cpu scheduling for ages. Support for io +priorities is io scheduler dependent and currently supported by bfq and +mq-deadline. Scheduling classes ------------------ -CFQ implements three generic scheduling classes that determine how io is -served for a process. +Three generic scheduling classes are implemented for io priorities that +determine how io is served for a process. IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given higher priority than any other in the system, processes from this class are