From 46a99e0cf6a4990b81bff1d6df0865a7b3a7cea2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 13 Oct 2020 12:30:48 +0200 Subject: [PATCH 01/27] block/rnbd-clt: remove nr argument from send_usr_msg The argument is not needed since all callers pass 1 for it. Signed-off-by: Guoqing Jiang Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index d7a69741c0f6..de73a14f3580 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -427,7 +427,7 @@ enum wait_type { }; static int send_usr_msg(struct rtrs_clt *rtrs, int dir, - struct rnbd_iu *iu, struct kvec *vec, size_t nr, + struct rnbd_iu *iu, struct kvec *vec, size_t len, struct scatterlist *sg, unsigned int sg_len, void (*conf)(struct work_struct *work), int *errno, enum wait_type wait) @@ -441,7 +441,7 @@ static int send_usr_msg(struct rtrs_clt *rtrs, int dir, .conf_fn = msg_conf, }; err = rtrs_clt_request(dir, &req_ops, rtrs, iu->permit, - vec, nr, len, sg, sg_len); + vec, 1, len, sg, sg_len); if (!err && wait) { wait_event(iu->comp.wait, iu->comp.errno != INT_MAX); *errno = iu->comp.errno; @@ -486,7 +486,7 @@ static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) msg.device_id = cpu_to_le32(device_id); WARN_ON(!rnbd_clt_get_dev(dev)); - err = send_usr_msg(sess->rtrs, WRITE, iu, &vec, 1, 0, NULL, 0, + err = send_usr_msg(sess->rtrs, WRITE, iu, &vec, 0, NULL, 0, msg_close_conf, &errno, wait); if (err) { rnbd_clt_put_dev(dev); @@ -575,7 +575,7 @@ static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) WARN_ON(!rnbd_clt_get_dev(dev)); err = send_usr_msg(sess->rtrs, READ, iu, - &vec, 1, sizeof(*rsp), iu->sglist, 1, + &vec, sizeof(*rsp), iu->sglist, 1, msg_open_conf, &errno, wait); if (err) { rnbd_clt_put_dev(dev); @@ -629,7 +629,7 @@ static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait) goto put_iu; } err = send_usr_msg(sess->rtrs, READ, iu, - &vec, 1, sizeof(*rsp), iu->sglist, 1, + &vec, sizeof(*rsp), iu->sglist, 1, msg_sess_info_conf, &errno, wait); if (err) { rnbd_clt_put_sess(sess); From 050b654b2a70a978873bd5885a615c6a47c6205a Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Tue, 13 Oct 2020 12:30:49 +0200 Subject: [PATCH 02/27] block/rnbd-clt: do not cap max_hw_sectors & max_segments with remote device The max_hw_secotrs is only limited by the transport, not remote device, block layer on server side will split to the device limit if it's too big. The max_segments, similar, and rtrs server will submit single buffer, so no need to cap. Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index de73a14f3580..519c7d003bf0 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -91,11 +91,6 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; dev->max_segments = BMAX_SEGMENTS; - dev->max_hw_sectors = min_t(u32, dev->max_hw_sectors, - le32_to_cpu(rsp->max_hw_sectors)); - dev->max_segments = min_t(u16, dev->max_segments, - le16_to_cpu(rsp->max_segments)); - return 0; } From 47be77c2f80412f903134a57caea25fa3fc5f578 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Tue, 13 Oct 2020 12:30:50 +0200 Subject: [PATCH 03/27] block/rnbd-clt: send_msg_close if any error occurs after send_msg_open After send_msg_open is done, send_msg_close should be done if any error occurs and it is necessary to recover what has been done. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 519c7d003bf0..8b2411ccbda9 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1509,7 +1509,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, "map_device: Failed to configure device, err: %d\n", ret); mutex_unlock(&dev->lock); - goto del_dev; + goto send_close; } rnbd_clt_info(dev, @@ -1528,6 +1528,8 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, return dev; +send_close: + send_msg_close(dev, dev->device_id, WAIT); del_dev: delete_dev(dev); put_dev: From 3b481d91356e5693d8358d4ef9c383bdb92c8da0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 24 Sep 2020 13:53:28 -0700 Subject: [PATCH 04/27] block: add zone specific block statuses A zoned device with limited resources to open or activate zones may return an error when the host exceeds those limits. The same command may be successful if retried later, but the host needs to wait for specific zone states before it should expect a retry to succeed. Have the block layer provide an appropriate status for these conditions so applications can distinuguish this error for special handling. Cc: linux-api@vger.kernel.org Cc: Niklas Cassel Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- Documentation/block/queue-sysfs.rst | 8 ++++++++ block/blk-core.c | 4 ++++ include/linux/blk_types.h | 18 ++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst index f261a5c84170..2638d3446b79 100644 --- a/Documentation/block/queue-sysfs.rst +++ b/Documentation/block/queue-sysfs.rst @@ -124,6 +124,10 @@ For zoned block devices (zoned attribute indicating "host-managed" or EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value. If this value is 0, there is no limit. +If the host attempts to exceed this limit, the driver should report this error +with BLK_STS_ZONE_ACTIVE_RESOURCE, which user space may see as the EOVERFLOW +errno. + max_open_zones (RO) ------------------- For zoned block devices (zoned attribute indicating "host-managed" or @@ -131,6 +135,10 @@ For zoned block devices (zoned attribute indicating "host-managed" or EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value. If this value is 0, there is no limit. +If the host attempts to exceed this limit, the driver should report this error +with BLK_STS_ZONE_OPEN_RESOURCE, which user space may see as the ETOOMANYREFS +errno. + max_sectors_kb (RW) ------------------- This is the maximum number of kilobytes that the block layer will allow diff --git a/block/blk-core.c b/block/blk-core.c index ac00d2fa4eb4..2db8bda43b6e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -186,6 +186,10 @@ static const struct { /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + /* zone device specific errors */ + [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, + [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 7d7c13238fdb..d9b69bbde5cc 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -104,6 +104,24 @@ typedef u8 __bitwise blk_status_t; */ #define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) +/* + * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion + * path if the device returns a status indicating that too many zone resources + * are currently open. The same command should be successful if resubmitted + * after the number of open zones decreases below the device's limits, which is + * reported in the request_queue's max_open_zones. + */ +#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15) + +/* + * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion + * path if the device returns a status indicating that too many zone resources + * are currently active. The same command should be successful if resubmitted + * after the number of active zones decreases below the device's limits, which + * is reported in the request_queue's max_active_zones. + */ +#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with From afaf5c6c81d736d7a3376801f4af396b04292191 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 24 Sep 2020 13:53:29 -0700 Subject: [PATCH 05/27] nvme: translate zone resource errors Translate zoned resource errors to the appropriate blk_status_t. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 56e2a22e8a02..95ef4943d8bd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -248,6 +248,10 @@ static blk_status_t nvme_error_status(u16 status) return BLK_STS_NEXUS; case NVME_SC_HOST_PATH_ERROR: return BLK_STS_TRANSPORT; + case NVME_SC_ZONE_TOO_MANY_ACTIVE: + return BLK_STS_ZONE_ACTIVE_RESOURCE; + case NVME_SC_ZONE_TOO_MANY_OPEN: + return BLK_STS_ZONE_OPEN_RESOURCE; default: return BLK_STS_IOERR; } From d8f53b0ab0337762cc9e7b50d0c60b5bd091a0e1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 24 Sep 2020 13:53:30 -0700 Subject: [PATCH 06/27] scsi: handle zone resources errors ZBC or ZAC disks that have a limit on the number of open zones may fail a zone open command or a write to a zone that is not already implicitly or explicitly open if the total number of open zones is already at the maximum allowed. For these operations, instead of returning the generic BLK_STS_IOERR, return BLK_STS_ZONE_OPEN_RESOURCE which is returned as -ETOOMANYREFS to the I/O issuer, allowing the device user to act appropriately on these relatively benign zone resource errors. Acked-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Damien Le Moal Signed-off-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index a89478a0c588..72b12102f777 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -758,6 +758,15 @@ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) /* See SSC3rXX or current. */ action = ACTION_FAIL; break; + case DATA_PROTECT: + action = ACTION_FAIL; + if ((sshdr.asc == 0x0C && sshdr.ascq == 0x12) || + (sshdr.asc == 0x55 && + (sshdr.ascq == 0x0E || sshdr.ascq == 0x0F))) { + /* Insufficient zone resources */ + blk_stat = BLK_STS_ZONE_OPEN_RESOURCE; + } + break; default: action = ACTION_FAIL; break; From 87aac3a80af5cbad93e63250e8a1e19095ba0d30 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 13 Oct 2020 22:45:14 -0400 Subject: [PATCH 07/27] nbd: make the config put is called before the notifying the waiter There has one race case for ceph's rbd-nbd tool. When do mapping it may fail with EBUSY from ioctl(nbd, NBD_DO_IT), but actually the nbd device has already unmaped. It dues to if just after the wake_up(), the recv_work() is scheduled out and defers calling the nbd_config_put(), though the map process has exited the "nbd->recv_task" is not cleared. Signed-off-by: Xiubo Li Reviewed-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 2dca0aab0a9a..90c2effb5ded 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -802,9 +802,9 @@ static void recv_work(struct work_struct *work) if (likely(!blk_should_fake_timeout(rq->q))) blk_mq_complete_request(rq); } + nbd_config_put(nbd); atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); - nbd_config_put(nbd); kfree(args); } From a48faebe65b0db55a73b9220c3d919eee849bb79 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 16 Oct 2020 15:33:51 +0100 Subject: [PATCH 08/27] lightnvm: fix out-of-bounds write to array devices->info[] There is an off-by-one array check that can lead to a out-of-bounds write to devices->info[i]. Fix this by checking by using >= rather than > for the size check. Also replace hard-coded array size limit with ARRAY_SIZE on the array. Addresses-Coverity: ("Out-of-bounds write") Fixes: cd9e9808d18f ("lightnvm: Support for Open-Channel SSDs") Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index fe78bf0fdce5..c1bcac71008c 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -1311,8 +1311,9 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg) strlcpy(info->bmname, "gennvm", sizeof(info->bmname)); i++; - if (i > 31) { - pr_err("max 31 devices can be reported.\n"); + if (i >= ARRAY_SIZE(devices->info)) { + pr_err("max %zd devices can be reported.\n", + ARRAY_SIZE(devices->info)); break; } } From b2a182a40278bc5849730e66bca01a762188ed86 Mon Sep 17 00:00:00 2001 From: Douglas Gilbert Date: Thu, 15 Oct 2020 14:57:35 -0400 Subject: [PATCH 09/27] sgl_alloc_order: fix memory leak sgl_alloc_order() can fail when 'length' is large on a memory constrained system. When order > 0 it will potentially be making several multi-page allocations with the later ones more likely to fail than the earlier one. So it is important that sgl_alloc_order() frees up any pages it has obtained before returning NULL. In the case when order > 0 it calls the wrong free page function and leaks. In testing the leak was sufficient to bring down my 8 GiB laptop with OOM. Reviewed-by: Bart Van Assche Signed-off-by: Douglas Gilbert Signed-off-by: Jens Axboe --- lib/scatterlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 5d63a8857f36..c448642e0f78 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -514,7 +514,7 @@ struct scatterlist *sgl_alloc_order(unsigned long long length, elem_len = min_t(u64, length, PAGE_SIZE << order); page = alloc_pages(gfp, order); if (!page) { - sgl_free(sgl); + sgl_free_order(sgl, order); return NULL; } From db073272700fce69a9c41b27c62d0003dbb66488 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Sat, 17 Oct 2020 09:52:29 +0800 Subject: [PATCH 10/27] skd_main: remove unused including Remove including that don't need it. Signed-off-by: Tian Tao Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index ae6454c24594..a962b4551bed 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include From 0669d2b265d0f6f9e16f1abbf5c5d2e22b219a6b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Oct 2020 12:13:53 +0200 Subject: [PATCH 11/27] zram: Fix __zram_bvec_{read,write}() locking order Mikhail reported a lockdep spat detailing how __zram_bvec_read() and __zram_bvec_write() use zstrm->lock and zspage->lock in opposite order. Reported-by: Mikhail Gavrilov Signed-off-by: Peter Zijlstra (Intel) Tested-by: Mikhail Gavrilov Acked-by: Minchan Kim Acked-by: Sebastian Andrzej Siewior Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index bff3d4021c18..e13201424ba2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1218,10 +1218,11 @@ out: static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, struct bio *bio, bool partial_io) { - int ret; + struct zcomp_strm *zstrm; unsigned long handle; unsigned int size; void *src, *dst; + int ret; zram_slot_lock(zram, index); if (zram_test_flag(zram, index, ZRAM_WB)) { @@ -1252,6 +1253,9 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, size = zram_get_obj_size(zram, index); + if (size != PAGE_SIZE) + zstrm = zcomp_stream_get(zram->comp); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { dst = kmap_atomic(page); @@ -1259,8 +1263,6 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, kunmap_atomic(dst); ret = 0; } else { - struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); - dst = kmap_atomic(page); ret = zcomp_decompress(zstrm, src, size, dst); kunmap_atomic(dst); From 576e85c5e92486f1aa8be3cb1a30cb59d4415981 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Mon, 19 Oct 2020 16:20:47 +0800 Subject: [PATCH 12/27] blk-mq: remove the calling of local_memory_node() We don't need to check whether the node is memoryless numa node before calling allocator interface. SLUB(and SLAB,SLOB) relies on the page allocator to pick a node. Page allocator should deal with memoryless nodes just fine. It has zonelists constructed for each possible nodes. And it will automatically fall back into a node which is closest to the requested node. As long as __GFP_THISNODE is not enforced of course. The code comments of kmem_cache_alloc_node() of SLAB also showed this: * Fallback to other node is possible if __GFP_THISNODE is not set. blk-mq code doesn't set __GFP_THISNODE, so we can remove the calling of local_memory_node(). Signed-off-by: Xianting Tian Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 2 +- block/blk-mq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 0157f2b3485a..3db84d3197f1 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -89,7 +89,7 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) for_each_possible_cpu(i) { if (index == qmap->mq_map[i]) - return local_memory_node(cpu_to_node(i)); + return cpu_to_node(i); } return NUMA_NO_NODE; diff --git a/block/blk-mq.c b/block/blk-mq.c index deca157032c2..615da7de8855 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2744,7 +2744,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = local_memory_node(cpu_to_node(i)); + hctx->numa_node = cpu_to_node(i); } } } From cb3a92da231bcf55c243d00fa619ee36281b0001 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Tue, 20 Oct 2020 05:22:56 -0400 Subject: [PATCH 13/27] block: remove unused members for io_context After removing blk-sq code, there is no user of nr_batch_requests and last_waited in kernel. Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 1dcd9198beb7..0a9dc40b7be8 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -106,12 +106,6 @@ struct io_context { unsigned short ioprio; - /* - * For request batching - */ - int nr_batch_requests; /* Number of requests left in the batch */ - unsigned long last_waited; /* Time last woken after wait for request */ - struct radix_tree_root icq_tree; struct io_cq __rcu *icq_hint; struct hlist_head icq_list; From 43efdb8e870ee0f58633fd579aa5b5185bf5d39e Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Mon, 12 Oct 2020 16:10:40 +0800 Subject: [PATCH 14/27] nvme-rdma: fix crash when connect rejected A crash can happened when a connect is rejected. The host establishes the connection after received ConnectReply, and then continues to send the fabrics Connect command. If the controller does not receive the ReadyToUse capsule, host may receive a ConnectReject reply. Call nvme_rdma_destroy_queue_ib after the host received the RDMA_CM_EVENT_REJECTED event. Then when the fabrics Connect command times out, nvme_rdma_timeout calls nvme_rdma_complete_rq to fail the request. A crash happenes due to use after free in nvme_rdma_complete_rq. nvme_rdma_destroy_queue_ib is redundant when handling the RDMA_CM_EVENT_REJECTED event as nvme_rdma_destroy_queue_ib is already called in connection failure handler. Signed-off-by: Chao Leng Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 9e378d0a0c01..116902b1b2c3 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1926,7 +1926,6 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, complete(&queue->cm_done); return 0; case RDMA_CM_EVENT_REJECTED: - nvme_rdma_destroy_queue_ib(queue); cm_error = nvme_rdma_conn_rejected(queue, ev); break; case RDMA_CM_EVENT_ROUTE_ERROR: From a87da50f39d467f2ea4c1f98decb72ef6d87a31e Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Mon, 12 Oct 2020 16:55:37 +0800 Subject: [PATCH 15/27] nvme-rdma: fix crash due to incorrect cqe A crash happened due to injecting error test. When a CQE has incorrect command id due do an error injection, the host may find a request which is already freed. Dereferencing req->mr->rkey causes a crash in nvme_rdma_process_nvme_rsp because the mr is already freed. Add a check for the mr to fix it. Signed-off-by: Chao Leng Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 116902b1b2c3..aad829a2b50d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1730,10 +1730,11 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue, req->result = cqe->result; if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { - if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) { + if (unlikely(!req->mr || + wc->ex.invalidate_rkey != req->mr->rkey)) { dev_err(queue->ctrl->ctrl.device, "Bogus remote invalidation for rkey %#x\n", - req->mr->rkey); + req->mr ? req->mr->rkey : 0); nvme_rdma_error_recovery(queue->ctrl); } } else if (req->mr) { From 643c476d6f78cf0349fb8e07334962dd056a3c90 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 15 Oct 2020 11:36:29 -0700 Subject: [PATCH 16/27] nvme: use queuedata for nvme_req_qid The request's rq_disk isn't set for passthrough IO commands, so tracing uses qid 0 for these which incorrectly decodes as an admin command. Use the request_queue's queuedata instead since that value is always set for the IO queues, and never set for the admin queue. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e7c88b40f5bb..cc111136a981 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -176,7 +176,7 @@ static inline struct nvme_request *nvme_req(struct request *req) static inline u16 nvme_req_qid(struct request *req) { - if (!req->rq_disk) + if (!req->q->queuedata) return 0; return blk_mq_unique_tag_to_hwq(blk_mq_unique_tag(req)) + 1; } From 02ca079c99319c4308c6bb892613f29119c1a9f9 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 13 Oct 2020 16:34:45 +0800 Subject: [PATCH 17/27] nvme-pci: disable Write Zeroes on Sandisk Skyhawk Like commit 5611ec2b9814 ("nvme-pci: prevent SK hynix PC400 from using Write Zeroes command"), Sandisk Skyhawk has the same issue: [ 6305.633887] blk_update_request: operation not supported error, dev nvme0n1, sector 340812032 op 0x9:(WRITE_ZEROES) flags 0x0 phys_seg 0 prio class 0 So also disable Write Zeroes command on Sandisk Skyhawk. BugLink: https://bugs.launchpad.net/bugs/1899503 Signed-off-by: Kai-Heng Feng Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e5b02242f3ca..df8f3612107f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3185,6 +3185,8 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_IGNORE_DEV_SUBNQN, }, { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x15b7, 0x2001), /* Sandisk Skyhawk */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001), .driver_data = NVME_QUIRK_SINGLE_VECTOR }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, From 85bd23f3dc09a2ae9e56885420e52c54bf983713 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Thu, 15 Oct 2020 09:51:40 +0800 Subject: [PATCH 18/27] nvmet: fix uninitialized work for zero kato When connecting a controller with a zero kato value using the following command line nvme connect -t tcp -n NQN -a ADDR -s PORT --keep-alive-tmo=0 the warning below can be reproduced: WARNING: CPU: 1 PID: 241 at kernel/workqueue.c:1627 __queue_delayed_work+0x6d/0x90 with trace: mod_delayed_work_on+0x59/0x90 nvmet_update_cc+0xee/0x100 [nvmet] nvmet_execute_prop_set+0x72/0x80 [nvmet] nvmet_tcp_try_recv_pdu+0x2f7/0x770 [nvmet_tcp] nvmet_tcp_io_work+0x63f/0xb2d [nvmet_tcp] ... This is caused by queuing up an uninitialized work. Althrough the keep-alive timer is disabled during allocating the controller (fixed in 0d3b6a8d213a), ka_work still has a chance to run (called by nvmet_start_ctrl). Fixes: 0d3b6a8d213a ("nvmet: Disable keep-alive timer when kato is cleared to 0h") Signed-off-by: zhenwei pi Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 25d62d867563..aafcbc424b7a 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1126,7 +1126,8 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) * in case a host died before it enabled the controller. Hence, simply * reset the keep alive timer when the controller is enabled. */ - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + if (ctrl->kato) + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); } static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) From df06047d54276f73782c9d97882b305fca745d3f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 16 Oct 2020 16:19:04 -0600 Subject: [PATCH 19/27] nvmet: limit passthru MTDS by BIO_MAX_PAGES nvmet_passthru_map_sg() only supports mapping a single BIO, not a chain so the effective maximum transfer should also be limitted by BIO_MAX_PAGES (presently this works out to 1MB). For PCI passthru devices the max_sectors would typically be more limitting than BIO_MAX_PAGES, but this may not be true for all passthru devices. Fixes: c1fef73f793b ("nvmet: add passthru code to process commands") Suggested-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 56c571052216..323046e1d67f 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -26,7 +26,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl; u16 status = NVME_SC_SUCCESS; struct nvme_id_ctrl *id; - u32 max_hw_sectors; + int max_hw_sectors; int page_shift; id = kzalloc(sizeof(*id), GFP_KERNEL); @@ -48,6 +48,13 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9), pctrl->max_hw_sectors); + /* + * nvmet_passthru_map_sg is limitted to using a single bio so limit + * the mdts based on BIO_MAX_PAGES as well + */ + max_hw_sectors = min_not_zero(BIO_MAX_PAGES << (PAGE_SHIFT - 9), + max_hw_sectors); + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; id->mdts = ilog2(max_hw_sectors) + 9 - page_shift; From 5e063101ffacf7c14797f5185c58a967ca83c79f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 16 Oct 2020 16:19:05 -0600 Subject: [PATCH 20/27] nvmet: cleanup nvmet_passthru_map_sg() Clean up some confusing elements of nvmet_passthru_map_sg() by returning early if the request is greater than the maximum bio size. This allows us to drop the sg_cnt variable. This should not result in any functional change but makes the code clearer and more understandable. The original code allocated a truncated bio then would return EINVAL when bio_add_pc_page() filled that bio. The new code just returns EINVAL early if this would happen. Fixes: c1fef73f793b ("nvmet: add passthru code to process commands") Signed-off-by: Logan Gunthorpe Suggested-by: Douglas Gilbert Reviewed-by: Sagi Grimberg Cc: Christoph Hellwig Cc: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 323046e1d67f..0814cba8298a 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -187,18 +187,20 @@ static void nvmet_passthru_req_done(struct request *rq, static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { - int sg_cnt = req->sg_cnt; struct scatterlist *sg; int op_flags = 0; struct bio *bio; int i, ret; + if (req->sg_cnt > BIO_MAX_PAGES) + return -EINVAL; + if (req->cmd->common.opcode == nvme_cmd_flush) op_flags = REQ_FUA; else if (nvme_is_write(req->cmd)) op_flags = REQ_SYNC | REQ_IDLE; - bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, req->sg_cnt); bio->bi_end_io = bio_put; bio->bi_opf = req_op(rq) | op_flags; @@ -208,7 +210,6 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) bio_put(bio); return -EINVAL; } - sg_cnt--; } ret = blk_rq_append_bio(rq, &bio); From 150dfb6c834c9e0e92db7794530b09fd2b9f05c8 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 20 Oct 2020 16:14:04 -0700 Subject: [PATCH 21/27] nvmet: don't use BLK_MQ_REQ_NOWAIT for passthru MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By default, we set the passthru request allocation flag such that it returns the error in the following code path and we fail the I/O when BLK_MQ_REQ_NOWAIT is used for request allocation :- nvme_alloc_request()  blk_mq_alloc_request()   blk_mq_queue_enter()    if (flag & BLK_MQ_REQ_NOWAIT)         return -EBUSY; <-- return if busy. On some controllers using BLK_MQ_REQ_NOWAIT ends up in I/O error where the controller is perfectly healthy and not in a degraded state. Block layer request allocation does allow us to wait instead of immediately returning the error when we BLK_MQ_REQ_NOWAIT flag is not used. This has shown to fix the I/O error problem reported under heavy random write workload. Remove the BLK_MQ_REQ_NOWAIT parameter for passthru request allocation which resolves this issue. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig --- drivers/nvme/target/passthru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 0814cba8298a..8ee94f056898 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -244,7 +244,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) q = ns->queue; } - rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); + rq = nvme_alloc_request(q, req->cmd, 0, NVME_QID_ANY); if (IS_ERR(rq)) { status = NVME_SC_INTERNAL; goto out_put_ns; From fd78874b710f42ea46feaefd7c918893c8567e8a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 22 Oct 2020 08:47:39 -0700 Subject: [PATCH 22/27] null_blk: use zone status for max active/open The block layer provides special status codes when requests go beyond the zone resource limits. Use these codes instead of the generic IOERR for requests that exceed the max active or open limits the null_blk device was configured with so that applications know how these special conditions should be handled. Signed-off-by: Keith Busch Reviewed-by: Niklas Cassel Cc: Damien Le Moal Cc: Niklas Cassel Signed-off-by: Jens Axboe --- drivers/block/null_blk_zoned.c | 69 +++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index fa0cc70f05e6..7d94f2d47a6a 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -220,29 +220,34 @@ static void null_close_first_imp_zone(struct nullb_device *dev) } } -static bool null_can_set_active(struct nullb_device *dev) +static blk_status_t null_check_active(struct nullb_device *dev) { if (!dev->zone_max_active) - return true; + return BLK_STS_OK; - return dev->nr_zones_exp_open + dev->nr_zones_imp_open + - dev->nr_zones_closed < dev->zone_max_active; + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open + + dev->nr_zones_closed < dev->zone_max_active) + return BLK_STS_OK; + + return BLK_STS_ZONE_ACTIVE_RESOURCE; } -static bool null_can_open(struct nullb_device *dev) +static blk_status_t null_check_open(struct nullb_device *dev) { if (!dev->zone_max_open) - return true; + return BLK_STS_OK; if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open) - return true; + return BLK_STS_OK; - if (dev->nr_zones_imp_open && null_can_set_active(dev)) { - null_close_first_imp_zone(dev); - return true; + if (dev->nr_zones_imp_open) { + if (null_check_active(dev) == BLK_STS_OK) { + null_close_first_imp_zone(dev); + return BLK_STS_OK; + } } - return false; + return BLK_STS_ZONE_OPEN_RESOURCE; } /* @@ -258,19 +263,22 @@ static bool null_can_open(struct nullb_device *dev) * it is not certain that closing an implicit open zone will allow a new zone * to be opened, since we might already be at the active limit capacity. */ -static bool null_has_zone_resources(struct nullb_device *dev, struct blk_zone *zone) +static blk_status_t null_check_zone_resources(struct nullb_device *dev, struct blk_zone *zone) { + blk_status_t ret; + switch (zone->cond) { case BLK_ZONE_COND_EMPTY: - if (!null_can_set_active(dev)) - return false; + ret = null_check_active(dev); + if (ret != BLK_STS_OK) + return ret; fallthrough; case BLK_ZONE_COND_CLOSED: - return null_can_open(dev); + return null_check_open(dev); default: /* Should never be called for other states */ WARN_ON(1); - return false; + return BLK_STS_IOERR; } } @@ -293,8 +301,9 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return BLK_STS_IOERR; case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_CLOSED: - if (!null_has_zone_resources(dev, zone)) - return BLK_STS_IOERR; + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; break; case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: @@ -349,6 +358,8 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone) { + blk_status_t ret; + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; @@ -357,15 +368,17 @@ static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zo /* open operation on exp open is not an error */ return BLK_STS_OK; case BLK_ZONE_COND_EMPTY: - if (!null_has_zone_resources(dev, zone)) - return BLK_STS_IOERR; + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; break; case BLK_ZONE_COND_CLOSED: - if (!null_has_zone_resources(dev, zone)) - return BLK_STS_IOERR; + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; dev->nr_zones_closed--; break; case BLK_ZONE_COND_FULL: @@ -381,6 +394,8 @@ static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zo static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone) { + blk_status_t ret; + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; @@ -389,8 +404,9 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone * /* finish operation on full is not an error */ return BLK_STS_OK; case BLK_ZONE_COND_EMPTY: - if (!null_has_zone_resources(dev, zone)) - return BLK_STS_IOERR; + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; break; case BLK_ZONE_COND_IMP_OPEN: dev->nr_zones_imp_open--; @@ -399,8 +415,9 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone * dev->nr_zones_exp_open--; break; case BLK_ZONE_COND_CLOSED: - if (!null_has_zone_resources(dev, zone)) - return BLK_STS_IOERR; + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) + return ret; dev->nr_zones_closed--; break; default: From 52793d62a696e9188092eb0817fb1219ee5729ff Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 16 Oct 2020 14:06:27 -0700 Subject: [PATCH 23/27] nvme-fc: fix io timeout to abort I/O Currently, an I/O timeout unconditionally invokes nvme_fc_error_recovery() which checks for LIVE or CONNECTING state. If live, the routine resets the controller which initiates a reconnect - which is valid. If CONNECTING, err_work is scheduled. Err_work then calls the terminate_io routine, which also checks for CONNECTING and noops any further action on outstanding I/O. The result is nothing happened to the timed out io. As such, if the command was dropped on the wire, it will never timeout / complete, and the connect process will hang. Change the behavior of the io timeout routine to unconditionally abort the I/O. I/O completion handling will note that an io failed due to an abort and will terminate the connection / association as needed. If the abort was unable to happen, continue with a call to nvme_fc_error_recovery(). To ensure something different happens in nvme_fc_error_recovery() rework it so at it will abort all I/Os on the association to force a failure. As I/O aborts now may occur outside of delete_association, counting for completion must be wary and only count those aborted during delete_association when TERMIO is set on the controller. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 152 ++++++++++++++++++++++++----------------- 1 file changed, 91 insertions(+), 61 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e2e09e25c056..3e72b7d74df3 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1837,8 +1837,10 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED); if (opstate != FCPOP_STATE_ACTIVE) atomic_set(&op->state, opstate); - else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) + else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) { + op->flags |= FCOP_FLAGS_TERMIO; ctrl->iocnt++; + } spin_unlock_irqrestore(&ctrl->lock, flags); if (opstate != FCPOP_STATE_ACTIVE) @@ -1874,7 +1876,8 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, if (opstate == FCPOP_STATE_ABORTED) { spin_lock_irqsave(&ctrl->lock, flags); - if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) { + if (test_bit(FCCTRL_TERMIO, &ctrl->flags) && + op->flags & FCOP_FLAGS_TERMIO) { if (!--ctrl->iocnt) wake_up(&ctrl->ioabort_wait); } @@ -2446,15 +2449,20 @@ nvme_fc_timeout(struct request *rq, bool reserved) { struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); struct nvme_fc_ctrl *ctrl = op->ctrl; + struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; + struct nvme_command *sqe = &cmdiu->sqe; /* - * we can't individually ABTS an io without affecting the queue, - * thus killing the queue, and thus the association. - * So resolve by performing a controller reset, which will stop - * the host/io stack, terminate the association on the link, - * and recreate an association on the link. + * Attempt to abort the offending command. Command completion + * will detect the aborted io and will fail the connection. */ - nvme_fc_error_recovery(ctrl, "io timeout error"); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d.%d}: io timeout: opcode %d fctype %d w10/11: " + "x%08x/x%08x\n", + ctrl->cnum, op->queue->qnum, sqe->common.opcode, + sqe->connect.fctype, sqe->common.cdw10, sqe->common.cdw11); + if (__nvme_fc_abort_op(ctrl, op)) + nvme_fc_error_recovery(ctrl, "io timeout abort failed"); /* * the io abort has been initiated. Have the reset timer @@ -2726,6 +2734,7 @@ nvme_fc_complete_rq(struct request *rq) struct nvme_fc_ctrl *ctrl = op->ctrl; atomic_set(&op->state, FCPOP_STATE_IDLE); + op->flags &= ~FCOP_FLAGS_TERMIO; nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); @@ -3090,6 +3099,61 @@ out_free_queue: return ret; } + +/* + * This routine runs through all outstanding commands on the association + * and aborts them. This routine is typically be called by the + * delete_association routine. It is also called due to an error during + * reconnect. In that scenario, it is most likely a command that initializes + * the controller, including fabric Connect commands on io queues, that + * may have timed out or failed thus the io must be killed for the connect + * thread to see the error. + */ +static void +__nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues) +{ + /* + * If io queues are present, stop them and terminate all outstanding + * ios on them. As FC allocates FC exchange for each io, the + * transport must contact the LLDD to terminate the exchange, + * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() + * to tell us what io's are busy and invoke a transport routine + * to kill them with the LLDD. After terminating the exchange + * the LLDD will call the transport's normal io done path, but it + * will have an aborted status. The done path will return the + * io requests back to the block layer as part of normal completions + * (but with error status). + */ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); + if (start_queues) + nvme_start_queues(&ctrl->ctrl); + } + + /* + * Other transports, which don't have link-level contexts bound + * to sqe's, would try to gracefully shutdown the controller by + * writing the registers for shutdown and polling (call + * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially + * just aborted and we will wait on those contexts, and given + * there was no indication of how live the controlelr is on the + * link, don't send more io to create more contexts for the + * shutdown. Let the controller fail via keepalive failure if + * its still present. + */ + + /* + * clean up the admin queue. Same thing as above. + */ + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); +} + /* * This routine stops operation of the controller on the host side. * On the host os stack side: Admin and IO queues are stopped, @@ -3110,46 +3174,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) ctrl->iocnt = 0; spin_unlock_irqrestore(&ctrl->lock, flags); - /* - * If io queues are present, stop them and terminate all outstanding - * ios on them. As FC allocates FC exchange for each io, the - * transport must contact the LLDD to terminate the exchange, - * thus releasing the FC exchange. We use blk_mq_tagset_busy_itr() - * to tell us what io's are busy and invoke a transport routine - * to kill them with the LLDD. After terminating the exchange - * the LLDD will call the transport's normal io done path, but it - * will have an aborted status. The done path will return the - * io requests back to the block layer as part of normal completions - * (but with error status). - */ - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_fc_terminate_exchange, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(&ctrl->tag_set); - } - - /* - * Other transports, which don't have link-level contexts bound - * to sqe's, would try to gracefully shutdown the controller by - * writing the registers for shutdown and polling (call - * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially - * just aborted and we will wait on those contexts, and given - * there was no indication of how live the controlelr is on the - * link, don't send more io to create more contexts for the - * shutdown. Let the controller fail via keepalive failure if - * its still present. - */ - - /* - * clean up the admin queue. Same thing as above. - * use blk_mq_tagset_busy_itr() and the transport routine to - * terminate the exchanges. - */ - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_fc_terminate_exchange, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); + __nvme_fc_abort_outstanding_ios(ctrl, false); /* kill the aens as they are a separate path */ nvme_fc_abort_aen_ops(ctrl); @@ -3263,22 +3288,27 @@ static void __nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl) { /* - * if state is connecting - the error occurred as part of a - * reconnect attempt. The create_association error paths will - * clean up any outstanding io. - * - * if it's a different state - ensure all pending io is - * terminated. Given this can delay while waiting for the - * aborted io to return, we recheck adapter state below - * before changing state. + * if state is CONNECTING - the error occurred as part of a + * reconnect attempt. Abort any ios on the association and + * let the create_association error paths resolve things. */ - if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) { - nvme_stop_keep_alive(&ctrl->ctrl); - - /* will block will waiting for io to terminate */ - nvme_fc_delete_association(ctrl); + if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { + __nvme_fc_abort_outstanding_ios(ctrl, true); + return; } + /* + * For any other state, kill the association. As this routine + * is a common io abort routine for resetting and such, after + * the association is terminated, ensure that the state is set + * to CONNECTING. + */ + + nvme_stop_keep_alive(&ctrl->ctrl); + + /* will block will waiting for io to terminate */ + nvme_fc_delete_association(ctrl); + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING && !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) dev_err(ctrl->ctrl.device, From 514a6dc9ecfd2fe4e1deebcb7a63e3de23e6c38b Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 16 Oct 2020 14:06:04 -0700 Subject: [PATCH 24/27] nvme-fc: fix error loop in create_hw_io_queues The loop that backs out of hw io queue creation continues through index 0, which corresponds to the admin queue as well. Fix the loop so it only proceeds through indexes 1..n which correspond to I/O queues. Signed-off-by: James Smart Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 3e72b7d74df3..108130f140d0 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2317,7 +2317,7 @@ nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) return 0; delete_queues: - for (; i >= 0; i--) + for (; i > 0; i--) __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[i], i); return ret; } @@ -2436,7 +2436,7 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) return; dev_warn(ctrl->ctrl.device, - "NVME-FC{%d}: transport association error detected: %s\n", + "NVME-FC{%d}: transport association event: %s\n", ctrl->cnum, errmsg); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); From 88e837ed0f1fddd34a19092aaa7098d579e6c506 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 16 Oct 2020 14:17:24 -0700 Subject: [PATCH 25/27] nvme-fc: wait for queues to freeze before calling update_hr_hw_queues On reconnect, the code currently does not freeze the controller before possibly updating the number hw queues for the controller. Add the freeze before updating the number of hw queues. Note: the queues are already started and remain started through the reconnect. Signed-off-by: James Smart Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 108130f140d0..5f1d09640c40 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2885,11 +2885,14 @@ nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; - if (prior_ioq_cnt != nr_io_queues) + if (prior_ioq_cnt != nr_io_queues) { dev_info(ctrl->ctrl.device, "reconnect: revising io queue count from %d to %d\n", prior_ioq_cnt, nr_io_queues); - blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); + nvme_wait_freeze(&ctrl->ctrl); + blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); + nvme_unfreeze(&ctrl->ctrl); + } return 0; From f673714a1247669bc90322dfb14a5cf553833796 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 16 Oct 2020 14:29:28 -0700 Subject: [PATCH 26/27] nvme-fc: shorten reconnect delay if possible for FC We've had several complaints about a 10s reconnect delay (the default) when there was an error while there is connectivity to a subsystem. The max_reconnects and reconnect_delay are set in common code prior to calling the transport to create the controller. This change checks if the default reconnect delay is being used, and if so, it adjusts it to a shorter period (2s) for the nvme-fc transport. It does so by calculating the controller loss tmo window, changing the value of the reconnect delay, and then recalculating the maximum number of reconnect attempts allowed. Signed-off-by: James Smart Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5f1d09640c40..3c002bdcace3 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -26,6 +26,10 @@ enum nvme_fc_queue_flags { }; #define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */ +#define NVME_FC_DEFAULT_RECONNECT_TMO 2 /* delay between reconnects + * when connected and a + * connection failure. + */ struct nvme_fc_queue { struct nvme_fc_ctrl *ctrl; @@ -3436,7 +3440,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, { struct nvme_fc_ctrl *ctrl; unsigned long flags; - int ret, idx; + int ret, idx, ctrl_loss_tmo; if (!(rport->remoteport.port_role & (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) { @@ -3462,6 +3466,19 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_free_ctrl; } + /* + * if ctrl_loss_tmo is being enforced and the default reconnect delay + * is being used, change to a shorter reconnect delay for FC. + */ + if (opts->max_reconnects != -1 && + opts->reconnect_delay == NVMF_DEF_RECONNECT_DELAY && + opts->reconnect_delay > NVME_FC_DEFAULT_RECONNECT_TMO) { + ctrl_loss_tmo = opts->max_reconnects * opts->reconnect_delay; + opts->reconnect_delay = NVME_FC_DEFAULT_RECONNECT_TMO; + opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo, + opts->reconnect_delay); + } + ctrl->ctrl.opts = opts; ctrl->ctrl.nr_reconnects = 0; if (lport->dev) From 24f7bb8863eb63b97ff7a83e6dd0d188a1c0575e Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 23 Oct 2020 18:32:54 +0200 Subject: [PATCH 27/27] block: blk-mq: fix a kernel-doc markup Fix a typo: blk_mq_run_hw_queue -> blk_mq_run_hw_queues Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 615da7de8855..bd15f775ccad 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1664,7 +1664,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) EXPORT_SYMBOL(blk_mq_run_hw_queue); /** - * blk_mq_run_hw_queue - Run all hardware queues in a request queue. + * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously. */