io_uring-5.9-2020-08-23

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl9CwtMQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpsehEAC4ReB53LLbZxqcmoA2RNs9yz1I4DM2PU6z
 C+NSGGEnAFHQAhLbfCAzxbtQa6x/m64zoLd+8zHZNAeanJXarszcgSuqhXQFlEfX
 7Jz/vdXGdu7Q4zgkLuO3FxleDoPoUC5qOSFHWYtMu6KvHLOkmc9DvdSUsFMDSThX
 6RsoaQY2gDOD/pwtm8Cqmy89nLZdFoyxadXyk/lzxLodjeRZOwoVc+YM8YWmrXZ0
 mKEEuO4uBWxUUmoyAwUABNqWWAkwTDEhrYCiiG81DkAa1Cu0mRXodN0xycr72cLZ
 Ik2OlnTLCE6B0UXsBu2c0+qXGArWsvDyhEEkwF+O+Ump4IBIr72EmgZb+o2nnkXo
 Uu4X/r0qeQ6XD+vBTHcE6oPUjJhV6uEXXon5aesE+vh277ILmHgMyjJKaSiJcY/E
 efM5SuPRq2kuROKWLKiLJnpuJ/9ZTU/4nk4k1pOlWWOVGLHien0sWBBzQ+iWr6mm
 eRl5EkI3JoahqIrNFz0+qF3DwKPVfu+B02/EzA8OXoYHIRV9KMS5eWX5hK12aZ3i
 4AT3xuAanfcNs4qBAScOfHQxQu9U5Z7Mu4JQJ58xdsJd+UWBnbznUmSLob9KKk+c
 X8AvAcYhb684F87VCmaCzDlIPMb46OYxLBgI6sz7L0xdc7i8TCeeEDbQCN1HixZ3
 SNtKzalNXA==
 =fAwK
 -----END PGP SIGNATURE-----

Merge tag 'io_uring-5.9-2020-08-23' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

 - NVMe pull request from Sagi:
       - nvme completion rework from Christoph and Chao that mostly came
         from a bit of divergence of how we classify errors related to
         pathing/retry etc.
       - nvmet passthru fixes from Chaitanya
       - minor nvmet fixes from Amit and I
       - mpath round-robin path selection fix from Martin
       - ignore noiob for zoned devices from Keith
       - minor nvme-fc fix from Tianjia"

 - BFQ cgroup leak fix (Dmitry)

 - block layer MAINTAINERS addition (Geert)

 - fix null_blk FUA checking (Hou)

 - get_max_io_size() size fix (Keith)

 - fix block page_is_mergeable() for compound pages (Matthew)

 - discard granularity fixes (Ming)

 - IO scheduler ordering fix (Ming)

 - misc fixes

* tag 'io_uring-5.9-2020-08-23' of git://git.kernel.dk/linux-block: (31 commits)
  null_blk: fix passing of REQ_FUA flag in null_handle_rq
  nvmet: Disable keep-alive timer when kato is cleared to 0h
  nvme: redirect commands on dying queue
  nvme: just check the status code type in nvme_is_path_error
  nvme: refactor command completion
  nvme: rename and document nvme_end_request
  nvme: skip noiob for zoned devices
  nvme-pci: fix PRP pool size
  nvme-pci: Use u32 for nvme_dev.q_depth and nvme_queue.q_depth
  nvme: Use spin_lock_irq() when taking the ctrl->lock
  nvmet: call blk_mq_free_request() directly
  nvmet: fix oops in pt cmd execution
  nvmet: add ns tear down label for pt-cmd handling
  nvme: multipath: round-robin: eliminate "fallback" variable
  nvme: multipath: round-robin: fix single non-optimized path case
  nvme-fc: Fix wrong return value in __nvme_fc_init_request()
  nvmet-passthru: Reject commands with non-sgl flags set
  nvmet: fix a memory leak
  blkcg: fix memleak for iolatency
  MAINTAINERS: Add missing header files to BLOCK LAYER section
  ...
This commit is contained in:
Linus Torvalds 2020-08-24 11:53:15 -07:00
commit c41c3ec4a2
26 changed files with 239 additions and 154 deletions

View File

@ -3,7 +3,7 @@ NVMe Fault Injection
Linux's fault injection framework provides a systematic way to support Linux's fault injection framework provides a systematic way to support
error injection via debugfs in the /sys/kernel/debug directory. When error injection via debugfs in the /sys/kernel/debug directory. When
enabled, the default NVME_SC_INVALID_OPCODE with no retry will be enabled, the default NVME_SC_INVALID_OPCODE with no retry will be
injected into the nvme_end_request. Users can change the default status injected into the nvme_try_complete_req. Users can change the default status
code and no retry flag via the debugfs. The list of Generic Command code and no retry flag via the debugfs. The list of Generic Command
Status can be found in include/linux/nvme.h Status can be found in include/linux/nvme.h

View File

@ -3205,6 +3205,7 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git
F: block/ F: block/
F: drivers/block/ F: drivers/block/
F: include/linux/blk*
F: kernel/trace/blktrace.c F: kernel/trace/blktrace.c
F: lib/sbitmap.c F: lib/sbitmap.c

View File

@ -332,7 +332,7 @@ static void bfqg_put(struct bfq_group *bfqg)
kfree(bfqg); kfree(bfqg);
} }
void bfqg_and_blkg_get(struct bfq_group *bfqg) static void bfqg_and_blkg_get(struct bfq_group *bfqg)
{ {
/* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
bfqg_get(bfqg); bfqg_get(bfqg);

View File

@ -986,7 +986,6 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
void bfqg_and_blkg_get(struct bfq_group *bfqg);
void bfqg_and_blkg_put(struct bfq_group *bfqg); void bfqg_and_blkg_put(struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_GROUP_IOSCHED #ifdef CONFIG_BFQ_GROUP_IOSCHED

View File

@ -533,9 +533,7 @@ static void bfq_get_entity(struct bfq_entity *entity)
bfqq->ref++; bfqq->ref++;
bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
bfqq, bfqq->ref); bfqq, bfqq->ref);
} else }
bfqg_and_blkg_get(container_of(entity, struct bfq_group,
entity));
} }
/** /**
@ -649,14 +647,8 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
entity->on_st_or_in_serv = false; entity->on_st_or_in_serv = false;
st->wsum -= entity->weight; st->wsum -= entity->weight;
if (is_in_service) if (bfqq && !is_in_service)
return;
if (bfqq)
bfq_put_queue(bfqq); bfq_put_queue(bfqq);
else
bfqg_and_blkg_put(container_of(entity, struct bfq_group,
entity));
} }
/** /**

View File

@ -740,8 +740,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off, struct page *page, unsigned int len, unsigned int off,
bool *same_page) bool *same_page)
{ {
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + size_t bv_end = bv->bv_offset + bv->bv_len;
bv->bv_offset + bv->bv_len - 1; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
phys_addr_t page_addr = page_to_phys(page); phys_addr_t page_addr = page_to_phys(page);
if (vec_end_addr + 1 != page_addr + off) if (vec_end_addr + 1 != page_addr + off)
@ -750,9 +750,9 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return false; return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr); *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
if (!*same_page && pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page) if (*same_page)
return false;
return true; return true;
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
} }
/* /*

View File

@ -1152,13 +1152,15 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded) if (preloaded)
radix_tree_preload_end(); radix_tree_preload_end();
ret = blk_iolatency_init(q);
if (ret)
goto err_destroy_all;
ret = blk_throtl_init(q); ret = blk_throtl_init(q);
if (ret) if (ret)
goto err_destroy_all; goto err_destroy_all;
ret = blk_iolatency_init(q);
if (ret) {
blk_throtl_exit(q);
goto err_destroy_all;
}
return 0; return 0;
err_destroy_all: err_destroy_all:

View File

@ -154,7 +154,7 @@ static inline unsigned get_max_io_size(struct request_queue *q,
if (max_sectors > start_offset) if (max_sectors > start_offset)
return max_sectors - start_offset; return max_sectors - start_offset;
return sectors & (lbs - 1); return sectors & ~(lbs - 1);
} }
static inline unsigned get_max_segment_size(const struct request_queue *q, static inline unsigned get_max_segment_size(const struct request_queue *q,
@ -533,10 +533,17 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
} }
EXPORT_SYMBOL(__blk_rq_map_sg); EXPORT_SYMBOL(__blk_rq_map_sg);
static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
if (req_op(rq) == REQ_OP_DISCARD)
return queue_max_discard_segments(rq->q);
return queue_max_segments(rq->q);
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio, static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs) unsigned int nr_phys_segs)
{ {
if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q)) if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
goto no_merge; goto no_merge;
if (blk_integrity_merge_bio(req->q, req, bio) == false) if (blk_integrity_merge_bio(req->q, req, bio) == false)
@ -624,7 +631,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
return 0; return 0;
total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
if (total_phys_segments > queue_max_segments(q)) if (total_phys_segments > blk_rq_get_max_segments(req))
return 0; return 0;
if (blk_integrity_merge_rq(q, req, next) == false) if (blk_integrity_merge_rq(q, req, next) == false)

View File

@ -78,6 +78,15 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
return; return;
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
* Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
* in blk_mq_run_hw_queue(). Its pair is the barrier in
* blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
* meantime new request added to hctx->dispatch is missed to check in
* blk_mq_run_hw_queue().
*/
smp_mb();
blk_mq_run_hw_queue(hctx, true); blk_mq_run_hw_queue(hctx, true);
} }

View File

@ -1437,6 +1437,15 @@ out:
list_splice_tail_init(list, &hctx->dispatch); list_splice_tail_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock); spin_unlock(&hctx->lock);
/*
* Order adding requests to hctx->dispatch and checking
* SCHED_RESTART flag. The pair of this smp_mb() is the one
* in blk_mq_sched_restart(). Avoid restart code path to
* miss the new added requests to hctx->dispatch, meantime
* SCHED_RESTART is observed here.
*/
smp_mb();
/* /*
* If SCHED_RESTART was set by the caller of this function and * If SCHED_RESTART was set by the caller of this function and
* it is no longer set that means that it was cleared by another * it is no longer set that means that it was cleared by another
@ -1834,6 +1843,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
/** /**
* blk_mq_request_bypass_insert - Insert a request at dispatch list. * blk_mq_request_bypass_insert - Insert a request at dispatch list.
* @rq: Pointer to request to be inserted. * @rq: Pointer to request to be inserted.
* @at_head: true if the request should be inserted at the head of the list.
* @run_queue: If we should run the hardware queue after inserting the request. * @run_queue: If we should run the hardware queue after inserting the request.
* *
* Should only be used carefully, when the caller knows we want to * Should only be used carefully, when the caller knows we want to
@ -2016,7 +2026,8 @@ insert:
if (bypass_insert) if (bypass_insert)
return BLK_STS_RESOURCE; return BLK_STS_RESOURCE;
blk_mq_request_bypass_insert(rq, false, run_queue); blk_mq_sched_insert_request(rq, false, run_queue, false);
return BLK_STS_OK; return BLK_STS_OK;
} }

View File

@ -378,7 +378,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
bset->timeout_fn = timeout; bset->timeout_fn = timeout;
set = &bset->tag_set; set = &bset->tag_set;
set->ops = &bsg_mq_ops, set->ops = &bsg_mq_ops;
set->nr_hw_queues = 1; set->nr_hw_queues = 1;
set->queue_depth = 128; set->queue_depth = 128;
set->numa_node = NUMA_NO_NODE; set->numa_node = NUMA_NO_NODE;

View File

@ -878,6 +878,7 @@ static void loop_config_discard(struct loop_device *lo)
struct file *file = lo->lo_backing_file; struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host; struct inode *inode = file->f_mapping->host;
struct request_queue *q = lo->lo_queue; struct request_queue *q = lo->lo_queue;
u32 granularity, max_discard_sectors;
/* /*
* If the backing device is a block device, mirror its zeroing * If the backing device is a block device, mirror its zeroing
@ -890,11 +891,10 @@ static void loop_config_discard(struct loop_device *lo)
struct request_queue *backingq; struct request_queue *backingq;
backingq = bdev_get_queue(inode->i_bdev); backingq = bdev_get_queue(inode->i_bdev);
blk_queue_max_discard_sectors(q,
backingq->limits.max_write_zeroes_sectors);
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
backingq->limits.max_write_zeroes_sectors); granularity = backingq->limits.discard_granularity ?:
queue_physical_block_size(backingq);
/* /*
* We use punch hole to reclaim the free space used by the * We use punch hole to reclaim the free space used by the
@ -903,24 +903,27 @@ static void loop_config_discard(struct loop_device *lo)
* useful information. * useful information.
*/ */
} else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) { } else if (!file->f_op->fallocate || lo->lo_encrypt_key_size) {
q->limits.discard_granularity = 0; max_discard_sectors = 0;
q->limits.discard_alignment = 0; granularity = 0;
blk_queue_max_discard_sectors(q, 0);
blk_queue_max_write_zeroes_sectors(q, 0);
} else { } else {
q->limits.discard_granularity = inode->i_sb->s_blocksize; max_discard_sectors = UINT_MAX >> 9;
q->limits.discard_alignment = 0; granularity = inode->i_sb->s_blocksize;
blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
} }
if (q->limits.max_write_zeroes_sectors) if (max_discard_sectors) {
q->limits.discard_granularity = granularity;
blk_queue_max_discard_sectors(q, max_discard_sectors);
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
else } else {
q->limits.discard_granularity = 0;
blk_queue_max_discard_sectors(q, 0);
blk_queue_max_write_zeroes_sectors(q, 0);
blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q); blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
} }
q->limits.discard_alignment = 0;
}
static void loop_unprepare_queue(struct loop_device *lo) static void loop_unprepare_queue(struct loop_device *lo)
{ {

View File

@ -1147,7 +1147,7 @@ static int null_handle_rq(struct nullb_cmd *cmd)
len = bvec.bv_len; len = bvec.bv_len;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(req_op(rq)), sector, op_is_write(req_op(rq)), sector,
req_op(rq) & REQ_FUA); rq->cmd_flags & REQ_FUA);
if (err) { if (err) {
spin_unlock_irq(&nullb->lock); spin_unlock_irq(&nullb->lock);
return err; return err;

View File

@ -148,7 +148,8 @@ static int process_rdma(struct rtrs_srv *sess,
/* Generate bio with pages pointing to the rdma buffer */ /* Generate bio with pages pointing to the rdma buffer */
bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL); bio = rnbd_bio_map_kern(data, sess_dev->rnbd_dev->ibd_bio_set, datalen, GFP_KERNEL);
if (IS_ERR(bio)) { if (IS_ERR(bio)) {
rnbd_srv_err(sess_dev, "Failed to generate bio, err: %ld\n", PTR_ERR(bio)); err = PTR_ERR(bio);
rnbd_srv_err(sess_dev, "Failed to generate bio, err: %d\n", err);
goto sess_dev_put; goto sess_dev_put;
} }

View File

@ -126,6 +126,18 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
if (!range) if (!range)
return -ENOMEM; return -ENOMEM;
/*
* Single max discard segment means multi-range discard isn't
* supported, and block layer only runs contiguity merge like
* normal RW request. So we can't reply on bio for retrieving
* each range info.
*/
if (queue_max_discard_segments(req->q) == 1) {
range[0].flags = cpu_to_le32(flags);
range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req));
range[0].sector = cpu_to_le64(blk_rq_pos(req));
n = 1;
} else {
__rq_for_each_bio(bio, req) { __rq_for_each_bio(bio, req) {
u64 sector = bio->bi_iter.bi_sector; u64 sector = bio->bi_iter.bi_sector;
u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT; u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;
@ -135,6 +147,9 @@ static int virtblk_setup_discard_write_zeroes(struct request *req, bool unmap)
range[n].sector = cpu_to_le64(sector); range[n].sector = cpu_to_le64(sector);
n++; n++;
} }
}
WARN_ON_ONCE(n != segments);
req->special_vec.bv_page = virt_to_page(range); req->special_vec.bv_page = virt_to_page(range);
req->special_vec.bv_offset = offset_in_page(range); req->special_vec.bv_offset = offset_in_page(range);

View File

@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status)
} }
} }
static inline bool nvme_req_needs_retry(struct request *req)
{
if (blk_noretry_request(req))
return false;
if (nvme_req(req)->status & NVME_SC_DNR)
return false;
if (nvme_req(req)->retries >= nvme_max_retries)
return false;
return true;
}
static void nvme_retry_req(struct request *req) static void nvme_retry_req(struct request *req)
{ {
struct nvme_ns *ns = req->q->queuedata; struct nvme_ns *ns = req->q->queuedata;
@ -268,33 +257,66 @@ static void nvme_retry_req(struct request *req)
blk_mq_delay_kick_requeue_list(req->q, delay); blk_mq_delay_kick_requeue_list(req->q, delay);
} }
void nvme_complete_rq(struct request *req) enum nvme_disposition {
COMPLETE,
RETRY,
FAILOVER,
};
static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
{
if (likely(nvme_req(req)->status == 0))
return COMPLETE;
if (blk_noretry_request(req) ||
(nvme_req(req)->status & NVME_SC_DNR) ||
nvme_req(req)->retries >= nvme_max_retries)
return COMPLETE;
if (req->cmd_flags & REQ_NVME_MPATH) {
if (nvme_is_path_error(nvme_req(req)->status) ||
blk_queue_dying(req->q))
return FAILOVER;
} else {
if (blk_queue_dying(req->q))
return COMPLETE;
}
return RETRY;
}
static inline void nvme_end_req(struct request *req)
{ {
blk_status_t status = nvme_error_status(nvme_req(req)->status); blk_status_t status = nvme_error_status(nvme_req(req)->status);
trace_nvme_complete_rq(req); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = nvme_lba_to_sect(req->q->queuedata,
le64_to_cpu(nvme_req(req)->result.u64));
nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
}
void nvme_complete_rq(struct request *req)
{
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req); nvme_cleanup_cmd(req);
if (nvme_req(req)->ctrl->kas) if (nvme_req(req)->ctrl->kas)
nvme_req(req)->ctrl->comp_seen = true; nvme_req(req)->ctrl->comp_seen = true;
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { switch (nvme_decide_disposition(req)) {
if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) case COMPLETE:
nvme_end_req(req);
return; return;
case RETRY:
if (!blk_queue_dying(req->q)) {
nvme_retry_req(req); nvme_retry_req(req);
return; return;
case FAILOVER:
nvme_failover_req(req);
return;
} }
} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
req_op(req) == REQ_OP_ZONE_APPEND) {
req->__sector = nvme_lba_to_sect(req->q->queuedata,
le64_to_cpu(nvme_req(req)->result.u64));
}
nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
} }
EXPORT_SYMBOL_GPL(nvme_complete_rq); EXPORT_SYMBOL_GPL(nvme_complete_rq);
@ -2075,7 +2097,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
} }
} }
if (iob) if (iob && !blk_queue_is_zoned(ns->queue))
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob));
nvme_update_disk_info(disk, ns, id); nvme_update_disk_info(disk, ns, id);
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
@ -2965,14 +2987,14 @@ static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
{ {
struct nvme_cel *cel, *ret = NULL; struct nvme_cel *cel, *ret = NULL;
spin_lock(&ctrl->lock); spin_lock_irq(&ctrl->lock);
list_for_each_entry(cel, &ctrl->cels, entry) { list_for_each_entry(cel, &ctrl->cels, entry) {
if (cel->csi == csi) { if (cel->csi == csi) {
ret = cel; ret = cel;
break; break;
} }
} }
spin_unlock(&ctrl->lock); spin_unlock_irq(&ctrl->lock);
return ret; return ret;
} }
@ -2999,9 +3021,9 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
cel->csi = csi; cel->csi = csi;
spin_lock(&ctrl->lock); spin_lock_irq(&ctrl->lock);
list_add_tail(&cel->entry, &ctrl->cels); list_add_tail(&cel->entry, &ctrl->cels);
spin_unlock(&ctrl->lock); spin_unlock_irq(&ctrl->lock);
out: out:
*log = &cel->log; *log = &cel->log;
return 0; return 0;

View File

@ -2035,7 +2035,7 @@ done:
} }
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
if (!nvme_end_request(rq, status, result)) if (!nvme_try_complete_req(rq, status, result))
nvme_fc_complete_rq(rq); nvme_fc_complete_rq(rq);
check_error: check_error:
@ -2078,7 +2078,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) { if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) {
dev_err(ctrl->dev, dev_err(ctrl->dev,
"FCP Op failed - cmdiu dma mapping failed.\n"); "FCP Op failed - cmdiu dma mapping failed.\n");
ret = EFAULT; ret = -EFAULT;
goto out_on_error; goto out_on_error;
} }
@ -2088,7 +2088,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) { if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) {
dev_err(ctrl->dev, dev_err(ctrl->dev,
"FCP Op failed - rspiu dma mapping failed.\n"); "FCP Op failed - rspiu dma mapping failed.\n");
ret = EFAULT; ret = -EFAULT;
} }
atomic_set(&op->state, FCPOP_STATE_IDLE); atomic_set(&op->state, FCPOP_STATE_IDLE);

View File

@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
} }
} }
bool nvme_failover_req(struct request *req) void nvme_failover_req(struct request *req)
{ {
struct nvme_ns *ns = req->q->queuedata; struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status; u16 status = nvme_req(req)->status & 0x7ff;
unsigned long flags; unsigned long flags;
switch (status & 0x7ff) {
case NVME_SC_ANA_TRANSITION:
case NVME_SC_ANA_INACCESSIBLE:
case NVME_SC_ANA_PERSISTENT_LOSS:
/*
* If we got back an ANA error we know the controller is alive,
* but not ready to serve this namespaces. The spec suggests
* we should update our general state here, but due to the fact
* that the admin and I/O queues are not serialized that is
* fundamentally racy. So instead just clear the current path,
* mark the the path as pending and kick of a re-read of the ANA
* log page ASAP.
*/
nvme_mpath_clear_current_path(ns); nvme_mpath_clear_current_path(ns);
if (ns->ctrl->ana_log_buf) {
/*
* If we got back an ANA error, we know the controller is alive but not
* ready to serve this namespace. Kick of a re-read of the ANA
* information page, and just try any other available path for now.
*/
if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
set_bit(NVME_NS_ANA_PENDING, &ns->flags); set_bit(NVME_NS_ANA_PENDING, &ns->flags);
queue_work(nvme_wq, &ns->ctrl->ana_work); queue_work(nvme_wq, &ns->ctrl->ana_work);
} }
break;
case NVME_SC_HOST_PATH_ERROR:
case NVME_SC_HOST_ABORTED_CMD:
/*
* Temporary transport disruption in talking to the controller.
* Try to send on a new path.
*/
nvme_mpath_clear_current_path(ns);
break;
default:
/* This was a non-ANA error so follow the normal error path. */
return false;
}
spin_lock_irqsave(&ns->head->requeue_lock, flags); spin_lock_irqsave(&ns->head->requeue_lock, flags);
blk_steal_bios(&ns->head->requeue_list, req); blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags); spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
blk_mq_end_request(req, 0);
kblockd_schedule_work(&ns->head->requeue_work); kblockd_schedule_work(&ns->head->requeue_work);
return true;
} }
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@ -233,7 +212,7 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
int node, struct nvme_ns *old) int node, struct nvme_ns *old)
{ {
struct nvme_ns *ns, *found, *fallback = NULL; struct nvme_ns *ns, *found = NULL;
if (list_is_singular(&head->list)) { if (list_is_singular(&head->list)) {
if (nvme_path_is_disabled(old)) if (nvme_path_is_disabled(old))
@ -252,18 +231,22 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
goto out; goto out;
} }
if (ns->ana_state == NVME_ANA_NONOPTIMIZED) if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
fallback = ns; found = ns;
} }
/* No optimized path found, re-check the current path */ /*
* The loop above skips the current path for round-robin semantics.
* Fall back to the current path if either:
* - no other optimized path found and current is optimized,
* - no other usable path found and current is usable.
*/
if (!nvme_path_is_disabled(old) && if (!nvme_path_is_disabled(old) &&
old->ana_state == NVME_ANA_OPTIMIZED) { (old->ana_state == NVME_ANA_OPTIMIZED ||
found = old; (!found && old->ana_state == NVME_ANA_NONOPTIMIZED)))
goto out; return old;
}
if (!fallback) if (!found)
return NULL; return NULL;
found = fallback;
out: out:
rcu_assign_pointer(head->current_path[node], found); rcu_assign_pointer(head->current_path[node], found);
return found; return found;

View File

@ -523,7 +523,31 @@ static inline u32 nvme_bytes_to_numd(size_t len)
return (len >> 2) - 1; return (len >> 2) - 1;
} }
static inline bool nvme_end_request(struct request *req, __le16 status, static inline bool nvme_is_ana_error(u16 status)
{
switch (status & 0x7ff) {
case NVME_SC_ANA_TRANSITION:
case NVME_SC_ANA_INACCESSIBLE:
case NVME_SC_ANA_PERSISTENT_LOSS:
return true;
default:
return false;
}
}
static inline bool nvme_is_path_error(u16 status)
{
/* check for a status code type of 'path related status' */
return (status & 0x700) == 0x300;
}
/*
* Fill in the status and result information from the CQE, and then figure out
* if blk-mq will need to use IPI magic to complete the request, and if yes do
* so. If not let the caller complete the request without an indirect function
* call.
*/
static inline bool nvme_try_complete_req(struct request *req, __le16 status,
union nvme_result result) union nvme_result result)
{ {
struct nvme_request *rq = nvme_req(req); struct nvme_request *rq = nvme_req(req);
@ -629,7 +653,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags); struct nvme_ctrl *ctrl, int *flags);
bool nvme_failover_req(struct request *req); void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@ -688,9 +712,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
} }
static inline bool nvme_failover_req(struct request *req) static inline void nvme_failover_req(struct request *req)
{ {
return false;
} }
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{ {

View File

@ -120,7 +120,7 @@ struct nvme_dev {
unsigned max_qid; unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES]; unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs; unsigned int num_vecs;
u16 q_depth; u32 q_depth;
int io_sqes; int io_sqes;
u32 db_stride; u32 db_stride;
void __iomem *bar; void __iomem *bar;
@ -157,13 +157,13 @@ struct nvme_dev {
static int io_queue_depth_set(const char *val, const struct kernel_param *kp) static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{ {
int ret; int ret;
u16 n; u32 n;
ret = kstrtou16(val, 10, &n); ret = kstrtou32(val, 10, &n);
if (ret != 0 || n < 2) if (ret != 0 || n < 2)
return -EINVAL; return -EINVAL;
return param_set_ushort(val, kp); return param_set_uint(val, kp);
} }
static inline unsigned int sq_idx(unsigned int qid, u32 stride) static inline unsigned int sq_idx(unsigned int qid, u32 stride)
@ -195,7 +195,7 @@ struct nvme_queue {
dma_addr_t sq_dma_addr; dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr; dma_addr_t cq_dma_addr;
u32 __iomem *q_db; u32 __iomem *q_db;
u16 q_depth; u32 q_depth;
u16 cq_vector; u16 cq_vector;
u16 sq_tail; u16 sq_tail;
u16 cq_head; u16 cq_head;
@ -961,7 +961,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id); req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail); trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_end_request(req, cqe->status, cqe->result)) if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req); nvme_pci_complete_rq(req);
} }
@ -2320,7 +2320,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth); io_queue_depth);
dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
@ -2460,7 +2460,8 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
static int nvme_setup_prp_pools(struct nvme_dev *dev) static int nvme_setup_prp_pools(struct nvme_dev *dev)
{ {
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
PAGE_SIZE, PAGE_SIZE, 0); NVME_CTRL_PAGE_SIZE,
NVME_CTRL_PAGE_SIZE, 0);
if (!dev->prp_page_pool) if (!dev->prp_page_pool)
return -ENOMEM; return -ENOMEM;

View File

@ -1189,7 +1189,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req)
if (!refcount_dec_and_test(&req->ref)) if (!refcount_dec_and_test(&req->ref))
return; return;
if (!nvme_end_request(rq, req->status, req->result)) if (!nvme_try_complete_req(rq, req->status, req->result))
nvme_rdma_complete_rq(rq); nvme_rdma_complete_rq(rq);
} }

View File

@ -481,7 +481,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
return -EINVAL; return -EINVAL;
} }
if (!nvme_end_request(rq, cqe->status, cqe->result)) if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
nvme_complete_rq(rq); nvme_complete_rq(rq);
queue->nr_cqe++; queue->nr_cqe++;
@ -672,7 +672,7 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status)
{ {
union nvme_result res = {}; union nvme_result res = {};
if (!nvme_end_request(rq, cpu_to_le16(status << 1), res)) if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
nvme_complete_rq(rq); nvme_complete_rq(rq);
} }

View File

@ -1136,6 +1136,7 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item,
up_write(&nvmet_config_sem); up_write(&nvmet_config_sem);
kfree_rcu(new_model, rcuhead); kfree_rcu(new_model, rcuhead);
kfree(new_model_number);
return count; return count;
} }

View File

@ -397,6 +397,9 @@ static void nvmet_keep_alive_timer(struct work_struct *work)
static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
{ {
if (unlikely(ctrl->kato == 0))
return;
pr_debug("ctrl %d start keep-alive timer for %d secs\n", pr_debug("ctrl %d start keep-alive timer for %d secs\n",
ctrl->cntlid, ctrl->kato); ctrl->cntlid, ctrl->kato);
@ -406,6 +409,9 @@ static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
{ {
if (unlikely(ctrl->kato == 0))
return;
pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
cancel_delayed_work_sync(&ctrl->ka_work); cancel_delayed_work_sync(&ctrl->ka_work);

View File

@ -115,7 +115,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
return; return;
} }
if (!nvme_end_request(rq, cqe->status, cqe->result)) if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
nvme_loop_complete_rq(rq); nvme_loop_complete_rq(rq);
} }
} }

View File

@ -165,7 +165,7 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
req->cqe->result = nvme_req(rq)->result; req->cqe->result = nvme_req(rq)->result;
nvmet_req_complete(req, status); nvmet_req_complete(req, status);
blk_put_request(rq); blk_mq_free_request(rq);
} }
static void nvmet_passthru_req_done(struct request *rq, static void nvmet_passthru_req_done(struct request *rq,
@ -175,7 +175,7 @@ static void nvmet_passthru_req_done(struct request *rq,
req->cqe->result = nvme_req(rq)->result; req->cqe->result = nvme_req(rq)->result;
nvmet_req_complete(req, nvme_req(rq)->status); nvmet_req_complete(req, nvme_req(rq)->status);
blk_put_request(rq); blk_mq_free_request(rq);
} }
static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
@ -230,7 +230,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
if (unlikely(!ns)) { if (unlikely(!ns)) {
pr_err("failed to get passthru ns nsid:%u\n", nsid); pr_err("failed to get passthru ns nsid:%u\n", nsid);
status = NVME_SC_INVALID_NS | NVME_SC_DNR; status = NVME_SC_INVALID_NS | NVME_SC_DNR;
goto fail_out; goto out;
} }
q = ns->queue; q = ns->queue;
@ -238,16 +238,15 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY); rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
if (IS_ERR(rq)) { if (IS_ERR(rq)) {
rq = NULL;
status = NVME_SC_INTERNAL; status = NVME_SC_INTERNAL;
goto fail_out; goto out_put_ns;
} }
if (req->sg_cnt) { if (req->sg_cnt) {
ret = nvmet_passthru_map_sg(req, rq); ret = nvmet_passthru_map_sg(req, rq);
if (unlikely(ret)) { if (unlikely(ret)) {
status = NVME_SC_INTERNAL; status = NVME_SC_INTERNAL;
goto fail_out; goto out_put_req;
} }
} }
@ -274,11 +273,13 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
return; return;
fail_out: out_put_req:
blk_mq_free_request(rq);
out_put_ns:
if (ns) if (ns)
nvme_put_ns(ns); nvme_put_ns(ns);
out:
nvmet_req_complete(req, status); nvmet_req_complete(req, status);
blk_put_request(rq);
} }
/* /*
@ -326,6 +327,10 @@ static u16 nvmet_setup_passthru_command(struct nvmet_req *req)
u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
{ {
/* Reject any commands with non-sgl flags set (ie. fused commands) */
if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL)
return NVME_SC_INVALID_FIELD;
switch (req->cmd->common.opcode) { switch (req->cmd->common.opcode) {
case nvme_cmd_resv_register: case nvme_cmd_resv_register:
case nvme_cmd_resv_report: case nvme_cmd_resv_report:
@ -396,6 +401,10 @@ static u16 nvmet_passthru_get_set_features(struct nvmet_req *req)
u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req) u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
{ {
/* Reject any commands with non-sgl flags set (ie. fused commands) */
if (req->cmd->common.flags & ~NVME_CMD_SGL_ALL)
return NVME_SC_INVALID_FIELD;
/* /*
* Passthru all vendor specific commands * Passthru all vendor specific commands
*/ */