From 943d0609d0571af092dc13456cbca70351e4d20e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:22 +0000 Subject: [PATCH 001/150] io_uring: rename ->resize_lock ->resize_lock is used for resizing rings, but it's a good idea to reuse it in other cases as well. Rename it into mmap_lock as it's protects from races with mmap. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/68f705306f3ac4d2fb999eb80ea1615015ce9f7f.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 2 +- io_uring/memmap.c | 6 +++--- io_uring/register.c | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fd4cdb0860a2..fafc1d779eb1 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -424,7 +424,7 @@ struct io_ring_ctx { * side will need to grab this lock, to prevent either side from * being run concurrently with the other. */ - struct mutex resize_lock; + struct mutex mmap_lock; /* * If IORING_SETUP_NO_MMAP is used, then the below holds diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d3403c8216db..539fecde0244 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -350,7 +350,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); - mutex_init(&ctx->resize_lock); + mutex_init(&ctx->mmap_lock); return ctx; diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 57de9bccbf50..a0d4151d11af 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -329,7 +329,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) unsigned int npages; void *ptr; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) @@ -365,7 +365,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) return -EINVAL; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) @@ -415,7 +415,7 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, struct io_ring_ctx *ctx = file->private_data; void *ptr; - guard(mutex)(&ctx->resize_lock); + guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) diff --git a/io_uring/register.c b/io_uring/register.c index fdd44914c39c..77743e3f6751 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -489,15 +489,15 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) } /* - * We'll do the swap. Grab the ctx->resize_lock, which will exclude + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude * any new mmap's on the ring fd. Clear out existing mappings to prevent * mmap from seeing them, as we'll unmap them. Any attempt to mmap * existing rings beyond this point will fail. Not that it could proceed * at this point anyway, as the io_uring mmap side needs go grab the - * ctx->resize_lock as well. Likewise, hold the completion lock over the + * ctx->mmap_lock as well. Likewise, hold the completion lock over the * duration of the actual swap. */ - mutex_lock(&ctx->resize_lock); + mutex_lock(&ctx->mmap_lock); spin_lock(&ctx->completion_lock); o.rings = ctx->rings; ctx->rings = NULL; @@ -564,7 +564,7 @@ overflow: ret = 0; out: spin_unlock(&ctx->completion_lock); - mutex_unlock(&ctx->resize_lock); + mutex_unlock(&ctx->mmap_lock); io_register_free_rings(&p, to_free); if (ctx->sq_data) From 7427b0b49ad51304c041ffb8c413f342e148cdea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:23 +0000 Subject: [PATCH 002/150] io_uring/rsrc: export io_check_coalesce_buffer io_try_coalesce_buffer() is a useful helper collecting useful info about a set of pages, I want to reuse it for analysing ring/etc. mappings. I don't need the entire thing and only interested if it can be coalesced into a single page, but that's better than duplicating the parsing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/353b447953cd5d34c454a7d909bb6024c391d6e2.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 22 ++++++++++++---------- io_uring/rsrc.h | 4 ++++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 077f84684c18..2d2333970eb5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -626,11 +626,12 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data, int nr_folios) +static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; int nr_pages_left = *nr_pages, i, j; + int nr_folios = data->nr_folios; /* Store head pages only*/ new_array = kvmalloc_array(nr_folios, sizeof(struct page *), @@ -667,15 +668,14 @@ static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, return true; } -static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data) +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data) { - struct page **page_array = *pages; struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; - if (*nr_pages <= 1) + if (nr_pages <= 1) return false; data->nr_pages_mid = folio_nr_pages(folio); @@ -687,7 +687,7 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ - for (i = 1; i < *nr_pages; i++) { + for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; @@ -715,7 +715,8 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, if (nr_folios == 1) data->nr_pages_head = count; - return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); + data->nr_folios = nr_folios; + return true; } static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, @@ -729,7 +730,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; - bool coalesced; + bool coalesced = false; if (!iov->iov_base) return NULL; @@ -749,7 +750,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ - coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); + if (io_check_coalesce_buffer(pages, nr_pages, &data)) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7a4668deaa1a..c8b093584461 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -40,6 +40,7 @@ struct io_imu_folio_data { /* For non-head/tail folios, has to be fully included */ unsigned int nr_pages_mid; unsigned int folio_shift; + unsigned int nr_folios; }; struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); @@ -66,6 +67,9 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data); + static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, int index) { From a730d2047d4ef822262c37f51ff7267f2f0e7167 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:24 +0000 Subject: [PATCH 003/150] io_uring/memmap: flag vmap'ed regions Add internal flags for struct io_mapped_region. The first flag we need is IO_REGION_F_VMAPPED, that indicates that the pointer has to be unmapped on region destruction. For now all regions are vmap'ed, so it's set unconditionally. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5a3d8046a038da97c0f8a8c8f1733fa3fc689d31.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++-- io_uring/memmap.c | 14 ++++++++++---- io_uring/memmap.h | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fafc1d779eb1..0793a91b66a5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -78,8 +78,9 @@ struct io_hash_table { struct io_mapped_region { struct page **pages; - void *vmap_ptr; - size_t nr_pages; + void *ptr; + unsigned nr_pages; + unsigned flags; }; /* diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a0d4151d11af..31fb8c8ffe4e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -202,14 +202,19 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages, return ERR_PTR(-ENOMEM); } +enum { + /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ + IO_REGION_F_VMAP = 1, +}; + void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { unpin_user_pages(mr->pages, mr->nr_pages); kvfree(mr->pages); } - if (mr->vmap_ptr) - vunmap(mr->vmap_ptr); + if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) + vunmap(mr->ptr); if (mr->nr_pages && ctx->user) __io_unaccount_mem(ctx->user, mr->nr_pages); @@ -225,7 +230,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, void *vptr; u64 end; - if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages)) + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) return -EFAULT; if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) return -EINVAL; @@ -260,8 +265,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } mr->pages = pages; - mr->vmap_ptr = vptr; + mr->ptr = vptr; mr->nr_pages = nr_pages; + mr->flags |= IO_REGION_F_VMAP; return 0; out_free: if (pages_accounted) diff --git a/io_uring/memmap.h b/io_uring/memmap.h index f361a635b6c7..2096a8427277 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -28,7 +28,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, static inline void *io_region_get_ptr(struct io_mapped_region *mr) { - return mr->vmap_ptr; + return mr->ptr; } static inline bool io_region_is_set(struct io_mapped_region *mr) From 16375af32d0fd5a6398c48d2a684b3f4fbb17a8e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:25 +0000 Subject: [PATCH 004/150] io_uring/memmap: flag regions with user pages In preparation to kernel allocated regions add a flag telling if the region contains user pinned pages or not. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0dc91564642654405bab080b7ec911cb4a43ec6e.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 31fb8c8ffe4e..a0416733e921 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -205,12 +205,17 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages, enum { /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ IO_REGION_F_VMAP = 1, + /* memory is provided by user and pinned by the kernel */ + IO_REGION_F_USER_PROVIDED = 2, }; void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { - unpin_user_pages(mr->pages, mr->nr_pages); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + unpin_user_pages(mr->pages, mr->nr_pages); + else + release_pages(mr->pages, mr->nr_pages); kvfree(mr->pages); } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) @@ -267,7 +272,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, mr->pages = pages; mr->ptr = vptr; mr->nr_pages = nr_pages; - mr->flags |= IO_REGION_F_VMAP; + mr->flags |= IO_REGION_F_VMAP | IO_REGION_F_USER_PROVIDED; return 0; out_free: if (pages_accounted) From fc5f22a64649db7582f988d01651fa7d50054f90 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:26 +0000 Subject: [PATCH 005/150] io_uring/memmap: account memory before pinning Move memory accounting before page pinning. It shouldn't even try to pin pages if it's not allowed, and accounting is also relatively inexpensive. It also give a better code structure as we do generic accounting and then can branch for different mapping types. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e242b8038411a222e8b269d35e021fa5015289f.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a0416733e921..fca93bc4c6f1 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -252,17 +252,21 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (check_add_overflow(reg->user_addr, reg->size, &end)) return -EOVERFLOW; - pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - + nr_pages = reg->size >> PAGE_SHIFT; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) - goto out_free; + return ret; pages_accounted = nr_pages; } + pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto out_free; + } + vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!vptr) { ret = -ENOMEM; @@ -277,7 +281,8 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, out_free: if (pages_accounted) __io_unaccount_mem(ctx->user, pages_accounted); - io_pages_free(&pages, nr_pages); + if (pages) + io_pages_free(&pages, nr_pages); return ret; } From 226ae1b4d1111b0b0041677b58371af9b8cd31a9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:27 +0000 Subject: [PATCH 006/150] io_uring/memmap: reuse io_free_region for failure path Regions are going to become more complex with allocation options and optimisations, I want to split initialisation into steps and for that it needs a sane fail path. Reuse io_free_region(), it's smart enough to undo only what's needed and leaves the structure in a consistent state. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b853b4ec407cc80d033d021bdd2c14e22378fc78.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index fca93bc4c6f1..96c4f6b61171 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -229,7 +229,6 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { - int pages_accounted = 0; struct page **pages; int nr_pages, ret; void *vptr; @@ -257,32 +256,27 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; - pages_accounted = nr_pages; } + mr->nr_pages = nr_pages; pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); - pages = NULL; goto out_free; } + mr->pages = pages; + mr->flags |= IO_REGION_F_USER_PROVIDED; vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!vptr) { ret = -ENOMEM; goto out_free; } - - mr->pages = pages; mr->ptr = vptr; - mr->nr_pages = nr_pages; - mr->flags |= IO_REGION_F_VMAP | IO_REGION_F_USER_PROVIDED; + mr->flags |= IO_REGION_F_VMAP; return 0; out_free: - if (pages_accounted) - __io_unaccount_mem(ctx->user, pages_accounted); - if (pages) - io_pages_free(&pages, nr_pages); + io_free_region(ctx, mr); return ret; } From c4d0ac1c1567ee822529124a3dc10b384838c3bc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:28 +0000 Subject: [PATCH 007/150] io_uring/memmap: optimise single folio regions We don't need to vmap if memory is already physically contiguous. There are two important cases it covers: PAGE_SIZE regions and huge pages. Use io_check_coalesce_buffer() to get the number of contiguous folios. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d5240af23064a824c29d14d2406f1ae764bf4505.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 96c4f6b61171..fd348c98f64f 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -226,12 +226,31 @@ void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) memset(mr, 0, sizeof(*mr)); } +static int io_region_init_ptr(struct io_mapped_region *mr) +{ + struct io_imu_folio_data ifd; + void *ptr; + + if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { + if (ifd.nr_folios == 1) { + mr->ptr = page_address(mr->pages[0]); + return 0; + } + } + ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); + if (!ptr) + return -ENOMEM; + + mr->ptr = ptr; + mr->flags |= IO_REGION_F_VMAP; + return 0; +} + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { struct page **pages; int nr_pages, ret; - void *vptr; u64 end; if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) @@ -267,13 +286,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, mr->pages = pages; mr->flags |= IO_REGION_F_USER_PROVIDED; - vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!vptr) { - ret = -ENOMEM; + ret = io_region_init_ptr(mr); + if (ret) goto out_free; - } - mr->ptr = vptr; - mr->flags |= IO_REGION_F_VMAP; return 0; out_free: io_free_region(ctx, mr); From a90558b36cceeb546c19b3cd17ed94f9810f8eec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:29 +0000 Subject: [PATCH 008/150] io_uring/memmap: helper for pinning region pages In preparation to adding kernel allocated regions extract a new helper that pins user pages. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a17d7c39c3de4266b66b75b2dcf768150e1fc618.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index fd348c98f64f..5d261e07c2e3 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -246,10 +246,28 @@ static int io_region_init_ptr(struct io_mapped_region *mr) return 0; } +static int io_region_pin_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + unsigned long size = mr->nr_pages << PAGE_SHIFT; + struct page **pages; + int nr_pages; + + pages = io_pin_pages(reg->user_addr, size, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) + return -EFAULT; + + mr->pages = pages; + mr->flags |= IO_REGION_F_USER_PROVIDED; + return 0; +} + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { - struct page **pages; int nr_pages, ret; u64 end; @@ -278,13 +296,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } mr->nr_pages = nr_pages; - pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); + ret = io_region_pin_pages(ctx, mr, reg); + if (ret) goto out_free; - } - mr->pages = pages; - mr->flags |= IO_REGION_F_USER_PROVIDED; ret = io_region_init_ptr(mr); if (ret) From 4b851d20d325dc59f9abdce55d42dc4b68179db0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:30 +0000 Subject: [PATCH 009/150] io_uring/memmap: add IO_REGION_F_SINGLE_REF Kernel allocated compound pages will have just one reference for the entire page array, add a flag telling io_free_region about that. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a7abfa7535e9728d5fcade29a1ea1605ec2c04ce.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 5d261e07c2e3..a37ccb167258 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -207,15 +207,23 @@ enum { IO_REGION_F_VMAP = 1, /* memory is provided by user and pinned by the kernel */ IO_REGION_F_USER_PROVIDED = 2, + /* only the first page in the array is ref'ed */ + IO_REGION_F_SINGLE_REF = 4, }; void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { + long nr_refs = mr->nr_pages; + + if (mr->flags & IO_REGION_F_SINGLE_REF) + nr_refs = 1; + if (mr->flags & IO_REGION_F_USER_PROVIDED) - unpin_user_pages(mr->pages, mr->nr_pages); + unpin_user_pages(mr->pages, nr_refs); else - release_pages(mr->pages, mr->nr_pages); + release_pages(mr->pages, nr_refs); + kvfree(mr->pages); } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) From 1e21df691ffa2c277e0b1a4928c9da0e86e9a2be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:31 +0000 Subject: [PATCH 010/150] io_uring/memmap: implement kernel allocated regions Allow the kernel to allocate memory for a region. That's the classical way SQ/CQ are allocated. It's not yet useful to user space as there is no way to mmap it, which is why it's explicitly disabled in io_register_mem_region(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7b8c40e6542546bbf93f4842a9a42a7373b81e0d.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 43 ++++++++++++++++++++++++++++++++++++++++--- io_uring/register.c | 2 ++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a37ccb167258..0908a71bf57e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -273,6 +273,39 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, return 0; } +static int io_region_allocate_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + unsigned long size = mr->nr_pages << PAGE_SHIFT; + unsigned long nr_allocated; + struct page **pages; + void *p; + + pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); + if (!pages) + return -ENOMEM; + + p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); + if (!IS_ERR(p)) { + mr->flags |= IO_REGION_F_SINGLE_REF; + mr->pages = pages; + return 0; + } + + nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE, + mr->nr_pages, pages); + if (nr_allocated != mr->nr_pages) { + if (nr_allocated) + release_pages(pages, nr_allocated); + kvfree(pages); + return -ENOMEM; + } + mr->pages = pages; + return 0; +} + int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { @@ -283,9 +316,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, return -EFAULT; if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) return -EINVAL; - if (reg->flags != IORING_MEM_REGION_TYPE_USER) + if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) return -EINVAL; - if (!reg->user_addr) + /* user_addr should be set IFF it's a user memory backed region */ + if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) return -EFAULT; if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; @@ -304,7 +338,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } mr->nr_pages = nr_pages; - ret = io_region_pin_pages(ctx, mr, reg); + if (reg->flags & IORING_MEM_REGION_TYPE_USER) + ret = io_region_pin_pages(ctx, mr, reg); + else + ret = io_region_allocate_pages(ctx, mr, reg); if (ret) goto out_free; diff --git a/io_uring/register.c b/io_uring/register.c index 77743e3f6751..47992bc1ffcb 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -589,6 +589,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; + if (!(rd.flags & IORING_MEM_REGION_TYPE_USER)) + return -EINVAL; if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) From 087f997870a948820ec366701d178f402c6a23a3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:32 +0000 Subject: [PATCH 011/150] io_uring/memmap: implement mmap for regions The patch implements mmap for the param region and enables the kernel allocation mode. Internally it uses a fixed mmap offset, however the user has to use the offset returned in struct io_uring_region_desc::mmap_offset. Note, mmap doesn't and can't take ->uring_lock and the region / ring lookup is protected by ->mmap_lock, and it's directly peeking at ctx->param_region. We can't protect io_create_region() with the mmap_lock as it'd deadlock, which is why io_create_region_mmap_safe() initialises it for us in a temporary variable and then publishes it with the lock taken. It's intentionally decoupled from main region helpers, and in the future we might want to have a list of active regions, which then could be protected by the ->mmap_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0f1212bd6af7fb39b63514b34fae8948014221d1.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 61 +++++++++++++++++++++++++++++++++++++++++---- io_uring/memmap.h | 10 +++++++- io_uring/register.c | 6 ++--- 3 files changed, 67 insertions(+), 10 deletions(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 0908a71bf57e..9a182c8a4be1 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -275,7 +275,8 @@ static int io_region_pin_pages(struct io_ring_ctx *ctx, static int io_region_allocate_pages(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg) + struct io_uring_region_desc *reg, + unsigned long mmap_offset) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; unsigned long size = mr->nr_pages << PAGE_SHIFT; @@ -290,8 +291,7 @@ static int io_region_allocate_pages(struct io_ring_ctx *ctx, p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); if (!IS_ERR(p)) { mr->flags |= IO_REGION_F_SINGLE_REF; - mr->pages = pages; - return 0; + goto done; } nr_allocated = alloc_pages_bulk_array_node(gfp, NUMA_NO_NODE, @@ -302,12 +302,15 @@ static int io_region_allocate_pages(struct io_ring_ctx *ctx, kvfree(pages); return -ENOMEM; } +done: + reg->mmap_offset = mmap_offset; mr->pages = pages; return 0; } int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg) + struct io_uring_region_desc *reg, + unsigned long mmap_offset) { int nr_pages, ret; u64 end; @@ -341,7 +344,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (reg->flags & IORING_MEM_REGION_TYPE_USER) ret = io_region_pin_pages(ctx, mr, reg); else - ret = io_region_allocate_pages(ctx, mr, reg); + ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); if (ret) goto out_free; @@ -354,6 +357,40 @@ out_free: return ret; } +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + struct io_mapped_region tmp_mr; + int ret; + + memcpy(&tmp_mr, mr, sizeof(tmp_mr)); + ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); + if (ret) + return ret; + + /* + * Once published mmap can find it without holding only the ->mmap_lock + * and not ->uring_lock. + */ + guard(mutex)(&ctx->mmap_lock); + memcpy(mr, &tmp_mr, sizeof(tmp_mr)); + return 0; +} + +static void *io_region_validate_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr) +{ + lockdep_assert_held(&ctx->mmap_lock); + + if (!io_region_is_set(mr)) + return ERR_PTR(-EINVAL); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + return ERR_PTR(-EINVAL); + + return io_region_get_ptr(mr); +} + static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { @@ -389,6 +426,8 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, io_put_bl(ctx, bl); return ptr; } + case IORING_MAP_OFF_PARAM_REGION: + return io_region_validate_mmap(ctx, &ctx->param_region); } return ERR_PTR(-EINVAL); @@ -405,6 +444,16 @@ int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, #ifdef CONFIG_MMU +static int io_region_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct vm_area_struct *vma) +{ + unsigned long nr_pages = mr->nr_pages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); +} + __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { struct io_ring_ctx *ctx = file->private_data; @@ -429,6 +478,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) ctx->n_sqe_pages); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); + case IORING_MAP_OFF_PARAM_REGION: + return io_region_mmap(ctx, &ctx->param_region, vma); } return -EINVAL; diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 2096a8427277..2402bca3d700 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -1,6 +1,8 @@ #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H +#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL + struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); void io_pages_free(struct page ***pages, int npages); int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, @@ -24,7 +26,13 @@ int io_uring_mmap(struct file *file, struct vm_area_struct *vma); void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct io_uring_region_desc *reg); + struct io_uring_region_desc *reg, + unsigned long mmap_offset); + +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset); static inline void *io_region_get_ptr(struct io_mapped_region *mr) { diff --git a/io_uring/register.c b/io_uring/register.c index 47992bc1ffcb..8c19fbe84808 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -588,9 +588,6 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) rd_uptr = u64_to_user_ptr(reg.region_uptr); if (copy_from_user(&rd, rd_uptr, sizeof(rd))) return -EFAULT; - - if (!(rd.flags & IORING_MEM_REGION_TYPE_USER)) - return -EINVAL; if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) return -EINVAL; if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) @@ -605,7 +602,8 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) !(ctx->flags & IORING_SETUP_R_DISABLED)) return -EINVAL; - ret = io_create_region(ctx, &ctx->param_region, &rd); + ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, + IORING_MAP_OFF_PARAM_REGION); if (ret) return ret; if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { From 02255d55260a6836423c8401628b58264e198973 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:33 +0000 Subject: [PATCH 012/150] io_uring: pass ctx to io_register_free_rings A preparation patch, pass the context to io_register_free_rings. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c1865fd2b3d4db22d1a1aac7dd06ea22cb990834.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/register.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 8c19fbe84808..43dd6688a464 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -375,7 +375,8 @@ struct io_ring_ctx_rings { struct io_rings *rings; }; -static void io_register_free_rings(struct io_uring_params *p, +static void io_register_free_rings(struct io_ring_ctx *ctx, + struct io_uring_params *p, struct io_ring_ctx_rings *r) { if (!(p->flags & IORING_SETUP_NO_MMAP)) { @@ -455,7 +456,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) n.rings->cq_ring_entries = p.cq_entries; if (copy_to_user(arg, &p, sizeof(p))) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EFAULT; } @@ -464,7 +465,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) else size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); if (size == SIZE_MAX) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return -EOVERFLOW; } @@ -475,7 +476,7 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) p.sq_off.user_addr, size); if (IS_ERR(ptr)) { - io_register_free_rings(&p, &n); + io_register_free_rings(ctx, &p, &n); return PTR_ERR(ptr); } @@ -565,7 +566,7 @@ overflow: out: spin_unlock(&ctx->completion_lock); mutex_unlock(&ctx->mmap_lock); - io_register_free_rings(&p, to_free); + io_register_free_rings(ctx, &p, to_free); if (ctx->sq_data) io_sq_thread_unpark(ctx->sq_data); From 8078486e1d53591ed946c943177339e59e3089e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:34 +0000 Subject: [PATCH 013/150] io_uring: use region api for SQ Convert internal parts of the SQ managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1fb73ced6b835cb319ab0fe1dc0b2e982a9a5650.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- io_uring/io_uring.c | 38 ++++++++++++++-------------------- io_uring/memmap.c | 3 +-- io_uring/register.c | 37 +++++++++++++++------------------ 4 files changed, 34 insertions(+), 47 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0793a91b66a5..808accf1776b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -432,10 +432,9 @@ struct io_ring_ctx { * the gup'ed pages for the two rings, and the sqes. */ unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; + struct io_mapped_region sq_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 539fecde0244..bdd4bc984bc8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2641,29 +2641,19 @@ static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size); } -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, true); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, - true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; vunmap(ctx->rings); - vunmap(ctx->sq_sqes); } + io_free_region(ctx, &ctx->sq_region); + ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -3481,9 +3471,10 @@ bool io_is_uring_fops(struct file *file) static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3519,17 +3510,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { - io_rings_free(ctx); - return PTR_ERR(ptr); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } - - ctx->sq_sqes = ptr; + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_rings_free(ctx); + return ret; + } + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); return 0; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 9a182c8a4be1..b9aaa25182a5 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -474,8 +474,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: - return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, - ctx->n_sqe_pages); + return io_region_mmap(ctx, &ctx->sq_region, vma); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); case IORING_MAP_OFF_PARAM_REGION: diff --git a/io_uring/register.c b/io_uring/register.c index 43dd6688a464..1cf47ad40dc1 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -368,11 +368,11 @@ static int io_register_clock(struct io_ring_ctx *ctx, */ struct io_ring_ctx_rings { unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; - struct io_uring_sqe *sq_sqes; struct io_rings *rings; + + struct io_uring_sqe *sq_sqes; + struct io_mapped_region sq_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, @@ -382,14 +382,11 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, if (!(p->flags & IORING_SETUP_NO_MMAP)) { io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, true); - io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, - true); } else { io_pages_free(&r->ring_pages, r->n_ring_pages); - io_pages_free(&r->sqe_pages, r->n_sqe_pages); vunmap(r->rings); - vunmap(r->sq_sqes); } + io_free_region(ctx, &r->sq_region); } #define swap_old(ctx, o, n, field) \ @@ -404,11 +401,11 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; struct io_uring_params p; unsigned i, tail; - void *ptr; int ret; /* for single issuer, must be owner resizing */ @@ -469,16 +466,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) return -EOVERFLOW; } - if (!(p.flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); - else - ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, - p.sq_off.user_addr, - size); - if (IS_ERR(ptr)) { - io_register_free_rings(ctx, &p, &n); - return PTR_ERR(ptr); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread @@ -509,7 +508,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ - n.sq_sqes = ptr; tail = o.rings->sq.tail; if (tail - o.rings->sq.head > p.sq_entries) goto overflow; @@ -558,9 +556,8 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, n_sqe_pages); swap_old(ctx, o, n, ring_pages); - swap_old(ctx, o, n, sqe_pages); + swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: From 81a4058e0cd0f07139f088fbeb65bc488f687829 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:35 +0000 Subject: [PATCH 014/150] io_uring: use region api for CQ Convert internal parts of the CQ/SQ array managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46fc3c801290d6b1ac16023d78f6b8e685c87fd6.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 +---- io_uring/io_uring.c | 36 +++++++--------------- io_uring/memmap.c | 55 +++++----------------------------- io_uring/memmap.h | 4 --- io_uring/register.c | 35 ++++++++++------------ 5 files changed, 36 insertions(+), 102 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 808accf1776b..b63f44220e8b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -427,14 +427,8 @@ struct io_ring_ctx { */ struct mutex mmap_lock; - /* - * If IORING_SETUP_NO_MMAP is used, then the below holds - * the gup'ed pages for the two rings, and the sqes. - */ - unsigned short n_ring_pages; - struct page **ring_pages; - struct io_mapped_region sq_region; + struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bdd4bc984bc8..8dd3fa9be993 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2634,26 +2634,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, - true); - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - vunmap(ctx->rings); - } - io_free_region(ctx, &ctx->sq_region); - + io_free_region(ctx, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -3485,15 +3469,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - if (IS_ERR(rings)) - return PTR_ERR(rings); - - ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; diff --git a/io_uring/memmap.c b/io_uring/memmap.c index b9aaa25182a5..668b1c3579a2 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -120,18 +120,6 @@ void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, *npages = 0; } -void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - - if (!page_array) - return; - - unpin_user_pages(page_array, npages); - kvfree(page_array); - *pages = NULL; -} - struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; @@ -174,34 +162,6 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) return ERR_PTR(ret); } -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) -{ - struct page **page_array; - unsigned int nr_pages; - void *page_addr; - - *npages = 0; - - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); - - nr_pages = 0; - page_array = io_pin_pages(uaddr, size, &nr_pages); - if (IS_ERR(page_array)) - return page_array; - - page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); - if (page_addr) { - *pages = page_array; - *npages = nr_pages; - return page_addr; - } - - io_pages_free(&page_array, nr_pages); - return ERR_PTR(-ENOMEM); -} - enum { /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ IO_REGION_F_VMAP = 1, @@ -446,9 +406,10 @@ int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, static int io_region_mmap(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + unsigned max_pages) { - unsigned long nr_pages = mr->nr_pages; + unsigned long nr_pages = min(mr->nr_pages, max_pages); vm_flags_set(vma, VM_DONTEXPAND); return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); @@ -459,7 +420,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned int npages; + unsigned int page_limit; void *ptr; guard(mutex)(&ctx->mmap_lock); @@ -471,14 +432,14 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); + page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); case IORING_OFF_SQES: - return io_region_mmap(ctx, &ctx->sq_region, vma); + return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); case IORING_MAP_OFF_PARAM_REGION: - return io_region_mmap(ctx, &ctx->param_region, vma); + return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); } return -EINVAL; diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 2402bca3d700..7395996eb353 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,7 +4,6 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -void io_pages_free(struct page ***pages, int npages); int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, struct page **pages, int npages); @@ -13,9 +12,6 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages, void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, bool put_pages); -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size); - #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); #endif diff --git a/io_uring/register.c b/io_uring/register.c index 1cf47ad40dc1..5e48413706ac 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -367,26 +367,19 @@ static int io_register_clock(struct io_ring_ctx *ctx, * either mapping or freeing. */ struct io_ring_ctx_rings { - unsigned short n_ring_pages; - struct page **ring_pages; struct io_rings *rings; - struct io_uring_sqe *sq_sqes; + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_uring_params *p, struct io_ring_ctx_rings *r) { - if (!(p->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, - true); - } else { - io_pages_free(&r->ring_pages, r->n_ring_pages); - vunmap(r->rings); - } io_free_region(ctx, &r->sq_region); + io_free_region(ctx, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -439,13 +432,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (size == SIZE_MAX) return -EOVERFLOW; - if (!(p.flags & IORING_SETUP_NO_MMAP)) - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); - else - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, - p.cq_off.user_addr, size); - if (IS_ERR(n.rings)) - return PTR_ERR(n.rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); n.rings->sq_ring_mask = p.sq_entries - 1; n.rings->cq_ring_mask = p.cq_entries - 1; @@ -555,8 +553,7 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; - swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, ring_pages); + swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; From 78fda3d056417ccb9921663383b12f771aa0dd43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:36 +0000 Subject: [PATCH 015/150] io_uring/kbuf: use mmap_lock to sync with mmap A preparation / cleanup patch simplifying the buf ring - mmap synchronisation. Instead of relying on RCU, which is trickier, do it by grabbing the mmap_lock when when anyone tries to publish or remove a registered buffer to / from ->io_bl_xa. Modifications of the xarray should always be protected by both ->uring_lock and ->mmap_lock, while lookups should hold either of them. While a struct io_buffer_list is in the xarray, the mmap related fields like ->flags and ->buf_pages should stay stable. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/af13bde56ee1a26bcaefaa9aad37a9ea318a590e.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/kbuf.c | 56 +++++++++++++++------------------- io_uring/kbuf.h | 1 - 3 files changed, 29 insertions(+), 33 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b63f44220e8b..73575d545d3c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,6 +294,11 @@ struct io_ring_ctx { struct io_submit_state submit_state; + /* + * Modifications are protected by ->uring_lock and ->mmap_lock. + * The flags, buf_pages and buf_nr_pages fields should be stable + * once published. + */ struct xarray io_bl_xa; struct io_hash_table cancel_table; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb7..662e928cc3b0 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -45,10 +45,11 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; atomic_set(&bl->refs, 1); + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -388,7 +389,7 @@ void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { if (atomic_dec_and_test(&bl->refs)) { __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); + kfree(bl); } } @@ -397,10 +398,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer_list *bl; struct list_head *item, *tmp; struct io_buffer *buf; - unsigned long index; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); + while (1) { + unsigned long index = 0; + + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; io_put_bl(ctx, bl); } @@ -589,11 +597,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) INIT_LIST_HEAD(&bl->buf_list); ret = io_buffer_add_list(ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. - */ - kfree_rcu(bl, rcu); + kfree(bl); goto err; } } @@ -736,7 +740,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; } - kfree_rcu(free_bl, rcu); + kfree(free_bl); return ret; } @@ -760,7 +764,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - xa_erase(&ctx->io_bl_xa, bl->bgid); + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->io_bl_xa, bl->bgid); + io_put_bl(ctx, bl); return 0; } @@ -795,29 +801,13 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; - bool ret; - /* - * We have to be a bit careful here - we're inside mmap and cannot grab - * the uring_lock. This means the buffer_list could be simultaneously - * going away, if someone is trying to be sneaky. Look it up under rcu - * so we know it's not going away, and attempt to grab a reference to - * it. If the ref is already zero, then fail the mapping. If successful, - * the caller will call io_put_bl() to drop the the reference at at the - * end. This may then safely free the buffer_list (and drop the pages) - * at that point, vm_insert_pages() would've already grabbed the - * necessary vma references. - */ - rcu_read_lock(); bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ - ret = false; - if (bl && bl->flags & IOBL_MMAP) - ret = atomic_inc_not_zero(&bl->refs); - rcu_read_unlock(); - - if (ret) - return bl; + if (bl && bl->flags & IOBL_MMAP) { + if (atomic_inc_not_zero(&bl->refs)) + return bl; + } return ERR_PTR(-EINVAL); } @@ -829,6 +819,8 @@ int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) struct io_buffer_list *bl; int bgid, ret; + lockdep_assert_held(&ctx->mmap_lock); + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); if (IS_ERR(bl)) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 36aadfe5ac00..d5e4afcbfbb3 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -25,7 +25,6 @@ struct io_buffer_list { struct page **buf_pages; struct io_uring_buf_ring *buf_ring; }; - struct rcu_head rcu; }; __u16 bgid; From 90175f3f503213903b00bc7ba9f8ae436fc5c00e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:37 +0000 Subject: [PATCH 016/150] io_uring/kbuf: remove pbuf ring refcounting struct io_buffer_list refcounting was needed for RCU based sync with mmap, now we can kill it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4a9cc54bf0077bb2bf2f3daf917549ddd41080da.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 21 +++++++-------------- io_uring/kbuf.h | 3 --- io_uring/memmap.c | 1 - 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 662e928cc3b0..644f61445ec9 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -48,7 +48,6 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; - atomic_set(&bl->refs, 1); guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -385,12 +384,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - if (atomic_dec_and_test(&bl->refs)) { - __io_remove_buffers(ctx, bl, -1U); - kfree(bl); - } + __io_remove_buffers(ctx, bl, -1U); + kfree(bl); } void io_destroy_buffers(struct io_ring_ctx *ctx) @@ -804,10 +801,8 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ - if (bl && bl->flags & IOBL_MMAP) { - if (atomic_inc_not_zero(&bl->refs)) - return bl; - } + if (bl && bl->flags & IOBL_MMAP) + return bl; return ERR_PTR(-EINVAL); } @@ -817,7 +812,7 @@ int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; struct io_buffer_list *bl; - int bgid, ret; + int bgid; lockdep_assert_held(&ctx->mmap_lock); @@ -826,7 +821,5 @@ int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) if (IS_ERR(bl)) return PTR_ERR(bl); - ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); - io_put_bl(ctx, bl); - return ret; + return io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index d5e4afcbfbb3..dff7444026a6 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -35,8 +35,6 @@ struct io_buffer_list { __u16 mask; __u16 flags; - - atomic_t refs; }; struct io_buffer { @@ -83,7 +81,6 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid); int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 668b1c3579a2..73b73f4ea1bd 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -383,7 +383,6 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, if (IS_ERR(bl)) return bl; ptr = bl->buf_ring; - io_put_bl(ctx, bl); return ptr; } case IORING_MAP_OFF_PARAM_REGION: From ef62de3c4ad58fab4e37bc267177bc723e48e5e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:38 +0000 Subject: [PATCH 017/150] io_uring/kbuf: use region api for pbuf rings Convert internal parts of the provided buffer ring managment to the region API. It's the last non-region mapped ring we have, so it also kills a bunch of now unused memmap.c helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6c40cf7beaa648558acd4d84bc0fb3279a35d74b.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 172 ++++++++++++++-------------------------------- io_uring/kbuf.h | 18 ++--- io_uring/memmap.c | 118 +++++-------------------------- io_uring/memmap.h | 7 -- 4 files changed, 74 insertions(+), 241 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 644f61445ec9..2dfb9f9419a0 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -351,17 +351,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; - if (bl->buf_nr_pages) { - int j; - - if (!(bl->flags & IOBL_MMAP)) { - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - } - io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->flags & IOBL_MMAP); - bl->flags &= ~IOBL_MMAP; - } + io_free_region(ctx, &bl->region); /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); bl->flags &= ~IOBL_BUF_RING; @@ -614,75 +604,14 @@ err: return IOU_OK; } -static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_uring_buf_ring *br = NULL; - struct page **pages; - int nr_pages, ret; - - pages = io_pin_pages(reg->ring_addr, - flex_array_size(br, bufs, reg->ring_entries), - &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!br) { - ret = -ENOMEM; - goto error_unpin; - } - -#ifdef SHM_COLOUR - /* - * On platforms that have specific aliasing requirements, SHM_COLOUR - * is set and we must guarantee that the kernel and user side align - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and - * the application mmap's the provided ring buffer. Fail the request - * if we, by chance, don't end up with aligned addresses. The app - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle - * this transparently. - */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { - ret = -EINVAL; - goto error_unpin; - } -#endif - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->buf_ring = br; - bl->flags |= IOBL_BUF_RING; - bl->flags &= ~IOBL_MMAP; - return 0; -error_unpin: - unpin_user_pages(pages, nr_pages); - kvfree(pages); - vunmap(br); - return ret; -} - -static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, - struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - size_t ring_size; - - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - - bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); - if (IS_ERR(bl->buf_ring)) { - bl->buf_ring = NULL; - return -ENOMEM; - } - - bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); - return 0; -} - int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; struct io_buffer_list *bl, *free_bl = NULL; + struct io_uring_region_desc rd; + struct io_uring_buf_ring *br; + unsigned long mmap_offset; + unsigned long ring_size; int ret; lockdep_assert_held(&ctx->uring_lock); @@ -694,19 +623,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - } else { - if (reg.ring_addr) - return -EINVAL; - } - if (!is_power_of_2(reg.ring_entries)) return -EINVAL; - /* cannot disambiguate full vs empty due to head/tail size */ if (reg.ring_entries >= 65536) return -EINVAL; @@ -722,21 +640,47 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - if (!(reg.flags & IOU_PBUF_RING_MMAP)) - ret = io_pin_pbuf_ring(®, bl); - else - ret = io_alloc_pbuf_ring(ctx, ®, bl); + mmap_offset = reg.bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(br, bufs, reg.ring_entries); - if (!ret) { - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; - if (reg.flags & IOU_PBUF_RING_INC) - bl->flags |= IOBL_INC; - - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(ring_size); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg.ring_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + if (ret) + goto fail; + br = io_region_get_ptr(&bl->region); +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if (!(reg.flags & IOU_PBUF_RING_MMAP) && + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { + ret = -EINVAL; + goto fail; + } +#endif + + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + bl->flags |= IOBL_BUF_RING; + bl->buf_ring = br; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +fail: + io_free_region(ctx, &bl->region); kfree(free_bl); return ret; } @@ -794,32 +738,18 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid) +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid) { struct io_buffer_list *bl; - bl = xa_load(&ctx->io_bl_xa, bgid); - /* must be a mmap'able buffer ring and have pages */ - if (bl && bl->flags & IOBL_MMAP) - return bl; - - return ERR_PTR(-EINVAL); -} - -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; - struct io_buffer_list *bl; - int bgid; - lockdep_assert_held(&ctx->mmap_lock); - bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return PTR_ERR(bl); + bl = xa_load(&ctx->io_bl_xa, bgid); + if (!bl || !(bl->flags & IOBL_BUF_RING)) + return NULL; + if (WARN_ON_ONCE(!io_region_is_set(&bl->region))) + return NULL; - return io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); + return &bl->region; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index dff7444026a6..bd80c44c5af1 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -3,15 +3,13 @@ #define IOU_KBUF_H #include +#include enum { /* ring mapped provided buffers */ IOBL_BUF_RING = 1, - /* ring mapped provided buffers, but mmap'ed by application */ - IOBL_MMAP = 2, /* buffers are consumed incrementally rather than always fully */ - IOBL_INC = 4, - + IOBL_INC = 2, }; struct io_buffer_list { @@ -21,10 +19,7 @@ struct io_buffer_list { */ union { struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; + struct io_uring_buf_ring *buf_ring; }; __u16 bgid; @@ -35,6 +30,8 @@ struct io_buffer_list { __u16 mask; __u16 flags; + + struct io_mapped_region region; }; struct io_buffer { @@ -81,9 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid); -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 73b73f4ea1bd..6d8a98bd9cac 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -36,90 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages, return page_address(page); } -static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, - gfp_t gfp) -{ - void *ret; - int i; - - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(gfp); - if (!pages[i]) - goto err; - } - - ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (ret) - return ret; -err: - while (i--) - put_page(pages[i]); - return ERR_PTR(-ENOMEM); -} - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - struct page **pages; - int nr_pages; - void *ret; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); - if (!pages) - return ERR_PTR(-ENOMEM); - - ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) - goto done; - if (nr_pages == 1) - goto fail; - - ret = io_mem_alloc_single(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) { -done: - *out_pages = pages; - *npages = nr_pages; - return ret; - } -fail: - kvfree(pages); - *out_pages = NULL; - *npages = 0; - return ret; -} - -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages) -{ - bool do_vunmap = false; - - if (!ptr) - return; - - if (put_pages && *npages) { - struct page **to_free = *pages; - int i; - - /* - * Only did vmap for the non-compound multiple page case. - * For the compound page, we just need to put the head. - */ - if (PageCompound(to_free[0])) - *npages = 1; - else if (*npages > 1) - do_vunmap = true; - for (i = 0; i < *npages; i++) - put_page(to_free[i]); - } - if (do_vunmap) - vunmap(ptr); - kvfree(*pages); - *pages = NULL; - *npages = 0; -} - struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; @@ -374,16 +290,14 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, return ERR_PTR(-EFAULT); return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { - struct io_buffer_list *bl; + struct io_mapped_region *region; unsigned int bgid; - void *ptr; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return bl; - ptr = bl->buf_ring; - return ptr; + region = io_pbuf_get_region(ctx, bgid); + if (!region) + return ERR_PTR(-EINVAL); + return io_region_validate_mmap(ctx, region); } case IORING_MAP_OFF_PARAM_REGION: return io_region_validate_mmap(ctx, &ctx->param_region); @@ -392,15 +306,6 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, return ERR_PTR(-EINVAL); } -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages) -{ - unsigned long nr_pages = npages; - - vm_flags_set(vma, VM_DONTEXPAND); - return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -} - #ifdef CONFIG_MMU static int io_region_mmap(struct io_ring_ctx *ctx, @@ -435,8 +340,17 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); case IORING_OFF_SQES: return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); - case IORING_OFF_PBUF_RING: - return io_pbuf_mmap(file, vma); + case IORING_OFF_PBUF_RING: { + struct io_mapped_region *region; + unsigned int bgid; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + region = io_pbuf_get_region(ctx, bgid); + if (!region) + return -EINVAL; + + return io_region_mmap(ctx, region, vma, UINT_MAX); + } case IORING_MAP_OFF_PARAM_REGION: return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 7395996eb353..c898dcba2b4e 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,13 +4,6 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages); - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size); -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); From 7cd7b9575270e4c4f80cc5ff71b13f4102b59b9e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:39 +0000 Subject: [PATCH 018/150] io_uring/memmap: unify io_uring mmap'ing code All mapped memory is now backed by regions and we can unify and clean up io_region_validate_mmap() and io_uring_mmap(). Extract a function looking up a region, the rest of the handling should be generic and just needs the region. There is one more ring type specific code, i.e. the mmaping size truncation quirk for IORING_OFF_[S,C]Q_RING, which is left as is. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f5e1eda1562bfd34276de07465525ae5f10e1e84.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 3 -- io_uring/memmap.c | 81 ++++++++++++++++++----------------------------- 2 files changed, 31 insertions(+), 53 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2dfb9f9419a0..e91260a6156b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -748,8 +748,5 @@ struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, bl = xa_load(&ctx->io_bl_xa, bgid); if (!bl || !(bl->flags & IOBL_BUF_RING)) return NULL; - if (WARN_ON_ONCE(!io_region_is_set(&bl->region))) - return NULL; - return &bl->region; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6d8a98bd9cac..dda846190fbd 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -254,6 +254,27 @@ int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region return 0; } +static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, + loff_t pgoff) +{ + loff_t offset = pgoff << PAGE_SHIFT; + unsigned int bgid; + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return &ctx->ring_region; + case IORING_OFF_SQES: + return &ctx->sq_region; + case IORING_OFF_PBUF_RING: + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, bgid); + case IORING_MAP_OFF_PARAM_REGION: + return &ctx->param_region; + } + return NULL; +} + static void *io_region_validate_mmap(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { @@ -271,39 +292,12 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; + struct io_mapped_region *region; - switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - if (!ctx->rings) - return ERR_PTR(-EFAULT); - return ctx->rings; - case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - if (!ctx->sq_sqes) - return ERR_PTR(-EFAULT); - return ctx->sq_sqes; - case IORING_OFF_PBUF_RING: { - struct io_mapped_region *region; - unsigned int bgid; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - region = io_pbuf_get_region(ctx, bgid); - if (!region) - return ERR_PTR(-EINVAL); - return io_region_validate_mmap(ctx, region); - } - case IORING_MAP_OFF_PARAM_REGION: - return io_region_validate_mmap(ctx, &ctx->param_region); - } - - return ERR_PTR(-EINVAL); + region = io_mmap_get_region(ctx, pgoff); + if (!region) + return ERR_PTR(-EINVAL); + return io_region_validate_mmap(ctx, region); } #ifdef CONFIG_MMU @@ -324,7 +318,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned int page_limit; + unsigned int page_limit = UINT_MAX; + struct io_mapped_region *region; void *ptr; guard(mutex)(&ctx->mmap_lock); @@ -337,25 +332,11 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; - return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); - case IORING_OFF_SQES: - return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); - case IORING_OFF_PBUF_RING: { - struct io_mapped_region *region; - unsigned int bgid; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - region = io_pbuf_get_region(ctx, bgid); - if (!region) - return -EINVAL; - - return io_region_mmap(ctx, region, vma, UINT_MAX); - } - case IORING_MAP_OFF_PARAM_REGION: - return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); + break; } - return -EINVAL; + region = io_mmap_get_region(ctx, vma->vm_pgoff); + return io_region_mmap(ctx, region, vma, page_limit); } unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, From 5dbb3cbd060aa86a722d7d44278e537ae3f63081 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:31 +0530 Subject: [PATCH 019/150] block: define set of integrity flags to be inherited by cloned bip Introduce BIP_CLONE_FLAGS describing integrity flags that should be inherited in the cloned bip from the parent. Suggested-by: Christoph Hellwig Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- include/linux/bio-integrity.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2a4bd6611692..a448a25d13de 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -559,7 +559,7 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; - bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; + bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; return 0; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dbf0f74c1529..0f0cf10222e8 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -30,6 +30,9 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ From 031141976be0bd5f385775727a4ed3cc845eb7ba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Nov 2024 16:52:32 +0530 Subject: [PATCH 020/150] block: copy back bounce buffer to user-space correctly in case of split Copy back the bounce buffer to user-space in entirety when the parent bio completes. The existing code uses bip_iter.bi_size for sizing the copy, which can be modified. So move away from that and fetch it from the vector passed to the block layer. While at it, switch to using better variable names. Fixes: 492c5d455969f ("block: bio-integrity: directly map user buffers") Signed-off-by: Christoph Hellwig Signed-off-by: Anuj Gupta Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-3-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index a448a25d13de..4341b0d4efa1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -118,17 +118,18 @@ static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) { - unsigned short nr_vecs = bip->bip_max_vcnt - 1; - struct bio_vec *copy = &bip->bip_vec[1]; - size_t bytes = bip->bip_iter.bi_size; - struct iov_iter iter; + unsigned short orig_nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *orig_bvecs = &bip->bip_vec[1]; + struct bio_vec *bounce_bvec = &bip->bip_vec[0]; + size_t bytes = bounce_bvec->bv_len; + struct iov_iter orig_iter; int ret; - iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); - ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + iov_iter_bvec(&orig_iter, ITER_DEST, orig_bvecs, orig_nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bounce_bvec), bytes, &orig_iter); WARN_ON_ONCE(ret != bytes); - bio_integrity_unpin_bvec(copy, nr_vecs, true); + bio_integrity_unpin_bvec(orig_bvecs, orig_nr_vecs, true); } /** From fe8f4ca7107e968b0eb7328155c8811f2a19424a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:33 +0530 Subject: [PATCH 021/150] block: modify bio_integrity_map_user to accept iov_iter as argument This patch refactors bio_integrity_map_user to accept iov_iter as argument. This is a prep patch. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 12 +++++------- block/blk-integrity.c | 10 +++++++++- include/linux/bio-integrity.h | 5 ++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4341b0d4efa1..f56d01cec689 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -302,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + size_t offset, bytes = iter->count; unsigned int direction, nr_bvecs; - struct iov_iter iter; int ret, nr_vecs; - size_t offset; bool copy; if (bio_integrity(bio)) @@ -324,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) else direction = ITER_SOURCE; - iov_iter_ubuf(&iter, direction, ubuf, bytes); - nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { @@ -335,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) pages = NULL; } - copy = !iov_iter_is_aligned(&iter, align, align); - ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + copy = !iov_iter_is_aligned(iter, align, align); + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index b180cac61a9d..4a29754f1bc2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); + int ret; + struct iov_iter iter; + unsigned int direction; + if (op_is_write(req_op(rq))) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + iov_iter_ubuf(&iter, direction, ubuf, bytes); + ret = bio_integrity_map_user(rq->bio, &iter); if (ret) return ret; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0f0cf10222e8..58ff9988433a 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -75,7 +75,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -101,8 +101,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len) +static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } From 10783d0ba0d7731ec81d88c54f83cf0ff89d1c2a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:34 +0530 Subject: [PATCH 022/150] fs, iov_iter: define meta io descriptor Add flags to describe checks for integrity meta buffer. Also, introduce a new 'uio_meta' structure that upper layer can use to pass the meta/integrity information. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-5-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 9 +++++++++ include/uapi/linux/fs.h | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/uio.h b/include/linux/uio.h index 853f9de5aa05..8ada84e85447 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -82,6 +82,15 @@ struct iov_iter { }; }; +typedef __u16 uio_meta_flags_t; + +struct uio_meta { + uio_meta_flags_t flags; + u16 app_tag; + u64 seed; + struct iov_iter iter; +}; + static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..9070ef19f0a3 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -40,6 +40,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< Date: Thu, 28 Nov 2024 16:52:35 +0530 Subject: [PATCH 023/150] fs: introduce IOCB_HAS_METADATA for metadata Introduce an IOCB_HAS_METADATA flag for the kiocb struct, for handling requests containing meta payload. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-6-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc3d45da7b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -348,6 +348,7 @@ struct readahead_control; #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) +#define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ From 59a7d12a7fb5ab24efa893e6980a00ffc090c777 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:36 +0530 Subject: [PATCH 024/150] io_uring: introduce attributes for read/write and PI support Add the ability to pass additional attributes along with read/write. Application can prepare attibute specific information and pass its address using the SQE field: __u64 attr_ptr; Along with setting a mask indicating attributes being passed: __u64 attr_type_mask; Overall 64 attributes are allowed and currently one attribute 'IORING_RW_ATTR_FLAG_PI' is supported. With PI attribute, userspace can pass following information: - flags: integrity check flags IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG} - len: length of PI/metadata buffer - addr: address of metadata buffer - seed: seed value for reftag remapping - app_tag: application defined 16b value Process this information to prepare uio_meta_descriptor and pass it down using kiocb->private. PI attribute is supported only for direct IO. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Link: https://lore.kernel.org/r/20241128112240.8867-7-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 16 +++++++ io_uring/io_uring.c | 2 + io_uring/rw.c | 83 ++++++++++++++++++++++++++++++++++- io_uring/rw.h | 14 +++++- 4 files changed, 112 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index aac9a4f8fa9a..38f0d6b10eaf 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -98,6 +98,10 @@ struct io_uring_sqe { __u64 addr3; __u64 __pad2[1]; }; + struct { + __u64 attr_ptr; /* pointer to attribute information */ + __u64 attr_type_mask; /* bit mask of attributes */ + }; __u64 optval; /* * If the ring is initialized with IORING_SETUP_SQE128, then @@ -107,6 +111,18 @@ struct io_uring_sqe { }; }; +/* sqe->attr_type_mask flags */ +#define IORING_RW_ATTR_FLAG_PI (1U << 0) +/* PI attribute information */ +struct io_uring_attr_pi { + __u16 flags; + __u16 app_tag; + __u32 len; + __u64 addr; + __u64 seed; + __u64 rsvd; +}; + /* * If sqe->file_index is set to this for opcodes that instantiate a new * direct descriptor (like openat/openat2/accept), then io_uring will allocate diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8dd3fa9be993..2e6891aadecc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3876,6 +3876,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != diff --git a/io_uring/rw.c b/io_uring/rw.c index 0bcb83e4ce3c..04e4467ab0ee 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -257,11 +257,53 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; } +static inline void io_meta_save_state(struct io_async_rw *io) +{ + io->meta_state.seed = io->meta.seed; + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); +} + +static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + io->meta.seed = io->meta_state.seed; + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); + } +} + +static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, + u64 attr_ptr, u64 attr_type_mask) +{ + struct io_uring_attr_pi pi_attr; + struct io_async_rw *io; + int ret; + + if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), + sizeof(pi_attr))) + return -EFAULT; + + if (pi_attr.rsvd) + return -EINVAL; + + io = req->async_data; + io->meta.flags = pi_attr.flags; + io->meta.app_tag = pi_attr.app_tag; + io->meta.seed = pi_attr.seed; + ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), + pi_attr.len, &io->meta.iter); + if (unlikely(ret < 0)) + return ret; + rw->kiocb.ki_flags |= IOCB_HAS_METADATA; + io_meta_save_state(io); + return ret; +} + static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; + u64 attr_type_mask; int ret; rw->kiocb.ki_pos = READ_ONCE(sqe->off); @@ -279,11 +321,28 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return io_prep_rw_setup(req, ddir, do_import); + ret = io_prep_rw_setup(req, ddir, do_import); + + if (unlikely(ret)) + return ret; + + attr_type_mask = READ_ONCE(sqe->attr_type_mask); + if (attr_type_mask) { + u64 attr_ptr; + + /* only PI attribute is supported currently */ + if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) + return -EINVAL; + + attr_ptr = READ_ONCE(sqe->attr_ptr); + ret = io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + } + return ret; } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -409,7 +468,9 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + io_meta_restore(io, &rw->kiocb); iov_iter_restore(&io->iter, &io->iter_state); } @@ -744,6 +805,10 @@ static bool io_rw_should_retry(struct io_kiocb *req) if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; + /* never retry for meta io */ + if (kiocb->ki_flags & IOCB_HAS_METADATA) + return false; + /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -794,7 +859,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags = file->f_iocb_flags; + kiocb->ki_flags |= file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -828,6 +893,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + struct io_async_rw *io = req->async_data; + + /* + * We have a union of meta fields with wpq used for buffered-io + * in io_async_rw, so fail it here. + */ + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->private = &io->meta; + } + return 0; } @@ -902,6 +979,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); do { /* @@ -1125,6 +1203,7 @@ done: } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; diff --git a/io_uring/rw.h b/io_uring/rw.h index 3f432dc75441..2d7656bd268d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,6 +2,11 @@ #include +struct io_meta_state { + u32 seed; + struct iov_iter_state iter_meta; +}; + struct io_async_rw { size_t bytes_done; struct iov_iter iter; @@ -9,7 +14,14 @@ struct io_async_rw { struct iovec fast_iov; struct iovec *free_iovec; int free_iov_nr; - struct wait_page_queue wpq; + /* wpq is for buffered io, while meta fields are used with direct io */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; + }; }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); From 2c0487d8b1f1351d48a13b77b254a2bb6de49eb3 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:37 +0530 Subject: [PATCH 025/150] block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags This patch introduces BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags which indicate how the hardware should check the integrity payload. BIP_CHECK_GUARD/REFTAG are conversion of existing semantics, while BIP_CHECK_APPTAG is a new flag. The driver can now just rely on block layer flags, and doesn't need to know the integrity source. Submitter of PI decides which tags to check. This would also give us a unified interface for user and kernel generated integrity. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-8-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 5 +++++ drivers/nvme/host/core.c | 11 +++-------- include/linux/bio-integrity.h | 6 +++++- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f56d01cec689..3bee43b87001 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -434,6 +434,11 @@ bool bio_integrity_prep(struct bio *bio) if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; + /* describe what tags to check in payload */ + if (bi->csum_type) + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) { printk(KERN_ERR "could not attach integrity payload\n"); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a970168a3014..766df130cd82 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1017,18 +1017,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, control |= NVME_RW_PRINFO_PRACT; } - switch (ns->head->pi_type) { - case NVME_NS_DPS_PI_TYPE3: + if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD)) control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; + if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) { + control |= NVME_RW_PRINFO_PRCHK_REF; if (op == nvme_cmd_zone_append) control |= NVME_RW_APPEND_PIREMAP; nvme_set_ref_tag(ns, cmnd, req); - break; } } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 58ff9988433a..fe2bfe122db2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -11,6 +11,9 @@ enum bip_flags { BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 6, /* guard check */ + BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ }; struct bio_integrity_payload { @@ -31,7 +34,8 @@ struct bio_integrity_payload { }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ + BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY From 472292cd8cfcfb9f2e7731c3c54196c35b8d283d Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 28 Nov 2024 16:52:38 +0530 Subject: [PATCH 026/150] nvme: add support for passing on the application tag With user integrity buffer, there is a way to specify the app_tag. Set the corresponding protocol specific flags and send the app_tag down. Reviewed-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-9-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 766df130cd82..0983482b3ea2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -885,6 +885,12 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, return BLK_STS_OK; } +static void nvme_set_app_tag(struct request *req, struct nvme_command *cmnd) +{ + cmnd->rw.lbat = cpu_to_le16(bio_integrity(req->bio)->app_tag); + cmnd->rw.lbatm = cpu_to_le16(0xffff); +} + static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd, struct request *req) { @@ -1025,6 +1031,10 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, control |= NVME_RW_APPEND_PIREMAP; nvme_set_ref_tag(ns, cmnd, req); } + if (bio_integrity_flagged(req->bio, BIP_CHECK_APPTAG)) { + control |= NVME_RW_PRINFO_PRCHK_APP; + nvme_set_app_tag(req, cmnd); + } } cmnd->rw.control = cpu_to_le16(control); From 18623503a3a514780214850bf8ba8b03ea0f3a4b Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:39 +0530 Subject: [PATCH 027/150] scsi: add support for user-meta interface Add support for sending user-meta buffer. Set tags to be checked using flags specified by user/block-layer. With this change, BIP_CTRL_NOCHECK becomes unused. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-10-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 4 ++-- include/linux/bio-integrity.h | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8947dab132d7..cb7ac8736b91 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -814,14 +814,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_GUARD)) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG)) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index fe2bfe122db2..2195bc06dcde 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -7,13 +7,12 @@ enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ - BIP_CHECK_GUARD = 1 << 6, /* guard check */ - BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ - BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ + BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ + BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 5, /* guard check */ + BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ }; struct bio_integrity_payload { @@ -33,8 +32,7 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; -#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY From 3d8b5a22d40435b4a7e58f06ae2cd3506b222898 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 28 Nov 2024 16:52:40 +0530 Subject: [PATCH 028/150] block: add support to pass user meta buffer If an iocb contains metadata, extract that and prepare the bip. Based on flags specified by the user, set corresponding guard/app/ref tags to be checked in bip. Reviewed-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-11-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 50 +++++++++++++++++++++++++++++++++++ block/fops.c | 45 ++++++++++++++++++++++++------- include/linux/bio-integrity.h | 7 +++++ 3 files changed, 92 insertions(+), 10 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 3bee43b87001..5d81ad9a3d20 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -364,6 +364,55 @@ free_bvec: return ret; } +static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (meta->flags & IO_INTEGRITY_CHK_GUARD) + bip->bip_flags |= BIP_CHECK_GUARD; + if (meta->flags & IO_INTEGRITY_CHK_APPTAG) + bip->bip_flags |= BIP_CHECK_APPTAG; + if (meta->flags & IO_INTEGRITY_CHK_REFTAG) + bip->bip_flags |= BIP_CHECK_REFTAG; + + bip->app_tag = meta->app_tag; +} + +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int integrity_bytes; + int ret; + struct iov_iter it; + + if (!bi) + return -EINVAL; + /* + * original meta iterator can be bigger. + * process integrity info corresponding to current data buffer only. + */ + it = meta->iter; + integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio)); + if (it.count < integrity_bytes) + return -EINVAL; + + /* should fit into two bytes */ + BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16)); + + if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS)) + return -EINVAL; + + it.count = integrity_bytes; + ret = bio_integrity_map_user(bio, &it); + if (!ret) { + bio_uio_meta_to_bip(bio, meta); + bip_set_seed(bio_integrity(bio), meta->seed); + iov_iter_advance(&meta->iter, integrity_bytes); + meta->seed += bio_integrity_intervals(bi, bio_sectors(bio)); + } + return ret; +} + /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -564,6 +613,7 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; + bip->app_tag = bip_src->app_tag; return 0; } diff --git a/block/fops.c b/block/fops.c index 13a67940d040..6d5c4fc5a216 100644 --- a/block/fops.c +++ b/block/fops.c @@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; + WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; + bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; + if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + bio_integrity_unmap_user(bio); + if (atomic_dec_and_test(&dio->ref)) { - if (!(dio->flags & DIO_IS_SYNC)) { + if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { - bio_release_pages(bio, false); - bio_clear_flag(bio, BIO_REFFED); - bio_put(bio); - blk_finish_plug(&plug); - return -EAGAIN; + ret = -EAGAIN; + goto fail; } bio->bi_opf |= REQ_NOWAIT; } + if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + ret = bio_integrity_map_iter(bio, iocb->private); + if (unlikely(ret)) + goto fail; + } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) @@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_put(&dio->bio); return ret; +fail: + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return ret; } static void blkdev_bio_end_io_async(struct bio *bio) @@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } + if (iocb->ki_flags & IOCB_HAS_METADATA) + bio_integrity_unmap_user(bio); + iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { @@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + if (unlikely(ret)) + goto out_bio_put; } dio->size = bio->bi_iter.bi_size; @@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_HAS_METADATA) { + ret = bio_integrity_map_iter(bio, iocb->private); + WRITE_ONCE(iocb->private, NULL); + if (unlikely(ret)) + goto out_bio_put; + } + if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; @@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, submit_bio(bio); } return -EIOCBQUEUED; + +out_bio_put: + bio_put(bio); + return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 2195bc06dcde..de0a6c9de4d1 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -23,6 +23,7 @@ struct bio_integrity_payload { unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + u16 app_tag; /* application tag value */ struct bvec_iter bio_iter; /* for rewinding parent bio */ @@ -78,6 +79,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -108,6 +110,11 @@ static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) return -EINVAL; } +static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + return -EINVAL; +} + static inline void bio_integrity_unmap_user(struct bio *bio) { } From aff09dc1fd3a165289011ab23cc3b46978ec741c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:27 +0800 Subject: [PATCH 029/150] block: remove unnecessary check in blk_unfreeze_check_owner() The following check of 'q->mq_freeze_owner != current' covers the previous one, so remove the unnecessary check. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8ac19d4ae3c0..60f457f62913 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -142,8 +142,6 @@ static bool blk_freeze_set_owner(struct request_queue *q, /* verify the last unfreeze in owner context */ static bool blk_unfreeze_check_owner(struct request_queue *q) { - if (!q->mq_freeze_owner) - return false; if (q->mq_freeze_owner != current) return false; if (--q->mq_freeze_owner_depth == 0) { From 6f491a8d4b92d1a840fd9209cba783c84437d0b7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:28 +0800 Subject: [PATCH 030/150] block: track disk DEAD state automatically for modeling queue freeze lockdep Now we only verify the outmost freeze & unfreeze in current context in case that !q->mq_freeze_depth, so it is reliable to save disk DEAD state when we want to lock the freeze queue since the state is one per-task variable now. Doing this way can kill lots of false positive when freeze queue is called before adding disk[1]. [1] https://lore.kernel.org/linux-block/6741f6b2.050a0220.1cc393.0017.GAE@google.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 +++++-- block/blk.h | 19 +++++++++++++------ block/elevator.c | 4 ++-- block/genhd.c | 4 ++-- include/linux/blkdev.h | 2 ++ 5 files changed, 24 insertions(+), 12 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 60f457f62913..0c6a319fb936 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -131,6 +131,9 @@ static bool blk_freeze_set_owner(struct request_queue *q, if (!q->mq_freeze_depth) { q->mq_freeze_owner = owner; q->mq_freeze_owner_depth = 1; + q->mq_freeze_disk_dead = !q->disk || + test_bit(GD_DEAD, &q->disk->state) || + !blk_queue_registered(q); return true; } @@ -187,7 +190,7 @@ bool __blk_freeze_queue_start(struct request_queue *q, void blk_freeze_queue_start(struct request_queue *q) { if (__blk_freeze_queue_start(q, current)) - blk_freeze_acquire_lock(q, false, false); + blk_freeze_acquire_lock(q, false); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -235,7 +238,7 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) void blk_mq_unfreeze_queue(struct request_queue *q) { if (__blk_mq_unfreeze_queue(q, false)) - blk_unfreeze_release_lock(q, false, false); + blk_unfreeze_release_lock(q, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); diff --git a/block/blk.h b/block/blk.h index 2c26abf505b8..8708168d50e4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -720,22 +720,29 @@ void blk_integrity_verify(struct bio *bio); void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); -static inline void blk_freeze_acquire_lock(struct request_queue *q, bool - disk_dead, bool queue_dying) +#ifdef CONFIG_LOCKDEP +static inline void blk_freeze_acquire_lock(struct request_queue *q, bool queue_dying) { - if (!disk_dead) + if (!q->mq_freeze_disk_dead) rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); if (!queue_dying) rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); } -static inline void blk_unfreeze_release_lock(struct request_queue *q, bool - disk_dead, bool queue_dying) +static inline void blk_unfreeze_release_lock(struct request_queue *q, bool queue_dying) { if (!queue_dying) rwsem_release(&q->q_lockdep_map, _RET_IP_); - if (!disk_dead) + if (!q->mq_freeze_disk_dead) rwsem_release(&q->io_lockdep_map, _RET_IP_); } +#else +static inline void blk_freeze_acquire_lock(struct request_queue *q, bool queue_dying) +{ +} +static inline void blk_unfreeze_release_lock(struct request_queue *q, bool queue_dying) +{ +} +#endif #endif /* BLK_INTERNAL_H */ diff --git a/block/elevator.c b/block/elevator.c index 7c3ba80e5ff4..ca0a74369f1c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -602,14 +602,14 @@ void elevator_init_mq(struct request_queue *q) * Disk isn't added yet, so verifying queue lock only manually. */ blk_freeze_queue_start_non_owner(q); - blk_freeze_acquire_lock(q, true, false); + blk_freeze_acquire_lock(q, false); blk_mq_freeze_queue_wait(q); blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_unfreeze_release_lock(q, true, false); + blk_unfreeze_release_lock(q, false); blk_mq_unfreeze_queue_non_owner(q); if (err) { diff --git a/block/genhd.c b/block/genhd.c index 79230c109fca..59ac299909b3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -692,7 +692,7 @@ void del_gendisk(struct gendisk *disk) start_drain = __blk_mark_disk_dead(disk); queue_dying = blk_queue_dying(q); if (start_drain) - blk_freeze_acquire_lock(q, true, queue_dying); + blk_freeze_acquire_lock(q, queue_dying); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); @@ -748,7 +748,7 @@ void del_gendisk(struct gendisk *disk) blk_mq_exit_queue(q); if (start_drain) - blk_unfreeze_release_lock(q, true, queue_dying); + blk_unfreeze_release_lock(q, queue_dying); } EXPORT_SYMBOL(del_gendisk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 378d3a1a22fc..522cf8eef66c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -581,6 +581,8 @@ struct request_queue { #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; + /* Records disk state in current context, used in unfreeze queue */ + bool mq_freeze_disk_dead; #endif wait_queue_head_t mq_freeze_wq; /* From b9d4eee7e04b9cfb0b4bcd748fe6b3ec517171d9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:29 +0800 Subject: [PATCH 031/150] block: don't verify queue freeze manually in elevator_init_mq() Now blk_freeze_queue_start() can track disk state automatically, and it isn't necessary to verify queue freeze manually in elevator_init_mq() any more. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/elevator.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index ca0a74369f1c..a26b96662620 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -601,16 +601,13 @@ void elevator_init_mq(struct request_queue *q) * * Disk isn't added yet, so verifying queue lock only manually. */ - blk_freeze_queue_start_non_owner(q); - blk_freeze_acquire_lock(q, false); - blk_mq_freeze_queue_wait(q); + blk_mq_freeze_queue(q); blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_unfreeze_release_lock(q, false); - blk_mq_unfreeze_queue_non_owner(q); + blk_mq_unfreeze_queue(q); if (err) { pr_warn("\"%s\" elevator initialization failed, " From f6661b1d0525f3764596a1b65eeed9e75aecafa7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 27 Nov 2024 21:51:30 +0800 Subject: [PATCH 032/150] block: track queue dying state automatically for modeling queue freeze lockdep Now we only verify the outmost freeze & unfreeze in current context in case that !q->mq_freeze_depth, so it is reliable to save queue lying state when we want to lock the freeze queue since the state is one per-task variable now. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241127135133.3952153-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- block/blk.h | 12 ++++++------ block/genhd.c | 7 +++---- include/linux/blkdev.h | 6 +++++- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 0c6a319fb936..fca2ec64a06b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -134,6 +134,7 @@ static bool blk_freeze_set_owner(struct request_queue *q, q->mq_freeze_disk_dead = !q->disk || test_bit(GD_DEAD, &q->disk->state) || !blk_queue_registered(q); + q->mq_freeze_queue_dying = blk_queue_dying(q); return true; } @@ -190,7 +191,7 @@ bool __blk_freeze_queue_start(struct request_queue *q, void blk_freeze_queue_start(struct request_queue *q) { if (__blk_freeze_queue_start(q, current)) - blk_freeze_acquire_lock(q, false); + blk_freeze_acquire_lock(q); } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); @@ -238,7 +239,7 @@ bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) void blk_mq_unfreeze_queue(struct request_queue *q) { if (__blk_mq_unfreeze_queue(q, false)) - blk_unfreeze_release_lock(q, false); + blk_unfreeze_release_lock(q); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); diff --git a/block/blk.h b/block/blk.h index 8708168d50e4..cbf6a676ffe9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -721,26 +721,26 @@ void blk_integrity_prepare(struct request *rq); void blk_integrity_complete(struct request *rq, unsigned int nr_bytes); #ifdef CONFIG_LOCKDEP -static inline void blk_freeze_acquire_lock(struct request_queue *q, bool queue_dying) +static inline void blk_freeze_acquire_lock(struct request_queue *q) { if (!q->mq_freeze_disk_dead) rwsem_acquire(&q->io_lockdep_map, 0, 1, _RET_IP_); - if (!queue_dying) + if (!q->mq_freeze_queue_dying) rwsem_acquire(&q->q_lockdep_map, 0, 1, _RET_IP_); } -static inline void blk_unfreeze_release_lock(struct request_queue *q, bool queue_dying) +static inline void blk_unfreeze_release_lock(struct request_queue *q) { - if (!queue_dying) + if (!q->mq_freeze_queue_dying) rwsem_release(&q->q_lockdep_map, _RET_IP_); if (!q->mq_freeze_disk_dead) rwsem_release(&q->io_lockdep_map, _RET_IP_); } #else -static inline void blk_freeze_acquire_lock(struct request_queue *q, bool queue_dying) +static inline void blk_freeze_acquire_lock(struct request_queue *q) { } -static inline void blk_unfreeze_release_lock(struct request_queue *q, bool queue_dying) +static inline void blk_unfreeze_release_lock(struct request_queue *q) { } #endif diff --git a/block/genhd.c b/block/genhd.c index 59ac299909b3..5678194b6b1a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -661,7 +661,7 @@ void del_gendisk(struct gendisk *disk) struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; - bool start_drain, queue_dying; + bool start_drain; might_sleep(); @@ -690,9 +690,8 @@ void del_gendisk(struct gendisk *disk) */ mutex_lock(&disk->open_mutex); start_drain = __blk_mark_disk_dead(disk); - queue_dying = blk_queue_dying(q); if (start_drain) - blk_freeze_acquire_lock(q, queue_dying); + blk_freeze_acquire_lock(q); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); @@ -748,7 +747,7 @@ void del_gendisk(struct gendisk *disk) blk_mq_exit_queue(q); if (start_drain) - blk_unfreeze_release_lock(q, queue_dying); + blk_unfreeze_release_lock(q); } EXPORT_SYMBOL(del_gendisk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 522cf8eef66c..5d40af2ef971 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -581,8 +581,12 @@ struct request_queue { #ifdef CONFIG_LOCKDEP struct task_struct *mq_freeze_owner; int mq_freeze_owner_depth; - /* Records disk state in current context, used in unfreeze queue */ + /* + * Records disk & queue state in current context, used in unfreeze + * queue + */ bool mq_freeze_disk_dead; + bool mq_freeze_queue_dying; #endif wait_queue_head_t mq_freeze_wq; /* From b56426bcf880d0f14a482c302ab7e37f3e6c3583 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 26 Nov 2024 09:09:56 +0900 Subject: [PATCH 033/150] null_blk: Add rotational feature support To facilitate testing of kernel functions related to the rotational feature (BLK_FEAT_ROTATIONAL) of a block device (e.g. NVMe rotational bit support), add the rotational boolean configfs attribute and module parameter to the null_blk driver. If set, a null block device will report being a rotational device through it queue limits features with the BLK_FEAT_ROTATIONAL flag. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20241126000956.95983-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 13 ++++++++++++- drivers/block/null_blk/null_blk.h | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 3c3d8d200abb..32bd232cceef 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -266,6 +266,10 @@ static bool g_zone_full; module_param_named(zone_full, g_zone_full, bool, S_IRUGO); MODULE_PARM_DESC(zone_full, "Initialize the sequential write required zones of a zoned device to be full. Default: false"); +static bool g_rotational; +module_param_named(rotational, g_rotational, bool, S_IRUGO); +MODULE_PARM_DESC(rotational, "Set the rotational feature for the device. Default: false"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -468,6 +472,7 @@ NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); NULLB_DEVICE_ATTR(fua, bool, NULL); +NULLB_DEVICE_ATTR(rotational, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -621,6 +626,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_shared_tags, &nullb_device_attr_shared_tag_bitmap, &nullb_device_attr_fua, + &nullb_device_attr_rotational, NULL, }; @@ -706,7 +712,8 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page) "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size,zone_append_max_sectors,zone_full\n"); + "zone_size,zone_append_max_sectors,zone_full," + "rotational\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -793,6 +800,7 @@ static struct nullb_device *null_alloc_dev(void) dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; dev->fua = g_fua; + dev->rotational = g_rotational; return dev; } @@ -1938,6 +1946,9 @@ static int null_add_dev(struct nullb_device *dev) lim.features |= BLK_FEAT_FUA; } + if (dev->rotational) + lim.features |= BLK_FEAT_ROTATIONAL; + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); if (IS_ERR(nullb->disk)) { rv = PTR_ERR(nullb->disk); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index a7bb32f73ec3..6f9fe6171087 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -107,6 +107,7 @@ struct nullb_device { bool shared_tags; /* share tag set between devices for blk-mq */ bool shared_tag_bitmap; /* use hostwide shared tags */ bool fua; /* Support FUA */ + bool rotational; /* Fake rotational device */ }; struct nullb { From fd9b0244f5c5f63461ca9752eebd2423ae02bb59 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 28 Nov 2024 20:50:26 +0800 Subject: [PATCH 034/150] blktrace: don't centralize grabbing q->debugfs_mutex in blk_trace_ioctl Call each handler directly and the handler do grab q->debugfs_mutex, prepare for killing dependency between ->debug_mutex and ->mmap_lock. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241128125029.4152292-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8fd292d34d89..f01aae3a2f7b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -732,34 +732,32 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) int ret, start = 0; char b[BDEVNAME_SIZE]; - mutex_lock(&q->debugfs_mutex); - switch (cmd) { case BLKTRACESETUP: snprintf(b, sizeof(b), "%pg", bdev); - ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: snprintf(b, sizeof(b), "%pg", bdev); + mutex_lock(&q->debugfs_mutex); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + mutex_unlock(&q->debugfs_mutex); break; #endif case BLKTRACESTART: start = 1; fallthrough; case BLKTRACESTOP: - ret = __blk_trace_startstop(q, start); + ret = blk_trace_startstop(q, start); break; case BLKTRACETEARDOWN: - ret = __blk_trace_remove(q); + ret = blk_trace_remove(q); break; default: ret = -ENOTTY; break; } - - mutex_unlock(&q->debugfs_mutex); return ret; } From b769a2f409e7a356db852a1bb62a32f7809b3a3c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 28 Nov 2024 20:50:27 +0800 Subject: [PATCH 035/150] blktrace: move copy_[to|from]_user() out of ->debugfs_lock Move copy_[to|from]_user() out of ->debugfs_lock and cut the dependency between mm->mmap_lock and q->debugfs_lock, then we avoids lots of lockdep false positive warning. Obviously ->debug_lock isn't needed for copy_[to|from]_user(). The only behavior change is to call blk_trace_remove() in case of setup failure handling by re-grabbing ->debugfs_lock, and this way is just fine since we do cover concurrent setup() & remove(). Reported-by: syzbot+91585b36b538053343e4@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-block/67450fd4.050a0220.1286eb.0007.GAE@google.com/ Closes: https://lore.kernel.org/linux-block/6742e584.050a0220.1cc393.0038.GAE@google.com/ Closes: https://lore.kernel.org/linux-block/6742a600.050a0220.1cc393.002e.GAE@google.com/ Closes: https://lore.kernel.org/linux-block/67420102.050a0220.1cc393.0019.GAE@google.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20241128125029.4152292-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f01aae3a2f7b..18c81e6aa496 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -617,8 +617,9 @@ err: return ret; } -static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, char __user *arg) +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) { struct blk_user_trace_setup buts; int ret; @@ -627,26 +628,17 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; + mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { - __blk_trace_remove(q); + blk_trace_remove(q); return -EFAULT; } return 0; -} - -int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct block_device *bdev, - char __user *arg) -{ - int ret; - - mutex_lock(&q->debugfs_mutex); - ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->debugfs_mutex); return ret; } @@ -673,12 +665,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, .pid = cbuts.pid, }; + mutex_lock(&q->debugfs_mutex); ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + mutex_unlock(&q->debugfs_mutex); if (ret) return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { - __blk_trace_remove(q); + blk_trace_remove(q); return -EFAULT; } @@ -740,9 +734,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) case BLKTRACESETUP32: snprintf(b, sizeof(b), "%pg", bdev); - mutex_lock(&q->debugfs_mutex); ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); - mutex_unlock(&q->debugfs_mutex); break; #endif case BLKTRACESTART: From 5c292ac6e69f390179b93dc104b40903cddce636 Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:19:56 +0000 Subject: [PATCH 036/150] block: Delete bio_prio() Since commit 43b62ce3ff0a ("block: move bio io prio to a new field"), macro bio_prio() does nothing but return the value in bio->bi_ioprio. Most other places just read bio->bi_ioprio directly, so replace bi_ioprio() callsites with reading bio->bi_ioprio directly and delete that macro. Signed-off-by: John Garry Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241202111957.2311683-2-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/md/dm-verity-fec.c | 6 +++--- drivers/md/dm-verity-target.c | 4 ++-- include/linux/bio.h | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 62b1a44b8dd2..b0ee199009fc 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -132,7 +132,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, u8 *par, *block; struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio->bi_ioprio); if (IS_ERR(par)) return PTR_ERR(par); @@ -160,7 +160,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, if (offset >= v->fec->io_size) { dm_bufio_release(buf); - par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio_prio(bio)); + par = fec_read_parity(v, rsb, block_offset, &offset, &buf, bio->bi_ioprio); if (IS_ERR(par)) return PTR_ERR(par); } @@ -250,7 +250,7 @@ static int fec_read_bufs(struct dm_verity *v, struct dm_verity_io *io, bufio = v->bufio; } - bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio_prio(bio)); + bbuf = dm_bufio_read_with_ioprio(bufio, block, &buf, bio->bi_ioprio); if (IS_ERR(bbuf)) { DMWARN_LIMIT("%s: FEC %llu: read failed (%llu): %ld", v->data_dev->name, diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 47d595f6a76e..e86c1431b108 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -321,7 +321,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } else { data = dm_bufio_read_with_ioprio(v->bufio, hash_block, - &buf, bio_prio(bio)); + &buf, bio->bi_ioprio); } if (IS_ERR(data)) @@ -789,7 +789,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) verity_fec_init_io(io); - verity_submit_prefetch(v, io, bio_prio(bio)); + verity_submit_prefetch(v, io, bio->bi_ioprio); submit_bio_noacct(bio); diff --git a/include/linux/bio.h b/include/linux/bio.h index 7a1b3b1a8fed..99676916f3db 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -19,7 +19,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) return min(nr_segs, BIO_MAX_VECS); } -#define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) #define bio_iter_iovec(bio, iter) \ From 19206d3f5ef7f051056d2fb49203a347e4844e6e Mon Sep 17 00:00:00 2001 From: John Garry Date: Mon, 2 Dec 2024 11:19:57 +0000 Subject: [PATCH 037/150] block: Delete bio_set_prio() Since commit 43b62ce3ff0a ("block: move bio io prio to a new field"), macro bio_set_prio() does nothing but set bio->bi_ioprio. All other places just set bio->bi_ioprio directly, so replace bio_set_prio() remaining callsites with setting bio->bi_ioprio directly and delete that macro. Signed-off-by: John Garry Acked-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20241202111957.2311683-3-john.g.garry@oracle.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 2 +- drivers/md/bcache/movinggc.c | 2 +- drivers/md/bcache/writeback.c | 2 +- fs/bcachefs/move.c | 6 +++--- include/linux/bio.h | 2 -- 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 08ce6d96d04c..2ee6e9bd4e28 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -167,7 +167,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess, bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); - bio_set_prio(bio, prio); + bio->bi_ioprio = prio; submit_bio(bio); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index ef6abf33f926..45ca134cbf02 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -82,7 +82,7 @@ static void moving_init(struct moving_io *io) bio_init(bio, NULL, bio->bi_inline_vecs, DIV_ROUND_UP(KEY_SIZE(&io->w->key), PAGE_SECTORS), 0); bio_get(bio); - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&io->w->key) << 9; bio->bi_private = &io->cl; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index c1d28e365910..453efbbdc8ee 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -334,7 +334,7 @@ static void dirty_init(struct keybuf_key *w) bio_init(bio, NULL, bio->bi_inline_vecs, DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 0); if (!io->dc->writeback_percent) - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; bio->bi_private = w; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0ef4a86850bb..67fb651f4af4 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -292,8 +292,8 @@ int bch2_move_extent(struct moving_context *ctxt, io->write_sectors = k.k->size; bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); - bio_set_prio(&io->write.op.wbio.bio, - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->write.op.wbio.bio.bi_ioprio = + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, GFP_KERNEL)) @@ -303,7 +303,7 @@ int bch2_move_extent(struct moving_context *ctxt, io->rbio.opts = io_opts; bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); io->rbio.bio.bi_vcnt = pages; - bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); io->rbio.bio.bi_iter.bi_size = sectors << 9; io->rbio.bio.bi_opf = REQ_OP_READ; diff --git a/include/linux/bio.h b/include/linux/bio.h index 99676916f3db..1eec59699100 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -19,8 +19,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) return min(nr_segs, BIO_MAX_VECS); } -#define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) - #define bio_iter_iovec(bio, iter) \ bvec_iter_bvec((bio)->bi_io_vec, (iter)) From ccb9868ab7f4b253440b8723a3487b8b9a16d371 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 4 Dec 2024 15:04:50 +0000 Subject: [PATCH 038/150] blktrace: remove redundant return at end of function A recent change added return 0 before an existing return statement at the end of function blk_trace_setup. The final return is now redundant, so remove it. Fixes: 64d124798244 ("blktrace: move copy_[to|from]_user() out of ->debugfs_lock") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20241204150450.399005-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 18c81e6aa496..3679a6d18934 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -639,8 +639,6 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return -EFAULT; } return 0; - - return ret; } EXPORT_SYMBOL_GPL(blk_trace_setup); From 53328a3671e965051fdbd2be9be34a0bdc4e7a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20du=20Garreau?= Date: Wed, 4 Dec 2024 09:38:39 +0100 Subject: [PATCH 039/150] block: rnull: Initialize the module in place MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using `InPlaceModule` avoids an allocation and an indirection. Signed-off-by: Benoît du Garreau Acked-by: Andreas Hindborg Link: https://lore.kernel.org/r/20241204-rnull_in_place-v1-1-efe3eafac9fb@dugarreau.fr Signed-off-by: Jens Axboe --- drivers/block/rnull.rs | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/block/rnull.rs b/drivers/block/rnull.rs index 9cca05dcf772..ddf3629d8894 100644 --- a/drivers/block/rnull.rs +++ b/drivers/block/rnull.rs @@ -32,25 +32,31 @@ module! { license: "GPL v2", } +#[pin_data] struct NullBlkModule { - _disk: Pin>>>, + #[pin] + _disk: Mutex>, } -impl kernel::Module for NullBlkModule { - fn init(_module: &'static ThisModule) -> Result { +impl kernel::InPlaceModule for NullBlkModule { + fn init(_module: &'static ThisModule) -> impl PinInit { pr_info!("Rust null_blk loaded\n"); - let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; - let disk = gen_disk::GenDiskBuilder::new() - .capacity_sectors(4096 << 11) - .logical_block_size(4096)? - .physical_block_size(4096)? - .rotational(false) - .build(format_args!("rnullb{}", 0), tagset)?; + // Use a immediately-called closure as a stable `try` block + let disk = /* try */ (|| { + let tagset = Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?; - let disk = KBox::pin_init(new_mutex!(disk, "nullb:disk"), flags::GFP_KERNEL)?; + gen_disk::GenDiskBuilder::new() + .capacity_sectors(4096 << 11) + .logical_block_size(4096)? + .physical_block_size(4096)? + .rotational(false) + .build(format_args!("rnullb{}", 0), tagset) + })(); - Ok(Self { _disk: disk }) + try_pin_init!(Self { + _disk <- new_mutex!(disk?, "nullb:disk"), + }) } } From 0e20669a91306540ce76710c12201a73b1c3612a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Dec 2024 16:08:47 +0000 Subject: [PATCH 040/150] null_blk: Remove accesses to page->index Use page->private to store the index instead of page->index. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20241216160849.31739-1-willy@infradead.org Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 32bd232cceef..7b674187c096 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -907,7 +907,7 @@ static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx, if (radix_tree_insert(root, idx, t_page)) { null_free_page(t_page); t_page = radix_tree_lookup(root, idx); - WARN_ON(!t_page || t_page->page->index != idx); + WARN_ON(!t_page || t_page->page->private != idx); } else if (is_cache) nullb->dev->curr_cache += PAGE_SIZE; @@ -930,7 +930,7 @@ static void null_free_device_storage(struct nullb_device *dev, bool is_cache) (void **)t_pages, pos, FREE_BATCH); for (i = 0; i < nr_pages; i++) { - pos = t_pages[i]->page->index; + pos = t_pages[i]->page->private; ret = radix_tree_delete_item(root, pos, t_pages[i]); WARN_ON(ret != t_pages[i]); null_free_page(ret); @@ -956,7 +956,7 @@ static struct nullb_page *__null_lookup_page(struct nullb *nullb, root = is_cache ? &nullb->dev->cache : &nullb->dev->data; t_page = radix_tree_lookup(root, idx); - WARN_ON(t_page && t_page->page->index != idx); + WARN_ON(t_page && t_page->page->private != idx); if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap))) return t_page; @@ -999,7 +999,7 @@ static struct nullb_page *null_insert_page(struct nullb *nullb, spin_lock_irq(&nullb->lock); idx = sector >> PAGE_SECTORS_SHIFT; - t_page->page->index = idx; + t_page->page->private = idx; t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache); radix_tree_preload_end(); @@ -1019,7 +1019,7 @@ static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page) struct nullb_page *t_page, *ret; void *dst, *src; - idx = c_page->page->index; + idx = c_page->page->private; t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true); @@ -1078,7 +1078,7 @@ again: * avoid race, we don't allow page free */ for (i = 0; i < nr_pages; i++) { - nullb->cache_flush_pos = c_pages[i]->page->index; + nullb->cache_flush_pos = c_pages[i]->page->private; /* * We found the page which is being flushed to disk by other * threads From fea4952df0eeec4e1a295ebaac9f61c0065fae87 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:09 +0100 Subject: [PATCH 041/150] driver core: bus: add irq_get_affinity callback to bus_type Introducing a callback in struct bus_type so that a subsystem can hook up the getters directly. This approach avoids exposing random getters in any subsystems APIs. Acked-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Acked-by: Greg Kroah-Hartman Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-1-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- include/linux/device/bus.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index cdc4757217f9..b18658bce2c3 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -48,6 +48,7 @@ struct fwnode_handle; * will never get called until they do. * @remove: Called when a device removed from this bus. * @shutdown: Called at shut-down time to quiesce the device. + * @irq_get_affinity: Get IRQ affinity mask for the device on this bus. * * @online: Called to put the device back online (after offlining it). * @offline: Called to put the device offline for hot-removal. May fail. @@ -87,6 +88,8 @@ struct bus_type { void (*sync_state)(struct device *dev); void (*remove)(struct device *dev); void (*shutdown)(struct device *dev); + const struct cpumask *(*irq_get_affinity)(struct device *dev, + unsigned int irq_vec); int (*online)(struct device *dev); int (*offline)(struct device *dev); From 22d813bf00ba7f7a2e027dfc26f60c6ff525ba85 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:10 +0100 Subject: [PATCH 042/150] PCI: hookup irq_get_affinity callback struct bus_type has a new callback for retrieving the IRQ affinity for a device. Hook this callback up for PCI based devices. Acked-by: Bjorn Helgaas Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-2-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- drivers/pci/pci-driver.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 35270172c833..f57ea36d125d 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1670,6 +1670,19 @@ static void pci_dma_cleanup(struct device *dev) iommu_device_unuse_default_domain(dev); } +/* + * pci_device_irq_get_affinity - get IRQ affinity mask for device + * @dev: ptr to dev structure + * @irq_vec: interrupt vector number + * + * Return the CPU affinity mask for @dev and @irq_vec. + */ +static const struct cpumask *pci_device_irq_get_affinity(struct device *dev, + unsigned int irq_vec) +{ + return pci_irq_get_affinity(to_pci_dev(dev), irq_vec); +} + const struct bus_type pci_bus_type = { .name = "pci", .match = pci_bus_match, @@ -1677,6 +1690,7 @@ const struct bus_type pci_bus_type = { .probe = pci_device_probe, .remove = pci_device_remove, .shutdown = pci_device_shutdown, + .irq_get_affinity = pci_device_irq_get_affinity, .dev_groups = pci_dev_groups, .bus_groups = pci_bus_groups, .drv_groups = pci_drv_groups, From c7f63c5d13925c97a4ae9908bd933ab197872161 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:11 +0100 Subject: [PATCH 043/150] virtio: hookup irq_get_affinity callback struct bus_type has a new callback for retrieving the IRQ affinity for a device. Hook this callback up for virtio based devices. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-3-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- drivers/virtio/virtio.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index b9095751e43b..b10ed9f5b543 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -377,6 +377,24 @@ static void virtio_dev_remove(struct device *_d) of_node_put(dev->dev.of_node); } +/* + * virtio_irq_get_affinity - get IRQ affinity mask for device + * @_d: ptr to dev structure + * @irq_vec: interrupt vector number + * + * Return the CPU affinity mask for @_d and @irq_vec. + */ +static const struct cpumask *virtio_irq_get_affinity(struct device *_d, + unsigned int irq_vec) +{ + struct virtio_device *dev = dev_to_virtio(_d); + + if (!dev->config->get_vq_affinity) + return NULL; + + return dev->config->get_vq_affinity(dev, irq_vec); +} + static const struct bus_type virtio_bus = { .name = "virtio", .match = virtio_dev_match, @@ -384,6 +402,7 @@ static const struct bus_type virtio_bus = { .uevent = virtio_uevent, .probe = virtio_dev_probe, .remove = virtio_dev_remove, + .irq_get_affinity = virtio_irq_get_affinity, }; int __register_virtio_driver(struct virtio_driver *driver, struct module *owner) From 1452e9b470c903fc4137a448e9f5767e92d68229 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:12 +0100 Subject: [PATCH 044/150] blk-mq: introduce blk_mq_map_hw_queues blk_mq_pci_map_queues and blk_mq_virtio_map_queues will create a CPU to hardware queue mapping based on affinity information. These two function share common code and only differ on how the affinity information is retrieved. Also, those functions are located in the block subsystem where it doesn't really fit in. They are virtio and pci subsystem specific. Thus introduce provide a generic mapping function which uses the irq_get_affinity callback from bus_type. Originally idea from Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-4-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- block/blk-mq-cpumap.c | 37 +++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 2 ++ 2 files changed, 39 insertions(+) diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 9638b25fd521..ad8d6a363f24 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -54,3 +55,39 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) return NUMA_NO_NODE; } + +/** + * blk_mq_map_hw_queues - Create CPU to hardware queue mapping + * @qmap: CPU to hardware queue map + * @dev: The device to map queues + * @offset: Queue offset to use for the device + * + * Create a CPU to hardware queue mapping in @qmap. The struct bus_type + * irq_get_affinity callback will be used to retrieve the affinity. + */ +void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, + struct device *dev, unsigned int offset) + +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + if (!dev->bus->irq_get_affinity) + goto fallback; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = dev->bus->irq_get_affinity(dev, queue + offset); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return; + +fallback: + WARN_ON_ONCE(qmap->nr_queues > 1); + blk_mq_clear_mq_map(qmap); +} +EXPORT_SYMBOL_GPL(blk_mq_map_hw_queues); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index c596e0e4cb75..769eab6247d4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -921,6 +921,8 @@ void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); void blk_freeze_queue_start_non_owner(struct request_queue *q); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); +void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, + struct device *dev, unsigned int offset); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); From bd326a5ad6397ccfc67af862606be107c15a43e6 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:13 +0100 Subject: [PATCH 045/150] scsi: replace blk_mq_pci_map_queues with blk_mq_map_hw_queues Replace all users of blk_mq_pci_map_queues with the more generic blk_mq_map_hw_queues. This in preparation to retire blk_mq_pci_map_queues. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-5-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- drivers/scsi/fnic/fnic_main.c | 3 +-- drivers/scsi/hisi_sas/hisi_sas.h | 1 - drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 4 ++-- drivers/scsi/megaraid/megaraid_sas_base.c | 3 +-- drivers/scsi/mpi3mr/mpi3mr.h | 1 - drivers/scsi/mpi3mr/mpi3mr_os.c | 2 +- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 3 +-- drivers/scsi/pm8001/pm8001_init.c | 2 +- drivers/scsi/pm8001/pm8001_sas.h | 1 - drivers/scsi/qla2xxx/qla_nvme.c | 3 +-- drivers/scsi/qla2xxx/qla_os.c | 4 ++-- drivers/scsi/smartpqi/smartpqi_init.c | 7 +++---- 12 files changed, 13 insertions(+), 21 deletions(-) diff --git a/drivers/scsi/fnic/fnic_main.c b/drivers/scsi/fnic/fnic_main.c index adec0df24bc4..1cb517f731f4 100644 --- a/drivers/scsi/fnic/fnic_main.c +++ b/drivers/scsi/fnic/fnic_main.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -601,7 +600,7 @@ void fnic_mq_map_queues_cpus(struct Scsi_Host *host) return; } - blk_mq_pci_map_queues(qmap, l_pdev, FNIC_PCI_OFFSET); + blk_mq_map_hw_queues(qmap, &l_pdev->dev, FNIC_PCI_OFFSET); } static int fnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h index a44768bceb9a..4101447bb8eb 100644 --- a/drivers/scsi/hisi_sas/hisi_sas.h +++ b/drivers/scsi/hisi_sas/hisi_sas.h @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 5db931663ae4..79129c977704 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3328,8 +3328,8 @@ static void hisi_sas_map_queues(struct Scsi_Host *shost) if (i == HCTX_TYPE_POLL) blk_mq_map_queues(qmap); else - blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, - BASE_VECTORS_V3_HW); + blk_mq_map_hw_queues(qmap, hisi_hba->dev, + BASE_VECTORS_V3_HW); qoff += qmap->nr_queues; } } diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 50f1dcb6d584..49abd7dd75a7 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include @@ -3193,7 +3192,7 @@ static void megasas_map_queues(struct Scsi_Host *shost) map = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; map->nr_queues = instance->msix_vectors - offset; map->queue_offset = 0; - blk_mq_pci_map_queues(map, instance->pdev, offset); + blk_mq_map_hw_queues(map, &instance->pdev->dev, offset); qoff += map->nr_queues; offset += map->nr_queues; diff --git a/drivers/scsi/mpi3mr/mpi3mr.h b/drivers/scsi/mpi3mr/mpi3mr.h index 0c3e1ac076b5..0d72b5f1b69d 100644 --- a/drivers/scsi/mpi3mr/mpi3mr.h +++ b/drivers/scsi/mpi3mr/mpi3mr.h @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/drivers/scsi/mpi3mr/mpi3mr_os.c b/drivers/scsi/mpi3mr/mpi3mr_os.c index 1bef88130d0c..1e8735538b23 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_os.c +++ b/drivers/scsi/mpi3mr/mpi3mr_os.c @@ -4042,7 +4042,7 @@ static void mpi3mr_map_queues(struct Scsi_Host *shost) */ map->queue_offset = qoff; if (i != HCTX_TYPE_POLL) - blk_mq_pci_map_queues(map, mrioc->pdev, offset); + blk_mq_map_hw_queues(map, &mrioc->pdev->dev, offset); else blk_mq_map_queues(map); diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index f2a55aa5fe65..9599d7a50028 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -53,7 +53,6 @@ #include #include #include -#include #include #include "mpt3sas_base.h" @@ -11890,7 +11889,7 @@ static void scsih_map_queues(struct Scsi_Host *shost) */ map->queue_offset = qoff; if (i != HCTX_TYPE_POLL) - blk_mq_pci_map_queues(map, ioc->pdev, offset); + blk_mq_map_hw_queues(map, &ioc->pdev->dev, offset); else blk_mq_map_queues(map); diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index f8c81e53e93f..2a7822fd613e 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -105,7 +105,7 @@ static void pm8001_map_queues(struct Scsi_Host *shost) struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; if (pm8001_ha->number_of_intr > 1) { - blk_mq_pci_map_queues(qmap, pm8001_ha->pdev, 1); + blk_mq_map_hw_queues(qmap, &pm8001_ha->pdev->dev, 1); return; } diff --git a/drivers/scsi/pm8001/pm8001_sas.h b/drivers/scsi/pm8001/pm8001_sas.h index 42c7b3f7afbf..d3bd8683f344 100644 --- a/drivers/scsi/pm8001/pm8001_sas.h +++ b/drivers/scsi/pm8001/pm8001_sas.h @@ -56,7 +56,6 @@ #include #include #include -#include #include "pm8001_defs.h" #define DRV_NAME "pm80xx" diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c index 8f4cc136a9c9..8ee2e337c9e1 100644 --- a/drivers/scsi/qla2xxx/qla_nvme.c +++ b/drivers/scsi/qla2xxx/qla_nvme.c @@ -8,7 +8,6 @@ #include #include #include -#include #include static struct nvme_fc_port_template qla_nvme_fc_transport; @@ -841,7 +840,7 @@ static void qla_nvme_map_queues(struct nvme_fc_local_port *lport, { struct scsi_qla_host *vha = lport->private; - blk_mq_pci_map_queues(map, vha->hw->pdev, vha->irq_offset); + blk_mq_map_hw_queues(map, &vha->hw->pdev->dev, vha->irq_offset); } static void qla_nvme_localport_delete(struct nvme_fc_local_port *lport) diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c index 7ab717ed7232..31535beaaa16 100644 --- a/drivers/scsi/qla2xxx/qla_os.c +++ b/drivers/scsi/qla2xxx/qla_os.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -8071,7 +8070,8 @@ static void qla2xxx_map_queues(struct Scsi_Host *shost) if (USER_CTRL_IRQ(vha->hw) || !vha->hw->mqiobase) blk_mq_map_queues(qmap); else - blk_mq_pci_map_queues(qmap, vha->hw->pdev, vha->irq_offset); + blk_mq_map_hw_queues(qmap, &vha->hw->pdev->dev, + vha->irq_offset); } struct scsi_host_template qla2xxx_driver_template = { diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c index 870f37b70546..04fb24d77e9b 100644 --- a/drivers/scsi/smartpqi/smartpqi_init.c +++ b/drivers/scsi/smartpqi/smartpqi_init.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -6547,10 +6546,10 @@ static void pqi_map_queues(struct Scsi_Host *shost) struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost); if (!ctrl_info->disable_managed_interrupts) - return blk_mq_pci_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], - ctrl_info->pci_dev, 0); + blk_mq_map_hw_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT], + &ctrl_info->pci_dev->dev, 0); else - return blk_mq_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&shost->tag_set.map[HCTX_TYPE_DEFAULT]); } static inline bool pqi_is_tape_changer_device(struct pqi_scsi_dev *device) From 4425f6492a511dd12ccad924ae8c8e802c172418 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:14 +0100 Subject: [PATCH 046/150] nvme: replace blk_mq_pci_map_queues with blk_mq_map_hw_queues Replace all users of blk_mq_pci_map_queues with the more generic blk_mq_map_hw_queues. This in preparation to retire blk_mq_pci_map_queues. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-6-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 1 - drivers/nvme/host/pci.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b81af7919e94..094be164ffdc 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -16,7 +16,6 @@ #include #include "fc.h" #include -#include /* *************************** Data Structures/Defines ****************** */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 1a5ba80f1811..709328a67f91 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -463,7 +462,7 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *set) */ map->queue_offset = qoff; if (i != HCTX_TYPE_POLL && offset) - blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); + blk_mq_map_hw_queues(map, dev->dev, offset); else blk_mq_map_queues(map); qoff += map->nr_queues; From a5665c3d150c9876981e9a31161f42dac6a0d95c Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:15 +0100 Subject: [PATCH 047/150] virtio: blk/scsi: replace blk_mq_virtio_map_queues with blk_mq_map_hw_queues Replace all users of blk_mq_virtio_map_queues with the more generic blk_mq_map_hw_queues. This in preparation to retire blk_mq_virtio_map_queues. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-7-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 4 ++-- drivers/scsi/virtio_scsi.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 3efe378f1386..ed514ff46dc8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include @@ -1181,7 +1180,8 @@ static void virtblk_map_queues(struct blk_mq_tag_set *set) if (i == HCTX_TYPE_POLL) blk_mq_map_queues(&set->map[i]); else - blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); + blk_mq_map_hw_queues(&set->map[i], + &vblk->vdev->dev, 0); } } diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 8471f38b730e..60be1a0c6183 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "sd.h" @@ -746,7 +745,7 @@ static void virtscsi_map_queues(struct Scsi_Host *shost) if (i == HCTX_TYPE_POLL) blk_mq_map_queues(map); else - blk_mq_virtio_map_queues(map, vscsi->vdev, 2); + blk_mq_map_hw_queues(map, &vscsi->vdev->dev, 2); } } From 9bc1e897a821f19ba3775bb013a8a6fb121c3ca1 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 2 Dec 2024 15:00:16 +0100 Subject: [PATCH 048/150] blk-mq: remove unused queue mapping helpers There are no users left of the pci and virtio queue mapping helpers. Thus remove them. Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: John Garry Signed-off-by: Daniel Wagner Link: https://lore.kernel.org/r/20241202-refactor-blk-affinity-helpers-v6-8-27211e9c2cd5@kernel.org Signed-off-by: Jens Axboe --- block/Makefile | 2 -- block/blk-mq-pci.c | 46 ----------------------------------- block/blk-mq-virtio.c | 46 ----------------------------------- include/linux/blk-mq-pci.h | 11 --------- include/linux/blk-mq-virtio.h | 11 --------- 5 files changed, 116 deletions(-) delete mode 100644 block/blk-mq-pci.c delete mode 100644 block/blk-mq-virtio.c delete mode 100644 include/linux/blk-mq-pci.h delete mode 100644 include/linux/blk-mq-virtio.h diff --git a/block/Makefile b/block/Makefile index ddfd21c1a9ff..33748123710b 100644 --- a/block/Makefile +++ b/block/Makefile @@ -27,8 +27,6 @@ bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o -obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o -obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c deleted file mode 100644 index d47b5c73c9eb..000000000000 --- a/block/blk-mq-pci.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2016 Christoph Hellwig. - */ -#include -#include -#include -#include -#include - -#include "blk-mq.h" - -/** - * blk_mq_pci_map_queues - provide a default queue mapping for PCI device - * @qmap: CPU to hardware queue map. - * @pdev: PCI device associated with @set. - * @offset: Offset to use for the pci irq vector - * - * This function assumes the PCI device @pdev has at least as many available - * interrupt vectors as @set has queues. It will then query the vector - * corresponding to each queue for it's affinity mask and built queue mapping - * that maps a queue to the CPUs that have irq affinity for the corresponding - * vector. - */ -void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - for (queue = 0; queue < qmap->nr_queues; queue++) { - mask = pci_irq_get_affinity(pdev, queue + offset); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - qmap->mq_map[cpu] = qmap->queue_offset + queue; - } - - return; - -fallback: - WARN_ON_ONCE(qmap->nr_queues > 1); - blk_mq_clear_mq_map(qmap); -} -EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c deleted file mode 100644 index 68d0945c0b08..000000000000 --- a/block/blk-mq-virtio.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2016 Christoph Hellwig. - */ -#include -#include -#include -#include -#include "blk-mq.h" - -/** - * blk_mq_virtio_map_queues - provide a default queue mapping for virtio device - * @qmap: CPU to hardware queue map. - * @vdev: virtio device to provide a mapping for. - * @first_vec: first interrupt vectors to use for queues (usually 0) - * - * This function assumes the virtio device @vdev has at least as many available - * interrupt vectors as @set has queues. It will then query the vector - * corresponding to each queue for it's affinity mask and built queue mapping - * that maps a queue to the CPUs that have irq affinity for the corresponding - * vector. - */ -void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, - struct virtio_device *vdev, int first_vec) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - if (!vdev->config->get_vq_affinity) - goto fallback; - - for (queue = 0; queue < qmap->nr_queues; queue++) { - mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - qmap->mq_map[cpu] = qmap->queue_offset + queue; - } - - return; - -fallback: - blk_mq_map_queues(qmap); -} -EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h deleted file mode 100644 index ca544e1d3508..000000000000 --- a/include/linux/blk-mq-pci.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BLK_MQ_PCI_H -#define _LINUX_BLK_MQ_PCI_H - -struct blk_mq_queue_map; -struct pci_dev; - -void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset); - -#endif /* _LINUX_BLK_MQ_PCI_H */ diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h deleted file mode 100644 index 13226e9b22dd..000000000000 --- a/include/linux/blk-mq-virtio.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BLK_MQ_VIRTIO_H -#define _LINUX_BLK_MQ_VIRTIO_H - -struct blk_mq_queue_map; -struct virtio_device; - -void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, - struct virtio_device *vdev, int first_vec); - -#endif /* _LINUX_BLK_MQ_VIRTIO_H */ From cc76ace465d6977b47daa427379b7be1e0976f12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 Dec 2024 07:01:59 +0100 Subject: [PATCH 049/150] block: remove BLK_MQ_F_SHOULD_MERGE BLK_MQ_F_SHOULD_MERGE is set for all tag_sets except those that purely process passthrough commands (bsg-lib, ufs tmf, various nvme admin queues) and thus don't even check the flag. Remove it to simplify the driver interface. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241219060214.1928848-1-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 1 - block/blk-mq-debugfs.c | 1 - block/blk-mq-sched.c | 3 +-- drivers/block/amiflop.c | 1 - drivers/block/aoe/aoeblk.c | 1 - drivers/block/ataflop.c | 1 - drivers/block/floppy.c | 1 - drivers/block/loop.c | 3 +-- drivers/block/mtip32xx/mtip32xx.c | 1 - drivers/block/nbd.c | 3 +-- drivers/block/null_blk/main.c | 2 -- drivers/block/ps3disk.c | 3 +-- drivers/block/rbd.c | 1 - drivers/block/rnbd/rnbd-clt.c | 3 +-- drivers/block/sunvdc.c | 2 +- drivers/block/swim.c | 2 +- drivers/block/swim3.c | 3 +-- drivers/block/ublk_drv.c | 1 - drivers/block/virtio_blk.c | 1 - drivers/block/xen-blkfront.c | 1 - drivers/block/z2ram.c | 1 - drivers/cdrom/gdrom.c | 2 +- drivers/md/dm-rq.c | 2 +- drivers/memstick/core/ms_block.c | 3 +-- drivers/memstick/core/mspro_block.c | 3 +-- drivers/mmc/core/queue.c | 2 +- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/mtd/ubi/block.c | 2 +- drivers/nvme/host/apple.c | 1 - drivers/nvme/host/core.c | 1 - drivers/s390/block/dasd_genhd.c | 1 - drivers/s390/block/scm_blk.c | 1 - drivers/scsi/scsi_lib.c | 1 - include/linux/blk-mq.h | 1 - 34 files changed, 15 insertions(+), 43 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 66c1a8835e36..0b1e61f72fb3 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -865,7 +865,6 @@ static int ubd_add(int n, char **error_out) ubd_dev->tag_set.ops = &ubd_mq_ops; ubd_dev->tag_set.queue_depth = 64; ubd_dev->tag_set.numa_node = NUMA_NO_NODE; - ubd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ubd_dev->tag_set.driver_data = ubd_dev; ubd_dev->tag_set.nr_hw_queues = 1; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5463697a8442..4b6b20ccdb53 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -181,7 +181,6 @@ static const char *const alloc_policy_name[] = { #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { - HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 451a2c1f1f32..7442ca27c2bf 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -351,8 +351,7 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; - if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || - list_empty_careful(&ctx->rq_lists[type])) + if (list_empty_careful(&ctx->rq_lists[type])) goto out_put; /* default per sw-queue merge */ diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 49ced65bef4c..9edd4468f755 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1819,7 +1819,6 @@ static int fd_alloc_drive(int drive) unit[drive].tag_set.nr_maps = 1; unit[drive].tag_set.queue_depth = 2; unit[drive].tag_set.numa_node = NUMA_NO_NODE; - unit[drive].tag_set.flags = BLK_MQ_F_SHOULD_MERGE; if (blk_mq_alloc_tag_set(&unit[drive].tag_set)) goto out_cleanup_trackbuf; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 2028795ec61c..00b74a845328 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -368,7 +368,6 @@ aoeblk_gdalloc(void *vp) set->nr_hw_queues = 1; set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; - set->flags = BLK_MQ_F_SHOULD_MERGE; err = blk_mq_alloc_tag_set(set); if (err) { pr_err("aoe: cannot allocate tag set for %ld.%d\n", diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 4ba98c6654be..110f9aca2667 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -2088,7 +2088,6 @@ static int __init atari_floppy_init (void) unit[i].tag_set.nr_maps = 1; unit[i].tag_set.queue_depth = 2; unit[i].tag_set.numa_node = NUMA_NO_NODE; - unit[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ret = blk_mq_alloc_tag_set(&unit[i].tag_set); if (ret) goto err; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 3affb538b989..abf0486f0d4f 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4596,7 +4596,6 @@ static int __init do_floppy_init(void) tag_sets[drive].nr_maps = 1; tag_sets[drive].queue_depth = 2; tag_sets[drive].numa_node = NUMA_NO_NODE; - tag_sets[drive].flags = BLK_MQ_F_SHOULD_MERGE; err = blk_mq_alloc_tag_set(&tag_sets[drive]); if (err) goto out_put_disk; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 8f6761c27c68..836a53eef4b4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2023,8 +2023,7 @@ static int loop_add(int i) lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); - lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | - BLK_MQ_F_NO_SCHED_BY_DEFAULT; + lo->tag_set.flags = BLK_MQ_F_STACKING | BLK_MQ_F_NO_SCHED_BY_DEFAULT; lo->tag_set.driver_data = lo; err = blk_mq_alloc_tag_set(&lo->tag_set); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 43701b7b10a7..95361099a2dc 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3416,7 +3416,6 @@ static int mtip_block_initialize(struct driver_data *dd) dd->tags.reserved_tags = 1; dd->tags.cmd_size = sizeof(struct mtip_cmd); dd->tags.numa_node = dd->numa_node; - dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; dd->tags.driver_data = dd; dd->tags.timeout = MTIP_NCQ_CMD_TIMEOUT_MS; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b852050d8a96..b1a5af69a66d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1841,8 +1841,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) nbd->tag_set.queue_depth = 128; nbd->tag_set.numa_node = NUMA_NO_NODE; nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); - nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_BLOCKING; + nbd->tag_set.flags = BLK_MQ_F_BLOCKING; nbd->tag_set.driver_data = nbd; INIT_WORK(&nbd->remove_work, nbd_dev_remove_work); nbd->backend = NULL; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 7b674187c096..178e62cd9a9f 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1791,7 +1791,6 @@ static int null_init_global_tag_set(void) tag_set.nr_hw_queues = g_submit_queues; tag_set.queue_depth = g_hw_queue_depth; tag_set.numa_node = g_home_node; - tag_set.flags = BLK_MQ_F_SHOULD_MERGE; if (g_no_sched) tag_set.flags |= BLK_MQ_F_NO_SCHED; if (g_shared_tag_bitmap) @@ -1817,7 +1816,6 @@ static int null_setup_tagset(struct nullb *nullb) nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues; nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; nullb->tag_set->numa_node = nullb->dev->home_node; - nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; if (nullb->dev->no_sched) nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED; if (nullb->dev->shared_tag_bitmap) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index ff45ed766469..68fed46c463e 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -434,8 +434,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) ps3disk_identify(dev); - error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1, - BLK_MQ_F_SHOULD_MERGE); + error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1, 0); if (error) goto fail_teardown; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ac421dbeeb11..5b393e4a1ddf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4964,7 +4964,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; rbd_dev->tag_set.numa_node = NUMA_NO_NODE; - rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; rbd_dev->tag_set.nr_hw_queues = num_present_cpus(); rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index c34695d2eea7..82467ecde7ec 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1209,8 +1209,7 @@ static int setup_mq_tags(struct rnbd_clt_session *sess) tag_set->ops = &rnbd_mq_ops; tag_set->queue_depth = sess->queue_depth; tag_set->numa_node = NUMA_NO_NODE; - tag_set->flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_TAG_QUEUE_SHARED; + tag_set->flags = BLK_MQ_F_TAG_QUEUE_SHARED; tag_set->cmd_size = sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE; /* for HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL */ diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 2d38331ee667..88dcae6ec575 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -829,7 +829,7 @@ static int probe_disk(struct vdc_port *port) } err = blk_mq_alloc_sq_tag_set(&port->tag_set, &vdc_mq_ops, - VDC_TX_RING_SIZE, BLK_MQ_F_SHOULD_MERGE); + VDC_TX_RING_SIZE, 0); if (err) return err; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index be4ac58afe41..eda33c5eb5e2 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -818,7 +818,7 @@ static int swim_floppy_init(struct swim_priv *swd) for (drive = 0; drive < swd->floppy_count; drive++) { err = blk_mq_alloc_sq_tag_set(&swd->unit[drive].tag_set, - &swim_mq_ops, 2, BLK_MQ_F_SHOULD_MERGE); + &swim_mq_ops, 2, 0); if (err) goto exit_put_disks; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 90be1017f7bf..9914153b365b 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1208,8 +1208,7 @@ static int swim3_attach(struct macio_dev *mdev, fs = &floppy_states[floppy_count]; memset(fs, 0, sizeof(*fs)); - rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); + rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2, 0); if (rc) goto out_unregister; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index d4aed12dd436..6c16cb798fdd 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2205,7 +2205,6 @@ static int ublk_add_tag_set(struct ublk_device *ub) ub->tag_set.queue_depth = ub->dev_info.queue_depth; ub->tag_set.numa_node = NUMA_NO_NODE; ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); - ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ub->tag_set.driver_data = ub; return blk_mq_alloc_tag_set(&ub->tag_set); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index ed514ff46dc8..71a7ffeafb32 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1481,7 +1481,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->tag_set.ops = &virtio_mq_ops; vblk->tag_set.queue_depth = queue_depth; vblk->tag_set.numa_node = NUMA_NO_NODE; - vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; vblk->tag_set.cmd_size = sizeof(struct virtblk_req) + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 59ce113b882a..edcd08a9dcef 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1131,7 +1131,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, } else info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; - info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; info->tag_set.cmd_size = sizeof(struct blkif_req); info->tag_set.driver_data = info; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 4b7219be1bb8..8c1c7f4211eb 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -354,7 +354,6 @@ static int __init z2_init(void) tag_set.nr_maps = 1; tag_set.queue_depth = 16; tag_set.numa_node = NUMA_NO_NODE; - tag_set.flags = BLK_MQ_F_SHOULD_MERGE; ret = blk_mq_alloc_tag_set(&tag_set); if (ret) goto out_unregister_blkdev; diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 64b097e830d4..85aceab5eac6 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -777,7 +777,7 @@ static int probe_gdrom(struct platform_device *devptr) probe_gdrom_setupcd(); err = blk_mq_alloc_sq_tag_set(&gd.tag_set, &gdrom_mq_ops, 1, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); + BLK_MQ_F_BLOCKING); if (err) goto probe_fail_free_cd_info; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 499f8cc8a39f..e23076f7ece2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) md->tag_set->ops = &dm_mq_ops; md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); md->tag_set->numa_node = md->numa_node_id; - md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING; + md->tag_set->flags = BLK_MQ_F_STACKING; md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); md->tag_set->driver_data = md; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 20a2466bec23..5b617c1f6789 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2094,8 +2094,7 @@ static int msb_init_disk(struct memstick_dev *card) if (msb->disk_id < 0) return msb->disk_id; - rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &msb_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); + rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &msb_mq_ops, 2, 0); if (rc) goto out_release_id; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 13b317c56069..634d343b6bdb 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1139,8 +1139,7 @@ static int mspro_block_init_disk(struct memstick_dev *card) if (disk_id < 0) return disk_id; - rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &mspro_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); + rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &mspro_mq_ops, 2, 0); if (rc) goto out_release_id; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 4d6844261912..ab662f502fe7 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -441,7 +441,7 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, else mq->tag_set.queue_depth = MMC_QUEUE_DEPTH; mq->tag_set.numa_node = NUMA_NO_NODE; - mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + mq->tag_set.flags = BLK_MQ_F_BLOCKING; mq->tag_set.nr_hw_queues = 1; mq->tag_set.cmd_size = sizeof(struct mmc_queue_req); mq->tag_set.driver_data = mq; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 47ead84407cd..ee7e1d908986 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -329,7 +329,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) goto out_list_del; ret = blk_mq_alloc_sq_tag_set(new->tag_set, &mtd_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); + BLK_MQ_F_BLOCKING); if (ret) goto out_kfree_tag_set; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 60d0155be869..2836905f0152 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -383,7 +383,7 @@ int ubiblock_create(struct ubi_volume_info *vi) dev->tag_set.ops = &ubiblock_mq_ops; dev->tag_set.queue_depth = 64; dev->tag_set.numa_node = NUMA_NO_NODE; - dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; + dev->tag_set.flags = BLK_MQ_F_BLOCKING; dev->tag_set.cmd_size = sizeof(struct ubiblock_pdu); dev->tag_set.driver_data = dev; dev->tag_set.nr_hw_queues = 1; diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 4319ab50c10d..83c60468542c 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1275,7 +1275,6 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv) anv->tagset.timeout = NVME_IO_TIMEOUT; anv->tagset.numa_node = NUMA_NO_NODE; anv->tagset.cmd_size = sizeof(struct apple_nvme_iod); - anv->tagset.flags = BLK_MQ_F_SHOULD_MERGE; anv->tagset.driver_data = &anv->ioq; ret = blk_mq_alloc_tag_set(&anv->tagset); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a970168a3014..42283d268500 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4639,7 +4639,6 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, /* Reserved for fabric connect */ set->reserved_tags = 1; set->numa_node = ctrl->numa_node; - set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 6da47a65af61..28e92fad0ca1 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -56,7 +56,6 @@ int dasd_gendisk_alloc(struct dasd_block *block) block->tag_set.cmd_size = sizeof(struct dasd_ccw_req); block->tag_set.nr_hw_queues = nr_hw_queues; block->tag_set.queue_depth = queue_depth; - block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; block->tag_set.numa_node = NUMA_NO_NODE; rc = blk_mq_alloc_tag_set(&block->tag_set); if (rc) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index 3fcfe029db1b..91bbe9d2e5ac 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -461,7 +461,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) bdev->tag_set.cmd_size = sizeof(blk_status_t); bdev->tag_set.nr_hw_queues = nr_requests; bdev->tag_set.queue_depth = nr_requests_per_io * nr_requests; - bdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; bdev->tag_set.numa_node = NUMA_NO_NODE; ret = blk_mq_alloc_tag_set(&bdev->tag_set); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index adee6f60c966..5cf124e13097 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2065,7 +2065,6 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) tag_set->queue_depth = shost->can_queue; tag_set->cmd_size = cmd_size; tag_set->numa_node = dev_to_node(shost->dma_dev); - tag_set->flags = BLK_MQ_F_SHOULD_MERGE; tag_set->flags |= BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); if (shost->queuecommand_may_block) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 769eab6247d4..7f6c482ebf54 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -668,7 +668,6 @@ struct blk_mq_ops { /* Keep hctx_flag_name[] in sync with the definitions below */ enum { - BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for From 31d813a3b8cbde2d09ba4dee282ca29096541006 Mon Sep 17 00:00:00 2001 From: Andreas Hindborg Date: Fri, 20 Dec 2024 10:37:57 +0100 Subject: [PATCH 050/150] rust: block: fix use of BLK_MQ_F_SHOULD_MERGE BLK_MQ_F_SHOULD_MERGE has was removed [1] and is now in effect by default. So remove the flag from tag sets of Rust block device drivers. Link: https://lore.kernel.org/r/20241219060214.1928848-1-hch@lst.de [1] Fixes: 9377b95cda73 ("block: remove BLK_MQ_F_SHOULD_MERGE") Signed-off-by: Andreas Hindborg Link: https://lore.kernel.org/r/20241220-merge-flag-fix-v1-1-41b7778dac06@kernel.org Signed-off-by: Jens Axboe --- rust/kernel/block/mq/tag_set.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs index d7f175a05d99..00ddcc71dfa2 100644 --- a/rust/kernel/block/mq/tag_set.rs +++ b/rust/kernel/block/mq/tag_set.rs @@ -52,7 +52,7 @@ impl TagSet { numa_node: bindings::NUMA_NO_NODE, queue_depth: num_tags, cmd_size, - flags: bindings::BLK_MQ_F_SHOULD_MERGE, + flags: 0, driver_data: core::ptr::null_mut::(), nr_maps: num_maps, ..tag_set From 48ea518d0072e29a7af304258a4cedb3e7eea308 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 17 Dec 2024 13:03:07 -0800 Subject: [PATCH 051/150] blk-zoned: Minimize #include directives Only include those header files that are necessary. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241217210310.645966-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 84da1eadff64..1575b887fa38 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -11,12 +11,8 @@ */ #include -#include #include #include -#include -#include -#include #include #include #include From cbac56e5237dbec9ae3d896e71f24e95e06eafa3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 17 Dec 2024 13:03:08 -0800 Subject: [PATCH 052/150] blk-zoned: Document locking assumptions Document which functions expect that their callers must hold a lock. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241217210310.645966-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 1575b887fa38..954724a2e3c6 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -459,6 +459,8 @@ static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, struct blk_zone_wplug *zwplug) { + lockdep_assert_held(&zwplug->lock); + /* If the zone write plug was already removed, we are done. */ if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) return false; @@ -913,6 +915,8 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, { struct gendisk *disk = bio->bi_bdev->bd_disk; + lockdep_assert_held(&zwplug->lock); + /* * If we lost track of the zone write pointer due to a write error, * the user must either execute a report zones, reset the zone or finish From fa8555630b32da7c239a1e01e9eb1bb040be59ac Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 17 Dec 2024 13:03:09 -0800 Subject: [PATCH 053/150] blk-zoned: Improve the queue reference count strategy documentation For the blk_queue_exit() calls, document where the corresponding code can be found that increases q->q_usage_counter. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241217210310.645966-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 954724a2e3c6..7876a6458022 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -582,6 +582,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); bio_io_error(bio); disk_put_zone_wplug(zwplug); + /* Drop the reference taken by disk_zone_wplug_add_bio(() */ blk_queue_exit(q); } @@ -893,10 +894,7 @@ void blk_zone_write_plug_init_request(struct request *req) break; } - /* - * Drop the extra reference on the queue usage we got when - * plugging the BIO and advance the write pointer offset. - */ + /* Drop the reference taken by disk_zone_wplug_add_bio(). */ blk_queue_exit(q); zwplug->wp_offset += bio_sectors(bio); From cb01ecb79943367f9903b1f1ffb4afb6a3f4d715 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 17 Dec 2024 13:03:10 -0800 Subject: [PATCH 054/150] blk-zoned: Split queue_zone_wplugs_show() Reduce the indentation level of the code in queue_zone_wplugs_show() by moving the body of the loop in that function into a new function. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20241217210310.645966-5-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7876a6458022..4b0be40a8ea7 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1774,37 +1774,41 @@ int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); #ifdef CONFIG_BLK_DEBUG_FS +static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, + struct seq_file *m) +{ + unsigned int zwp_wp_offset, zwp_flags; + unsigned int zwp_zone_no, zwp_ref; + unsigned int zwp_bio_list_size; + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + zwp_zone_no = zwplug->zone_no; + zwp_flags = zwplug->flags; + zwp_ref = refcount_read(&zwplug->ref); + zwp_wp_offset = zwplug->wp_offset; + zwp_bio_list_size = bio_list_size(&zwplug->bio_list); + spin_unlock_irqrestore(&zwplug->lock, flags); + + seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref, + zwp_wp_offset, zwp_bio_list_size); +} int queue_zone_wplugs_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct gendisk *disk = q->disk; struct blk_zone_wplug *zwplug; - unsigned int zwp_wp_offset, zwp_flags; - unsigned int zwp_zone_no, zwp_ref; - unsigned int zwp_bio_list_size, i; - unsigned long flags; + unsigned int i; if (!disk->zone_wplugs_hash) return 0; rcu_read_lock(); - for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { - hlist_for_each_entry_rcu(zwplug, - &disk->zone_wplugs_hash[i], node) { - spin_lock_irqsave(&zwplug->lock, flags); - zwp_zone_no = zwplug->zone_no; - zwp_flags = zwplug->flags; - zwp_ref = refcount_read(&zwplug->ref); - zwp_wp_offset = zwplug->wp_offset; - zwp_bio_list_size = bio_list_size(&zwplug->bio_list); - spin_unlock_irqrestore(&zwplug->lock, flags); - - seq_printf(m, "%u 0x%x %u %u %u\n", - zwp_zone_no, zwp_flags, zwp_ref, - zwp_wp_offset, zwp_bio_list_size); - } - } + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], + node) + queue_zone_wplug_show(zwplug, m); rcu_read_unlock(); return 0; From 546d191427cf5cf3215529744c2ea8558f0279db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Nov 2024 15:53:58 -0700 Subject: [PATCH 055/150] block: make bio_integrity_map_user() static inline If CONFIG_BLK_DEV_INTEGRITY isn't set, then the dummy helper must be static inline to avoid complaints about the function being unused. Fixes: fe8f4ca7107e ("block: modify bio_integrity_map_user to accept iov_iter as argument") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411300229.y7h60mDg-lkp@intel.com/ Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index de0a6c9de4d1..802f52e38efd 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -105,7 +105,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) +static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } From febfbf767174b8a027efb7aa434f9a9416adc23d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 4 Dec 2024 15:39:23 +0000 Subject: [PATCH 056/150] io_uring/kbuf: fix unintentional sign extension on shift of reg.bgid Shifting reg.bgid << IORING_OFF_PBUF_SHIFT results in a promotion from __u16 to a 32 bit signed integer, this is then sign extended to a 64 bit unsigned long on 64 bit architectures. If reg.bgid is greater than 0x7fff then this leads to a sign extended result where all the upper 32 bits of mmap_offset are set to 1. Fix this by casting reg.bgid to the same type as mmap_offset before performing the shift. Fixes: ef62de3c4ad5 ("io_uring/kbuf: use region api for pbuf rings") Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20241204153923.401674-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index e91260a6156b..15e5e6ec5968 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -640,7 +640,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - mmap_offset = reg.bgid << IORING_OFF_PBUF_SHIFT; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; ring_size = flex_array_size(br, bufs, reg.ring_entries); memset(&rd, 0, sizeof(rd)); From 2e6406a20a3999cc761a5a697a9afa7de40713e6 Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 6 Dec 2024 16:41:44 -0800 Subject: [PATCH 057/150] io_uring: clean up io_prep_rw_setup() Remove unnecessary call to iov_iter_save_state() in io_prep_rw_setup() as io_import_iovec() already does this. Then the result from io_import_iovec() can be returned directly. Signed-off-by: David Wei Reviewed-by: Anuj Gupta Tested-by: Li Zetao Link: https://lore.kernel.org/r/20241207004144.783631-1-dw@davidwei.uk Signed-off-by: Jens Axboe --- io_uring/rw.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 04e4467ab0ee..5b24fd8b69f6 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -240,7 +240,6 @@ done: static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) { struct io_async_rw *rw; - int ret; if (io_rw_alloc_async(req)) return -ENOMEM; @@ -249,12 +248,7 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) return 0; rw = req->async_data; - ret = io_import_iovec(ddir, req, rw, 0); - if (unlikely(ret < 0)) - return ret; - - iov_iter_save_state(&rw->iter, &rw->iter_state); - return 0; + return io_import_iovec(ddir, req, rw, 0); } static inline void io_meta_save_state(struct io_async_rw *io) From de3b9e2e48190be28473ea84c384bc64931facf7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 8 Dec 2024 21:46:01 +0000 Subject: [PATCH 058/150] io_uring: don't vmap single page regions When io_check_coalesce_buffer() meets a single page buffer it bails out and tells that it can be coalesced. That's fine for registered buffers as io_coalesce_buffer() wouldn't change anything, but the region code now uses the function to decided on whether to vmap the buffer or not. Report that a single page buffer is trivially coalescable and let io_sqe_buffer_register() to filter them. Fixes: c4d0ac1c1567 ("io_uring/memmap: optimise single folio regions") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cb83e053f318857068447d40c95becebcd8aeced.1733689833.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 2d2333970eb5..f2ff108485c8 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -675,14 +675,9 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, unsigned int count = 1, nr_folios = 1; int i; - if (nr_pages <= 1) - return false; - data->nr_pages_mid = folio_nr_pages(folio); - if (data->nr_pages_mid == 1) - return false; - data->folio_shift = folio_shift(folio); + /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. @@ -750,8 +745,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ - if (io_check_coalesce_buffer(pages, nr_pages, &data)) - coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); + if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { + if (data.nr_pages_mid != 1) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); + } imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) From 29b95ac917927ce9f95bf38797e16333ecb489b1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 8 Dec 2024 21:43:22 +0000 Subject: [PATCH 059/150] io_uring: prevent reg-wait speculations With *ENTER_EXT_ARG_REG instead of passing a user pointer with arguments for the waiting loop the user can specify an offset into a pre-mapped region of memory, in which case the [offset, offset + sizeof(io_uring_reg_wait)) will be intepreted as the argument. As we address a kernel array using a user given index, it'd be a subject to speculation type of exploits. Use array_index_nospec() to prevent that. Make sure to pass not the full region size but truncate by the maximum offset allowed considering the structure size. Fixes: d617b3147d54c ("io_uring: restore back registered wait arguments") Fixes: aa00f67adc2c0 ("io_uring: add support for fixed wait regions") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1e3d9da7c43d619de7bcf41d1cd277ab2688c443.1733694126.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2e6891aadecc..66a93170ad68 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3211,6 +3211,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, end > ctx->cq_wait_size)) return ERR_PTR(-EFAULT); + offset = array_index_nospec(offset, ctx->cq_wait_size - size); return ctx->cq_wait_arg + offset; } From 479b2f4590bebd4a5a5ac9cb4c4803f4edf86768 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:07 -0500 Subject: [PATCH 060/150] io_uring: Fold allocation into alloc_cache helper The allocation paths that use alloc_cache duplicate the same code pattern, sometimes in a quite convoluted way. Fold the allocation into the cache code itself, making it just an allocator function, and keeping the cache policy invisible to callers. Another justification for doing this, beyond code simplicity, is that it makes it trivial to test the impact of disabling the cache and using slab directly, which I've used for slab improvement experiments. One relevant detail is that we provide a callback to optionally initialize memory only when we actually reach slab. This allows us to avoid blindly executing the allocation with GFP_ZERO and only clean fields when they matter. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-2-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index b7a38a2069cf..a3a8cfec32ce 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -30,6 +30,19 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) return NULL; } +static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp, + void (*init_once)(void *obj)) +{ + if (unlikely(!cache->nr_cached)) { + void *obj = kmalloc(cache->elem_size, gfp); + + if (obj && init_once) + init_once(obj); + return obj; + } + return io_alloc_cache_get(cache); +} + /* returns false if the cache was initialized properly */ static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, size_t size) From 49f7a3098cc2406c9cf60d595f4a148d6780bce6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:08 -0500 Subject: [PATCH 061/150] io_uring: Add generic helper to allocate async data This helper replaces io_alloc_async_data by using the folded allocation. Do it in a header to allow the compiler to decide whether to inline. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-3-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 12abee607e4a..e43e9194dd0a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -8,6 +8,7 @@ #include #include #include +#include "alloc_cache.h" #include "io-wq.h" #include "slist.h" #include "filetable.h" @@ -222,6 +223,16 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, + struct io_kiocb *req, + void (*init_once)(void *obj)) +{ + req->async_data = io_cache_alloc(cache, GFP_KERNEL, init_once); + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; From b2846567060638a85724579781fc4a3ca6f137e4 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:09 -0500 Subject: [PATCH 062/150] io_uring/futex: Allocate ifd with generic alloc_cache helper Instead of open-coding the allocation, use the generic alloc_cache helper. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-4-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/futex.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/io_uring/futex.c b/io_uring/futex.c index e29662f039e1..30139cc150f2 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -251,17 +251,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) io_req_task_work_add(req); } -static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) -{ - struct io_futex_data *ifd; - - ifd = io_alloc_cache_get(&ctx->futex_cache); - if (ifd) - return ifd; - - return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); -} - int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); @@ -331,7 +320,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) } io_ring_submit_lock(ctx, issue_flags); - ifd = io_alloc_ifd(ctx); + ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT, NULL); if (!ifd) { ret = -ENOMEM; goto done_unlock; From 1210872918ef9feff60c9f5a4d3246372b732eae Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:10 -0500 Subject: [PATCH 063/150] io_uring/poll: Allocate apoll with generic alloc_cache helper This abstracts away the cache details to simplify the code. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-5-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/poll.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index bced9edd5233..cc01c40b43d3 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -648,15 +648,12 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - apoll = io_alloc_cache_get(&ctx->apoll_cache); - if (!apoll) - goto alloc_apoll; - apoll->poll.retries = APOLL_MAX_RETRY; } else { -alloc_apoll: - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) + if (!(issue_flags & IO_URING_F_UNLOCKED)) + apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC, NULL); + else + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } From e9447dc0b18db40f7c9807f8d0a218f0fa07517f Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:11 -0500 Subject: [PATCH 064/150] io_uring/uring_cmd: Allocate async data through generic helper This abstracts away the cache details and simplify the code. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-6-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index af842e9b4eb9..d6ff803dbbe1 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -16,22 +16,6 @@ #include "rsrc.h" #include "uring_cmd.h" -static struct uring_cache *io_uring_async_get(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct uring_cache *cache; - - cache = io_alloc_cache_get(&ctx->uring_cache); - if (cache) { - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = cache; - return cache; - } - if (!io_alloc_async_data(req)) - return req->async_data; - return NULL; -} - static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); @@ -185,8 +169,8 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct uring_cache *cache; - cache = io_uring_async_get(req); - if (unlikely(!cache)) + cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, NULL); + if (!cache) return -ENOMEM; if (!(req->flags & REQ_F_FORCE_ASYNC)) { From f49a85371d8c1b44650ce660652a2fe305d3ddf6 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:12 -0500 Subject: [PATCH 065/150] io_uring/net: Allocate msghdr async data through helper This abstracts away the cache details. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-7-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/net.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index df1f7dc6f1c8..8457408194e7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -155,30 +155,31 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } } +static void io_msg_async_data_init(void *obj) +{ + struct io_async_msghdr *hdr = (struct io_async_msghdr *)obj; + + hdr->free_iov = NULL; + hdr->free_iov_nr = 0; +} + static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_msghdr *hdr; - hdr = io_alloc_cache_get(&ctx->netmsg_cache); - if (hdr) { - if (hdr->free_iov) { - kasan_mempool_unpoison_object(hdr->free_iov, - hdr->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; - } + hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req, + io_msg_async_data_init); + if (!hdr) + return NULL; - if (!io_alloc_async_data(req)) { - hdr = req->async_data; - hdr->free_iov_nr = 0; - hdr->free_iov = NULL; - return hdr; + /* If the async data was cached, we might have an iov cached inside. */ + if (hdr->free_iov) { + kasan_mempool_unpoison_object(hdr->free_iov, + hdr->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; } - return NULL; + return hdr; } /* assign new iovec to kmsg, if we need to */ From d7f11616edf59b255f1302040604f584535876c7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:13 -0500 Subject: [PATCH 066/150] io_uring/rw: Allocate async data through helper This abstract away the cache details. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-8-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/rw.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 5b24fd8b69f6..bdfc3faef85d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -208,33 +208,29 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) } } +static void io_rw_async_data_init(void *obj) +{ + struct io_async_rw *rw = (struct io_async_rw *)obj; + + rw->free_iovec = 0; + rw->bytes_done = 0; +} + static int io_rw_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_async_rw *rw; - rw = io_alloc_cache_get(&ctx->rw_cache); - if (rw) { - if (rw->free_iovec) { - kasan_mempool_unpoison_object(rw->free_iovec, - rw->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = rw; - goto done; - } - - if (!io_alloc_async_data(req)) { - rw = req->async_data; - rw->free_iovec = NULL; - rw->free_iov_nr = 0; -done: + rw = io_uring_alloc_async_data(&ctx->rw_cache, req, io_rw_async_data_init); + if (!rw) + return -ENOMEM; + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; rw->bytes_done = 0; - return 0; } - - return -ENOMEM; + return 0; } static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) From ef623a647f423c0d96aa75797cec182e3c5ba47d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:14 -0500 Subject: [PATCH 067/150] io_uring: Move old async data allocation helper to header There are two remaining uses of the old async data allocator that do not rely on the alloc cache. I don't want to make them use the new allocator helper because that would require a if(cache) check, which will result in dead code for the cached case (for callers passing a cache, gcc can't prove the cache isn't NULL, and will therefore preserve the check. Since this is an inline function and just a few lines long, keep a second helper to deal with cases where we don't have an async data cache. No functional change intended here. This is just moving the helper around and making it inline. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-9-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 ------------- io_uring/io_uring.h | 12 ++++++++++++ io_uring/timeout.c | 5 ++--- io_uring/waitid.c | 4 ++-- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 66a93170ad68..e6d12104f8cb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1643,19 +1643,6 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - WARN_ON_ONCE(!def->async_size); - req->async_data = kmalloc(def->async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e43e9194dd0a..032758b28d78 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -12,6 +12,7 @@ #include "io-wq.h" #include "slist.h" #include "filetable.h" +#include "opdef.h" #ifndef CREATE_TRACE_POINTS #include @@ -233,6 +234,17 @@ static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, return req->async_data; } +static inline void *io_uring_alloc_async_data_nocache(struct io_kiocb *req) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; diff --git a/io_uring/timeout.c b/io_uring/timeout.c index bbe58638eca7..a166fd90667a 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -525,10 +525,9 @@ static int __io_timeout_prep(struct io_kiocb *req, if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (io_alloc_async_data(req)) + data = io_uring_alloc_async_data_nocache(req); + if (!data) return -ENOMEM; - - data = req->async_data; data->req = req; data->flags = flags; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index daef5dd644f0..6778c0ee76c4 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -303,10 +303,10 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) struct io_waitid_async *iwa; int ret; - if (io_alloc_async_data(req)) + iwa = io_uring_alloc_async_data_nocache(req); + if (!iwa) return -ENOMEM; - iwa = req->async_data; iwa->req = req; ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, From ce9464081d5168ee0f279d6932ba82260a5b97c4 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Dec 2024 15:46:15 -0500 Subject: [PATCH 068/150] io_uring/msg_ring: Drop custom destructor kfree can handle slab objects nowadays. Drop the extra callback and just use kfree. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20241216204615.759089-10-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/msg_ring.c | 7 ------- io_uring/msg_ring.h | 1 - 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e6d12104f8cb..42d4cc5da73b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -361,7 +361,7 @@ err: io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); @@ -2696,7 +2696,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); io_free_region(ctx, &ctx->param_region); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 333c220d322a..bd3cd78d2dba 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -354,10 +354,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) return __io_msg_ring_data(fd_file(f)->private_data, &io_msg, IO_URING_F_UNLOCKED); } - -void io_msg_cache_free(const void *entry) -{ - struct io_kiocb *req = (struct io_kiocb *) entry; - - kmem_cache_free(req_cachep, req); -} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 38e7f8f0c944..32236d2fb778 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -4,4 +4,3 @@ int io_uring_sync_msg_ring(struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); -void io_msg_cache_free(const void *entry); From 1143be17d7acb02a7c4dba6169a33534983f4960 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 07:44:25 -0700 Subject: [PATCH 069/150] io_uring/rw: don't mask in f_iocb_flags A previous commit changed overwriting kiocb->ki_flags with ->f_iocb_flags with masking it in. This breaks for retry situations, where we don't necessarily want to retain previously set flags, like IOCB_NOWAIT. The use case needs IOCB_HAS_METADATA to be persistent, but the change makes all flags persistent, which is an issue. Add a request flag to track whether the request has metadata or not, as that is persistent across issues. Fixes: 59a7d12a7fb5 ("io_uring: introduce attributes for read/write and PI support") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/rw.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73575d545d3c..623d8e798a11 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -480,6 +480,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_HAS_METADATA_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -560,6 +561,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* request has read/write metadata assigned */ + REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rw.c b/io_uring/rw.c index bdfc3faef85d..d0ac4a51420e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -283,7 +283,7 @@ static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, pi_attr.len, &io->meta.iter); if (unlikely(ret < 0)) return ret; - rw->kiocb.ki_flags |= IOCB_HAS_METADATA; + req->flags |= REQ_F_HAS_METADATA; io_meta_save_state(io); return ret; } @@ -787,18 +787,17 @@ static bool io_rw_should_retry(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) + /* + * Never retry for NOWAIT or a request with metadata, we just complete + * with -EAGAIN. + */ + if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) return false; /* Only for buffered IO */ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; - /* never retry for meta io */ - if (kiocb->ki_flags & IOCB_HAS_METADATA) - return false; - /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -849,7 +848,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags |= file->f_iocb_flags; + kiocb->ki_flags = file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -883,7 +882,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } - if (kiocb->ki_flags & IOCB_HAS_METADATA) { + if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; /* @@ -892,6 +891,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) */ if (!(req->file->f_flags & O_DIRECT)) return -EOPNOTSUPP; + kiocb->ki_flags |= IOCB_HAS_METADATA; kiocb->private = &io->meta; } From 21adbcaa8007f5e584d26bebb46ec46bfbbbd330 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 12:43:49 -0700 Subject: [PATCH 070/150] io_uring/rw: use NULL for rw->free_iovec assigment It's a pointer, don't use 0 for that. sparse throws a warning for that, as the kernel test robot noticed. Fixes: d7f11616edf5 ("io_uring/rw: Allocate async data through helper") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412180253.YML3qN4d-lkp@intel.com/ Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index d0ac4a51420e..75f70935ccf4 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -212,7 +212,7 @@ static void io_rw_async_data_init(void *obj) { struct io_async_rw *rw = (struct io_async_rw *)obj; - rw->free_iovec = 0; + rw->free_iovec = NULL; rw->bytes_done = 0; } From c5f71916146033f9aba108075ff7087022075fd6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 Dec 2024 09:49:32 -0700 Subject: [PATCH 071/150] io_uring/rw: always clear ->bytes_done on io_async_rw setup A previous commit mistakenly moved the clearing of the in-progress byte count into the section that's dependent on having a cached iovec or not, but it should be cleared for any IO. If not, then extra bytes may be added at IO completion time, causing potentially weird behavior like over-reporting the amount of IO done. Fixes: d7f11616edf5 ("io_uring/rw: Allocate async data through helper") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202412271132.a09c3500-lkp@intel.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 75f70935ccf4..ca1b19d3d142 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -228,8 +228,8 @@ static int io_rw_alloc_async(struct io_kiocb *req) kasan_mempool_unpoison_object(rw->free_iovec, rw->free_iov_nr * sizeof(struct iovec)); req->flags |= REQ_F_NEED_CLEANUP; - rw->bytes_done = 0; } + rw->bytes_done = 0; return 0; } From d62c2f0d82753a05133411b1e242baf31f4ef68e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Dec 2024 17:34:41 -0700 Subject: [PATCH 072/150] io_uring: ensure io_queue_deferred() is out-of-line This is not the hot path, it's a slow path. Yet the locking for it is in the hot path, and __cold does not prevent it from being inlined. Move the locking to the function itself, and mark it noinline as well to avoid it polluting the icache of the hot path. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 42d4cc5da73b..db198bd435b5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -550,8 +550,9 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); @@ -562,6 +563,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) io_req_task_queue(de->req); kfree(de); } + spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -570,11 +572,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); + if (ctx->drain_active) io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) io_eventfd_flush_signal(ctx); } From 044792cda05a97ae1da330771ec2140ae86439ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 2 Jan 2025 13:01:31 +0100 Subject: [PATCH 073/150] elevator: Enable const sysfs attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The elevator core does not need to modify the sysfs attributes added by the elevators. Reflect this in the types, so the attributes can be moved into read-only memory. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250102-sysfs-const-attr-elevator-v1-1-9837d2058c60@weissschuh.net Signed-off-by: Jens Axboe --- block/elevator.c | 8 ++++---- block/elevator.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/elevator.c b/block/elevator.c index a26b96662620..be6e994256ac 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -405,12 +405,12 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) return NULL; } -#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) +#define to_elv(atr) container_of_const((atr), struct elv_fs_entry, attr) static ssize_t elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { - struct elv_fs_entry *entry = to_elv(attr); + const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; @@ -428,7 +428,7 @@ static ssize_t elv_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { - struct elv_fs_entry *entry = to_elv(attr); + const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; @@ -461,7 +461,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { - struct elv_fs_entry *attr = e->type->elevator_attrs; + const struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { while (attr->attr.name) { if (sysfs_create_file(&e->kobj, &attr->attr)) diff --git a/block/elevator.h b/block/elevator.h index dbf357ef4fab..e526662c5dbb 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -71,7 +71,7 @@ struct elevator_type size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ - struct elv_fs_entry *elevator_attrs; + const struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; struct module *elevator_owner; From 8686e1dedac7190d2f148b23e4f1ac69d2e37d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 2 Jan 2025 13:01:32 +0100 Subject: [PATCH 074/150] block: mq-deadline: Constify sysfs attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The elevator core now allows instances of 'struct elv_fs_entry' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250102-sysfs-const-attr-elevator-v1-2-9837d2058c60@weissschuh.net Signed-off-by: Jens Axboe --- block/mq-deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 5528347b5fcf..754f6b7415cd 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -834,7 +834,7 @@ STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) -static struct elv_fs_entry deadline_attrs[] = { +static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), From c40f9f6ac59f949b6cbf10903fa2aae76efffa20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 2 Jan 2025 13:01:33 +0100 Subject: [PATCH 075/150] block, bfq: constify sysfs attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The elevator core now allows instances of 'struct elv_fs_entry' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250102-sysfs-const-attr-elevator-v1-3-9837d2058c60@weissschuh.net Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 95dd7b795935..068c63e95738 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7614,7 +7614,7 @@ static ssize_t bfq_low_latency_store(struct elevator_queue *e, #define BFQ_ATTR(name) \ __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store) -static struct elv_fs_entry bfq_attrs[] = { +static const struct elv_fs_entry bfq_attrs[] = { BFQ_ATTR(fifo_expire_sync), BFQ_ATTR(fifo_expire_async), BFQ_ATTR(back_seek_max), From 00aab2f236f25f3dc3c88eee1b8ccb0cbcae3f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 2 Jan 2025 13:01:34 +0100 Subject: [PATCH 076/150] kyber: constify sysfs attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The elevator core now allows instances of 'struct elv_fs_entry' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20250102-sysfs-const-attr-elevator-v1-4-9837d2058c60@weissschuh.net Signed-off-by: Jens Axboe --- block/kyber-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 4155594aefc6..dc31f2dfa414 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -889,7 +889,7 @@ KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); #undef KYBER_LAT_SHOW_STORE #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) -static struct elv_fs_entry kyber_sched_attrs[] = { +static const struct elv_fs_entry kyber_sched_attrs[] = { KYBER_LAT_ATTR(read), KYBER_LAT_ATTR(write), __ATTR_NULL From 2a51c327d4a4a2eb62d67f4ea13a17efd0f25c5c Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Fri, 3 Jan 2025 22:04:11 +0700 Subject: [PATCH 077/150] io_uring/rsrc: simplify the bvec iter count calculation As we don't use iov_iter_advance() but our own logic in io_import_fixed(), we can remove the logic that over-sets the iter's count to len + offset then adjusts it later to len. This helps to make the code cleaner. Signed-off-by: Bui Quang Minh Link: https://lore.kernel.org/r/20250103150412.12549-1-minhquangbui99@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2ff108485c8..964a47c8d85e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -882,7 +882,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len); if (offset) { /* @@ -904,7 +904,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset < bvec->bv_len) { - iter->count -= offset; iter->iov_offset = offset; } else { unsigned long seg_skip; @@ -915,7 +914,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter, iter->bvec += seg_skip; iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } From 457ef47c08d2979f3e59ce66267485c3faed70c8 Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Mon, 9 Dec 2024 19:04:35 +0800 Subject: [PATCH 078/150] block: retry call probe after request_module in blk_request_module Set kernel config: CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_LOOP_MIN_COUNT=0 Do latter: mknod loop0 b 7 0 exec 4<> loop0 Before commit e418de3abcda ("block: switch gendisk lookup to a simple xarray"), lookup_gendisk will first use base_probe to load module loop, and then the retry will call loop_probe to prepare the loop disk. Finally open for this disk will success. However, after this commit, we lose the retry logic, and open will fail with ENXIO. Block device autoloading is deprecated and will be removed soon, but maybe we should keep open success until we really remove it. So, give a retry to fix it. Fixes: e418de3abcda ("block: switch gendisk lookup to a simple xarray") Suggested-by: Christoph Hellwig Signed-off-by: Yang Erkun Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241209110435.3670985-1-yangerkun@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 5678194b6b1a..5da3c9a64e64 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -797,7 +797,7 @@ static ssize_t disk_badblocks_store(struct device *dev, } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD -void blk_request_module(dev_t devt) +static bool blk_probe_dev(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; @@ -807,14 +807,26 @@ void blk_request_module(dev_t devt) if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); - return; + return true; } } mutex_unlock(&major_names_lock); + return false; +} - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); +void blk_request_module(dev_t devt) +{ + int error; + + if (blk_probe_dev(devt)) + return; + + error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)); + /* Make old-style 2.4 aliases work */ + if (error > 0) + error = request_module("block-major-%d", MAJOR(devt)); + if (!error) + blk_probe_dev(devt); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ From c2398e6d5f16e15598d3a37e17107fea477e3f91 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 3 Jan 2025 09:51:25 +0100 Subject: [PATCH 079/150] ps3disk: Do not use dev->bounce_size before it is set dev->bounce_size is only initialized after it is used to set the queue limits. Fix this by using BOUNCE_SIZE instead. Fixes: a7f18b74dbe17162 ("ps3disk: pass queue_limits to blk_mq_alloc_disk") Reported-by: Philipp Hortmann Closes: https://lore.kernel.org/39256db9-3d73-4e86-a49b-300dfd670212@gmail.com Signed-off-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/06988f959ea6885b8bd7fb3b9059dd54bc6bbad7.1735894216.git.geert+renesas@glider.be Signed-off-by: Jens Axboe --- drivers/block/ps3disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 68fed46c463e..dc9e4a14b885 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -384,9 +384,9 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) unsigned int devidx; struct queue_limits lim = { .logical_block_size = dev->blk_size, - .max_hw_sectors = dev->bounce_size >> 9, + .max_hw_sectors = BOUNCE_SIZE >> 9, .max_segments = -1, - .max_segment_size = dev->bounce_size, + .max_segment_size = BOUNCE_SIZE, .dma_alignment = dev->blk_size - 1, .features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL, From 6aeb4f836480617be472de767c4cb09c1060a067 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jan 2025 08:33:57 +0100 Subject: [PATCH 080/150] block: remove bio_add_pc_page Lift bio_split_rw_at into blk_rq_append_bio so that it validates the hardware limits. With this all passthrough callers can simply add bio_add_page to build the bio and delay checking for exceeding of limits to this point instead of doing it for each page. While this looks like adding a new expensive loop over all bio_vecs, blk_rq_append_bio is already doing that just to counter the number of segments. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20250103073417.459715-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 107 ++------------------------ block/blk-map.c | 118 +++++++---------------------- block/blk.h | 8 -- drivers/nvme/target/passthru.c | 18 +++-- drivers/nvme/target/zns.c | 3 +- drivers/target/target_core_pscsi.c | 6 +- include/linux/bio.h | 2 - 7 files changed, 48 insertions(+), 214 deletions(-) diff --git a/block/bio.c b/block/bio.c index d5bdc31d88d3..4e1a27d312c9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -946,8 +946,11 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, /* * Try to merge a page into a segment, while obeying the hardware segment - * size limit. This is not for normal read/write bios, but for passthrough - * or Zone Append operations that we can't split. + * size limit. + * + * This is kept around for the integrity metadata, which is still tries + * to build the initial bio to the hardware limit and doesn't have proper + * helpers to split. Hopefully this will go away soon. */ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset, @@ -964,106 +967,6 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, return bvec_try_merge_page(bv, page, len, offset, same_page); } -/** - * bio_add_hw_page - attempt to add a page to a bio with hw constraints - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * @max_sectors: maximum number of sectors that can be added - * @same_page: return if the segment has been merged inside the same page - * - * Add a page to a bio while respecting the hardware max_sectors, max_segment - * and gap limitations. - */ -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page) -{ - unsigned int max_size = max_sectors << SECTOR_SHIFT; - - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return 0; - - len = min3(len, max_size, queue_max_segment_size(q)); - if (len > max_size - bio->bi_iter.bi_size) - return 0; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (bvec_try_merge_hw_page(q, bv, page, len, offset, - same_page)) { - bio->bi_iter.bi_size += len; - return len; - } - - if (bio->bi_vcnt >= - min(bio->bi_max_vecs, queue_max_segments(q))) - return 0; - - /* - * If the queue doesn't support SG gaps and adding this segment - * would create a gap, disallow it. - */ - if (bvec_gap_to_prev(&q->limits, bv, offset)) - return 0; - } - - bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); - bio->bi_vcnt++; - bio->bi_iter.bi_size += len; - return len; -} - -/** - * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints - * @q: the target queue - * @bio: destination bio - * @folio: folio to add - * @len: vec entry length - * @offset: vec entry offset in the folio - * @max_sectors: maximum number of sectors that can be added - * @same_page: return if the segment has been merged inside the same folio - * - * Add a folio to a bio while respecting the hardware max_sectors, max_segment - * and gap limitations. - */ -int bio_add_hw_folio(struct request_queue *q, struct bio *bio, - struct folio *folio, size_t len, size_t offset, - unsigned int max_sectors, bool *same_page) -{ - if (len > UINT_MAX || offset > UINT_MAX) - return 0; - return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset, - max_sectors, same_page); -} - -/** - * bio_add_pc_page - attempt to add page to passthrough bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by passthrough bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset) -{ - bool same_page = false; - return bio_add_hw_page(q, bio, page, len, offset, - queue_max_hw_sectors(q), &same_page); -} -EXPORT_SYMBOL(bio_add_pc_page); - /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio diff --git a/block/blk-map.c b/block/blk-map.c index 894009b2d881..67a2da3b7ed9 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -189,7 +189,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, } } - if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { + if (bio_add_page(bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; @@ -272,86 +272,27 @@ static struct bio *blk_rq_map_bio_alloc(struct request *rq, static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { - iov_iter_extraction_t extraction_flags = 0; - unsigned int max_sectors = queue_max_hw_sectors(rq->q); unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; - int j; if (!iov_iter_count(iter)) return -EINVAL; bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); - if (bio == NULL) + if (!bio) return -ENOMEM; - - if (blk_queue_pci_p2pdma(rq->q)) - extraction_flags |= ITER_ALLOW_P2PDMA; - if (iov_iter_extract_will_pin(iter)) - bio_set_flag(bio, BIO_PAGE_PINNED); - - while (iov_iter_count(iter)) { - struct page *stack_pages[UIO_FASTIOV]; - struct page **pages = stack_pages; - ssize_t bytes; - size_t offs; - int npages; - - if (nr_vecs > ARRAY_SIZE(stack_pages)) - pages = NULL; - - bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, - nr_vecs, extraction_flags, &offs); - if (unlikely(bytes <= 0)) { - ret = bytes ? bytes : -EFAULT; - goto out_unmap; - } - - npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - - if (unlikely(offs & queue_dma_alignment(rq->q))) - j = 0; - else { - for (j = 0; j < npages; j++) { - struct page *page = pages[j]; - unsigned int n = PAGE_SIZE - offs; - bool same_page = false; - - if (n > bytes) - n = bytes; - - if (!bio_add_hw_page(rq->q, bio, page, n, offs, - max_sectors, &same_page)) - break; - - if (same_page) - bio_release_page(bio, page); - bytes -= n; - offs = 0; - } - } - /* - * release the pages we didn't map into the bio, if any - */ - while (j < npages) - bio_release_page(bio, pages[j++]); - if (pages != stack_pages) - kvfree(pages); - /* couldn't stuff something into bio? */ - if (bytes) { - iov_iter_revert(iter, bytes); - break; - } - } - + ret = bio_iov_iter_get_pages(bio, iter); + if (ret) + goto out_put; ret = blk_rq_append_bio(rq, bio); if (ret) - goto out_unmap; + goto out_release; return 0; - out_unmap: +out_release: bio_release_pages(bio, false); +out_put: blk_mq_map_bio_put(bio); return ret; } @@ -422,8 +363,7 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, page = virt_to_page(data); else page = vmalloc_to_page(data); - if (bio_add_pc_page(q, bio, page, bytes, - offset) < bytes) { + if (bio_add_page(bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_uninit(bio); kfree(bio); @@ -507,7 +447,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (!reading) memcpy(page_address(page), p, bytes); - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + if (bio_add_page(bio, page, bytes, 0) < bytes) break; len -= bytes; @@ -536,12 +476,19 @@ cleanup: */ int blk_rq_append_bio(struct request *rq, struct bio *bio) { - struct bvec_iter iter; - struct bio_vec bv; + const struct queue_limits *lim = &rq->q->limits; + unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; unsigned int nr_segs = 0; + int ret; - bio_for_each_bvec(bv, bio, iter) - nr_segs++; + /* check that the data layout matches the hardware restrictions */ + ret = bio_split_rw_at(bio, lim, &nr_segs, max_bytes); + if (ret) { + /* if we would have to split the bio, copy instead */ + if (ret > 0) + ret = -EREMOTEIO; + return ret; + } if (!rq->bio) { blk_rq_bio_prep(rq, bio, nr_segs); @@ -561,9 +508,7 @@ EXPORT_SYMBOL(blk_rq_append_bio); /* Prepare bio for passthrough IO given ITER_BVEC iter */ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { - const struct queue_limits *lim = &rq->q->limits; - unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; - unsigned int nsegs; + unsigned int max_bytes = rq->q->limits.max_hw_sectors << SECTOR_SHIFT; struct bio *bio; int ret; @@ -576,18 +521,10 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) return -ENOMEM; bio_iov_bvec_set(bio, iter); - /* check that the data layout matches the hardware restrictions */ - ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes); - if (ret) { - /* if we would have to split the bio, copy instead */ - if (ret > 0) - ret = -EREMOTEIO; + ret = blk_rq_append_bio(rq, bio); + if (ret) blk_mq_map_bio_put(bio); - return ret; - } - - blk_rq_bio_prep(rq, bio, nsegs); - return 0; + return ret; } /** @@ -644,8 +581,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); else ret = bio_map_user_iov(rq, &i, gfp_mask); - if (ret) + if (ret) { + if (ret == -EREMOTEIO) + ret = -EINVAL; goto unmap_rq; + } if (!bio) bio = rq->bio; } while (iov_iter_count(&i)); diff --git a/block/blk.h b/block/blk.h index cbf6a676ffe9..4904b86d5fec 100644 --- a/block/blk.h +++ b/block/blk.h @@ -556,14 +556,6 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass); -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page); - -int bio_add_hw_folio(struct request_queue *q, struct bio *bio, - struct folio *folio, size_t len, size_t offset, - unsigned int max_sectors, bool *same_page); - /* * Clean up a page appropriately, where the page may be pinned, may have a * ref taken on it or neither. diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 30b21936b0c6..26e2907ce8bb 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -261,6 +261,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) { struct scatterlist *sg; struct bio *bio; + int ret = -EINVAL; int i; if (req->sg_cnt > BIO_MAX_VECS) @@ -277,16 +278,19 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) } for_each_sg(req->sg, sg, req->sg_cnt, i) { - if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length, - sg->offset) < sg->length) { - nvmet_req_bio_put(req, bio); - return -EINVAL; - } + if (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) < + sg->length) + goto out_bio_put; } - blk_rq_bio_prep(rq, bio, req->sg_cnt); - + ret = blk_rq_append_bio(rq, bio); + if (ret) + goto out_bio_put; return 0; + +out_bio_put: + nvmet_req_bio_put(req, bio); + return ret; } static void nvmet_passthru_execute_cmd(struct nvmet_req *req) diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 3aef35b05111..29a60fabfcc8 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -586,8 +586,7 @@ void nvmet_bdev_execute_zone_append(struct nvmet_req *req) for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) { unsigned int len = sg->length; - if (bio_add_pc_page(bdev_get_queue(bio->bi_bdev), bio, - sg_page(sg), len, sg->offset) != len) { + if (bio_add_page(bio, sg_page(sg), len, sg->offset) != len) { status = NVME_SC_INTERNAL; goto out_put_bio; } diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 287ac5b0495f..f991cf759836 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -823,7 +823,6 @@ static sense_reason_t pscsi_map_sg(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, struct request *req) { - struct pscsi_dev_virt *pdv = PSCSI_DEV(cmd->se_dev); struct bio *bio = NULL; struct page *page; struct scatterlist *sg; @@ -871,12 +870,11 @@ new_bio: (rw) ? "rw" : "r", nr_vecs); } - pr_debug("PSCSI: Calling bio_add_pc_page() i: %d" + pr_debug("PSCSI: Calling bio_add_page() i: %d" " bio: %p page: %p len: %d off: %d\n", i, bio, page, len, off); - rc = bio_add_pc_page(pdv->pdv_sd->request_queue, - bio, page, bytes, off); + rc = bio_add_page(bio, page, bytes, off); pr_debug("PSCSI: bio->bi_vcnt: %d nr_vecs: %d\n", bio_segments(bio), nr_vecs); if (rc != bytes) { diff --git a/include/linux/bio.h b/include/linux/bio.h index 1eec59699100..4b79bf50f4f0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -413,8 +413,6 @@ int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off); -extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, - unsigned int, unsigned int); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, From 02ee5d69e3baf2796ba75b928fcbc9cf7884c5e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jan 2025 08:33:58 +0100 Subject: [PATCH 081/150] block: remove blk_rq_bio_prep There is not real point in a helper just to assign three values to four fields, especially when the surrounding code is working on the neighbor fields directly. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20250103073417.459715-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-map.c | 10 ++++++---- block/blk-mq.c | 4 +++- include/linux/blk-mq.h | 8 -------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 67a2da3b7ed9..d2f22744b3d1 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -490,17 +490,19 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) return ret; } - if (!rq->bio) { - blk_rq_bio_prep(rq, bio, nr_segs); - } else { + if (rq->bio) { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; rq->biotail->bi_next = bio; rq->biotail = bio; - rq->__data_len += (bio)->bi_iter.bi_size; + rq->__data_len += bio->bi_iter.bi_size; bio_crypt_free_ctx(bio); + return 0; } + rq->nr_phys_segments = nr_segs; + rq->bio = rq->biotail = bio; + rq->__data_len = bio->bi_iter.bi_size; return 0; } EXPORT_SYMBOL(blk_rq_append_bio); diff --git a/block/blk-mq.c b/block/blk-mq.c index fca2ec64a06b..17f10683d640 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2658,8 +2658,10 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; + rq->bio = rq->biotail = bio; rq->__sector = bio->bi_iter.bi_sector; - blk_rq_bio_prep(rq, bio, nr_segs); + rq->__data_len = bio->bi_iter.bi_size; + rq->nr_phys_segments = nr_segs; if (bio_integrity(bio)) rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, bio); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7f6c482ebf54..6340293511c9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -978,14 +978,6 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) -{ - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; -} - void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); From 2caca8fc7aad9ea9a6ea3ed26ed146b1e5f06fab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:14:37 +0100 Subject: [PATCH 082/150] block: use page_to_phys in bvec_phys Use page_to_phys instead of open coding it now that it is available in an architecture independent way. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106081437.798213-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/bvec.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index f41c7f0ef91e..ba8f52d48b94 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -286,12 +286,7 @@ static inline void *bvec_virt(struct bio_vec *bvec) */ static inline phys_addr_t bvec_phys(const struct bio_vec *bvec) { - /* - * Note this open codes page_to_phys because page_to_phys is defined in - * , which we don't want to pull in here. If it ever moves to - * a sensible place we should start using it. - */ - return PFN_PHYS(page_to_pfn(bvec->bv_page)) + bvec->bv_offset; + return page_to_phys(bvec->bv_page) + bvec->bv_offset; } #endif /* __LINUX_BVEC_H */ From b7175e24d6acf79d9f3af9ce9d3d50de1fa748ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:15:29 +0100 Subject: [PATCH 083/150] block: add a dma mapping iterator blk_rq_map_sg is maze of nested loops. Untangle it by creating an iterator that returns [paddr,len] tuples for DMA mapping, and then implement the DMA logic on top of this. This not only removes code at the source level, but also generates nicer binary code: $ size block/blk-merge.o.* text data bss dec hex filename 10001 432 0 10433 28c1 block/blk-merge.o.new 10317 468 0 10785 2a21 block/blk-merge.o.old Last but not least it will be used as a building block for a new DMA mapping helper that doesn't rely on struct scatterlist. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106081609.798289-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-merge.c | 177 ++++++++++++++++++---------------------------- 1 file changed, 70 insertions(+), 107 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index e01383c6e534..15cd231d560c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -473,6 +473,63 @@ unsigned int blk_recalc_rq_segments(struct request *rq) return nr_phys_segs; } +struct phys_vec { + phys_addr_t paddr; + u32 len; +}; + +static bool blk_map_iter_next(struct request *req, + struct req_iterator *iter, struct phys_vec *vec) +{ + unsigned int max_size; + struct bio_vec bv; + + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + if (!iter->bio) + return false; + vec->paddr = bvec_phys(&req->special_vec); + vec->len = req->special_vec.bv_len; + iter->bio = NULL; + return true; + } + + if (!iter->iter.bi_size) + return false; + + bv = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + vec->paddr = bvec_phys(&bv); + max_size = get_max_segment_size(&req->q->limits, vec->paddr, UINT_MAX); + bv.bv_len = min(bv.bv_len, max_size); + bio_advance_iter_single(iter->bio, &iter->iter, bv.bv_len); + + /* + * If we are entirely done with this bi_io_vec entry, check if the next + * one could be merged into it. This typically happens when moving to + * the next bio, but some callers also don't pack bvecs tight. + */ + while (!iter->iter.bi_size || !iter->iter.bi_bvec_done) { + struct bio_vec next; + + if (!iter->iter.bi_size) { + if (!iter->bio->bi_next) + break; + iter->bio = iter->bio->bi_next; + iter->iter = iter->bio->bi_iter; + } + + next = mp_bvec_iter_bvec(iter->bio->bi_io_vec, iter->iter); + if (bv.bv_len + next.bv_len > max_size || + !biovec_phys_mergeable(req->q, &bv, &next)) + break; + + bv.bv_len += next.bv_len; + bio_advance_iter_single(iter->bio, &iter->iter, next.bv_len); + } + + vec->len = bv.bv_len; + return true; +} + static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist) { @@ -490,120 +547,26 @@ static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, return sg_next(*sg); } -static unsigned blk_bvec_map_sg(struct request_queue *q, - struct bio_vec *bvec, struct scatterlist *sglist, - struct scatterlist **sg) -{ - unsigned nbytes = bvec->bv_len; - unsigned nsegs = 0, total = 0; - - while (nbytes > 0) { - unsigned offset = bvec->bv_offset + total; - unsigned len = get_max_segment_size(&q->limits, - bvec_phys(bvec) + total, nbytes); - struct page *page = bvec->bv_page; - - /* - * Unfortunately a fair number of drivers barf on scatterlists - * that have an offset larger than PAGE_SIZE, despite other - * subsystems dealing with that invariant just fine. For now - * stick to the legacy format where we never present those from - * the block layer, but the code below should be removed once - * these offenders (mostly MMC/SD drivers) are fixed. - */ - page += (offset >> PAGE_SHIFT); - offset &= ~PAGE_MASK; - - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, page, len, offset); - - total += len; - nbytes -= len; - nsegs++; - } - - return nsegs; -} - -static inline int __blk_bvec_map_sg(struct bio_vec bv, - struct scatterlist *sglist, struct scatterlist **sg) -{ - *sg = blk_next_sg(sg, sglist); - sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); - return 1; -} - -/* only try to merge bvecs into one sg if they are from two bios */ -static inline bool -__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec, - struct bio_vec *bvprv, struct scatterlist **sg) -{ - - int nbytes = bvec->bv_len; - - if (!*sg) - return false; - - if ((*sg)->length + nbytes > queue_max_segment_size(q)) - return false; - - if (!biovec_phys_mergeable(q, bvprv, bvec)) - return false; - - (*sg)->length += nbytes; - - return true; -} - -static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, - struct scatterlist *sglist, - struct scatterlist **sg) -{ - struct bio_vec bvec, bvprv = { NULL }; - struct bvec_iter iter; - int nsegs = 0; - bool new_bio = false; - - for_each_bio(bio) { - bio_for_each_bvec(bvec, bio, iter) { - /* - * Only try to merge bvecs from two bios given we - * have done bio internal merge when adding pages - * to bio - */ - if (new_bio && - __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg)) - goto next_bvec; - - if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) - nsegs += __blk_bvec_map_sg(bvec, sglist, sg); - else - nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg); - next_bvec: - new_bio = false; - } - if (likely(bio->bi_iter.bi_size)) { - bvprv = bvec; - new_bio = true; - } - } - - return nsegs; -} - /* - * map a request to scatterlist, return number of sg entries setup. Caller - * must make sure sg can hold rq->nr_phys_segments entries + * Map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries. */ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg) { + struct req_iterator iter = { + .bio = rq->bio, + .iter = rq->bio->bi_iter, + }; + struct phys_vec vec; int nsegs = 0; - if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) - nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); - else if (rq->bio) - nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); + while (blk_map_iter_next(rq, &iter, &vec)) { + *last_sg = blk_next_sg(last_sg, sglist); + sg_set_page(*last_sg, phys_to_page(vec.paddr), vec.len, + offset_in_page(vec.paddr)); + nsegs++; + } if (*last_sg) sg_mark_end(*last_sg); From 6783811569aef24b949992bd5c4e6eaac02a0c30 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:08 +0100 Subject: [PATCH 084/150] block: better split mq vs non-mq code in add_disk_fwnode Add a big conditional for blk-mq vs not mq at the beginning of add_disk_fwnode so that elevator_init_mq is only called for blk-mq disks, and add checks that the right methods or set or not set based on the queue type. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106083531.799976-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 5da3c9a64e64..befb7a516bcf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -400,21 +400,23 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, struct device *ddev = disk_to_dev(disk); int ret; - /* Only makes sense for bio-based to set ->poll_bio */ - if (queue_is_mq(disk->queue) && disk->fops->poll_bio) - return -EINVAL; + if (queue_is_mq(disk->queue)) { + /* + * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers. + */ + if (disk->fops->submit_bio || disk->fops->poll_bio) + return -EINVAL; - /* - * The disk queue should now be all set with enough information about - * the device for the elevator code to pick an adequate default - * elevator if one is needed, that is, for devices requesting queue - * registration. - */ - elevator_init_mq(disk->queue); - - /* Mark bdev as having a submit_bio, if needed */ - if (disk->fops->submit_bio) + /* + * Initialize the I/O scheduler code and pick a default one if + * needed. + */ + elevator_init_mq(disk->queue); + } else { + if (!disk->fops->submit_bio) + return -EINVAL; bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO); + } /* * If the driver provides an explicit major number it also must provide From 68ed45122249083bf45593ed635474282583352c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:09 +0100 Subject: [PATCH 085/150] block: remove blk_mq_init_bitmaps The little work done in blk_mq_init_bitmaps is easier done in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250106083531.799976-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 38 ++++++++++++-------------------------- block/blk-mq.h | 3 --- 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2cafcf11ee8b..ab4a66791a20 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -544,30 +544,12 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, - unsigned int queue_depth, unsigned int reserved, - int node, int alloc_policy) -{ - unsigned int depth = queue_depth - reserved; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - - if (bt_alloc(bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(breserved_tags, reserved, round_robin, node)) - goto free_bitmap_tags; - - return 0; - -free_bitmap_tags: - sbitmap_queue_free(bitmap_tags); - return -ENOMEM; -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, int node, int alloc_policy) { + unsigned int depth = total_tags - reserved_tags; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -582,14 +564,18 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) + goto out_free_tags; + if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) + goto out_free_bitmap_tags; - if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, - total_tags, reserved_tags, node, - alloc_policy) < 0) { - kfree(tags); - return NULL; - } return tags; + +out_free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +out_free_tags: + kfree(tags); + return NULL; } void blk_mq_free_tags(struct blk_mq_tags *tags) diff --git a/block/blk-mq.h b/block/blk-mq.h index 89a20fffa4b1..3bb9ea80f9b6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -165,9 +165,6 @@ struct blk_mq_alloc_data { struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); void blk_mq_free_tags(struct blk_mq_tags *tags); -int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, - struct sbitmap_queue *breserved_tags, unsigned int queue_depth, - unsigned int reserved, int node, int alloc_policy); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, From e7602bb4f3a1234df8b75728ac3260bcb8242612 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:10 +0100 Subject: [PATCH 086/150] block: remove BLK_MQ_F_NO_SCHED The only queues that really can't support a scheduler are those that do not have a gendisk associated with them, and thus can't be used for non-passthrough commands. In addition to those null_blk can optionally set the flag, which is a bad odd. Replace the null_blk usage with BLK_MQ_F_NO_SCHED_BY_DEFAULT to keep the expected semantics and then remove BLK_MQ_F_NO_SCHED as the non-disk queues never call into elevator_init_mq or blk_register_queue which adds the sysfs attributes. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250106083531.799976-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 - block/bsg-lib.c | 2 +- block/elevator.c | 20 -------------------- drivers/block/null_blk/main.c | 4 ++-- drivers/nvme/host/apple.c | 1 - drivers/nvme/host/core.c | 1 - drivers/ufs/core/ufshcd.c | 1 - include/linux/blk-mq.h | 2 -- 8 files changed, 3 insertions(+), 29 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 4b6b20ccdb53..64b3c333aa47 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -185,7 +185,6 @@ static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), HCTX_FLAG_NAME(BLOCKING), - HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 32da4a4429ce..93523d8f8195 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -381,7 +381,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; set->cmd_size = sizeof(struct bsg_job) + dd_job_size; - set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; + set->flags = BLK_MQ_F_BLOCKING; if (blk_mq_alloc_tag_set(set)) goto out_tag_set; diff --git a/block/elevator.c b/block/elevator.c index be6e994256ac..b81216c48b6b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -547,14 +547,6 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static inline bool elv_support_iosched(struct request_queue *q) -{ - if (!queue_is_mq(q) || - (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) - return false; - return true; -} - /* * For single queue devices, default to using mq-deadline. If we have multiple * queues or mq-deadline is not available, default to "none". @@ -580,9 +572,6 @@ void elevator_init_mq(struct request_queue *q) struct elevator_type *e; int err; - if (!elv_support_iosched(q)) - return; - WARN_ON_ONCE(blk_queue_registered(q)); if (unlikely(q->elevator)) @@ -714,9 +703,6 @@ void elv_iosched_load_module(struct gendisk *disk, const char *buf, struct elevator_type *found; const char *name; - if (!elv_support_iosched(disk->queue)) - return; - strscpy(elevator_name, buf, sizeof(elevator_name)); name = strstrip(elevator_name); @@ -734,9 +720,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, char elevator_name[ELV_NAME_MAX]; int ret; - if (!elv_support_iosched(disk->queue)) - return count; - strscpy(elevator_name, buf, sizeof(elevator_name)); ret = elevator_change(disk->queue, strstrip(elevator_name)); if (!ret) @@ -751,9 +734,6 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name) struct elevator_type *cur = NULL, *e; int len = 0; - if (!elv_support_iosched(q)) - return sprintf(name, "none\n"); - if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 178e62cd9a9f..d94ef37480bd 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1792,7 +1792,7 @@ static int null_init_global_tag_set(void) tag_set.queue_depth = g_hw_queue_depth; tag_set.numa_node = g_home_node; if (g_no_sched) - tag_set.flags |= BLK_MQ_F_NO_SCHED; + tag_set.flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; if (g_shared_tag_bitmap) tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (g_blocking) @@ -1817,7 +1817,7 @@ static int null_setup_tagset(struct nullb *nullb) nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; nullb->tag_set->numa_node = nullb->dev->home_node; if (nullb->dev->no_sched) - nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED; + nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED_BY_DEFAULT; if (nullb->dev->shared_tag_bitmap) nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; if (nullb->dev->blocking) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index 83c60468542c..1de11b722f04 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1251,7 +1251,6 @@ static int apple_nvme_alloc_tagsets(struct apple_nvme *anv) anv->admin_tagset.timeout = NVME_ADMIN_TIMEOUT; anv->admin_tagset.numa_node = NUMA_NO_NODE; anv->admin_tagset.cmd_size = sizeof(struct apple_nvme_iod); - anv->admin_tagset.flags = BLK_MQ_F_NO_SCHED; anv->admin_tagset.driver_data = &anv->adminq; ret = blk_mq_alloc_tag_set(&anv->admin_tagset); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 42283d268500..c2250ddef5a2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4564,7 +4564,6 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, /* Reserved for fabric connect and keep alive */ set->reserved_tags = 2; set->numa_node = ctrl->numa_node; - set->flags = BLK_MQ_F_NO_SCHED; if (ctrl->ops->flags & NVME_F_BLOCKING) set->flags |= BLK_MQ_F_BLOCKING; set->cmd_size = cmd_size; diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 8a01e4393159..fd53c9f402c3 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10412,7 +10412,6 @@ static int ufshcd_add_scsi_host(struct ufs_hba *hba) .nr_hw_queues = 1, .queue_depth = hba->nutmrs, .ops = &ufshcd_tmf_ops, - .flags = BLK_MQ_F_NO_SCHED, }; err = blk_mq_alloc_tag_set(&hba->tmf_tag_set); if (err < 0) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 6340293511c9..f2ff0ffa0535 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -676,8 +676,6 @@ enum { BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, - /* Do not allow an I/O scheduler to be configured. */ - BLK_MQ_F_NO_SCHED = 1 << 5, /* * Select 'none' during queue registration in case of a single hwq From ce32496ec1abe866225f2e2005ceda68cf4c7bf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 6 Jan 2025 09:35:11 +0100 Subject: [PATCH 087/150] block: simplify tag allocation policy selection Use a plain BLK_MQ_F_* flag to select the round robin tag selection instead of overlaying an enum with just two possible values into the flags space. Doing so allows adding a BLK_MQ_F_MAX sentinel for simplified overflow checking in the messy debugfs helpers. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250106083531.799976-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 25 ++++--------------------- block/blk-mq-tag.c | 5 ++--- block/blk-mq.c | 3 +-- block/blk-mq.h | 2 +- drivers/ata/ahci.h | 2 +- drivers/ata/pata_macio.c | 2 +- drivers/ata/sata_mv.c | 2 +- drivers/ata/sata_nv.c | 4 ++-- drivers/ata/sata_sil24.c | 1 - drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 2 +- drivers/scsi/scsi_lib.c | 4 ++-- include/linux/blk-mq.h | 22 +++++++--------------- include/linux/libata.h | 4 ++-- include/scsi/scsi_host.h | 6 ++++-- 14 files changed, 29 insertions(+), 55 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 64b3c333aa47..adf5f0697b6b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -172,19 +172,13 @@ static int hctx_state_show(void *data, struct seq_file *m) return 0; } -#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name -static const char *const alloc_policy_name[] = { - BLK_TAG_ALLOC_NAME(FIFO), - BLK_TAG_ALLOC_NAME(RR), -}; -#undef BLK_TAG_ALLOC_NAME - #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(TAG_HCTX_SHARED), HCTX_FLAG_NAME(BLOCKING), + HCTX_FLAG_NAME(TAG_RR), HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT), }; #undef HCTX_FLAG_NAME @@ -192,22 +186,11 @@ static const char *const hctx_flag_name[] = { static int hctx_flags_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); - BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != - BLK_MQ_F_ALLOC_POLICY_START_BIT); - BUILD_BUG_ON(ARRAY_SIZE(alloc_policy_name) != BLK_TAG_ALLOC_MAX); + BUILD_BUG_ON(ARRAY_SIZE(hctx_flag_name) != ilog2(BLK_MQ_F_MAX)); - seq_puts(m, "alloc_policy="); - if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && - alloc_policy_name[alloc_policy]) - seq_puts(m, alloc_policy_name[alloc_policy]); - else - seq_printf(m, "%d", alloc_policy); - seq_puts(m, " "); - blk_flags_show(m, - hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), - hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); + blk_flags_show(m, hctx->flags, hctx_flag_name, + ARRAY_SIZE(hctx_flag_name)); seq_puts(m, "\n"); return 0; } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ab4a66791a20..b9f417d980b4 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -545,11 +545,10 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, - unsigned int reserved_tags, - int node, int alloc_policy) + unsigned int reserved_tags, unsigned int flags, int node) { unsigned int depth = total_tags - reserved_tags; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + bool round_robin = flags & BLK_MQ_F_TAG_RR; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 17f10683d640..2e6132f778fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3476,8 +3476,7 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, - BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); + tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node); if (!tags) return NULL; diff --git a/block/blk-mq.h b/block/blk-mq.h index 3bb9ea80f9b6..c872bbbe6411 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -163,7 +163,7 @@ struct blk_mq_alloc_data { }; struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, - unsigned int reserved_tags, int node, int alloc_policy); + unsigned int reserved_tags, unsigned int flags, int node); void blk_mq_free_tags(struct blk_mq_tags *tags); unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h index 8f40f75ba08c..06781bdde0d2 100644 --- a/drivers/ata/ahci.h +++ b/drivers/ata/ahci.h @@ -396,7 +396,7 @@ extern const struct attribute_group *ahci_sdev_groups[]; .shost_groups = ahci_shost_groups, \ .sdev_groups = ahci_sdev_groups, \ .change_queue_depth = ata_scsi_change_queue_depth, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure extern struct ata_port_operations ahci_ops; diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c index f2f36e55a1f4..4b01bb6880b0 100644 --- a/drivers/ata/pata_macio.c +++ b/drivers/ata/pata_macio.c @@ -935,7 +935,7 @@ static const struct scsi_host_template pata_macio_sht = { .device_configure = pata_macio_device_configure, .sdev_groups = ata_common_sdev_groups, .can_queue = ATA_DEF_QUEUE, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, }; static struct ata_port_operations pata_macio_ops = { diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index b8f363370e1a..21c72650f9cc 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -672,7 +672,7 @@ static const struct scsi_host_template mv6_sht = { .dma_boundary = MV_DMA_BOUNDARY, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, .device_configure = ata_scsi_device_configure }; diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index 36d99043ef50..823cce5ea1e9 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -385,7 +385,7 @@ static const struct scsi_host_template nv_adma_sht = { .device_configure = nv_adma_device_configure, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, }; static const struct scsi_host_template nv_swncq_sht = { @@ -396,7 +396,7 @@ static const struct scsi_host_template nv_swncq_sht = { .device_configure = nv_swncq_device_configure, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, }; /* diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c index 72c03cbdaff4..935b13e79dec 100644 --- a/drivers/ata/sata_sil24.c +++ b/drivers/ata/sata_sil24.c @@ -378,7 +378,6 @@ static const struct scsi_host_template sil24_sht = { .can_queue = SIL24_MAX_CMDS, .sg_tablesize = SIL24_MAX_SGE, .dma_boundary = ATA_DMA_BOUNDARY, - .tag_alloc_policy = BLK_TAG_ALLOC_FIFO, .sdev_groups = ata_ncq_sdev_groups, .change_queue_depth = ata_scsi_change_queue_depth, .device_configure = ata_scsi_device_configure diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index 79129c977704..35501d0aa655 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -3345,7 +3345,7 @@ static const struct scsi_host_template sht_v3_hw = { .slave_alloc = hisi_sas_slave_alloc, .shost_groups = host_v3_hw_groups, .sdev_groups = sdev_groups_v3_hw, - .tag_alloc_policy = BLK_TAG_ALLOC_RR, + .tag_alloc_policy_rr = true, .host_reset = hisi_sas_host_reset, .host_tagset = 1, .mq_poll = queue_complete_v3_hw, diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5cf124e13097..51c496ca9380 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2065,8 +2065,8 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost) tag_set->queue_depth = shost->can_queue; tag_set->cmd_size = cmd_size; tag_set->numa_node = dev_to_node(shost->dma_dev); - tag_set->flags |= - BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); + if (shost->hostt->tag_alloc_policy_rr) + tag_set->flags |= BLK_MQ_F_TAG_RR; if (shost->queuecommand_may_block) tag_set->flags |= BLK_MQ_F_BLOCKING; tag_set->driver_data = shost; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f2ff0ffa0535..a0a9007cc1e3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -296,13 +296,6 @@ enum blk_eh_timer_return { BLK_EH_RESET_TIMER, }; -/* Keep alloc_policy_name[] in sync with the definitions below */ -enum { - BLK_TAG_ALLOC_FIFO, /* allocate starting from 0 */ - BLK_TAG_ALLOC_RR, /* allocate starting from last allocated tag */ - BLK_TAG_ALLOC_MAX -}; - /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device @@ -677,20 +670,19 @@ enum { BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, + /* + * Alloc tags on a round-robin base instead of the first available one. + */ + BLK_MQ_F_TAG_RR = 1 << 5, + /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, - BLK_MQ_F_ALLOC_POLICY_START_BIT = 7, - BLK_MQ_F_ALLOC_POLICY_BITS = 1, + + BLK_MQ_F_MAX = 1 << 7, }; -#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ - ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ - ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) -#define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ - ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ - << BLK_MQ_F_ALLOC_POLICY_START_BIT) #define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) diff --git a/include/linux/libata.h b/include/linux/libata.h index c1a85d46eba6..be5183d75736 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1467,13 +1467,13 @@ extern const struct attribute_group *ata_common_sdev_groups[]; #define ATA_SUBBASE_SHT(drv_name) \ __ATA_BASE_SHT(drv_name), \ .can_queue = ATA_DEF_QUEUE, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_SUBBASE_SHT_QD(drv_name, drv_qd) \ __ATA_BASE_SHT(drv_name), \ .can_queue = drv_qd, \ - .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ + .tag_alloc_policy_rr = true, \ .device_configure = ata_scsi_device_configure #define ATA_BASE_SHT(drv_name) \ diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 2b4ab0369ffb..02823d6af37d 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -438,8 +438,10 @@ struct scsi_host_template { */ short cmd_per_lun; - /* If use block layer to manage tags, this is tag allocation policy */ - int tag_alloc_policy; + /* + * Allocate tags starting from last allocated tag. + */ + bool tag_alloc_policy_rr : 1; /* * Track QUEUE_FULL events and reduce queue depth on demand. From 844b8cdc681612ff24df62cdefddeab5772fadf1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 3 Jan 2025 17:28:59 +0800 Subject: [PATCH 088/150] nbd: don't allow reconnect after disconnect Following process can cause nbd_config UAF: 1) grab nbd_config temporarily; 2) nbd_genl_disconnect() flush all recv_work() and release the initial reference: nbd_genl_disconnect nbd_disconnect_and_put nbd_disconnect flush_workqueue(nbd->recv_workq) if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, ...)) nbd_config_put -> due to step 1), reference is still not zero 3) nbd_genl_reconfigure() queue recv_work() again; nbd_genl_reconfigure config = nbd_get_config_unlocked(nbd) if (!config) -> succeed if (!test_bit(NBD_RT_BOUND, ...)) -> succeed nbd_reconnect_socket queue_work(nbd->recv_workq, &args->work) 4) step 1) release the reference; 5) Finially, recv_work() will trigger UAF: recv_work nbd_config_put(nbd) -> nbd_config is freed atomic_dec(&config->recv_threads) -> UAF Fix the problem by clearing NBD_RT_BOUND in nbd_genl_disconnect(), so that nbd_genl_reconfigure() will fail. Fixes: b7aa3d39385d ("nbd: add a reconfigure netlink command") Reported-by: syzbot+6b0df248918b92c33e6a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/675bfb65.050a0220.1a2d0d.0006.GAE@google.com/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250103092859.3574648-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b1a5af69a66d..259bd57fc529 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -2179,6 +2179,7 @@ static void nbd_disconnect_and_put(struct nbd_device *nbd) flush_workqueue(nbd->recv_workq); nbd_clear_que(nbd); nbd->task_setup = NULL; + clear_bit(NBD_RT_BOUND, &nbd->config->runtime_flags); mutex_unlock(&nbd->config_lock); if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF, From 3ec5c62cfcf060e9ea533cd3901f5d03b26ddc24 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Mon, 9 Dec 2024 09:53:44 +0800 Subject: [PATCH 089/150] nvmet: handle rw's limited retry flag In some scenarios, some multipath software setup places the REQ_FAILFAST_DEV flag on I/O to prevent retries and immediately switch to other paths for issuing I/O commands. This will reflect on the NVMe read and write commands with the limited retry flag. However, the current NVMe target side does not handle the limited retry flag, and the target's underlying driver still retries the I/O. This will result in the I/O not being quickly switched to other paths, ultimately leading to increased I/O latency. When the nvme target receive an rw command with limited retry flag, handle it in block backend by setting the REQ_FAILFAST_DEV flag to bio. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/io-cmd-bdev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 0bda83d0fc3e..6380b60fd490 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -272,6 +272,9 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req) iter_flags = SG_MITER_FROM_SG; } + if (req->cmd->rw.control & NVME_RW_LR) + opf |= REQ_FAILFAST_DEV; + if (is_pci_p2pdma_page(sg_page(req->sg))) opf |= REQ_NOMERGE; From 9c96821b44f893fb63f021a28625d3b32c68e8b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:09 +0100 Subject: [PATCH 090/150] block: fix docs for freezing of queue limits updates queue_limits_commit_update is the function that needs to operate on a frozen queue, not queue_limits_start_update. Update the kerneldoc comments to reflect that. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250110054726.1499538-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 3 ++- include/linux/blkdev.h | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 8f09e33f41f6..89d8366fd43c 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -413,7 +413,8 @@ int blk_set_default_limits(struct queue_limits *lim) * @lim: limits to apply * * Apply the limits in @lim that were obtained from queue_limits_start_update() - * and updated by the caller to @q. + * and updated by the caller to @q. The caller must have frozen the queue or + * ensure that there are no outstanding I/Os by other means. * * Returns 0 if successful, else a negative error code. */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5d40af2ef971..e781d4e6f92d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -944,8 +944,7 @@ static inline unsigned int blk_boundary_sectors_left(sector_t offset, * the caller can modify. The caller must call queue_limits_commit_update() * to finish the update. * - * Context: process context. The caller must have frozen the queue or ensured - * that there is outstanding I/O by other means. + * Context: process context. */ static inline struct queue_limits queue_limits_start_update(struct request_queue *q) From aa427d7b73b196f657d6d2cf0e94eff6b883fdef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:10 +0100 Subject: [PATCH 091/150] block: add a queue_limits_commit_update_frozen helper Add a helper that freezes the queue, updates the queue limits and unfreezes the queue and convert all open coded versions of that to the new helper. Signed-off-by: Christoph Hellwig Reviewed-by: John Garry Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-integrity.c | 4 +--- block/blk-settings.c | 24 ++++++++++++++++++++++++ block/blk-zoned.c | 7 +------ drivers/block/virtio_blk.c | 4 +--- drivers/scsi/sd.c | 17 +++++------------ drivers/scsi/sr.c | 5 +---- include/linux/blkdev.h | 2 ++ 7 files changed, 35 insertions(+), 28 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index b180cac61a9d..013469faa5e7 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -218,9 +218,7 @@ static ssize_t flag_store(struct device *dev, const char *page, size_t count, else lim.integrity.flags |= flag; - blk_mq_freeze_queue(q); - err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); + err = queue_limits_commit_update_frozen(q, &lim); if (err) return err; return count; diff --git a/block/blk-settings.c b/block/blk-settings.c index 89d8366fd43c..6c96a73261d1 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -444,6 +444,30 @@ out_unlock: } EXPORT_SYMBOL_GPL(queue_limits_commit_update); +/** + * queue_limits_commit_update_frozen - commit an atomic update of queue limits + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were obtained from queue_limits_start_update() + * and updated with the new values by the caller to @q. Freezes the queue + * before the update and unfreezes it after. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_commit_update_frozen(struct request_queue *q, + struct queue_limits *lim) +{ + int ret; + + blk_mq_freeze_queue(q); + ret = queue_limits_commit_update(q, lim); + blk_mq_unfreeze_queue(q); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_limits_commit_update_frozen); + /** * queue_limits_set - apply queue limits to queue * @q: queue to update diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 4b0be40a8ea7..9d08a54c201e 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1444,7 +1444,6 @@ static int disk_update_zone_resources(struct gendisk *disk, unsigned int nr_seq_zones, nr_conv_zones; unsigned int pool_size; struct queue_limits lim; - int ret; disk->nr_zones = args->nr_zones; disk->zone_capacity = args->zone_capacity; @@ -1495,11 +1494,7 @@ static int disk_update_zone_resources(struct gendisk *disk, } commit: - blk_mq_freeze_queue(q); - ret = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); - - return ret; + return queue_limits_commit_update_frozen(q, &lim); } static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 71a7ffeafb32..bbaa26b523b8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1105,9 +1105,7 @@ cache_type_store(struct device *dev, struct device_attribute *attr, lim.features |= BLK_FEAT_WRITE_CACHE; else lim.features &= ~BLK_FEAT_WRITE_CACHE; - blk_mq_freeze_queue(disk->queue); - i = queue_limits_commit_update(disk->queue, &lim); - blk_mq_unfreeze_queue(disk->queue); + i = queue_limits_commit_update_frozen(disk->queue, &lim); if (i) return i; return count; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8947dab132d7..af62a8ed8620 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -177,9 +177,8 @@ cache_type_store(struct device *dev, struct device_attribute *attr, lim = queue_limits_start_update(sdkp->disk->queue); sd_set_flush_flag(sdkp, &lim); - blk_mq_freeze_queue(sdkp->disk->queue); - ret = queue_limits_commit_update(sdkp->disk->queue, &lim); - blk_mq_unfreeze_queue(sdkp->disk->queue); + ret = queue_limits_commit_update_frozen(sdkp->disk->queue, + &lim); if (ret) return ret; return count; @@ -483,9 +482,7 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, lim = queue_limits_start_update(sdkp->disk->queue); sd_config_discard(sdkp, &lim, mode); - blk_mq_freeze_queue(sdkp->disk->queue); - err = queue_limits_commit_update(sdkp->disk->queue, &lim); - blk_mq_unfreeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (err) return err; return count; @@ -594,9 +591,7 @@ max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, lim = queue_limits_start_update(sdkp->disk->queue); sd_config_write_same(sdkp, &lim); - blk_mq_freeze_queue(sdkp->disk->queue); - err = queue_limits_commit_update(sdkp->disk->queue, &lim); - blk_mq_unfreeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (err) return err; return count; @@ -3803,9 +3798,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_config_write_same(sdkp, &lim); kfree(buffer); - blk_mq_freeze_queue(sdkp->disk->queue); - err = queue_limits_commit_update(sdkp->disk->queue, &lim); - blk_mq_unfreeze_queue(sdkp->disk->queue); + err = queue_limits_commit_update_frozen(sdkp->disk->queue, &lim); if (err) return err; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 198bec87bb8e..b17796d5ee66 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -797,10 +797,7 @@ static int get_sectorsize(struct scsi_cd *cd) lim = queue_limits_start_update(q); lim.logical_block_size = sector_size; - blk_mq_freeze_queue(q); - err = queue_limits_commit_update(q, &lim); - blk_mq_unfreeze_queue(q); - return err; + return queue_limits_commit_update_frozen(q, &lim); } static int get_capabilities(struct scsi_cd *cd) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e781d4e6f92d..13d353351c37 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -952,6 +952,8 @@ queue_limits_start_update(struct request_queue *q) mutex_lock(&q->limits_lock); return q->limits; } +int queue_limits_commit_update_frozen(struct request_queue *q, + struct queue_limits *lim); int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); int queue_limits_set(struct request_queue *q, struct queue_limits *lim); From 958148a6ac061a9a80a184ea678a5fa872d0c56f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:11 +0100 Subject: [PATCH 092/150] block: check BLK_FEAT_POLL under q_usage_count Otherwise feature reconfiguration can race with I/O submission. Also drop the bio_clear_polled in the error path, as the flag does not matter for instant error completions, it is a left over from when we allowed polled I/O to proceed unpolled in this case. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Nilay Shroff Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20250110054726.1499538-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 22 ++++++++++++---------- block/blk-mq.c | 12 ++++++++++-- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 666efe8fa202..6309b3f5a89d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -629,8 +629,14 @@ static void __submit_bio(struct bio *bio) blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; - - disk->fops->submit_bio(bio); + + if ((bio->bi_opf & REQ_POLLED) && + !(disk->queue->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + } else { + disk->fops->submit_bio(bio); + } blk_queue_exit(disk->queue); } @@ -805,12 +811,6 @@ void submit_bio_noacct(struct bio *bio) } } - if (!(q->limits.features & BLK_FEAT_POLL) && - (bio->bi_opf & REQ_POLLED)) { - bio_clear_polled(bio); - goto not_supported; - } - switch (bio_op(bio)) { case REQ_OP_READ: break; @@ -935,7 +935,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) return 0; q = bdev_get_queue(bdev); - if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL)) + if (cookie == BLK_QC_T_NONE) return 0; blk_flush_plug(current->plug, false); @@ -951,7 +951,9 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) */ if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; - if (queue_is_mq(q)) { + if (!(q->limits.features & BLK_FEAT_POLL)) { + ret = 0; + } else if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); } else { struct gendisk *disk = q->disk; diff --git a/block/blk-mq.c b/block/blk-mq.c index 2e6132f778fd..02c9232a8fff 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3096,14 +3096,22 @@ void blk_mq_submit_bio(struct bio *bio) } /* - * Device reconfiguration may change logical block size, so alignment - * check has to be done with queue usage counter held + * Device reconfiguration may change logical block size or reduce the + * number of poll queues, so the checks for alignment and poll support + * have to be done with queue usage counter held. */ if (unlikely(bio_unaligned(bio, q))) { bio_io_error(bio); goto queue_exit; } + if ((bio->bi_opf & REQ_POLLED) && + !(q->limits.features & BLK_FEAT_POLL)) { + bio->bi_status = BLK_STS_NOTSUPP; + bio_endio(bio); + goto queue_exit; + } + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) goto queue_exit; From d432c817c21a48c3baaa0d28e4d3e74b6aa238a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:12 +0100 Subject: [PATCH 093/150] block: don't update BLK_FEAT_POLL in __blk_mq_update_nr_hw_queues When __blk_mq_update_nr_hw_queues changes the number of tag sets, it might have to disable poll queues. Currently it does so by adjusting the BLK_FEAT_POLL, which is a bit against the intent of features that describe hardware / driver capabilities, but more importantly causes nasty lock order problems with the broadly held freeze when updating the number of hardware queues and the limits lock. Fix this by leaving BLK_FEAT_POLL alone, and instead check for the number of poll queues in the bio submission and poll handlers. While this adds extra work to the fast path, the variables are in cache lines used by these operations anyway, so it should be cheap enough. Fixes: 8023e144f9d6 ("block: move the poll flag to queue_limits") Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Nilay Shroff Link: https://lore.kernel.org/r/20250110054726.1499538-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 7 +++---- block/blk-mq.c | 26 +++++--------------------- block/blk-mq.h | 6 ++++++ block/blk-sysfs.c | 9 ++++++++- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 6309b3f5a89d..32fb28a6372c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -951,14 +951,13 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) */ if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; - if (!(q->limits.features & BLK_FEAT_POLL)) { - ret = 0; - } else if (queue_is_mq(q)) { + if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); } else { struct gendisk *disk = q->disk; - if (disk && disk->fops->poll_bio) + if ((q->limits.features & BLK_FEAT_POLL) && disk && + disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); diff --git a/block/blk-mq.c b/block/blk-mq.c index 02c9232a8fff..655dcc16db76 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3105,8 +3105,7 @@ void blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - if ((bio->bi_opf & REQ_POLLED) && - !(q->limits.features & BLK_FEAT_POLL)) { + if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) { bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); goto queue_exit; @@ -4328,12 +4327,6 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -static bool blk_mq_can_poll(struct blk_mq_tag_set *set) -{ - return set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues; -} - struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata) { @@ -4344,7 +4337,7 @@ struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, if (!lim) lim = &default_lim; lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT; - if (blk_mq_can_poll(set)) + if (set->nr_maps > HCTX_TYPE_POLL) lim->features |= BLK_FEAT_POLL; q = blk_alloc_queue(lim, set->numa_node); @@ -5032,8 +5025,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { - struct queue_limits lim; - blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { @@ -5047,13 +5038,6 @@ fallback: set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } - lim = queue_limits_start_update(q); - if (blk_mq_can_poll(set)) - lim.features |= BLK_FEAT_POLL; - else - lim.features &= ~BLK_FEAT_POLL; - if (queue_limits_commit_update(q, &lim) < 0) - pr_warn("updating the poll flag failed\n"); blk_mq_map_swqueue(q); } @@ -5113,9 +5097,9 @@ static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { - struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); - - return blk_hctx_poll(q, hctx, iob, flags); + if (!blk_mq_can_poll(q)) + return 0; + return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, diff --git a/block/blk-mq.h b/block/blk-mq.h index c872bbbe6411..44979e92b79f 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -448,4 +448,10 @@ do { \ #define blk_mq_run_dispatch_ops(q, dispatch_ops) \ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ +static inline bool blk_mq_can_poll(struct request_queue *q) +{ + return (q->limits.features & BLK_FEAT_POLL) && + q->tag_set->map[HCTX_TYPE_POLL].nr_queues; +} + #endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 767598e719ab..e9f1c82b2f3e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -245,10 +245,17 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ !!(disk->queue->limits.features & _feature)); \ } -QUEUE_SYSFS_FEATURE_SHOW(poll, BLK_FEAT_POLL); QUEUE_SYSFS_FEATURE_SHOW(fua, BLK_FEAT_FUA); QUEUE_SYSFS_FEATURE_SHOW(dax, BLK_FEAT_DAX); +static ssize_t queue_poll_show(struct gendisk *disk, char *page) +{ + if (queue_is_mq(disk->queue)) + return sysfs_emit(page, "%u\n", blk_mq_can_poll(disk->queue)); + return sysfs_emit(page, "%u\n", + !!(disk->queue->limits.features & BLK_FEAT_POLL)); +} + static ssize_t queue_zoned_show(struct gendisk *disk, char *page) { if (blk_queue_is_zoned(disk->queue)) From a16230649ce27f8ac7dd8a5b079d9657aa96de16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:13 +0100 Subject: [PATCH 094/150] block: add a store_limit operations for sysfs entries De-duplicate the code for updating queue limits by adding a store_limit method that allows having common code handle the actual queue limits update. Note that this is a pure refactoring patch and does not address the existing freeze vs limits lock order problem in the refactored code, which will be addressed next. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250110054726.1499538-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 128 ++++++++++++++++++++++------------------------ 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e9f1c82b2f3e..d2aa2177e4ba 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -24,6 +24,8 @@ struct queue_sysfs_entry { struct attribute attr; ssize_t (*show)(struct gendisk *disk, char *page); ssize_t (*store)(struct gendisk *disk, const char *page, size_t count); + int (*store_limit)(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim); void (*load_module)(struct gendisk *disk, const char *page, size_t count); }; @@ -153,13 +155,11 @@ QUEUE_SYSFS_SHOW_CONST(discard_zeroes_data, 0) QUEUE_SYSFS_SHOW_CONST(write_same_max, 0) QUEUE_SYSFS_SHOW_CONST(poll_delay, -1) -static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, - const char *page, size_t count) +static int queue_max_discard_sectors_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) { unsigned long max_discard_bytes; - struct queue_limits lim; ssize_t ret; - int err; ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) @@ -171,38 +171,28 @@ static ssize_t queue_max_discard_sectors_store(struct gendisk *disk, if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - lim = queue_limits_start_update(disk->queue); - lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; - err = queue_limits_commit_update(disk->queue, &lim); - if (err) - return err; - return ret; + lim->max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + return 0; } -static ssize_t -queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count) +static int +queue_max_sectors_store(struct gendisk *disk, const char *page, size_t count, + struct queue_limits *lim) { unsigned long max_sectors_kb; - struct queue_limits lim; ssize_t ret; - int err; ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; - lim = queue_limits_start_update(disk->queue); - lim.max_user_sectors = max_sectors_kb << 1; - err = queue_limits_commit_update(disk->queue, &lim); - if (err) - return err; - return ret; + lim->max_user_sectors = max_sectors_kb << 1; + return 0; } static ssize_t queue_feature_store(struct gendisk *disk, const char *page, - size_t count, blk_features_t feature) + size_t count, struct queue_limits *lim, blk_features_t feature) { - struct queue_limits lim; unsigned long val; ssize_t ret; @@ -210,15 +200,11 @@ static ssize_t queue_feature_store(struct gendisk *disk, const char *page, if (ret < 0) return ret; - lim = queue_limits_start_update(disk->queue); if (val) - lim.features |= feature; + lim->features |= feature; else - lim.features &= ~feature; - ret = queue_limits_commit_update(disk->queue, &lim); - if (ret) - return ret; - return count; + lim->features &= ~feature; + return 0; } #define QUEUE_SYSFS_FEATURE(_name, _feature) \ @@ -227,10 +213,10 @@ static ssize_t queue_##_name##_show(struct gendisk *disk, char *page) \ return sysfs_emit(page, "%u\n", \ !!(disk->queue->limits.features & _feature)); \ } \ -static ssize_t queue_##_name##_store(struct gendisk *disk, \ - const char *page, size_t count) \ +static int queue_##_name##_store(struct gendisk *disk, \ + const char *page, size_t count, struct queue_limits *lim) \ { \ - return queue_feature_store(disk, page, count, _feature); \ + return queue_feature_store(disk, page, count, lim, _feature); \ } QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL) @@ -273,10 +259,9 @@ static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); } -static ssize_t queue_iostats_passthrough_store(struct gendisk *disk, - const char *page, size_t count) +static int queue_iostats_passthrough_store(struct gendisk *disk, + const char *page, size_t count, struct queue_limits *lim) { - struct queue_limits lim; unsigned long ios; ssize_t ret; @@ -284,18 +269,13 @@ static ssize_t queue_iostats_passthrough_store(struct gendisk *disk, if (ret < 0) return ret; - lim = queue_limits_start_update(disk->queue); if (ios) - lim.flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; + lim->flags |= BLK_FLAG_IOSTATS_PASSTHROUGH; else - lim.flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; - - ret = queue_limits_commit_update(disk->queue, &lim); - if (ret) - return ret; - - return count; + lim->flags &= ~BLK_FLAG_IOSTATS_PASSTHROUGH; + return 0; } + static ssize_t queue_nomerges_show(struct gendisk *disk, char *page) { return queue_var_show((blk_queue_nomerges(disk->queue) << 1) | @@ -398,12 +378,10 @@ static ssize_t queue_wc_show(struct gendisk *disk, char *page) return sysfs_emit(page, "write through\n"); } -static ssize_t queue_wc_store(struct gendisk *disk, const char *page, - size_t count) +static int queue_wc_store(struct gendisk *disk, const char *page, + size_t count, struct queue_limits *lim) { - struct queue_limits lim; bool disable; - int err; if (!strncmp(page, "write back", 10)) { disable = false; @@ -414,15 +392,11 @@ static ssize_t queue_wc_store(struct gendisk *disk, const char *page, return -EINVAL; } - lim = queue_limits_start_update(disk->queue); if (disable) - lim.flags |= BLK_FLAG_WRITE_CACHE_DISABLED; + lim->flags |= BLK_FLAG_WRITE_CACHE_DISABLED; else - lim.flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; - err = queue_limits_commit_update(disk->queue, &lim); - if (err) - return err; - return count; + lim->flags &= ~BLK_FLAG_WRITE_CACHE_DISABLED; + return 0; } #define QUEUE_RO_ENTRY(_prefix, _name) \ @@ -438,6 +412,13 @@ static struct queue_sysfs_entry _prefix##_entry = { \ .store = _prefix##_store, \ }; +#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show = _prefix##_show, \ + .store_limit = _prefix##_store, \ +} + #define QUEUE_RW_LOAD_MODULE_ENTRY(_prefix, _name) \ static struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ @@ -448,7 +429,7 @@ static struct queue_sysfs_entry _prefix##_entry = { \ QUEUE_RW_ENTRY(queue_requests, "nr_requests"); QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); -QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); +QUEUE_LIM_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); @@ -464,7 +445,7 @@ QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); QUEUE_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes"); -QUEUE_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); +QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); QUEUE_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes"); @@ -484,11 +465,11 @@ QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); -QUEUE_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); +QUEUE_LIM_RW_ENTRY(queue_iostats_passthrough, "iostats_passthrough"); QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); QUEUE_RW_ENTRY(queue_poll, "io_poll"); QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); -QUEUE_RW_ENTRY(queue_wc, "write_cache"); +QUEUE_LIM_RW_ENTRY(queue_wc, "write_cache"); QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); @@ -501,10 +482,10 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .show = queue_logical_block_size_show, }; -QUEUE_RW_ENTRY(queue_rotational, "rotational"); -QUEUE_RW_ENTRY(queue_iostats, "iostats"); -QUEUE_RW_ENTRY(queue_add_random, "add_random"); -QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); +QUEUE_LIM_RW_ENTRY(queue_rotational, "rotational"); +QUEUE_LIM_RW_ENTRY(queue_iostats, "iostats"); +QUEUE_LIM_RW_ENTRY(queue_add_random, "add_random"); +QUEUE_LIM_RW_ENTRY(queue_stable_writes, "stable_writes"); #ifdef CONFIG_BLK_WBT static ssize_t queue_var_store64(s64 *var, const char *page) @@ -702,7 +683,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, struct request_queue *q = disk->queue; ssize_t res; - if (!entry->store) + if (!entry->store_limit && !entry->store) return -EIO; /* @@ -713,11 +694,24 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->load_module) entry->load_module(disk, page, length); - blk_mq_freeze_queue(q); mutex_lock(&q->sysfs_lock); - res = entry->store(disk, page, length); - mutex_unlock(&q->sysfs_lock); + blk_mq_freeze_queue(q); + if (entry->store_limit) { + struct queue_limits lim = queue_limits_start_update(q); + + res = entry->store_limit(disk, page, length, &lim); + if (res < 0) { + queue_limits_cancel_update(q); + } else { + res = queue_limits_commit_update(q, &lim); + if (!res) + res = length; + } + } else { + res = entry->store(disk, page, length); + } blk_mq_unfreeze_queue(q); + mutex_unlock(&q->sysfs_lock); return res; } From c99f66e4084a62a2cc401c4704a84328aeddc9ec Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:14 +0100 Subject: [PATCH 095/150] block: fix queue freeze vs limits lock order in sysfs store methods queue_attr_store() always freezes a device queue before calling the attribute store operation. For attributes that control queue limits, the store operation will also lock the queue limits with a call to queue_limits_start_update(). However, some drivers (e.g. SCSI sd) may need to issue commands to a device to obtain limit values from the hardware with the queue limits locked. This creates a potential ABBA deadlock situation if a user attempts to modify a limit (thus freezing the device queue) while the device driver starts a revalidation of the device queue limits. Avoid such deadlock by not freezing the queue before calling the ->store_limit() method in struct queue_sysfs_entry and instead use the queue_limits_commit_update_frozen helper to freeze the queue after taking the limits lock. This also removes taking the sysfs lock for the store_limit method as it doesn't protect anything here, but creates even more nesting. Hopefully it will go away from the actual sysfs methods entirely soon. (commit log adapted from a similar patch from Damien Le Moal) Fixes: ff956a3be95b ("block: use queue_limits_commit_update in queue_discard_max_store") Fixes: 0327ca9d53bf ("block: use queue_limits_commit_update in queue_max_sectors_store") Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d2aa2177e4ba..e828be777206 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -694,22 +694,24 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, if (entry->load_module) entry->load_module(disk, page, length); - mutex_lock(&q->sysfs_lock); - blk_mq_freeze_queue(q); if (entry->store_limit) { struct queue_limits lim = queue_limits_start_update(q); res = entry->store_limit(disk, page, length, &lim); if (res < 0) { queue_limits_cancel_update(q); - } else { - res = queue_limits_commit_update(q, &lim); - if (!res) - res = length; + return res; } - } else { - res = entry->store(disk, page, length); + + res = queue_limits_commit_update_frozen(q, &lim); + if (res) + return res; + return length; } + + mutex_lock(&q->sysfs_lock); + blk_mq_freeze_queue(q); + res = entry->store(disk, page, length); blk_mq_unfreeze_queue(q); mutex_unlock(&q->sysfs_lock); return res; From 473106dd3aa964a62314d858f6602c95e40e6270 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:15 +0100 Subject: [PATCH 096/150] nvme: fix queue freeze vs limits lock order Match the locking order used by the core block code by only freezing the queue after taking the limits lock. Unlike most queue updates this does not use the queue_limits_commit_update_frozen helper as the nvme driver want the queue frozen for more than just the limits update. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2250ddef5a2..1ccf17f6ea7f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2128,9 +2128,10 @@ static int nvme_update_ns_info_generic(struct nvme_ns *ns, struct queue_limits lim; int ret; - blk_mq_freeze_queue(ns->disk->queue); lim = queue_limits_start_update(ns->disk->queue); nvme_set_ctrl_limits(ns->ctrl, &lim); + + blk_mq_freeze_queue(ns->disk->queue); ret = queue_limits_commit_update(ns->disk->queue, &lim); set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); blk_mq_unfreeze_queue(ns->disk->queue); @@ -2177,12 +2178,12 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, goto out; } + lim = queue_limits_start_update(ns->disk->queue); + blk_mq_freeze_queue(ns->disk->queue); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); - - lim = queue_limits_start_update(ns->disk->queue); nvme_set_ctrl_limits(ns->ctrl, &lim); nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info); nvme_set_chunk_sectors(ns, id, &lim); @@ -2285,6 +2286,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) struct queue_limits *ns_lim = &ns->disk->queue->limits; struct queue_limits lim; + lim = queue_limits_start_update(ns->head->disk->queue); blk_mq_freeze_queue(ns->head->disk->queue); /* * queue_limits mixes values that are the hardware limitations @@ -2301,7 +2303,6 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) * the splitting limits in to make sure we still obey possibly * lower limitations of other controllers. */ - lim = queue_limits_start_update(ns->head->disk->queue); lim.logical_block_size = ns_lim->logical_block_size; lim.physical_block_size = ns_lim->physical_block_size; lim.io_min = ns_lim->io_min; From f3dec61d7544a90685f1dd9a87fd4afc751996d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:16 +0100 Subject: [PATCH 097/150] nbd: fix queue freeze vs limits lock order Match the locking order used by the core block code by only freezing the queue after taking the limits lock using the queue_limits_commit_update_frozen helper. This also allows removes the need for the separate __nbd_set_size helper, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 259bd57fc529..efa05c3c06bf 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -327,8 +327,7 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, nsock->sent = 0; } -static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, - loff_t blksize) +static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { struct queue_limits lim; int error; @@ -368,7 +367,7 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, lim.logical_block_size = blksize; lim.physical_block_size = blksize; - error = queue_limits_commit_update(nbd->disk->queue, &lim); + error = queue_limits_commit_update_frozen(nbd->disk->queue, &lim); if (error) return error; @@ -379,18 +378,6 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, return 0; } -static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, - loff_t blksize) -{ - int error; - - blk_mq_freeze_queue(nbd->disk->queue); - error = __nbd_set_size(nbd, bytesize, blksize); - blk_mq_unfreeze_queue(nbd->disk->queue); - - return error; -} - static void nbd_complete_rq(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); From 1233751f7df722435bb93e928d64334db260b90d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:17 +0100 Subject: [PATCH 098/150] usb-storage: fix queue freeze vs limits lock order Match the locking order used by the core block code by only freezing the queue after taking the limits lock using the queue_limits_commit_update_frozen helper. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/usb/storage/scsiglue.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index 8c8b5e6041cc..dc98ceecb724 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -592,12 +592,9 @@ static ssize_t max_sectors_store(struct device *dev, struct device_attribute *at if (sscanf(buf, "%hu", &ms) <= 0) return -EINVAL; - blk_mq_freeze_queue(sdev->request_queue); lim = queue_limits_start_update(sdev->request_queue); lim.max_hw_sectors = ms; - ret = queue_limits_commit_update(sdev->request_queue, &lim); - blk_mq_unfreeze_queue(sdev->request_queue); - + ret = queue_limits_commit_update_frozen(sdev->request_queue, &lim); if (ret) return ret; return count; From b38c8be255e89ffcdeb817407222d2de0b573a41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:18 +0100 Subject: [PATCH 099/150] loop: refactor queue limits updates Replace loop_reconfigure_limits with a slightly less encompassing loop_update_limits that expects the caller to acquire and commit the queue limits to prepare for sorting out the freeze vs limits lock ordering. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Nilay Shroff Link: https://lore.kernel.org/r/20250110054726.1499538-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 836a53eef4b4..560d6d5879d6 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -977,12 +977,12 @@ static unsigned int loop_default_blocksize(struct loop_device *lo, return SECTOR_SIZE; } -static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize) +static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim, + unsigned int bsize) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct block_device *backing_bdev = NULL; - struct queue_limits lim; u32 granularity = 0, max_discard_sectors = 0; if (S_ISBLK(inode->i_mode)) @@ -995,22 +995,20 @@ static int loop_reconfigure_limits(struct loop_device *lo, unsigned int bsize) loop_get_discard_config(lo, &granularity, &max_discard_sectors); - lim = queue_limits_start_update(lo->lo_queue); - lim.logical_block_size = bsize; - lim.physical_block_size = bsize; - lim.io_min = bsize; - lim.features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); + lim->logical_block_size = bsize; + lim->physical_block_size = bsize; + lim->io_min = bsize; + lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL); if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY)) - lim.features |= BLK_FEAT_WRITE_CACHE; + lim->features |= BLK_FEAT_WRITE_CACHE; if (backing_bdev && !bdev_nonrot(backing_bdev)) - lim.features |= BLK_FEAT_ROTATIONAL; - lim.max_hw_discard_sectors = max_discard_sectors; - lim.max_write_zeroes_sectors = max_discard_sectors; + lim->features |= BLK_FEAT_ROTATIONAL; + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_write_zeroes_sectors = max_discard_sectors; if (max_discard_sectors) - lim.discard_granularity = granularity; + lim->discard_granularity = granularity; else - lim.discard_granularity = 0; - return queue_limits_commit_update(lo->lo_queue, &lim); + lim->discard_granularity = 0; } static int loop_configure(struct loop_device *lo, blk_mode_t mode, @@ -1019,6 +1017,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, { struct file *file = fget(config->fd); struct address_space *mapping; + struct queue_limits lim; int error; loff_t size; bool partscan; @@ -1090,7 +1089,9 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - error = loop_reconfigure_limits(lo, config->block_size); + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, config->block_size); + error = queue_limits_commit_update(lo->lo_queue, &lim); if (error) goto out_unlock; @@ -1458,6 +1459,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) static int loop_set_block_size(struct loop_device *lo, unsigned long arg) { + struct queue_limits lim; int err = 0; if (lo->lo_state != Lo_bound) @@ -1470,7 +1472,9 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - err = loop_reconfigure_limits(lo, arg); + lim = queue_limits_start_update(lo->lo_queue); + loop_update_limits(lo, &lim, arg); + err = queue_limits_commit_update(lo->lo_queue, &lim); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); From b03732a9c0db91522914185739505d92d3b0d816 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 06:47:19 +0100 Subject: [PATCH 100/150] loop: fix queue freeze vs limits lock order Match the locking order used by the core block code by only freezing the queue after taking the limits lock using the queue_limits_commit_update_frozen helper and document the callers that do not freeze the queue at all. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Nilay Shroff Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20250110054726.1499538-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 560d6d5879d6..15e486baa223 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -311,6 +311,13 @@ static void loop_clear_limits(struct loop_device *lo, int mode) lim.discard_granularity = 0; } + /* + * XXX: this updates the queue limits without freezing the queue, which + * is against the locking protocol and dangerous. But we can't just + * freeze the queue as we're inside the ->queue_rq method here. So this + * should move out into a workqueue unless we get the file operations to + * advertise if they support specific fallocate operations. + */ queue_limits_commit_update(lo->lo_queue, &lim); } @@ -1091,6 +1098,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, lim = queue_limits_start_update(lo->lo_queue); loop_update_limits(lo, &lim, config->block_size); + /* No need to freeze the queue as the device isn't bound yet. */ error = queue_limits_commit_update(lo->lo_queue, &lim); if (error) goto out_unlock; @@ -1151,7 +1159,12 @@ static void __loop_clr_fd(struct loop_device *lo) lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); - /* reset the block size to the default */ + /* + * Reset the block size to the default. + * + * No queue freezing needed because this is called from the final + * ->release call only, so there can't be any outstanding I/O. + */ lim = queue_limits_start_update(lo->lo_queue); lim.logical_block_size = SECTOR_SIZE; lim.physical_block_size = SECTOR_SIZE; @@ -1471,9 +1484,10 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); - blk_mq_freeze_queue(lo->lo_queue); lim = queue_limits_start_update(lo->lo_queue); loop_update_limits(lo, &lim, arg); + + blk_mq_freeze_queue(lo->lo_queue); err = queue_limits_commit_update(lo->lo_queue, &lim); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); From ae074d07a0e5c05769f1a9a2faa260c36d69465e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:31 +0100 Subject: [PATCH 101/150] loop: move updating lo_flags out of loop_set_status_from_info While loop_configure simplify assigns the flags passed in by userspace, loop_set_status only looks at the two changeable flags, and currently has to do a complicate dance to implement that. Move assign lo->lo_flags out of loop_set_status_from_info into the callers and thus drastically simplify the lo_flags handling in loop_set_status. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 15e486baa223..6ea729cdce71 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -971,7 +971,6 @@ loop_set_status_from_info(struct loop_device *lo, memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; - lo->lo_flags = info->lo_flags; return 0; } @@ -1069,6 +1068,7 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, error = loop_set_status_from_info(lo, &config->info); if (error) goto out_unlock; + lo->lo_flags = config->info.lo_flags; if (!(file->f_mode & FMODE_WRITE) || !(mode & BLK_OPEN_WRITE) || !file->f_op->write_iter) @@ -1258,7 +1258,6 @@ static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; - int prev_lo_flags; bool partscan = false; bool size_changed = false; @@ -1280,18 +1279,16 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); - prev_lo_flags = lo->lo_flags; - err = loop_set_status_from_info(lo, info); if (err) goto out_unfreeze; - /* Mask out flags that can't be set using LOOP_SET_STATUS. */ - lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For those flags, use the previous values instead */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS; - /* For flags that can't be cleared, use previous values too */ - lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS; + partscan = !(lo->lo_flags & LO_FLAGS_PARTSCAN) && + (info->lo_flags & LO_FLAGS_PARTSCAN); + + lo->lo_flags &= ~(LOOP_SET_STATUS_SETTABLE_FLAGS | + LOOP_SET_STATUS_CLEARABLE_FLAGS); + lo->lo_flags |= (info->lo_flags & LOOP_SET_STATUS_SETTABLE_FLAGS); if (size_changed) { loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit, @@ -1304,12 +1301,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); - - if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) && - !(prev_lo_flags & LO_FLAGS_PARTSCAN)) { + if (partscan) clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state); - partscan = true; - } out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) From 4155adb01e7406653f6b01aaca916a59567cfbfa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:32 +0100 Subject: [PATCH 102/150] loop: update commands in loop_set_status still referring to transfers The concept of transfers is gone since commit 47e9624616c8 ("block: remove support for cryptoloop and the xor transfer"). Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6ea729cdce71..0c7dfc6eee12 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1276,7 +1276,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) invalidate_bdev(lo->lo_device); } - /* I/O need to be drained during transfer transition */ + /* I/O needs to be drained before changing lo_offset or lo_sizelimit */ blk_mq_freeze_queue(lo->lo_queue); err = loop_set_status_from_info(lo, info); @@ -1296,7 +1296,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) loop_set_size(lo, new_size); } - /* update dio if lo_offset or transfer is changed */ + /* update the direct I/O flag if lo_offset changed */ __loop_update_dio(lo, lo->use_dio); out_unfreeze: From 781fc49a0e5c111b1a210bd1b3499c89bb21cd81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:33 +0100 Subject: [PATCH 103/150] loop: create a lo_can_use_dio helper Factor out a part of __loop_update_dio in preparation for further refactoring. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 0c7dfc6eee12..55bea9c95b45 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -182,26 +182,29 @@ static bool lo_bdev_can_use_dio(struct loop_device *lo, return true; } -static void __loop_update_dio(struct loop_device *lo, bool dio) +static bool lo_can_use_dio(struct loop_device *lo) { - struct file *file = lo->lo_backing_file; - struct inode *inode = file->f_mapping->host; - struct block_device *backing_bdev = NULL; - bool use_dio; + struct inode *inode = lo->lo_backing_file->f_mapping->host; + + if (!(lo->lo_backing_file->f_mode & FMODE_CAN_ODIRECT)) + return false; if (S_ISBLK(inode->i_mode)) - backing_bdev = I_BDEV(inode); - else if (inode->i_sb->s_bdev) - backing_bdev = inode->i_sb->s_bdev; + return lo_bdev_can_use_dio(lo, I_BDEV(inode)); + if (inode->i_sb->s_bdev) + return lo_bdev_can_use_dio(lo, inode->i_sb->s_bdev); + return true; +} - use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) && - (!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev)); +static void __loop_update_dio(struct loop_device *lo, bool dio) +{ + bool use_dio = dio && lo_can_use_dio(lo); if (lo->use_dio == use_dio) return; /* flush dirty pages before changing direct IO */ - vfs_fsync(file, 0); + vfs_fsync(lo->lo_backing_file, 0); /* * The flag of LO_FLAGS_DIRECT_IO is handled similarly with From 09ccf5549d7809671af34774bb30c8f935d6ed2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:34 +0100 Subject: [PATCH 104/150] loop: only write back pagecache when starting to to use direct I/O There is no point in doing an fdatasync to write out pages when switching away from direct I/O, as there won't be any. The writeback is only needed when switching to direct I/O, which would have to invalidate the pagecache less efficiently from the I/O path. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 55bea9c95b45..cf80cdf5e440 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -203,8 +203,9 @@ static void __loop_update_dio(struct loop_device *lo, bool dio) if (lo->use_dio == use_dio) return; - /* flush dirty pages before changing direct IO */ - vfs_fsync(lo->lo_backing_file, 0); + /* flush dirty pages before starting to use direct I/O */ + if (use_dio) + vfs_fsync(lo->lo_backing_file, 0); /* * The flag of LO_FLAGS_DIRECT_IO is handled similarly with From dc909525daec7c7c5d628683c99d26e281c1a7bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:35 +0100 Subject: [PATCH 105/150] loop: open code the direct I/O flag update in loop_set_dio loop_set_dio is different from the other (__)loop_update_dio callers in that it doesn't take any implicit conditions into account and wants to update the direct I/O flag to the user passed in value and fail if that can't be done. Open code the logic here to prepare for simplifying the other direct I/O flag updates and to make the error handling less convoluted. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cf80cdf5e440..6eb6d901151c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1455,16 +1455,28 @@ static int loop_set_capacity(struct loop_device *lo) static int loop_set_dio(struct loop_device *lo, unsigned long arg) { - int error = -ENXIO; - if (lo->lo_state != Lo_bound) - goto out; + bool use_dio = !!arg; - __loop_update_dio(lo, !!arg); - if (lo->use_dio == !!arg) + if (lo->lo_state != Lo_bound) + return -ENXIO; + if (use_dio == lo->use_dio) return 0; - error = -EINVAL; - out: - return error; + + if (use_dio) { + if (!lo_can_use_dio(lo)) + return -EINVAL; + /* flush dirty pages before starting to use direct I/O */ + vfs_fsync(lo->lo_backing_file, 0); + } + + blk_mq_freeze_queue(lo->lo_queue); + lo->use_dio = use_dio; + if (use_dio) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + else + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + blk_mq_unfreeze_queue(lo->lo_queue); + return 0; } static int loop_set_block_size(struct loop_device *lo, unsigned long arg) From 3a693110afd7127400cc9f779c885f01cf16d0f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:36 +0100 Subject: [PATCH 106/150] loop: allow loop_set_status to re-enable direct I/O Unlike all other calls of (__)loop_update_dio, loop_set_status never looks at the O_DIRECT flag of the backing file, and thus doesn't re-enable direct I/O on an O_DIRECT backing file if e.g. the new block size would allow it. Fix that and remove the need for the separate __loop_update_dio flag. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6eb6d901151c..2e1f8aa045a9 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -196,8 +196,9 @@ static bool lo_can_use_dio(struct loop_device *lo) return true; } -static void __loop_update_dio(struct loop_device *lo, bool dio) +static inline void loop_update_dio(struct loop_device *lo) { + bool dio = lo->use_dio || (lo->lo_backing_file->f_flags & O_DIRECT); bool use_dio = dio && lo_can_use_dio(lo); if (lo->use_dio == use_dio) @@ -531,12 +532,6 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) } } -static inline void loop_update_dio(struct loop_device *lo) -{ - __loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) | - lo->use_dio); -} - static void loop_reread_partitions(struct loop_device *lo) { int rc; @@ -1301,7 +1296,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) } /* update the direct I/O flag if lo_offset changed */ - __loop_update_dio(lo, lo->use_dio); + loop_update_dio(lo); out_unfreeze: blk_mq_unfreeze_queue(lo->lo_queue); From 0cd719aa63def1d57316e8e903f01f4af0641a46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:37 +0100 Subject: [PATCH 107/150] loop: don't freeze the queue in loop_update_dio All callers of loop_update_dio except for loop_configure already have the queue frozen, and loop_configure works on an unbound device. Remove the superfluous recursive freezing in loop_update_dio and add asserts for the locking and freezing state instead. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 2e1f8aa045a9..acb1a0cdfb27 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -201,6 +201,10 @@ static inline void loop_update_dio(struct loop_device *lo) bool dio = lo->use_dio || (lo->lo_backing_file->f_flags & O_DIRECT); bool use_dio = dio && lo_can_use_dio(lo); + lockdep_assert_held(&lo->lo_mutex); + WARN_ON_ONCE(lo->lo_state == Lo_bound && + lo->lo_queue->mq_freeze_depth == 0); + if (lo->use_dio == use_dio) return; @@ -213,15 +217,11 @@ static inline void loop_update_dio(struct loop_device *lo) * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup * will get updated by ioctl(LOOP_GET_STATUS) */ - if (lo->lo_state == Lo_bound) - blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; if (use_dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; else lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; - if (lo->lo_state == Lo_bound) - blk_mq_unfreeze_queue(lo->lo_queue); } /** From afd69d5c4a1049230fa91c9b54fdd8132f755503 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 10 Jan 2025 08:37:38 +0100 Subject: [PATCH 108/150] loop: remove the use_dio field in struct loop_device This field duplicate the LO_FLAGS_DIRECT_IO flag in lo_flags. Remove it to have a single source of truth about using direct I/O. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250110073750.1582447-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index acb1a0cdfb27..1ec7417c7f00 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -68,7 +68,6 @@ struct loop_device { struct list_head idle_worker_list; struct rb_root worker_tree; struct timer_list timer; - bool use_dio; bool sysfs_inited; struct request_queue *lo_queue; @@ -196,32 +195,30 @@ static bool lo_can_use_dio(struct loop_device *lo) return true; } +/* + * Direct I/O can be enabled either by using an O_DIRECT file descriptor, or by + * passing in the LO_FLAGS_DIRECT_IO flag from userspace. It will be silently + * disabled when the device block size is too small or the offset is unaligned. + * + * loop_get_status will always report the effective LO_FLAGS_DIRECT_IO flag and + * not the originally passed in one. + */ static inline void loop_update_dio(struct loop_device *lo) { - bool dio = lo->use_dio || (lo->lo_backing_file->f_flags & O_DIRECT); - bool use_dio = dio && lo_can_use_dio(lo); + bool dio_in_use = lo->lo_flags & LO_FLAGS_DIRECT_IO; lockdep_assert_held(&lo->lo_mutex); WARN_ON_ONCE(lo->lo_state == Lo_bound && lo->lo_queue->mq_freeze_depth == 0); - if (lo->use_dio == use_dio) - return; - - /* flush dirty pages before starting to use direct I/O */ - if (use_dio) - vfs_fsync(lo->lo_backing_file, 0); - - /* - * The flag of LO_FLAGS_DIRECT_IO is handled similarly with - * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup - * will get updated by ioctl(LOOP_GET_STATUS) - */ - lo->use_dio = use_dio; - if (use_dio) + if (lo->lo_backing_file->f_flags & O_DIRECT) lo->lo_flags |= LO_FLAGS_DIRECT_IO; - else + if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !lo_can_use_dio(lo)) lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + + /* flush dirty pages before starting to issue direct I/O */ + if ((lo->lo_flags & LO_FLAGS_DIRECT_IO) && !dio_in_use) + vfs_fsync(lo->lo_backing_file, 0); } /** @@ -1089,7 +1086,6 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, disk_force_media_change(lo->lo_disk); set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0); - lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO; lo->lo_device = bdev; lo->lo_backing_file = file; lo->old_gfp_mask = mapping_gfp_mask(mapping); @@ -1454,7 +1450,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) if (lo->lo_state != Lo_bound) return -ENXIO; - if (use_dio == lo->use_dio) + if (use_dio == !!(lo->lo_flags & LO_FLAGS_DIRECT_IO)) return 0; if (use_dio) { @@ -1465,7 +1461,6 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg) } blk_mq_freeze_queue(lo->lo_queue); - lo->use_dio = use_dio; if (use_dio) lo->lo_flags |= LO_FLAGS_DIRECT_IO; else @@ -1876,7 +1871,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->use_aio = false; break; default: - cmd->use_aio = lo->use_dio; + cmd->use_aio = lo->lo_flags & LO_FLAGS_DIRECT_IO; break; } From 9ac273ae3dc296905b4d61e4c8e7a25592f6d183 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Jan 2025 09:52:54 -0700 Subject: [PATCH 109/150] io_uring/rw: use io_rw_recycle() from cleanup path Cleanup should always have the uring lock held, it's safe to recycle from here. Signed-off-by: Jens Axboe --- io_uring/rw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index ca1b19d3d142..afc669048c5d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -434,7 +434,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - io_rw_iovec_free(req->async_data); + lockdep_assert_held(&req->ctx->uring_lock); + io_rw_recycle(req, 0); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) From d803d123948feffbd992213e144df224097f82b0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Jan 2025 10:59:35 -0700 Subject: [PATCH 110/150] io_uring/rw: handle -EAGAIN retry at IO completion time Rather than try and have io_read/io_write turn REQ_F_REISSUE into -EAGAIN, catch the REQ_F_REISSUE when the request is otherwise considered as done. This is saner as we know this isn't happening during an actual submission, and it removes the need to randomly check REQ_F_REISSUE after read/write submission. If REQ_F_REISSUE is set, __io_submit_flush_completions() will skip over this request in terms of posting a CQE, and the regular request cleaning will ensure that it gets reissued via io-wq. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 +++++++-- io_uring/rw.c | 80 ++++++++++++++------------------------------- 2 files changed, 38 insertions(+), 57 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db198bd435b5..92ba2fdcd087 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -115,7 +115,7 @@ REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) + REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -1403,6 +1403,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1442,7 +1448,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); diff --git a/io_uring/rw.c b/io_uring/rw.c index afc669048c5d..c52c0515f0a2 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -202,7 +202,7 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) * mean that the underlying data can be gone at any time. But that * should be fixed seperately, and then this check could be killed. */ - if (!(req->flags & REQ_F_REFCOUNT)) { + if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; io_rw_recycle(req, issue_flags); } @@ -455,19 +455,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -#ifdef CONFIG_BLOCK -static void io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - - io_meta_restore(io, &rw->kiocb); - iov_iter_restore(&io->iter, &io->iter_state); -} - static bool io_rw_should_reissue(struct io_kiocb *req) { +#ifdef CONFIG_BLOCK + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); umode_t mode = file_inode(req->file)->i_mode; + struct io_async_rw *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!S_ISBLK(mode) && !S_ISREG(mode)) @@ -488,17 +481,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (!same_thread_group(req->tctx->task, current) || !in_task()) return false; + + io_meta_restore(io, &rw->kiocb); + iov_iter_restore(&io->iter, &io->iter_state); return true; -} #else -static void io_resubmit_prep(struct io_kiocb *req) -{ -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ return false; -} #endif +} static void io_req_end_write(struct io_kiocb *req) { @@ -525,22 +515,16 @@ static void io_req_io_end(struct io_kiocb *req) } } -static bool __io_complete_rw_common(struct io_kiocb *req, long res) +static void __io_complete_rw_common(struct io_kiocb *req, long res) { - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - /* - * Reissue will start accounting again, finish the - * current cycle. - */ - io_req_io_end(req); - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - return true; - } + if (res == req->cqe.res) + return; + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + } else { req_set_fail(req); req->cqe.res = res; } - return false; } static inline int io_fixup_rw_res(struct io_kiocb *req, long res) @@ -583,8 +567,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_kiocb *req = cmd_to_io_kiocb(rw); if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { - if (__io_complete_rw_common(req, res)) - return; + __io_complete_rw_common(req, res); io_req_set_res(req, io_fixup_rw_res(req, res), 0); } req->io_task_work.func = io_req_rw_complete; @@ -646,26 +629,19 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { - if (!__io_complete_rw_common(req, ret)) { - /* - * Safe to call io_end from here as we're inline - * from the submission path. - */ - io_req_io_end(req); - io_req_set_res(req, final_ret, - io_put_kbuf(req, ret, issue_flags)); - io_req_rw_cleanup(req, issue_flags); - return IOU_OK; - } + __io_complete_rw_common(req, ret); + /* + * Safe to call io_end from here as we're inline + * from the submission path. + */ + io_req_io_end(req); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); + io_req_rw_cleanup(req, issue_flags); + return IOU_OK; } else { io_rw_done(&rw->kiocb, ret); } - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - io_resubmit_prep(req); - return -EAGAIN; - } return IOU_ISSUE_SKIP_COMPLETE; } @@ -944,8 +920,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EOPNOTSUPP && force_nonblock) ret = -EAGAIN; - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; + if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; @@ -1154,11 +1129,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. From b08e02045002e0412441d5243b0ee1a9bfc3952b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Jan 2025 11:07:47 -0700 Subject: [PATCH 111/150] io_uring/rw: don't gate retry on completion context nvme multipath reports that they see spurious -EAGAIN bubbling back to userspace, which is caused by how they handle retries internally through a kworker. However, any data that needs preserving or importing for a read/write request has always been done so at prep time, and we can sanely skip this check. Reported-by: "Haeuptle, Michael" Link: https://lore.kernel.org/io-uring/DS7PR84MB31105C2C63CFA47BE8CBD6EE95102@DS7PR84MB3110.NAMPRD84.PROD.OUTLOOK.COM/ Signed-off-by: Jens Axboe --- io_uring/rw.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index c52c0515f0a2..ee5d38db9b48 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -475,12 +475,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (percpu_ref_is_dying(&ctx->refs)) return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->tctx->task, current) || !in_task()) - return false; io_meta_restore(io, &rw->kiocb); iov_iter_restore(&io->iter, &io->iter_state); From 94d57442e56d2ad2ca20d096040b8ae6f216a921 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 5 Dec 2024 11:51:09 +0530 Subject: [PATCH 112/150] io_uring: expose read/write attribute capability After commit 9a213d3b80c0, we can pass additional attributes along with read/write. However, userspace doesn't know that. Add a new feature flag IORING_FEAT_RW_ATTR, to notify the userspace that the kernel has this ability. Signed-off-by: Anuj Gupta Reviewed-by: Li Zetao Reviewed-by: Martin K. Petersen Tested-by: Martin K. Petersen Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20241205062109.1788-1-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/io_uring.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 38f0d6b10eaf..e11c82638527 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -577,6 +577,7 @@ struct io_uring_params { #define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_MIN_TIMEOUT (1U << 15) +#define IORING_FEAT_RW_ATTR (1U << 16) /* * io_uring_register(2) opcodes and arguments diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92ba2fdcd087..af03e9973b58 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3713,7 +3713,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | + IORING_FEAT_RW_ATTR; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; From 32193789878c259e39b97bd0c0f2f0ccbe5cb8a8 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sat, 4 Jan 2025 23:27:11 +0200 Subject: [PATCH 113/150] nvme-tcp: Fix I/O queue cpu spreading for multiple controllers Since day-1 we are assigning the queue io_cpu very naively. We always base the queue id (controller scope) and assign it its matching cpu from the online mask. This works fine when the number of queues match the number of cpu cores. The problem starts when we have less queues than cpu cores. First, we should take into account the mq_map and select a cpu within the cpus that are assigned to this queue by the mq_map in order to minimize cross numa cpu bouncing. Second, even worse is that we don't take into account multiple controllers may have assigned queues to a given cpu. As a result we may simply compund more and more queues on the same set of cpus, which is suboptimal. We fix this by introducing global per-cpu counters that tracks the number of queues assigned to each cpu, and we select the least used cpu based on the mq_map and the per-cpu counters, and assign it as the queue io_cpu. The behavior for a single controller is slightly optimized by selecting better cpu candidates by consulting with the mq_map, and multiple controllers are spreading queues among cpu cores much better, resulting in lower average cpu load, and less likelihood to hit hotspots. Note that the accounting is not 100% perfect, but we don't need to be, we're simply putting our best effort to select the best candidate cpu core that we find at any given point. Another byproduct is that every controller reset/reconnect may change the queues io_cpu mapping, based on the current LRU accounting scheme. Here is the baseline queue io_cpu assignment for 4 controllers, 2 queues per controller, and 4 cpus on the host: nvme1: queue 0: using cpu 0 nvme1: queue 1: using cpu 1 nvme2: queue 0: using cpu 0 nvme2: queue 1: using cpu 1 nvme3: queue 0: using cpu 0 nvme3: queue 1: using cpu 1 nvme4: queue 0: using cpu 0 nvme4: queue 1: using cpu 1 And this is the fixed io_cpu assignment: nvme1: queue 0: using cpu 0 nvme1: queue 1: using cpu 2 nvme2: queue 0: using cpu 1 nvme2: queue 1: using cpu 3 nvme3: queue 0: using cpu 0 nvme3: queue 1: using cpu 2 nvme4: queue 0: using cpu 1 nvme4: queue 1: using cpu 3 Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Suggested-by: Hannes Reinecke Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig [fixed kbuild reported errors] Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 70 +++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 28c76a3e1bd2..dc5bbca58c6d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -54,6 +54,8 @@ MODULE_PARM_DESC(tls_handshake_timeout, "nvme TLS handshake timeout in seconds (default 10)"); #endif +static atomic_t nvme_tcp_cpu_queues[NR_CPUS]; + #ifdef CONFIG_DEBUG_LOCK_ALLOC /* lockdep can detect a circular dependency of the form * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock @@ -127,6 +129,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, NVME_TCP_Q_POLLING = 2, + NVME_TCP_Q_IO_CPU_SET = 3, }; enum nvme_tcp_recv_state { @@ -1562,23 +1565,56 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) ctrl->io_queues[HCTX_TYPE_POLL]; } +/** + * Track the number of queues assigned to each cpu using a global per-cpu + * counter and select the least used cpu from the mq_map. Our goal is to spread + * different controllers I/O threads across different cpu cores. + * + * Note that the accounting is not 100% perfect, but we don't need to be, we're + * simply putting our best effort to select the best candidate cpu core that we + * find at any given point. + */ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) { struct nvme_tcp_ctrl *ctrl = queue->ctrl; - int qid = nvme_tcp_queue_id(queue); - int n = 0; + struct blk_mq_tag_set *set = &ctrl->tag_set; + int qid = nvme_tcp_queue_id(queue) - 1; + unsigned int *mq_map = NULL; + int cpu, min_queues = INT_MAX, io_cpu; + + if (wq_unbound) + goto out; if (nvme_tcp_default_queue(queue)) - n = qid - 1; + mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map; else if (nvme_tcp_read_queue(queue)) - n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; + mq_map = set->map[HCTX_TYPE_READ].mq_map; else if (nvme_tcp_poll_queue(queue)) - n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - - ctrl->io_queues[HCTX_TYPE_READ] - 1; - if (wq_unbound) - queue->io_cpu = WORK_CPU_UNBOUND; - else - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + mq_map = set->map[HCTX_TYPE_POLL].mq_map; + + if (WARN_ON(!mq_map)) + goto out; + + /* Search for the least used cpu from the mq_map */ + io_cpu = WORK_CPU_UNBOUND; + for_each_online_cpu(cpu) { + int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]); + + if (mq_map[cpu] != qid) + continue; + if (num_queues < min_queues) { + io_cpu = cpu; + min_queues = num_queues; + } + } + if (io_cpu != WORK_CPU_UNBOUND) { + queue->io_cpu = io_cpu; + atomic_inc(&nvme_tcp_cpu_queues[io_cpu]); + set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags); + } +out: + dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n", + qid, queue->io_cpu); } static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) @@ -1722,7 +1758,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, queue->sock->sk->sk_allocation = GFP_ATOMIC; queue->sock->sk->sk_use_task_frag = false; - nvme_tcp_set_queue_io_cpu(queue); + queue->io_cpu = WORK_CPU_UNBOUND; queue->request = NULL; queue->data_remaining = 0; queue->ddgst_remaining = 0; @@ -1844,6 +1880,9 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; + if (test_and_clear_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags)) + atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]); + mutex_lock(&queue->queue_lock); if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) __nvme_tcp_stop_queue(queue); @@ -1878,9 +1917,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) nvme_tcp_init_recv_ctx(queue); nvme_tcp_setup_sock_ops(queue); - if (idx) + if (idx) { + nvme_tcp_set_queue_io_cpu(queue); ret = nvmf_connect_io_queue(nctrl, idx); - else + } else ret = nvmf_connect_admin_queue(nctrl); if (!ret) { @@ -2849,6 +2889,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { static int __init nvme_tcp_init_module(void) { unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; + int cpu; BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); @@ -2866,6 +2907,9 @@ static int __init nvme_tcp_init_module(void) if (!nvme_tcp_wq) return -ENOMEM; + for_each_possible_cpu(cpu) + atomic_set(&nvme_tcp_cpu_queues[cpu], 0); + nvmf_register_transport(&nvme_tcp_transport); return 0; } From ac32057acc7f3d7a238dafaa9b2aa2bc9750080e Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Fri, 20 Dec 2024 13:00:47 +0100 Subject: [PATCH 114/150] nvme: Add error check for xa_store in nvme_get_effects_log The xa_store() may fail due to memory allocation failure because there is no guarantee that the index csi is already used. This fix adds an error check of the return value of xa_store() in nvme_get_effects_log(). Fixes: 1cf7a12e09aa ("nvme: use an xarray to lookup the Commands Supported and Effects log") Signed-off-by: Keisuke Nishimura Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2250ddef5a2..4bdd5144af7c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3092,7 +3092,7 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, struct nvme_effects_log **log) { - struct nvme_effects_log *cel = xa_load(&ctrl->cels, csi); + struct nvme_effects_log *old, *cel = xa_load(&ctrl->cels, csi); int ret; if (cel) @@ -3109,7 +3109,11 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, return ret; } - xa_store(&ctrl->cels, csi, cel, GFP_KERNEL); + old = xa_store(&ctrl->cels, csi, cel, GFP_KERNEL); + if (xa_is_err(old)) { + kfree(cel); + return xa_err(old); + } out: *log = cel; return 0; From 002bb02729dc7f5a9b356e98e672262ca432b861 Mon Sep 17 00:00:00 2001 From: Yongsoo Joo Date: Mon, 23 Dec 2024 01:05:17 +0000 Subject: [PATCH 115/150] nvme: change return type of nvme_poll_cq() to bool The nvme_poll_cq() function currently returns the number of CQEs found, However, only one caller, nvme_poll(), requires a boolean value to check whether any CQE was completed. The other callers do not use the return value at all. To better reflect its usage, update the return type of nvme_poll_cq() from int to bool. Signed-off-by: Yongsoo Joo Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 709328a67f91..57e8e32c4529 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1147,13 +1147,13 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline int nvme_poll_cq(struct nvme_queue *nvmeq, - struct io_comp_batch *iob) +static inline bool nvme_poll_cq(struct nvme_queue *nvmeq, + struct io_comp_batch *iob) { - int found = 0; + bool found = false; while (nvme_cqe_pending(nvmeq)) { - found++; + found = true; /* * load-load control dependency between phase and the rest of * the cqe requires a full read memory barrier From 30e77e0fbec6940ecc5c79ffe0f076c54cf5a8d9 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:34 +0900 Subject: [PATCH 116/150] nvme: Move opcode string helper functions declarations Move the declaration of all helper functions converting NVMe command opcodes and status codes into strings from drivers/nvme/host/nvme.h into include/linux/nvme.h, together with the commands definitions. This allows NVMe target drivers to call these functions without having to include a host header file. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 39 --------------------------------------- include/linux/nvme.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 611b02c8a8b3..2c76afd00390 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -1182,43 +1182,4 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; } -#ifdef CONFIG_NVME_VERBOSE_ERRORS -const char *nvme_get_error_status_str(u16 status); -const char *nvme_get_opcode_str(u8 opcode); -const char *nvme_get_admin_opcode_str(u8 opcode); -const char *nvme_get_fabrics_opcode_str(u8 opcode); -#else /* CONFIG_NVME_VERBOSE_ERRORS */ -static inline const char *nvme_get_error_status_str(u16 status) -{ - return "I/O Error"; -} -static inline const char *nvme_get_opcode_str(u8 opcode) -{ - return "I/O Cmd"; -} -static inline const char *nvme_get_admin_opcode_str(u8 opcode) -{ - return "Admin Cmd"; -} - -static inline const char *nvme_get_fabrics_opcode_str(u8 opcode) -{ - return "Fabrics Cmd"; -} -#endif /* CONFIG_NVME_VERBOSE_ERRORS */ - -static inline const char *nvme_opcode_str(int qid, u8 opcode) -{ - return qid ? nvme_get_opcode_str(opcode) : - nvme_get_admin_opcode_str(opcode); -} - -static inline const char *nvme_fabrics_opcode_str( - int qid, const struct nvme_command *cmd) -{ - if (nvme_is_fabrics(cmd)) - return nvme_get_fabrics_opcode_str(cmd->fabrics.fctype); - - return nvme_opcode_str(qid, cmd->common.opcode); -} #endif /* _NVME_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 13377dde4527..a5a4ee56efcf 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1896,6 +1896,46 @@ static inline bool nvme_is_fabrics(const struct nvme_command *cmd) return cmd->common.opcode == nvme_fabrics_command; } +#ifdef CONFIG_NVME_VERBOSE_ERRORS +const char *nvme_get_error_status_str(u16 status); +const char *nvme_get_opcode_str(u8 opcode); +const char *nvme_get_admin_opcode_str(u8 opcode); +const char *nvme_get_fabrics_opcode_str(u8 opcode); +#else /* CONFIG_NVME_VERBOSE_ERRORS */ +static inline const char *nvme_get_error_status_str(u16 status) +{ + return "I/O Error"; +} +static inline const char *nvme_get_opcode_str(u8 opcode) +{ + return "I/O Cmd"; +} +static inline const char *nvme_get_admin_opcode_str(u8 opcode) +{ + return "Admin Cmd"; +} + +static inline const char *nvme_get_fabrics_opcode_str(u8 opcode) +{ + return "Fabrics Cmd"; +} +#endif /* CONFIG_NVME_VERBOSE_ERRORS */ + +static inline const char *nvme_opcode_str(int qid, u8 opcode) +{ + return qid ? nvme_get_opcode_str(opcode) : + nvme_get_admin_opcode_str(opcode); +} + +static inline const char *nvme_fabrics_opcode_str( + int qid, const struct nvme_command *cmd) +{ + if (nvme_is_fabrics(cmd)) + return nvme_get_fabrics_opcode_str(cmd->fabrics.fctype); + + return nvme_opcode_str(qid, cmd->common.opcode); +} + struct nvme_error_slot { __le64 error_count; __le16 sqid; From 5d4f4ea8fa2992eaf7040aab2f87e8dc849387fd Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:35 +0900 Subject: [PATCH 117/150] nvmet: Add vendor_id and subsys_vendor_id subsystem attributes Define the new vendor_id and subsys_vendor_id configfs attribute for target subsystems. These attributes are respectively reported as the vid field and as the ssvid field of the identify controller data of a target controllers using the subsystem for which these attributes are set. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 5 ++-- drivers/nvme/target/configfs.c | 45 +++++++++++++++++++++++++++++++++ drivers/nvme/target/nvmet.h | 2 ++ 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 2962794ce881..b73f5fde4d9e 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -522,9 +522,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) goto out; } - /* XXX: figure out how to assign real vendors IDs. */ - id->vid = 0; - id->ssvid = 0; + id->vid = cpu_to_le16(subsys->vendor_id); + id->ssvid = cpu_to_le16(subsys->subsys_vendor_id); memcpy(id->sn, ctrl->subsys->serial, NVMET_SN_MAX_SIZE); memcpy_and_pad(id->mn, sizeof(id->mn), subsys->model_number, diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index eeee9e9b854c..4b2b8e7d96f5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1412,6 +1412,49 @@ out_unlock: } CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max); +static ssize_t nvmet_subsys_attr_vendor_id_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "0x%x\n", to_subsys(item)->vendor_id); +} + +static ssize_t nvmet_subsys_attr_vendor_id_store(struct config_item *item, + const char *page, size_t count) +{ + u16 vid; + + if (kstrtou16(page, 0, &vid)) + return -EINVAL; + + down_write(&nvmet_config_sem); + to_subsys(item)->vendor_id = vid; + up_write(&nvmet_config_sem); + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_vendor_id); + +static ssize_t nvmet_subsys_attr_subsys_vendor_id_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "0x%x\n", + to_subsys(item)->subsys_vendor_id); +} + +static ssize_t nvmet_subsys_attr_subsys_vendor_id_store(struct config_item *item, + const char *page, size_t count) +{ + u16 ssvid; + + if (kstrtou16(page, 0, &ssvid)) + return -EINVAL; + + down_write(&nvmet_config_sem); + to_subsys(item)->subsys_vendor_id = ssvid; + up_write(&nvmet_config_sem); + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_subsys_vendor_id); + static ssize_t nvmet_subsys_attr_model_show(struct config_item *item, char *page) { @@ -1640,6 +1683,8 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_serial, &nvmet_subsys_attr_attr_cntlid_min, &nvmet_subsys_attr_attr_cntlid_max, + &nvmet_subsys_attr_attr_vendor_id, + &nvmet_subsys_attr_attr_subsys_vendor_id, &nvmet_subsys_attr_attr_model, &nvmet_subsys_attr_attr_qid_max, &nvmet_subsys_attr_attr_ieee_oui, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 58328b35dc96..e4a31a37c14b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -324,6 +324,8 @@ struct nvmet_subsys { struct config_group namespaces_group; struct config_group allowed_hosts_group; + u16 vendor_id; + u16 subsys_vendor_id; char *model_number; u32 ieee_oui; char *firmware_rev; From 15e9d26445441a0c05ee14bef7ba752088f66a0c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:36 +0900 Subject: [PATCH 118/150] nvmet: Export nvmet_update_cc() and nvmet_cc_xxx() helpers Make the function nvmet_update_cc() available to target drivers by exporting it. To also facilitate the manipulation of the cc register bits, move the inline helper functions nvmet_cc_en(), nvmet_cc_css(), nvmet_cc_mps(), nvmet_cc_ams(), nvmet_cc_shn(), nvmet_cc_iosqes(), and nvmet_cc_iocqes() from core.c to nvmet.h so that these functions can be reused in target controller drivers. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 36 +----------------------------------- drivers/nvme/target/nvmet.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 1f4e9989663b..4b5594549ae6 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1166,41 +1166,6 @@ void nvmet_req_free_sgls(struct nvmet_req *req) } EXPORT_SYMBOL_GPL(nvmet_req_free_sgls); -static inline bool nvmet_cc_en(u32 cc) -{ - return (cc >> NVME_CC_EN_SHIFT) & 0x1; -} - -static inline u8 nvmet_cc_css(u32 cc) -{ - return (cc >> NVME_CC_CSS_SHIFT) & 0x7; -} - -static inline u8 nvmet_cc_mps(u32 cc) -{ - return (cc >> NVME_CC_MPS_SHIFT) & 0xf; -} - -static inline u8 nvmet_cc_ams(u32 cc) -{ - return (cc >> NVME_CC_AMS_SHIFT) & 0x7; -} - -static inline u8 nvmet_cc_shn(u32 cc) -{ - return (cc >> NVME_CC_SHN_SHIFT) & 0x3; -} - -static inline u8 nvmet_cc_iosqes(u32 cc) -{ - return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; -} - -static inline u8 nvmet_cc_iocqes(u32 cc) -{ - return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; -} - static inline bool nvmet_css_supported(u8 cc_css) { switch (cc_css << NVME_CC_CSS_SHIFT) { @@ -1277,6 +1242,7 @@ void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; mutex_unlock(&ctrl->lock); } +EXPORT_SYMBOL_GPL(nvmet_update_cc); static void nvmet_init_cap(struct nvmet_ctrl *ctrl) { diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e4a31a37c14b..e68f1927339c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -732,6 +732,41 @@ void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl); u16 errno_to_nvme_status(struct nvmet_req *req, int errno); u16 nvmet_report_invalid_opcode(struct nvmet_req *req); +static inline bool nvmet_cc_en(u32 cc) +{ + return (cc >> NVME_CC_EN_SHIFT) & 0x1; +} + +static inline u8 nvmet_cc_css(u32 cc) +{ + return (cc >> NVME_CC_CSS_SHIFT) & 0x7; +} + +static inline u8 nvmet_cc_mps(u32 cc) +{ + return (cc >> NVME_CC_MPS_SHIFT) & 0xf; +} + +static inline u8 nvmet_cc_ams(u32 cc) +{ + return (cc >> NVME_CC_AMS_SHIFT) & 0x7; +} + +static inline u8 nvmet_cc_shn(u32 cc) +{ + return (cc >> NVME_CC_SHN_SHIFT) & 0x3; +} + +static inline u8 nvmet_cc_iosqes(u32 cc) +{ + return (cc >> NVME_CC_IOSQES_SHIFT) & 0xf; +} + +static inline u8 nvmet_cc_iocqes(u32 cc) +{ + return (cc >> NVME_CC_IOCQES_SHIFT) & 0xf; +} + /* Convert a 32-bit number to a 16-bit 0's based number */ static inline __le16 to0based(u32 a) { From 1ee4531054863eeba38cc0e94ac9c23560a83e96 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:37 +0900 Subject: [PATCH 119/150] nvmet: Introduce nvmet_get_cmd_effects_admin() In order to have a logically better organized implementation of the effects log page, split out reporting the supported admin commands from nvmet_get_cmd_effects_nvm() into the new function nvmet_get_cmd_effects_admin(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index b73f5fde4d9e..78478a4a2e4d 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -230,7 +230,7 @@ out: nvmet_req_complete(req, status); } -static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) +static void nvmet_get_cmd_effects_admin(struct nvme_effects_log *log) { log->acs[nvme_admin_get_log_page] = log->acs[nvme_admin_identify] = @@ -240,7 +240,10 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) log->acs[nvme_admin_async_event] = log->acs[nvme_admin_keep_alive] = cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); +} +static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) +{ log->iocs[nvme_cmd_read] = log->iocs[nvme_cmd_flush] = log->iocs[nvme_cmd_dsm] = @@ -276,6 +279,7 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) switch (req->cmd->get_log_page.csi) { case NVME_CSI_NVM: + nvmet_get_cmd_effects_admin(log); nvmet_get_cmd_effects_nvm(log); break; case NVME_CSI_ZNS: @@ -283,6 +287,7 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) status = NVME_SC_INVALID_IO_CMD_SET; goto free; } + nvmet_get_cmd_effects_admin(log); nvmet_get_cmd_effects_nvm(log); nvmet_get_cmd_effects_zns(log); break; From 35c593e5303c7f1f389728b4115cfd0f4d8c30ae Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:38 +0900 Subject: [PATCH 120/150] nvmet: Add drvdata field to struct nvmet_ctrl Allow a target driver to attach private data to a target controller by adding the new field drvdata to struct nvmet_ctrl. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e68f1927339c..abcc1f3828b7 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -238,6 +238,8 @@ struct nvmet_ctrl { struct nvmet_subsys *subsys; struct nvmet_sq **sqs; + void *drvdata; + bool reset_tbkas; struct mutex lock; From 200adac75888182c09027e9b7852507dabd87034 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:39 +0900 Subject: [PATCH 121/150] nvme: Add PCI transport type Define the transport type NVMF_TRTYPE_PCI for PCI endpoint targets. This transport type is defined using the value 0 which is reserved in the NVMe base specifications v2.1 (Figure 294). Given that struct nvmet_port are zeroed out on creation, to avoid having this transsport type becoming the new default, nvmet_referral_make() and nvmet_ports_make() are modified to initialize a port discovery address transport type field (disc_addr.trtype) to NVMF_TRTYPE_MAX. Any port using this transport type is also skipped and not reported in the discovery log page (nvmet_execute_disc_get_log_page()). The helper function nvmet_is_pci_ctrl() is also introduced to check if a target controller uses the PCI transport. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 4 ++++ drivers/nvme/target/discovery.c | 3 +++ drivers/nvme/target/nvmet.h | 5 +++++ include/linux/nvme.h | 1 + 4 files changed, 13 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 4b2b8e7d96f5..20cad722c060 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -37,6 +37,7 @@ static struct nvmet_type_name_map nvmet_transport[] = { { NVMF_TRTYPE_RDMA, "rdma" }, { NVMF_TRTYPE_FC, "fc" }, { NVMF_TRTYPE_TCP, "tcp" }, + { NVMF_TRTYPE_PCI, "pci" }, { NVMF_TRTYPE_LOOP, "loop" }, }; @@ -46,6 +47,7 @@ static const struct nvmet_type_name_map nvmet_addr_family[] = { { NVMF_ADDR_FAMILY_IP6, "ipv6" }, { NVMF_ADDR_FAMILY_IB, "ib" }, { NVMF_ADDR_FAMILY_FC, "fc" }, + { NVMF_ADDR_FAMILY_PCI, "pci" }, { NVMF_ADDR_FAMILY_LOOP, "loop" }, }; @@ -1839,6 +1841,7 @@ static struct config_group *nvmet_referral_make( return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&port->entry); + port->disc_addr.trtype = NVMF_TRTYPE_MAX; config_group_init_type_name(&port->group, name, &nvmet_referral_type); return &port->group; @@ -2064,6 +2067,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, port->inline_data_size = -1; /* < 0 == let the transport choose */ port->max_queue_size = -1; /* < 0 == let the transport choose */ + port->disc_addr.trtype = NVMF_TRTYPE_MAX; port->disc_addr.portid = cpu_to_le16(portid); port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX; port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW; diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 28843df5fa7c..7a13f8e8d33d 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -224,6 +224,9 @@ static void nvmet_execute_disc_get_log_page(struct nvmet_req *req) } list_for_each_entry(r, &req->port->referrals, entry) { + if (r->disc_addr.trtype == NVMF_TRTYPE_PCI) + continue; + nvmet_format_discovery_entry(hdr, r, NVME_DISC_SUBSYS_NAME, r->disc_addr.traddr, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index abcc1f3828b7..4dad413e5fef 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -693,6 +693,11 @@ static inline bool nvmet_is_disc_subsys(struct nvmet_subsys *subsys) return subsys->type != NVME_NQN_NVME; } +static inline bool nvmet_is_pci_ctrl(struct nvmet_ctrl *ctrl) +{ + return ctrl->port->disc_addr.trtype == NVMF_TRTYPE_PCI; +} + #ifdef CONFIG_NVME_TARGET_PASSTHRU void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys); int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index a5a4ee56efcf..42fc00dc494e 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -64,6 +64,7 @@ enum { /* Transport Type codes for Discovery Log Page entry TRTYPE field */ enum { + NVMF_TRTYPE_PCI = 0, /* PCI */ NVMF_TRTYPE_RDMA = 1, /* RDMA */ NVMF_TRTYPE_FC = 2, /* Fibre Channel */ NVMF_TRTYPE_TCP = 3, /* TCP/IP */ From 6202783184bf063c57efb8d83ce0adaf8da11090 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:40 +0900 Subject: [PATCH 122/150] nvmet: Improve nvmet_alloc_ctrl() interface and implementation Introduce struct nvmet_alloc_ctrl_args to define the arguments for the function nvmet_alloc_ctrl() to avoid the need for passing a pointer to a struct nvmet_req as an argument. This new data structure aggregates together the arguments that were passed to nvmet_alloc_ctrl() (subsysnqn, hostnqn and kato), together with the struct nvmet_req fields used by nvmet_alloc_ctrl(), that is, the fields port, p2p_client, and ops as input and the result and error_loc fields as output, as well as a status field. nvmet_alloc_ctrl() is also changed to return a pointer to the allocated and initialized controller structure instead of a status code, as the status is now returned through the status field of struct nvmet_alloc_ctrl_args. The function nvmet_setup_p2p_ns_map() is changed to not take a pointer to a struct nvmet_req as argument, instead, directly specify the p2p_client device pointer needed as argument. The code in nvmet_execute_admin_connect() that initializes a new target controller after allocating it is moved into nvmet_alloc_ctrl(). The code that sets up an admin queue for the controller (and the call to nvmet_install_queue()) remains in nvmet_execute_admin_connect(). Finally, nvmet_alloc_ctrl() is also exported to allow target drivers to use this function directly to allocate and initialize a new controller structure without the need to rely on a fabrics connect command request. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 83 ++++++++++++++++++++----------- drivers/nvme/target/fabrics-cmd.c | 60 ++++++++++------------ drivers/nvme/target/nvmet.h | 18 +++++-- 3 files changed, 95 insertions(+), 66 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 4b5594549ae6..4909f3e5a552 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1350,15 +1350,15 @@ bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn) * Note: ctrl->subsys->lock should be held when calling this function */ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl, - struct nvmet_req *req) + struct device *p2p_client) { struct nvmet_ns *ns; unsigned long idx; - if (!req->p2p_client) + if (!p2p_client) return; - ctrl->p2p_client = get_device(req->p2p_client); + ctrl->p2p_client = get_device(p2p_client); xa_for_each(&ctrl->subsys->namespaces, idx, ns) nvmet_p2pmem_ns_add_p2p(ctrl, ns); @@ -1387,45 +1387,44 @@ static void nvmet_fatal_error_handler(struct work_struct *work) ctrl->ops->delete_ctrl(ctrl); } -u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, - struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp, - uuid_t *hostid) +struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) { struct nvmet_subsys *subsys; struct nvmet_ctrl *ctrl; + u32 kato = args->kato; + u8 dhchap_status; int ret; - u16 status; - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; - subsys = nvmet_find_get_subsys(req->port, subsysnqn); + args->status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; + subsys = nvmet_find_get_subsys(args->port, args->subsysnqn); if (!subsys) { pr_warn("connect request for invalid subsystem %s!\n", - subsysnqn); - req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(subsysnqn); - req->error_loc = offsetof(struct nvme_common_command, dptr); - goto out; + args->subsysnqn); + args->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + args->error_loc = offsetof(struct nvme_common_command, dptr); + return NULL; } down_read(&nvmet_config_sem); - if (!nvmet_host_allowed(subsys, hostnqn)) { + if (!nvmet_host_allowed(subsys, args->hostnqn)) { pr_info("connect by host %s for subsystem %s not allowed\n", - hostnqn, subsysnqn); - req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn); + args->hostnqn, args->subsysnqn); + args->result = IPO_IATTR_CONNECT_DATA(hostnqn); up_read(&nvmet_config_sem); - status = NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR; - req->error_loc = offsetof(struct nvme_common_command, dptr); + args->status = NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR; + args->error_loc = offsetof(struct nvme_common_command, dptr); goto out_put_subsystem; } up_read(&nvmet_config_sem); - status = NVME_SC_INTERNAL; + args->status = NVME_SC_INTERNAL; ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) goto out_put_subsystem; mutex_init(&ctrl->lock); - ctrl->port = req->port; - ctrl->ops = req->ops; + ctrl->port = args->port; + ctrl->ops = args->ops; #ifdef CONFIG_NVME_TARGET_PASSTHRU /* By default, set loop targets to clear IDS by default */ @@ -1439,8 +1438,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); - memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); - memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + memcpy(ctrl->subsysnqn, args->subsysnqn, NVMF_NQN_SIZE); + memcpy(ctrl->hostnqn, args->hostnqn, NVMF_NQN_SIZE); kref_init(&ctrl->ref); ctrl->subsys = subsys; @@ -1463,12 +1462,12 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { - status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; + args->status = NVME_SC_CONNECT_CTRL_BUSY | NVME_STATUS_DNR; goto out_free_sqs; } ctrl->cntlid = ret; - uuid_copy(&ctrl->hostid, hostid); + uuid_copy(&ctrl->hostid, args->hostid); /* * Discovery controllers may use some arbitrary high value @@ -1490,12 +1489,35 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (ret) goto init_pr_fail; list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); - nvmet_setup_p2p_ns_map(ctrl, req); + nvmet_setup_p2p_ns_map(ctrl, args->p2p_client); nvmet_debugfs_ctrl_setup(ctrl); mutex_unlock(&subsys->lock); - *ctrlp = ctrl; - return 0; + if (args->hostid) + uuid_copy(&ctrl->hostid, args->hostid); + + dhchap_status = nvmet_setup_auth(ctrl); + if (dhchap_status) { + pr_err("Failed to setup authentication, dhchap status %u\n", + dhchap_status); + nvmet_ctrl_put(ctrl); + if (dhchap_status == NVME_AUTH_DHCHAP_FAILURE_FAILED) + args->status = + NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR; + else + args->status = NVME_SC_INTERNAL; + return NULL; + } + + args->status = NVME_SC_SUCCESS; + + pr_info("Created %s controller %d for subsystem %s for NQN %s%s%s.\n", + nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", + ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, + ctrl->pi_support ? " T10-PI is enabled" : "", + nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); + + return ctrl; init_pr_fail: mutex_unlock(&subsys->lock); @@ -1509,9 +1531,9 @@ out_free_ctrl: kfree(ctrl); out_put_subsystem: nvmet_subsys_put(subsys); -out: - return status; + return NULL; } +EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl); static void nvmet_ctrl_free(struct kref *ref) { @@ -1547,6 +1569,7 @@ void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) { kref_put(&ctrl->ref, nvmet_ctrl_free); } +EXPORT_SYMBOL_GPL(nvmet_ctrl_put); void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) { diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index c49904ebb6c2..8dbd7df8c9a0 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -213,73 +213,67 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) struct nvmf_connect_command *c = &req->cmd->connect; struct nvmf_connect_data *d; struct nvmet_ctrl *ctrl = NULL; - u16 status; - u8 dhchap_status; + struct nvmet_alloc_ctrl_args args = { + .port = req->port, + .ops = req->ops, + .p2p_client = req->p2p_client, + .kato = le32_to_cpu(c->kato), + }; if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data))) return; d = kmalloc(sizeof(*d), GFP_KERNEL); if (!d) { - status = NVME_SC_INTERNAL; + args.status = NVME_SC_INTERNAL; goto complete; } - status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d)); - if (status) + args.status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d)); + if (args.status) goto out; if (c->recfmt != 0) { pr_warn("invalid connect version (%d).\n", le16_to_cpu(c->recfmt)); - req->error_loc = offsetof(struct nvmf_connect_command, recfmt); - status = NVME_SC_CONNECT_FORMAT | NVME_STATUS_DNR; + args.error_loc = offsetof(struct nvmf_connect_command, recfmt); + args.status = NVME_SC_CONNECT_FORMAT | NVME_STATUS_DNR; goto out; } if (unlikely(d->cntlid != cpu_to_le16(0xffff))) { pr_warn("connect attempt for invalid controller ID %#x\n", d->cntlid); - status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; - req->cqe->result.u32 = IPO_IATTR_CONNECT_DATA(cntlid); + args.status = NVME_SC_CONNECT_INVALID_PARAM | NVME_STATUS_DNR; + args.result = IPO_IATTR_CONNECT_DATA(cntlid); goto out; } d->subsysnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; d->hostnqn[NVMF_NQN_FIELD_LEN - 1] = '\0'; - status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, - le32_to_cpu(c->kato), &ctrl, &d->hostid); - if (status) + + args.subsysnqn = d->subsysnqn; + args.hostnqn = d->hostnqn; + args.hostid = &d->hostid; + args.kato = c->kato; + + ctrl = nvmet_alloc_ctrl(&args); + if (!ctrl) goto out; - dhchap_status = nvmet_setup_auth(ctrl); - if (dhchap_status) { - pr_err("Failed to setup authentication, dhchap status %u\n", - dhchap_status); - nvmet_ctrl_put(ctrl); - if (dhchap_status == NVME_AUTH_DHCHAP_FAILURE_FAILED) - status = (NVME_SC_CONNECT_INVALID_HOST | NVME_STATUS_DNR); - else - status = NVME_SC_INTERNAL; - goto out; - } - - status = nvmet_install_queue(ctrl, req); - if (status) { + args.status = nvmet_install_queue(ctrl, req); + if (args.status) { nvmet_ctrl_put(ctrl); goto out; } - pr_info("creating %s controller %d for subsystem %s for NQN %s%s%s.\n", - nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", - ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, - ctrl->pi_support ? " T10-PI is enabled" : "", - nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); - req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl)); + args.result = cpu_to_le32(nvmet_connect_result(ctrl)); out: kfree(d); complete: - nvmet_req_complete(req, status); + req->error_loc = args.error_loc; + req->cqe->result.u32 = args.result; + nvmet_req_complete(req, args.status); } static void nvmet_execute_io_connect(struct nvmet_req *req) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 4dad413e5fef..ed7e8cd890e4 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -549,9 +549,21 @@ int nvmet_sq_init(struct nvmet_sq *sq); void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); -u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, - struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp, - uuid_t *hostid); + +struct nvmet_alloc_ctrl_args { + struct nvmet_port *port; + char *subsysnqn; + char *hostnqn; + uuid_t *hostid; + const struct nvmet_fabrics_ops *ops; + struct device *p2p_client; + u32 kato; + u32 result; + u16 error_loc; + u16 status; +}; + +struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args); struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, struct nvmet_req *req); From 43043c9b97258a008b3402cfbbf1c5d82151c77f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:41 +0900 Subject: [PATCH 123/150] nvmet: Introduce nvmet_req_transfer_len() Add the new function nvmet_req_transfer_len() to parse a request command to extract the transfer length of the command. This function implementation relies on multiple helper functions for parsing I/O commands (nvmet_io_cmd_transfer_len()), admin commands (nvmet_admin_cmd_data_len()) and fabrics connect commands (nvmet_connect_cmd_data_len). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 21 +++++++++++++ drivers/nvme/target/core.c | 37 ++++++++++++++++++++++ drivers/nvme/target/discovery.c | 14 +++++++++ drivers/nvme/target/fabrics-cmd-auth.c | 14 +++++++-- drivers/nvme/target/fabrics-cmd.c | 43 ++++++++++++++++++++++++++ drivers/nvme/target/nvmet.h | 8 +++++ 6 files changed, 135 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 78478a4a2e4d..6f7e5b0c91c7 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1296,6 +1296,27 @@ out: nvmet_req_complete(req, status); } +u32 nvmet_admin_cmd_data_len(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (nvme_is_fabrics(cmd)) + return nvmet_fabrics_admin_cmd_data_len(req); + if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) + return nvmet_discovery_cmd_data_len(req); + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + return nvmet_get_log_page_len(cmd); + case nvme_admin_identify: + return NVME_IDENTIFY_DATA_SIZE; + case nvme_admin_get_features: + return nvmet_feat_data_len(req, le32_to_cpu(cmd->common.cdw10)); + default: + return 0; + } +} + u16 nvmet_parse_admin_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 4909f3e5a552..9bca3e576893 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -911,6 +911,33 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) return 0; } +static u32 nvmet_io_cmd_transfer_len(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + u32 metadata_len = 0; + + if (nvme_is_fabrics(cmd)) + return nvmet_fabrics_io_cmd_data_len(req); + + if (!req->ns) + return 0; + + switch (req->cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + case nvme_cmd_zone_append: + if (req->sq->ctrl->pi_support && nvmet_ns_has_pi(req->ns)) + metadata_len = nvmet_rw_metadata_len(req); + return nvmet_rw_data_len(req) + metadata_len; + case nvme_cmd_dsm: + return nvmet_dsm_len(req); + case nvme_cmd_zone_mgmt_recv: + return (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2; + default: + return 0; + } +} + static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -1059,6 +1086,16 @@ void nvmet_req_uninit(struct nvmet_req *req) } EXPORT_SYMBOL_GPL(nvmet_req_uninit); +size_t nvmet_req_transfer_len(struct nvmet_req *req) +{ + if (likely(req->sq->qid != 0)) + return nvmet_io_cmd_transfer_len(req); + if (unlikely(!req->sq->ctrl)) + return nvmet_connect_cmd_data_len(req); + return nvmet_admin_cmd_data_len(req); +} +EXPORT_SYMBOL_GPL(nvmet_req_transfer_len); + bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len) { if (unlikely(len != req->transfer_len)) { diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 7a13f8e8d33d..df7207640506 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -355,6 +355,20 @@ static void nvmet_execute_disc_get_features(struct nvmet_req *req) nvmet_req_complete(req, stat); } +u32 nvmet_discovery_cmd_data_len(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + return nvmet_get_log_page_len(req->cmd); + case nvme_admin_identify: + return NVME_IDENTIFY_DATA_SIZE; + default: + return 0; + } +} + u16 nvmet_parse_discovery_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 3f2857c17d95..2022757f08dc 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -179,6 +179,11 @@ static u8 nvmet_auth_failure2(void *d) return data->rescode_exp; } +u32 nvmet_auth_send_data_len(struct nvmet_req *req) +{ + return le32_to_cpu(req->cmd->auth_send.tl); +} + void nvmet_execute_auth_send(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -206,7 +211,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req) offsetof(struct nvmf_auth_send_command, spsp1); goto done; } - tl = le32_to_cpu(req->cmd->auth_send.tl); + tl = nvmet_auth_send_data_len(req); if (!tl) { status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = @@ -429,6 +434,11 @@ static void nvmet_auth_failure1(struct nvmet_req *req, void *d, int al) data->rescode_exp = req->sq->dhchap_status; } +u32 nvmet_auth_receive_data_len(struct nvmet_req *req) +{ + return le32_to_cpu(req->cmd->auth_receive.al); +} + void nvmet_execute_auth_receive(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; @@ -454,7 +464,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req) offsetof(struct nvmf_auth_receive_command, spsp1); goto done; } - al = le32_to_cpu(req->cmd->auth_receive.al); + al = nvmet_auth_receive_data_len(req); if (!al) { status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; req->error_loc = diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 8dbd7df8c9a0..a7ff05b3be29 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -85,6 +85,22 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) nvmet_req_complete(req, status); } +u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->fabrics.fctype) { +#ifdef CONFIG_NVME_TARGET_AUTH + case nvme_fabrics_type_auth_send: + return nvmet_auth_send_data_len(req); + case nvme_fabrics_type_auth_receive: + return nvmet_auth_receive_data_len(req); +#endif + default: + return 0; + } +} + u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -114,6 +130,22 @@ u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req) return 0; } +u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->fabrics.fctype) { +#ifdef CONFIG_NVME_TARGET_AUTH + case nvme_fabrics_type_auth_send: + return nvmet_auth_send_data_len(req); + case nvme_fabrics_type_auth_receive: + return nvmet_auth_receive_data_len(req); +#endif + default: + return 0; + } +} + u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -337,6 +369,17 @@ out_ctrl_put: goto out; } +u32 nvmet_connect_cmd_data_len(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (!nvme_is_fabrics(cmd) || + cmd->fabrics.fctype != nvme_fabrics_type_connect) + return 0; + + return sizeof(struct nvmf_connect_data); +} + u16 nvmet_parse_connect_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index ed7e8cd890e4..96c4c2489be7 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -517,18 +517,24 @@ void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl); void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl); u16 nvmet_parse_connect_cmd(struct nvmet_req *req); +u32 nvmet_connect_cmd_data_len(struct nvmet_req *req); void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req); +u32 nvmet_admin_cmd_data_len(struct nvmet_req *req); u16 nvmet_parse_admin_cmd(struct nvmet_req *req); +u32 nvmet_discovery_cmd_data_len(struct nvmet_req *req); u16 nvmet_parse_discovery_cmd(struct nvmet_req *req); u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req); +u32 nvmet_fabrics_admin_cmd_data_len(struct nvmet_req *req); u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req); +u32 nvmet_fabrics_io_cmd_data_len(struct nvmet_req *req); bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); void nvmet_req_uninit(struct nvmet_req *req); +size_t nvmet_req_transfer_len(struct nvmet_req *req); bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len); bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len); void nvmet_req_complete(struct nvmet_req *req, u16 status); @@ -822,7 +828,9 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) } #ifdef CONFIG_NVME_TARGET_AUTH +u32 nvmet_auth_send_data_len(struct nvmet_req *req); void nvmet_execute_auth_send(struct nvmet_req *req); +u32 nvmet_auth_receive_data_len(struct nvmet_req *req); void nvmet_execute_auth_receive(struct nvmet_req *req); int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, bool set_ctrl); From 1eb380caf5275bba1d3d6182dde1fd740f331743 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:42 +0900 Subject: [PATCH 124/150] nvmet: Introduce nvmet_sq_create() and nvmet_cq_create() Introduce the new functions nvmet_sq_create() and nvmet_cq_create() to allow a target driver to initialize and setup admin and IO queues directly, without needing to execute connect fabrics commands. The helper functions nvmet_check_cqid() and nvmet_check_sqid() are implemented to check the correctness of SQ and CQ IDs when nvmet_sq_create() and nvmet_cq_create() are called. nvmet_sq_create() and nvmet_cq_create() are primarily intended for use with PCI target controller drivers and thus are not well integrated with the current queue creation of fabrics controllers using the connect command. These fabrices drivers are not modified to use these functions. This simple implementation of SQ and CQ management for PCI target controller drivers does not allow multiple SQs to share the same CQ, similarly to other fabrics transports. This is a specification violation. A more involved set of changes will follow to add support for this required completion queue sharing feature. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 83 +++++++++++++++++++++++++++++++++++++ drivers/nvme/target/nvmet.h | 6 +++ 2 files changed, 89 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 9bca3e576893..3a92e3a81b46 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -818,6 +818,89 @@ static void nvmet_confirm_sq(struct percpu_ref *ref) complete(&sq->confirm_done); } +u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid) +{ + if (!ctrl->sqs) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + + if (cqid > ctrl->subsys->max_qid) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + /* + * Note: For PCI controllers, the NVMe specifications allows multiple + * SQs to share a single CQ. However, we do not support this yet, so + * check that there is no SQ defined for a CQ. If one exist, then the + * CQ ID is invalid for creation as well as when the CQ is being + * deleted (as that would mean that the SQ was not deleted before the + * CQ). + */ + if (ctrl->sqs[cqid]) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + return NVME_SC_SUCCESS; +} + +u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, + u16 qid, u16 size) +{ + u16 status; + + status = nvmet_check_cqid(ctrl, qid); + if (status != NVME_SC_SUCCESS) + return status; + + nvmet_cq_setup(ctrl, cq, qid, size); + + return NVME_SC_SUCCESS; +} +EXPORT_SYMBOL_GPL(nvmet_cq_create); + +u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, + bool create) +{ + if (!ctrl->sqs) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + + if (sqid > ctrl->subsys->max_qid) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + if ((create && ctrl->sqs[sqid]) || + (!create && !ctrl->sqs[sqid])) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + return NVME_SC_SUCCESS; +} + +u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + u16 sqid, u16 size) +{ + u16 status; + int ret; + + if (!kref_get_unless_zero(&ctrl->ref)) + return NVME_SC_INTERNAL | NVME_STATUS_DNR; + + status = nvmet_check_sqid(ctrl, sqid, true); + if (status != NVME_SC_SUCCESS) + return status; + + ret = nvmet_sq_init(sq); + if (ret) { + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + goto ctrl_put; + } + + nvmet_sq_setup(ctrl, sq, sqid, size); + sq->ctrl = ctrl; + + return NVME_SC_SUCCESS; + +ctrl_put: + nvmet_ctrl_put(ctrl); + return status; +} +EXPORT_SYMBOL_GPL(nvmet_sq_create); + void nvmet_sq_destroy(struct nvmet_sq *sq) { struct nvmet_ctrl *ctrl = sq->ctrl; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 96c4c2489be7..5c8ed8f93918 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -545,10 +545,16 @@ void nvmet_execute_set_features(struct nvmet_req *req); void nvmet_execute_get_features(struct nvmet_req *req); void nvmet_execute_keep_alive(struct nvmet_req *req); +u16 nvmet_check_cqid(struct nvmet_ctrl *ctrl, u16 cqid); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, u16 size); +u16 nvmet_cq_create(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, + u16 size); +u16 nvmet_check_sqid(struct nvmet_ctrl *ctrl, u16 sqid, bool create); void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, u16 size); +u16 nvmet_sq_create(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, + u16 size); void nvmet_sq_destroy(struct nvmet_sq *sq); int nvmet_sq_init(struct nvmet_sq *sq); From 60d3cd856114cb51f2f0ba6570d25085c55671f8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:43 +0900 Subject: [PATCH 125/150] nvmet: Add support for I/O queue management admin commands The I/O submission queue management admin commands (nvme_admin_delete_sq, nvme_admin_create_sq, nvme_admin_delete_cq, and nvme_admin_create_cq) are mandatory admin commands for I/O controllers using the PCI transport, that is, support for these commands is mandatory for a a PCI target I/O controller. Implement support for these commands by adding the functions nvmet_execute_delete_sq(), nvmet_execute_create_sq(), nvmet_execute_delete_cq() and nvmet_execute_create_cq() to set as the execute method of requests for these commands. These functions will return an invalid opcode error for any controller that is not a PCI target controller. Support for the I/O queue management commands is also reported in the command effect log of PCI target controllers (using nvmet_get_cmd_effects_admin()). Each management command is backed by a controller fabric operation that can be defined by a PCI target controller driver to setup I/O queues using nvmet_sq_create() and nvmet_cq_create() or delete I/O queues using nvmet_sq_destroy(). As noted in a comment in nvmet_execute_create_sq(), we do not yet support sharing a single CQ between multiple SQs. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 165 +++++++++++++++++++++++++++++++- drivers/nvme/target/nvmet.h | 8 ++ 2 files changed, 170 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 6f7e5b0c91c7..c91864c185fc 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -12,6 +12,142 @@ #include #include "nvmet.h" +static void nvmet_execute_delete_sq(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 sqid = le16_to_cpu(req->cmd->delete_queue.qid); + u16 status; + + if (!nvmet_is_pci_ctrl(ctrl)) { + status = nvmet_report_invalid_opcode(req); + goto complete; + } + + if (!sqid) { + status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; + goto complete; + } + + status = nvmet_check_sqid(ctrl, sqid, false); + if (status != NVME_SC_SUCCESS) + goto complete; + + status = ctrl->ops->delete_sq(ctrl, sqid); + +complete: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_create_sq(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_command *cmd = req->cmd; + u16 sqid = le16_to_cpu(cmd->create_sq.sqid); + u16 cqid = le16_to_cpu(cmd->create_sq.cqid); + u16 sq_flags = le16_to_cpu(cmd->create_sq.sq_flags); + u16 qsize = le16_to_cpu(cmd->create_sq.qsize); + u64 prp1 = le64_to_cpu(cmd->create_sq.prp1); + u16 status; + + if (!nvmet_is_pci_ctrl(ctrl)) { + status = nvmet_report_invalid_opcode(req); + goto complete; + } + + if (!sqid) { + status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; + goto complete; + } + + status = nvmet_check_sqid(ctrl, sqid, true); + if (status != NVME_SC_SUCCESS) + goto complete; + + /* + * Note: The NVMe specification allows multiple SQs to use the same CQ. + * However, the target code does not really support that. So for now, + * prevent this and fail the command if sqid and cqid are different. + */ + if (!cqid || cqid != sqid) { + pr_err("SQ %u: Unsupported CQID %u\n", sqid, cqid); + status = NVME_SC_CQ_INVALID | NVME_STATUS_DNR; + goto complete; + } + + if (!qsize || qsize > NVME_CAP_MQES(ctrl->cap)) { + status = NVME_SC_QUEUE_SIZE | NVME_STATUS_DNR; + goto complete; + } + + status = ctrl->ops->create_sq(ctrl, sqid, sq_flags, qsize, prp1); + +complete: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_delete_cq(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 cqid = le16_to_cpu(req->cmd->delete_queue.qid); + u16 status; + + if (!nvmet_is_pci_ctrl(ctrl)) { + status = nvmet_report_invalid_opcode(req); + goto complete; + } + + if (!cqid) { + status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; + goto complete; + } + + status = nvmet_check_cqid(ctrl, cqid); + if (status != NVME_SC_SUCCESS) + goto complete; + + status = ctrl->ops->delete_cq(ctrl, cqid); + +complete: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_create_cq(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_command *cmd = req->cmd; + u16 cqid = le16_to_cpu(cmd->create_cq.cqid); + u16 cq_flags = le16_to_cpu(cmd->create_cq.cq_flags); + u16 qsize = le16_to_cpu(cmd->create_cq.qsize); + u16 irq_vector = le16_to_cpu(cmd->create_cq.irq_vector); + u64 prp1 = le64_to_cpu(cmd->create_cq.prp1); + u16 status; + + if (!nvmet_is_pci_ctrl(ctrl)) { + status = nvmet_report_invalid_opcode(req); + goto complete; + } + + if (!cqid) { + status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; + goto complete; + } + + status = nvmet_check_cqid(ctrl, cqid); + if (status != NVME_SC_SUCCESS) + goto complete; + + if (!qsize || qsize > NVME_CAP_MQES(ctrl->cap)) { + status = NVME_SC_QUEUE_SIZE | NVME_STATUS_DNR; + goto complete; + } + + status = ctrl->ops->create_cq(ctrl, cqid, cq_flags, qsize, + prp1, irq_vector); + +complete: + nvmet_req_complete(req, status); +} + u32 nvmet_get_log_page_len(struct nvme_command *cmd) { u32 len = le16_to_cpu(cmd->get_log_page.numdu); @@ -230,8 +366,18 @@ out: nvmet_req_complete(req, status); } -static void nvmet_get_cmd_effects_admin(struct nvme_effects_log *log) +static void nvmet_get_cmd_effects_admin(struct nvmet_ctrl *ctrl, + struct nvme_effects_log *log) { + /* For a PCI target controller, advertize support for the . */ + if (nvmet_is_pci_ctrl(ctrl)) { + log->acs[nvme_admin_delete_sq] = + log->acs[nvme_admin_create_sq] = + log->acs[nvme_admin_delete_cq] = + log->acs[nvme_admin_create_cq] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); + } + log->acs[nvme_admin_get_log_page] = log->acs[nvme_admin_identify] = log->acs[nvme_admin_abort_cmd] = @@ -268,6 +414,7 @@ static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log) static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) { + struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_effects_log *log; u16 status = NVME_SC_SUCCESS; @@ -279,7 +426,7 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) switch (req->cmd->get_log_page.csi) { case NVME_CSI_NVM: - nvmet_get_cmd_effects_admin(log); + nvmet_get_cmd_effects_admin(ctrl, log); nvmet_get_cmd_effects_nvm(log); break; case NVME_CSI_ZNS: @@ -287,7 +434,7 @@ static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) status = NVME_SC_INVALID_IO_CMD_SET; goto free; } - nvmet_get_cmd_effects_admin(log); + nvmet_get_cmd_effects_admin(ctrl, log); nvmet_get_cmd_effects_nvm(log); nvmet_get_cmd_effects_zns(log); break; @@ -1335,9 +1482,21 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) return nvmet_parse_passthru_admin_cmd(req); switch (cmd->common.opcode) { + case nvme_admin_delete_sq: + req->execute = nvmet_execute_delete_sq; + return 0; + case nvme_admin_create_sq: + req->execute = nvmet_execute_create_sq; + return 0; case nvme_admin_get_log_page: req->execute = nvmet_execute_get_log_page; return 0; + case nvme_admin_delete_cq: + req->execute = nvmet_execute_delete_cq; + return 0; + case nvme_admin_create_cq: + req->execute = nvmet_execute_create_cq; + return 0; case nvme_admin_identify: req->execute = nvmet_execute_identify; return 0; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 5c8ed8f93918..86bb2852a63b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -408,6 +408,14 @@ struct nvmet_fabrics_ops { void (*discovery_chg)(struct nvmet_port *port); u8 (*get_mdts)(const struct nvmet_ctrl *ctrl); u16 (*get_max_queue_size)(const struct nvmet_ctrl *ctrl); + + /* Operations mandatory for PCI target controllers */ + u16 (*create_sq)(struct nvmet_ctrl *ctrl, u16 sqid, u16 flags, + u16 qsize, u64 prp1); + u16 (*delete_sq)(struct nvmet_ctrl *ctrl, u16 sqid); + u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags, + u16 qsize, u64 prp1, u16 irq_vector); + u16 (*delete_cq)(struct nvmet_ctrl *ctrl, u16 cqid); }; #define NVMET_MAX_INLINE_BIOVEC 8 From 1ad8630ffa95ae48b2a9a079d124de452569f2f8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:44 +0900 Subject: [PATCH 126/150] nvmet: Do not require SGL for PCI target controller commands Support for SGL is optional for the PCI transport. Modify nvmet_req_init() to not require the NVME_CMD_SGL_METABUF command flag to be set if the target controller transport type is NVMF_TRTYPE_PCI. In addition to this, the NVMe base specification v2.1 mandate that all admin commands use PRP, that is, have CDW0.PSDT cleared to 0. Modify nvmet_parse_admin_cmd() to check this. Finally, modify nvmet_check_transfer_len() and nvmet_check_data_len_lte() to return the appropriate error status depending on the command using SGL or PRP. Since for fabrics nvmet_req_init() checks that a command uses SGL, always, this change affects only PCI target controllers. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 5 +++++ drivers/nvme/target/core.c | 27 +++++++++++++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c91864c185fc..0c5127a1d191 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1478,6 +1478,11 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; + /* For PCI controllers, admin commands shall not use SGL. */ + if (nvmet_is_pci_ctrl(req->sq->ctrl) && !req->sq->qid && + cmd->common.flags & NVME_CMD_SGL_ALL) + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + if (nvmet_is_passthru_req(req)) return nvmet_parse_passthru_admin_cmd(req); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 3a92e3a81b46..43c9888eea90 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1122,12 +1122,15 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, /* * For fabrics, PSDT field shall describe metadata pointer (MPTR) that * contains an address of a single contiguous physical buffer that is - * byte aligned. + * byte aligned. For PCI controllers, this is optional so not enforced. */ if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) { - req->error_loc = offsetof(struct nvme_common_command, flags); - status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; - goto fail; + if (!req->sq->ctrl || !nvmet_is_pci_ctrl(req->sq->ctrl)) { + req->error_loc = + offsetof(struct nvme_common_command, flags); + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + goto fail; + } } if (unlikely(!req->sq->ctrl)) @@ -1182,8 +1185,14 @@ EXPORT_SYMBOL_GPL(nvmet_req_transfer_len); bool nvmet_check_transfer_len(struct nvmet_req *req, size_t len) { if (unlikely(len != req->transfer_len)) { + u16 status; + req->error_loc = offsetof(struct nvme_common_command, dptr); - nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR); + if (req->cmd->common.flags & NVME_CMD_SGL_ALL) + status = NVME_SC_SGL_INVALID_DATA; + else + status = NVME_SC_INVALID_FIELD; + nvmet_req_complete(req, status | NVME_STATUS_DNR); return false; } @@ -1194,8 +1203,14 @@ EXPORT_SYMBOL_GPL(nvmet_check_transfer_len); bool nvmet_check_data_len_lte(struct nvmet_req *req, size_t data_len) { if (unlikely(data_len > req->transfer_len)) { + u16 status; + req->error_loc = offsetof(struct nvme_common_command, dptr); - nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_STATUS_DNR); + if (req->cmd->common.flags & NVME_CMD_SGL_ALL) + status = NVME_SC_SGL_INVALID_DATA; + else + status = NVME_SC_INVALID_FIELD; + nvmet_req_complete(req, status | NVME_STATUS_DNR); return false; } From 08461535a9cd9757dadbae0ee3f3bbdd6e66ba09 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:45 +0900 Subject: [PATCH 127/150] nvmet: Introduce get/set_feature controller operations The implementation of some features cannot always be done generically by the target core code. Arbitraion and IRQ coalescing features are examples of such features: their implementation must be provided (at least partially) by the target controller driver. Introduce the set_feature() and get_feature() controller fabrics operations (in struct nvmet_fabrics_ops) to allow supporting such features. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/nvmet.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 86bb2852a63b..8325de3382ee 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -416,6 +416,10 @@ struct nvmet_fabrics_ops { u16 (*create_cq)(struct nvmet_ctrl *ctrl, u16 cqid, u16 flags, u16 qsize, u64 prp1, u16 irq_vector); u16 (*delete_cq)(struct nvmet_ctrl *ctrl, u16 cqid); + u16 (*set_feature)(const struct nvmet_ctrl *ctrl, u8 feat, + void *feat_data); + u16 (*get_feature)(const struct nvmet_ctrl *ctrl, u8 feat, + void *feat_data); }; #define NVMET_MAX_INLINE_BIOVEC 8 From 2f2b20fad973d00169d24f5338eb1bf0a42e8218 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:46 +0900 Subject: [PATCH 128/150] nvmet: Implement host identifier set feature support The NVMe specifications mandate support for the host identifier set_features for controllers that also supports reservations. Satisfy this requirement by implementing handling of the NVME_FEAT_HOST_ID feature for the nvme_set_features command. This implementation is for now effective only for PCI target controllers. For other controller types, the set features command is failed with a NVME_SC_CMD_SEQ_ERROR status as before. As noted in the code, 128 bits host identifiers are supported since the NVMe base specifications version 2.1 indicate in section 5.1.25.1.28.1 that "The controller may support a 64-bit Host Identifier...". The RHII (Reservations and Host Identifier Interaction) bit of the controller attribute (ctratt) field of the identify controller data is also set to indicate that a host ID of "0" is supported but that the host ID must be a non-zero value to use reservations. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 35 +++++++++++++++++++++++++++++---- include/linux/nvme.h | 1 + 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 0c5127a1d191..efef3acba9fb 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -659,7 +659,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmet_subsys *subsys = ctrl->subsys; struct nvme_id_ctrl *id; - u32 cmd_capsule_size; + u32 cmd_capsule_size, ctratt; u16 status = 0; if (!subsys->subsys_discovered) { @@ -707,8 +707,10 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* XXX: figure out what to do about RTD3R/RTD3 */ id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL); - id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT | - NVME_CTRL_ATTR_TBKAS); + ctratt = NVME_CTRL_ATTR_HID_128_BIT | NVME_CTRL_ATTR_TBKAS; + if (nvmet_is_pci_ctrl(ctrl)) + ctratt |= NVME_CTRL_ATTR_RHII; + id->ctratt = cpu_to_le32(ctratt); id->oacs = 0; @@ -1255,6 +1257,31 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask) return 0; } +static u16 nvmet_set_feat_host_id(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + if (!nvmet_is_pci_ctrl(ctrl)) + return NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; + + /* + * The NVMe base specifications v2.1 recommends supporting 128-bits host + * IDs (section 5.1.25.1.28.1). However, that same section also says + * that "The controller may support a 64-bit Host Identifier and/or an + * extended 128-bit Host Identifier". So simplify this support and do + * not support 64-bits host IDs to avoid needing to check that all + * controllers associated with the same subsystem all use the same host + * ID size. + */ + if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) { + req->error_loc = offsetof(struct nvme_common_command, cdw11); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + + return nvmet_copy_from_sgl(req, 0, &req->sq->ctrl->hostid, + sizeof(req->sq->ctrl->hostid)); +} + void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = nvmet_req_subsys(req); @@ -1285,7 +1312,7 @@ void nvmet_execute_set_features(struct nvmet_req *req) status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL); break; case NVME_FEAT_HOST_ID: - status = NVME_SC_CMD_SEQ_ERROR | NVME_STATUS_DNR; + status = nvmet_set_feat_host_id(req); break; case NVME_FEAT_WRITE_PROTECT: status = nvmet_set_feat_write_protect(req); diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 42fc00dc494e..fe3b60818fdc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -276,6 +276,7 @@ enum nvme_ctrl_attr { NVME_CTRL_ATTR_HID_128_BIT = (1 << 0), NVME_CTRL_ATTR_TBKAS = (1 << 6), NVME_CTRL_ATTR_ELBAS = (1 << 15), + NVME_CTRL_ATTR_RHII = (1 << 18), }; struct nvme_id_ctrl { From 89b94a6cbeff4f184fc1ec3b9563b371ee617511 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:47 +0900 Subject: [PATCH 129/150] nvmet: Implement interrupt coalescing feature support The NVMe base specifications v2.1 mandate Supporting the interrupt coalescing feature (NVME_FEAT_IRQ_COALESCE) for PCI controllers. Introduce the data structure struct nvmet_feat_irq_coalesce to define the time and threshold (thr) fields of this feature and implement the functions nvmet_get_feat_irq_coalesce() and nvmet_set_feat_irq_coalesce() to get and set this feature. These functions respectively use the controller get_feature() and set_feature() operations to fill and handle the fields of struct nvmet_feat_irq_coalesce. While the Linux kernel nvme driver does not use this feature and thus will not complain if it is not implemented, other major OSes fail initializing the NVMe device if this feature support is missing. Support for this feature is prohibited for fabrics controllers. If a get feature or set feature command for this feature is received for a fabrics controller, the command is failed with an invalid field error. Suggested-by: Rick Wertenbroek Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 53 +++++++++++++++++++++++++++++++-- drivers/nvme/target/nvmet.h | 10 +++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index efef3acba9fb..eff9fd2e81ed 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1282,6 +1282,27 @@ static u16 nvmet_set_feat_host_id(struct nvmet_req *req) sizeof(req->sq->ctrl->hostid)); } +static u16 nvmet_set_feat_irq_coalesce(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); + struct nvmet_feat_irq_coalesce irqc = { + .time = (cdw11 >> 8) & 0xff, + .thr = cdw11 & 0xff, + }; + + /* + * This feature is not supported for fabrics controllers and mandatory + * for PCI controllers. + */ + if (!nvmet_is_pci_ctrl(ctrl)) { + req->error_loc = offsetof(struct nvme_common_command, cdw10); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + + return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc); +} + void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = nvmet_req_subsys(req); @@ -1305,6 +1326,9 @@ void nvmet_execute_set_features(struct nvmet_req *req) nvmet_set_result(req, (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); break; + case NVME_FEAT_IRQ_COALESCE: + status = nvmet_set_feat_irq_coalesce(req); + break; case NVME_FEAT_KATO: status = nvmet_set_feat_kato(req); break; @@ -1349,6 +1373,30 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req) return 0; } +static u16 nvmet_get_feat_irq_coalesce(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_feat_irq_coalesce irqc = { }; + u16 status; + + /* + * This feature is not supported for fabrics controllers and mandatory + * for PCI controllers. + */ + if (!nvmet_is_pci_ctrl(ctrl)) { + req->error_loc = offsetof(struct nvme_common_command, cdw10); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + + status = ctrl->ops->get_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc); + if (status != NVME_SC_SUCCESS) + return status; + + nvmet_set_result(req, ((u32)irqc.time << 8) | (u32)irqc.thr); + + return NVME_SC_SUCCESS; +} + void nvmet_get_feat_kato(struct nvmet_req *req) { nvmet_set_result(req, req->sq->ctrl->kato * 1000); @@ -1383,13 +1431,14 @@ void nvmet_execute_get_features(struct nvmet_req *req) break; case NVME_FEAT_ERR_RECOVERY: break; - case NVME_FEAT_IRQ_COALESCE: - break; case NVME_FEAT_IRQ_CONFIG: break; case NVME_FEAT_WRITE_ATOMIC: break; #endif + case NVME_FEAT_IRQ_COALESCE: + status = nvmet_get_feat_irq_coalesce(req); + break; case NVME_FEAT_ASYNC_EVENT: nvmet_get_feat_async_event(req); break; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 8325de3382ee..555c09b11dbe 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -906,4 +906,14 @@ static inline void nvmet_pr_put_ns_pc_ref(struct nvmet_pr_per_ctrl_ref *pc_ref) { percpu_ref_put(&pc_ref->ref); } + +/* + * Data for the get_feature() and set_feature() operations of PCI target + * controllers. + */ +struct nvmet_feat_irq_coalesce { + u8 thr; + u8 time; +}; + #endif /* _NVMET_H */ From f1ecd491b6e71d598172f29d9c6c8735b81d2566 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:48 +0900 Subject: [PATCH 130/150] nvmet: Implement interrupt config feature support The NVMe base specifications v2.1 mandate supporting the interrupt config feature (NVME_FEAT_IRQ_CONFIG) for PCI controllers. Introduce the data structure struct nvmet_feat_irq_config to define the coalescing disabled (cd) and interrupt vector (iv) fields of this feature and implement the functions nvmet_get_feat_irq_config() and nvmet_set_feat_irq_config() functions to get and set these fields. These functions respectively use the controller get_feature() and set_feature() operations to fill and handle the fields of struct nvmet_feat_irq_config. Support for this feature is prohibited for fabrics controllers. If a get feature command or a set feature command for this feature is received for a fabrics controller, the command is failed with an invalid field error. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 54 +++++++++++++++++++++++++++++++-- drivers/nvme/target/nvmet.h | 5 +++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index eff9fd2e81ed..8b8ec33330b2 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1303,6 +1303,27 @@ static u16 nvmet_set_feat_irq_coalesce(struct nvmet_req *req) return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_COALESCE, &irqc); } +static u16 nvmet_set_feat_irq_config(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); + struct nvmet_feat_irq_config irqcfg = { + .iv = cdw11 & 0xffff, + .cd = (cdw11 >> 16) & 0x1, + }; + + /* + * This feature is not supported for fabrics controllers and mandatory + * for PCI controllers. + */ + if (!nvmet_is_pci_ctrl(ctrl)) { + req->error_loc = offsetof(struct nvme_common_command, cdw10); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + + return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg); +} + void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = nvmet_req_subsys(req); @@ -1329,6 +1350,9 @@ void nvmet_execute_set_features(struct nvmet_req *req) case NVME_FEAT_IRQ_COALESCE: status = nvmet_set_feat_irq_coalesce(req); break; + case NVME_FEAT_IRQ_CONFIG: + status = nvmet_set_feat_irq_config(req); + break; case NVME_FEAT_KATO: status = nvmet_set_feat_kato(req); break; @@ -1397,6 +1421,31 @@ static u16 nvmet_get_feat_irq_coalesce(struct nvmet_req *req) return NVME_SC_SUCCESS; } +static u16 nvmet_get_feat_irq_config(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u32 iv = le32_to_cpu(req->cmd->common.cdw11) & 0xffff; + struct nvmet_feat_irq_config irqcfg = { .iv = iv }; + u16 status; + + /* + * This feature is not supported for fabrics controllers and mandatory + * for PCI controllers. + */ + if (!nvmet_is_pci_ctrl(ctrl)) { + req->error_loc = offsetof(struct nvme_common_command, cdw10); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + + status = ctrl->ops->get_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg); + if (status != NVME_SC_SUCCESS) + return status; + + nvmet_set_result(req, ((u32)irqcfg.cd << 16) | iv); + + return NVME_SC_SUCCESS; +} + void nvmet_get_feat_kato(struct nvmet_req *req) { nvmet_set_result(req, req->sq->ctrl->kato * 1000); @@ -1431,14 +1480,15 @@ void nvmet_execute_get_features(struct nvmet_req *req) break; case NVME_FEAT_ERR_RECOVERY: break; - case NVME_FEAT_IRQ_CONFIG: - break; case NVME_FEAT_WRITE_ATOMIC: break; #endif case NVME_FEAT_IRQ_COALESCE: status = nvmet_get_feat_irq_coalesce(req); break; + case NVME_FEAT_IRQ_CONFIG: + status = nvmet_get_feat_irq_config(req); + break; case NVME_FEAT_ASYNC_EVENT: nvmet_get_feat_async_event(req); break; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 555c09b11dbe..999a4ebf597e 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -916,4 +916,9 @@ struct nvmet_feat_irq_coalesce { u8 time; }; +struct nvmet_feat_irq_config { + u16 iv; + bool cd; +}; + #endif /* _NVMET_H */ From a0ed77d4c9a7745ac5dca35d563d6096787ae942 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:49 +0900 Subject: [PATCH 131/150] nvmet: Implement arbitration feature support NVMe base specification v2.1 mandates support for the arbitration feature (NVME_FEAT_ARBITRATION). Introduce the data structure struct nvmet_feat_arbitration to define the high, medium and low priority weight fields and the arbitration burst field of this feature and implement the functions nvmet_get_feat_arbitration() and nvmet_set_feat_arbitration() functions to get and set these fields. Since there is no generic way to implement support for the arbitration feature, these functions respectively use the controller get_feature() and set_feature() operations to process the feature with the help of the controller driver. If the controller driver does not implement these operations and a get feature command or a set feature command for this feature is received, the command is failed with an invalid field error. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Tested-by: Rick Wertenbroek Tested-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 51 +++++++++++++++++++++++++++++++-- drivers/nvme/target/nvmet.h | 7 +++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 8b8ec33330b2..3ddd8e44e148 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1324,6 +1324,25 @@ static u16 nvmet_set_feat_irq_config(struct nvmet_req *req) return ctrl->ops->set_feature(ctrl, NVME_FEAT_IRQ_CONFIG, &irqcfg); } +static u16 nvmet_set_feat_arbitration(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); + struct nvmet_feat_arbitration arb = { + .hpw = (cdw11 >> 24) & 0xff, + .mpw = (cdw11 >> 16) & 0xff, + .lpw = (cdw11 >> 8) & 0xff, + .ab = cdw11 & 0x3, + }; + + if (!ctrl->ops->set_feature) { + req->error_loc = offsetof(struct nvme_common_command, cdw10); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + + return ctrl->ops->set_feature(ctrl, NVME_FEAT_ARBITRATION, &arb); +} + void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = nvmet_req_subsys(req); @@ -1337,6 +1356,9 @@ void nvmet_execute_set_features(struct nvmet_req *req) return; switch (cdw10 & 0xff) { + case NVME_FEAT_ARBITRATION: + status = nvmet_set_feat_arbitration(req); + break; case NVME_FEAT_NUM_QUEUES: ncqr = (cdw11 >> 16) & 0xffff; nsqr = cdw11 & 0xffff; @@ -1446,6 +1468,30 @@ static u16 nvmet_get_feat_irq_config(struct nvmet_req *req) return NVME_SC_SUCCESS; } +static u16 nvmet_get_feat_arbitration(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_feat_arbitration arb = { }; + u16 status; + + if (!ctrl->ops->get_feature) { + req->error_loc = offsetof(struct nvme_common_command, cdw10); + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + + status = ctrl->ops->get_feature(ctrl, NVME_FEAT_ARBITRATION, &arb); + if (status != NVME_SC_SUCCESS) + return status; + + nvmet_set_result(req, + ((u32)arb.hpw << 24) | + ((u32)arb.mpw << 16) | + ((u32)arb.lpw << 8) | + (arb.ab & 0x3)); + + return NVME_SC_SUCCESS; +} + void nvmet_get_feat_kato(struct nvmet_req *req) { nvmet_set_result(req, req->sq->ctrl->kato * 1000); @@ -1472,8 +1518,6 @@ void nvmet_execute_get_features(struct nvmet_req *req) * need to come up with some fake values for these. */ #if 0 - case NVME_FEAT_ARBITRATION: - break; case NVME_FEAT_POWER_MGMT: break; case NVME_FEAT_TEMP_THRESH: @@ -1483,6 +1527,9 @@ void nvmet_execute_get_features(struct nvmet_req *req) case NVME_FEAT_WRITE_ATOMIC: break; #endif + case NVME_FEAT_ARBITRATION: + status = nvmet_get_feat_arbitration(req); + break; case NVME_FEAT_IRQ_COALESCE: status = nvmet_get_feat_irq_coalesce(req); break; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 999a4ebf597e..f4df458df9db 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -921,4 +921,11 @@ struct nvmet_feat_irq_config { bool cd; }; +struct nvmet_feat_arbitration { + u8 hpw; + u8 mpw; + u8 lpw; + u8 ab; +}; + #endif /* _NVMET_H */ From 0faa0fe6f90ea59b10d1b0f15ce0eb0c18eff186 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:50 +0900 Subject: [PATCH 132/150] nvmet: New NVMe PCI endpoint function target driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a PCI target driver using the PCI endpoint framework. This requires hardware with a PCI controller capable of executing in endpoint mode. The PCI endpoint framework is used to set up a PCI endpoint function and its BAR compatible with a NVMe PCI controller. The framework is also used to map local memory to the PCI address space to execute MMIO accesses for retrieving NVMe commands from submission queues and posting completion entries to completion queues. If supported, DMA is used for command retreival and command data transfers, based on the PCI address segments indicated by the command using either PRPs or SGLs. The NVMe target driver relies on the NVMe target core code to execute all commands isssued by the host. The PCI target driver is mainly responsible for the following: - Initialization and teardown of the endpoint device and its backend PCI target controller. The PCI target controller is created using a subsystem and a port defined through configfs. The port used must be initialized with the "pci" transport type. The target controller is allocated and initialized when the PCI endpoint is started by binding it to the endpoint PCI device (nvmet_pci_epf_epc_init() function). - Manage the endpoint controller state according to the PCI link state and the actions of the host (e.g. checking the CC.EN register) and propagate these actions to the PCI target controller. Polling of the controller enable/disable is done using a delayed work scheduled every 5ms (nvmet_pci_epf_poll_cc() function). This work is started whenever the PCI link comes up (nvmet_pci_epf_link_up() notifier function) and stopped when the PCI link comes down (nvmet_pci_epf_link_down() notifier function). nvmet_pci_epf_poll_cc() enables and disables the PCI controller using the functions nvmet_pci_epf_enable_ctrl() and nvmet_pci_epf_disable_ctrl(). The controller admin queue is created using nvmet_pci_epf_create_cq(), which calls nvmet_cq_create(), and nvmet_pci_epf_create_sq() which uses nvmet_sq_create(). nvmet_pci_epf_disable_ctrl() always resets the PCI controller to its initial state so that nvmet_pci_epf_enable_ctrl() can be called again. This ensures correct operation if, for instance, the host reboots causing the PCI link to be temporarily down. - Manage the controller admin and I/O submission queues using local memory. Commands are obtained from submission queues using a work item that constantly polls the doorbells of all submissions queues (nvmet_pci_epf_poll_sqs() function). This work is started whenever the controller is enabled (nvmet_pci_epf_enable_ctrl() function) and stopped when the controller is disabled (nvmet_pci_epf_disable_ctrl() function). When new commands are submitted by the host, DMA transfers are used to retrieve the commands. - Initiate the execution of all admin and I/O commands using the target core code, by calling a requests execute() function. All commands are individually handled using a per-command work item (nvmet_pci_epf_iod_work() function). A command overall execution includes: initializing a struct nvmet_req request for the command, using nvmet_req_transfer_len() to get a command data transfer length, parse the command PRPs or SGLs to get the PCI address segments of the command data buffer, retrieve data from the host (if the command is a write command), call req->execute() to execute the command and transfer data to the host (for read commands). - Handle the completions of commands as notified by the ->queue_response() operation of the PCI target controller (nvmet_pci_epf_queue_response() function). Completed commands are added to a list of completed command for their CQ. Each CQ list of completed command is processed using a work item (nvmet_pci_epf_cq_work() function) which posts entries for the completed commands in the CQ memory and raise an IRQ to the host to signal the completion. IRQ coalescing is supported as mandated by the NVMe base specification for PCI controllers. Of note is that completion entries are transmitted to the host using MMIO, after mapping the completion queue memory to the host PCI address space. Unlike for retrieving commands from SQs, DMA is not used as it degrades performance due to the transfer serialization needed (which delays completion entries transmission). The configuration of a NVMe PCI endpoint controller is done using configfs. First the NVMe PCI target controller configuration must be done to set up a subsystem and a port with the "pci" addr_trtype attribute. The subsystem can be setup using a file or block device backed namespace or using a passthrough NVMe device. After this, the PCI endpoint can be configured and bound to the PCI endpoint controller to start the NVMe endpoint controller. In order to not overcomplicate this initial implementation of an endpoint PCI target controller driver, protection information is not for now supported. If the PCI controller port and namespace are configured with protection information support, an error will be returned when the controller is created and initialized when the endpoint function is started. Protection information support will be added in a follow-up patch series. Using a Rock5B board (Rockchip RK3588 SoC, PCI Gen3x4 endpoint controller) with a target PCI controller setup with 4 I/O queues and a null_blk block device as a namespace, the maximum performance using fio was measured at 131 KIOPS for random 4K reads and up to 2.8 GB/S throughput. Some data points are: Rnd read, 4KB, QD=1, 1 job : IOPS=16.9k, BW=66.2MiB/s (69.4MB/s) Rnd read, 4KB, QD=32, 1 job : IOPS=78.5k, BW=307MiB/s (322MB/s) Rnd read, 4KB, QD=32, 4 jobs: IOPS=131k, BW=511MiB/s (536MB/s) Seq read, 512KB, QD=32, 1 job : IOPS=5381, BW=2691MiB/s (2821MB/s) The NVMe PCI endpoint target driver is not intended for production use. It is a tool for learning NVMe, exploring existing features and testing implementations of new NVMe features. Co-developed-by: Rick Wertenbroek Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Manivannan Sadhasivam Tested-by: Manivannan Sadhasivam Reviewed-by: Krzysztof Wilczyński Signed-off-by: Keith Busch --- drivers/nvme/target/Kconfig | 11 + drivers/nvme/target/Makefile | 2 + drivers/nvme/target/pci-epf.c | 2591 +++++++++++++++++++++++++++++++++ 3 files changed, 2604 insertions(+) create mode 100644 drivers/nvme/target/pci-epf.c diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 46be031f91b4..fb7446d6d682 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -115,3 +115,14 @@ config NVME_TARGET_AUTH target side. If unsure, say N. + +config NVME_TARGET_PCI_EPF + tristate "NVMe PCI Endpoint Function target support" + depends on NVME_TARGET && PCI_ENDPOINT + depends on NVME_CORE=y || NVME_CORE=NVME_TARGET + help + This enables the NVMe PCI Endpoint Function target driver support, + which allows creating a NVMe PCI controller using an endpoint mode + capable PCI controller. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index f2b025bbe10c..ed8522911d1f 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o obj-$(CONFIG_NVME_TARGET_FC) += nvmet-fc.o obj-$(CONFIG_NVME_TARGET_FCLOOP) += nvme-fcloop.o obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o +obj-$(CONFIG_NVME_TARGET_PCI_EPF) += nvmet-pci-epf.o nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o pr.o @@ -20,4 +21,5 @@ nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o nvme-fcloop-y += fcloop.o nvmet-tcp-y += tcp.o +nvmet-pci-epf-y += pci-epf.o nvmet-$(CONFIG_TRACING) += trace.o diff --git a/drivers/nvme/target/pci-epf.c b/drivers/nvme/target/pci-epf.c new file mode 100644 index 000000000000..ac30b42cc622 --- /dev/null +++ b/drivers/nvme/target/pci-epf.c @@ -0,0 +1,2591 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe PCI Endpoint Function target driver. + * + * Copyright (c) 2024, Western Digital Corporation or its affiliates. + * Copyright (c) 2024, Rick Wertenbroek + * REDS Institute, HEIG-VD, HES-SO, Switzerland + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvmet.h" + +static LIST_HEAD(nvmet_pci_epf_ports); +static DEFINE_MUTEX(nvmet_pci_epf_ports_mutex); + +/* + * Default and maximum allowed data transfer size. For the default, + * allow up to 128 page-sized segments. For the maximum allowed, + * use 4 times the default (which is completely arbitrary). + */ +#define NVMET_PCI_EPF_MAX_SEGS 128 +#define NVMET_PCI_EPF_MDTS_KB \ + (NVMET_PCI_EPF_MAX_SEGS << (PAGE_SHIFT - 10)) +#define NVMET_PCI_EPF_MAX_MDTS_KB (NVMET_PCI_EPF_MDTS_KB * 4) + +/* + * IRQ vector coalescing threshold: by default, post 8 CQEs before raising an + * interrupt vector to the host. This default 8 is completely arbitrary and can + * be changed by the host with a nvme_set_features command. + */ +#define NVMET_PCI_EPF_IV_THRESHOLD 8 + +/* + * BAR CC register and SQ polling intervals. + */ +#define NVMET_PCI_EPF_CC_POLL_INTERVAL msecs_to_jiffies(5) +#define NVMET_PCI_EPF_SQ_POLL_INTERVAL msecs_to_jiffies(5) +#define NVMET_PCI_EPF_SQ_POLL_IDLE msecs_to_jiffies(5000) + +/* + * SQ arbitration burst default: fetch at most 8 commands at a time from an SQ. + */ +#define NVMET_PCI_EPF_SQ_AB 8 + +/* + * Handling of CQs is normally immediate, unless we fail to map a CQ or the CQ + * is full, in which case we retry the CQ processing after this interval. + */ +#define NVMET_PCI_EPF_CQ_RETRY_INTERVAL msecs_to_jiffies(1) + +enum nvmet_pci_epf_queue_flags { + NVMET_PCI_EPF_Q_IS_SQ = 0, /* The queue is a submission queue */ + NVMET_PCI_EPF_Q_LIVE, /* The queue is live */ + NVMET_PCI_EPF_Q_IRQ_ENABLED, /* IRQ is enabled for this queue */ +}; + +/* + * IRQ vector descriptor. + */ +struct nvmet_pci_epf_irq_vector { + unsigned int vector; + unsigned int ref; + bool cd; + int nr_irqs; +}; + +struct nvmet_pci_epf_queue { + union { + struct nvmet_sq nvme_sq; + struct nvmet_cq nvme_cq; + }; + struct nvmet_pci_epf_ctrl *ctrl; + unsigned long flags; + + u64 pci_addr; + size_t pci_size; + struct pci_epc_map pci_map; + + u16 qid; + u16 depth; + u16 vector; + u16 head; + u16 tail; + u16 phase; + u32 db; + + size_t qes; + + struct nvmet_pci_epf_irq_vector *iv; + struct workqueue_struct *iod_wq; + struct delayed_work work; + spinlock_t lock; + struct list_head list; +}; + +/* + * PCI Root Complex (RC) address data segment for mapping an admin or + * I/O command buffer @buf of @length bytes to the PCI address @pci_addr. + */ +struct nvmet_pci_epf_segment { + void *buf; + u64 pci_addr; + u32 length; +}; + +/* + * Command descriptors. + */ +struct nvmet_pci_epf_iod { + struct list_head link; + + struct nvmet_req req; + struct nvme_command cmd; + struct nvme_completion cqe; + unsigned int status; + + struct nvmet_pci_epf_ctrl *ctrl; + + struct nvmet_pci_epf_queue *sq; + struct nvmet_pci_epf_queue *cq; + + /* Data transfer size and direction for the command. */ + size_t data_len; + enum dma_data_direction dma_dir; + + /* + * PCI Root Complex (RC) address data segments: if nr_data_segs is 1, we + * use only @data_seg. Otherwise, the array of segments @data_segs is + * allocated to manage multiple PCI address data segments. @data_sgl and + * @data_sgt are used to setup the command request for execution by the + * target core. + */ + unsigned int nr_data_segs; + struct nvmet_pci_epf_segment data_seg; + struct nvmet_pci_epf_segment *data_segs; + struct scatterlist data_sgl; + struct sg_table data_sgt; + + struct work_struct work; + struct completion done; +}; + +/* + * PCI target controller private data. + */ +struct nvmet_pci_epf_ctrl { + struct nvmet_pci_epf *nvme_epf; + struct nvmet_port *port; + struct nvmet_ctrl *tctrl; + struct device *dev; + + unsigned int nr_queues; + struct nvmet_pci_epf_queue *sq; + struct nvmet_pci_epf_queue *cq; + unsigned int sq_ab; + + mempool_t iod_pool; + void *bar; + u64 cap; + u32 cc; + u32 csts; + + size_t io_sqes; + size_t io_cqes; + + size_t mps_shift; + size_t mps; + size_t mps_mask; + + unsigned int mdts; + + struct delayed_work poll_cc; + struct delayed_work poll_sqs; + + struct mutex irq_lock; + struct nvmet_pci_epf_irq_vector *irq_vectors; + unsigned int irq_vector_threshold; + + bool link_up; + bool enabled; +}; + +/* + * PCI EPF driver private data. + */ +struct nvmet_pci_epf { + struct pci_epf *epf; + + const struct pci_epc_features *epc_features; + + void *reg_bar; + size_t msix_table_offset; + + unsigned int irq_type; + unsigned int nr_vectors; + + struct nvmet_pci_epf_ctrl ctrl; + + bool dma_enabled; + struct dma_chan *dma_tx_chan; + struct mutex dma_tx_lock; + struct dma_chan *dma_rx_chan; + struct mutex dma_rx_lock; + + struct mutex mmio_lock; + + /* PCI endpoint function configfs attributes. */ + struct config_group group; + __le16 portid; + char subsysnqn[NVMF_NQN_SIZE]; + unsigned int mdts_kb; +}; + +static inline u32 nvmet_pci_epf_bar_read32(struct nvmet_pci_epf_ctrl *ctrl, + u32 off) +{ + __le32 *bar_reg = ctrl->bar + off; + + return le32_to_cpu(READ_ONCE(*bar_reg)); +} + +static inline void nvmet_pci_epf_bar_write32(struct nvmet_pci_epf_ctrl *ctrl, + u32 off, u32 val) +{ + __le32 *bar_reg = ctrl->bar + off; + + WRITE_ONCE(*bar_reg, cpu_to_le32(val)); +} + +static inline u64 nvmet_pci_epf_bar_read64(struct nvmet_pci_epf_ctrl *ctrl, + u32 off) +{ + return (u64)nvmet_pci_epf_bar_read32(ctrl, off) | + ((u64)nvmet_pci_epf_bar_read32(ctrl, off + 4) << 32); +} + +static inline void nvmet_pci_epf_bar_write64(struct nvmet_pci_epf_ctrl *ctrl, + u32 off, u64 val) +{ + nvmet_pci_epf_bar_write32(ctrl, off, val & 0xFFFFFFFF); + nvmet_pci_epf_bar_write32(ctrl, off + 4, (val >> 32) & 0xFFFFFFFF); +} + +static inline int nvmet_pci_epf_mem_map(struct nvmet_pci_epf *nvme_epf, + u64 pci_addr, size_t size, struct pci_epc_map *map) +{ + struct pci_epf *epf = nvme_epf->epf; + + return pci_epc_mem_map(epf->epc, epf->func_no, epf->vfunc_no, + pci_addr, size, map); +} + +static inline void nvmet_pci_epf_mem_unmap(struct nvmet_pci_epf *nvme_epf, + struct pci_epc_map *map) +{ + struct pci_epf *epf = nvme_epf->epf; + + pci_epc_mem_unmap(epf->epc, epf->func_no, epf->vfunc_no, map); +} + +struct nvmet_pci_epf_dma_filter { + struct device *dev; + u32 dma_mask; +}; + +static bool nvmet_pci_epf_dma_filter(struct dma_chan *chan, void *arg) +{ + struct nvmet_pci_epf_dma_filter *filter = arg; + struct dma_slave_caps caps; + + memset(&caps, 0, sizeof(caps)); + dma_get_slave_caps(chan, &caps); + + return chan->device->dev == filter->dev && + (filter->dma_mask & caps.directions); +} + +static void nvmet_pci_epf_init_dma(struct nvmet_pci_epf *nvme_epf) +{ + struct pci_epf *epf = nvme_epf->epf; + struct device *dev = &epf->dev; + struct nvmet_pci_epf_dma_filter filter; + struct dma_chan *chan; + dma_cap_mask_t mask; + + mutex_init(&nvme_epf->dma_rx_lock); + mutex_init(&nvme_epf->dma_tx_lock); + + dma_cap_zero(mask); + dma_cap_set(DMA_SLAVE, mask); + + filter.dev = epf->epc->dev.parent; + filter.dma_mask = BIT(DMA_DEV_TO_MEM); + + chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter); + if (!chan) + goto out_dma_no_rx; + + nvme_epf->dma_rx_chan = chan; + + filter.dma_mask = BIT(DMA_MEM_TO_DEV); + chan = dma_request_channel(mask, nvmet_pci_epf_dma_filter, &filter); + if (!chan) + goto out_dma_no_tx; + + nvme_epf->dma_tx_chan = chan; + + nvme_epf->dma_enabled = true; + + dev_dbg(dev, "Using DMA RX channel %s, maximum segment size %u B\n", + dma_chan_name(chan), + dma_get_max_seg_size(dmaengine_get_dma_device(chan))); + + dev_dbg(dev, "Using DMA TX channel %s, maximum segment size %u B\n", + dma_chan_name(chan), + dma_get_max_seg_size(dmaengine_get_dma_device(chan))); + + return; + +out_dma_no_tx: + dma_release_channel(nvme_epf->dma_rx_chan); + nvme_epf->dma_rx_chan = NULL; + +out_dma_no_rx: + mutex_destroy(&nvme_epf->dma_rx_lock); + mutex_destroy(&nvme_epf->dma_tx_lock); + nvme_epf->dma_enabled = false; + + dev_info(&epf->dev, "DMA not supported, falling back to MMIO\n"); +} + +static void nvmet_pci_epf_deinit_dma(struct nvmet_pci_epf *nvme_epf) +{ + if (!nvme_epf->dma_enabled) + return; + + dma_release_channel(nvme_epf->dma_tx_chan); + nvme_epf->dma_tx_chan = NULL; + dma_release_channel(nvme_epf->dma_rx_chan); + nvme_epf->dma_rx_chan = NULL; + mutex_destroy(&nvme_epf->dma_rx_lock); + mutex_destroy(&nvme_epf->dma_tx_lock); + nvme_epf->dma_enabled = false; +} + +static int nvmet_pci_epf_dma_transfer(struct nvmet_pci_epf *nvme_epf, + struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir) +{ + struct pci_epf *epf = nvme_epf->epf; + struct dma_async_tx_descriptor *desc; + struct dma_slave_config sconf = {}; + struct device *dev = &epf->dev; + struct device *dma_dev; + struct dma_chan *chan; + dma_cookie_t cookie; + dma_addr_t dma_addr; + struct mutex *lock; + int ret; + + switch (dir) { + case DMA_FROM_DEVICE: + lock = &nvme_epf->dma_rx_lock; + chan = nvme_epf->dma_rx_chan; + sconf.direction = DMA_DEV_TO_MEM; + sconf.src_addr = seg->pci_addr; + break; + case DMA_TO_DEVICE: + lock = &nvme_epf->dma_tx_lock; + chan = nvme_epf->dma_tx_chan; + sconf.direction = DMA_MEM_TO_DEV; + sconf.dst_addr = seg->pci_addr; + break; + default: + return -EINVAL; + } + + mutex_lock(lock); + + dma_dev = dmaengine_get_dma_device(chan); + dma_addr = dma_map_single(dma_dev, seg->buf, seg->length, dir); + ret = dma_mapping_error(dma_dev, dma_addr); + if (ret) + goto unlock; + + ret = dmaengine_slave_config(chan, &sconf); + if (ret) { + dev_err(dev, "Failed to configure DMA channel\n"); + goto unmap; + } + + desc = dmaengine_prep_slave_single(chan, dma_addr, seg->length, + sconf.direction, DMA_CTRL_ACK); + if (!desc) { + dev_err(dev, "Failed to prepare DMA\n"); + ret = -EIO; + goto unmap; + } + + cookie = dmaengine_submit(desc); + ret = dma_submit_error(cookie); + if (ret) { + dev_err(dev, "Failed to do DMA submit (err=%d)\n", ret); + goto unmap; + } + + if (dma_sync_wait(chan, cookie) != DMA_COMPLETE) { + dev_err(dev, "DMA transfer failed\n"); + ret = -EIO; + } + + dmaengine_terminate_sync(chan); + +unmap: + dma_unmap_single(dma_dev, dma_addr, seg->length, dir); + +unlock: + mutex_unlock(lock); + + return ret; +} + +static int nvmet_pci_epf_mmio_transfer(struct nvmet_pci_epf *nvme_epf, + struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir) +{ + u64 pci_addr = seg->pci_addr; + u32 length = seg->length; + void *buf = seg->buf; + struct pci_epc_map map; + int ret = -EINVAL; + + /* + * Note: MMIO transfers do not need serialization but this is a + * simple way to avoid using too many mapping windows. + */ + mutex_lock(&nvme_epf->mmio_lock); + + while (length) { + ret = nvmet_pci_epf_mem_map(nvme_epf, pci_addr, length, &map); + if (ret) + break; + + switch (dir) { + case DMA_FROM_DEVICE: + memcpy_fromio(buf, map.virt_addr, map.pci_size); + break; + case DMA_TO_DEVICE: + memcpy_toio(map.virt_addr, buf, map.pci_size); + break; + default: + ret = -EINVAL; + goto unlock; + } + + pci_addr += map.pci_size; + buf += map.pci_size; + length -= map.pci_size; + + nvmet_pci_epf_mem_unmap(nvme_epf, &map); + } + +unlock: + mutex_unlock(&nvme_epf->mmio_lock); + + return ret; +} + +static inline int nvmet_pci_epf_transfer_seg(struct nvmet_pci_epf *nvme_epf, + struct nvmet_pci_epf_segment *seg, enum dma_data_direction dir) +{ + if (nvme_epf->dma_enabled) + return nvmet_pci_epf_dma_transfer(nvme_epf, seg, dir); + + return nvmet_pci_epf_mmio_transfer(nvme_epf, seg, dir); +} + +static inline int nvmet_pci_epf_transfer(struct nvmet_pci_epf_ctrl *ctrl, + void *buf, u64 pci_addr, u32 length, + enum dma_data_direction dir) +{ + struct nvmet_pci_epf_segment seg = { + .buf = buf, + .pci_addr = pci_addr, + .length = length, + }; + + return nvmet_pci_epf_transfer_seg(ctrl->nvme_epf, &seg, dir); +} + +static int nvmet_pci_epf_alloc_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl) +{ + ctrl->irq_vectors = kcalloc(ctrl->nr_queues, + sizeof(struct nvmet_pci_epf_irq_vector), + GFP_KERNEL); + if (!ctrl->irq_vectors) + return -ENOMEM; + + mutex_init(&ctrl->irq_lock); + + return 0; +} + +static void nvmet_pci_epf_free_irq_vectors(struct nvmet_pci_epf_ctrl *ctrl) +{ + if (ctrl->irq_vectors) { + mutex_destroy(&ctrl->irq_lock); + kfree(ctrl->irq_vectors); + ctrl->irq_vectors = NULL; + } +} + +static struct nvmet_pci_epf_irq_vector * +nvmet_pci_epf_find_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector) +{ + struct nvmet_pci_epf_irq_vector *iv; + int i; + + lockdep_assert_held(&ctrl->irq_lock); + + for (i = 0; i < ctrl->nr_queues; i++) { + iv = &ctrl->irq_vectors[i]; + if (iv->ref && iv->vector == vector) + return iv; + } + + return NULL; +} + +static struct nvmet_pci_epf_irq_vector * +nvmet_pci_epf_add_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, u16 vector) +{ + struct nvmet_pci_epf_irq_vector *iv; + int i; + + mutex_lock(&ctrl->irq_lock); + + iv = nvmet_pci_epf_find_irq_vector(ctrl, vector); + if (iv) { + iv->ref++; + goto unlock; + } + + for (i = 0; i < ctrl->nr_queues; i++) { + iv = &ctrl->irq_vectors[i]; + if (!iv->ref) + break; + } + + if (WARN_ON_ONCE(!iv)) + goto unlock; + + iv->ref = 1; + iv->vector = vector; + iv->nr_irqs = 0; + +unlock: + mutex_unlock(&ctrl->irq_lock); + + return iv; +} + +static void nvmet_pci_epf_remove_irq_vector(struct nvmet_pci_epf_ctrl *ctrl, + u16 vector) +{ + struct nvmet_pci_epf_irq_vector *iv; + + mutex_lock(&ctrl->irq_lock); + + iv = nvmet_pci_epf_find_irq_vector(ctrl, vector); + if (iv) { + iv->ref--; + if (!iv->ref) { + iv->vector = 0; + iv->nr_irqs = 0; + } + } + + mutex_unlock(&ctrl->irq_lock); +} + +static bool nvmet_pci_epf_should_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_queue *cq, bool force) +{ + struct nvmet_pci_epf_irq_vector *iv = cq->iv; + bool ret; + + if (!test_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags)) + return false; + + /* IRQ coalescing for the admin queue is not allowed. */ + if (!cq->qid) + return true; + + if (iv->cd) + return true; + + if (force) { + ret = iv->nr_irqs > 0; + } else { + iv->nr_irqs++; + ret = iv->nr_irqs >= ctrl->irq_vector_threshold; + } + if (ret) + iv->nr_irqs = 0; + + return ret; +} + +static void nvmet_pci_epf_raise_irq(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_queue *cq, bool force) +{ + struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf; + struct pci_epf *epf = nvme_epf->epf; + int ret = 0; + + if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) + return; + + mutex_lock(&ctrl->irq_lock); + + if (!nvmet_pci_epf_should_raise_irq(ctrl, cq, force)) + goto unlock; + + switch (nvme_epf->irq_type) { + case PCI_IRQ_MSIX: + case PCI_IRQ_MSI: + ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no, + nvme_epf->irq_type, cq->vector + 1); + if (!ret) + break; + /* + * If we got an error, it is likely because the host is using + * legacy IRQs (e.g. BIOS, grub). + */ + fallthrough; + case PCI_IRQ_INTX: + ret = pci_epc_raise_irq(epf->epc, epf->func_no, epf->vfunc_no, + PCI_IRQ_INTX, 0); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + break; + } + + if (ret) + dev_err(ctrl->dev, "Failed to raise IRQ (err=%d)\n", ret); + +unlock: + mutex_unlock(&ctrl->irq_lock); +} + +static inline const char *nvmet_pci_epf_iod_name(struct nvmet_pci_epf_iod *iod) +{ + return nvme_opcode_str(iod->sq->qid, iod->cmd.common.opcode); +} + +static void nvmet_pci_epf_exec_iod_work(struct work_struct *work); + +static struct nvmet_pci_epf_iod * +nvmet_pci_epf_alloc_iod(struct nvmet_pci_epf_queue *sq) +{ + struct nvmet_pci_epf_ctrl *ctrl = sq->ctrl; + struct nvmet_pci_epf_iod *iod; + + iod = mempool_alloc(&ctrl->iod_pool, GFP_KERNEL); + if (unlikely(!iod)) + return NULL; + + memset(iod, 0, sizeof(*iod)); + iod->req.cmd = &iod->cmd; + iod->req.cqe = &iod->cqe; + iod->req.port = ctrl->port; + iod->ctrl = ctrl; + iod->sq = sq; + iod->cq = &ctrl->cq[sq->qid]; + INIT_LIST_HEAD(&iod->link); + iod->dma_dir = DMA_NONE; + INIT_WORK(&iod->work, nvmet_pci_epf_exec_iod_work); + init_completion(&iod->done); + + return iod; +} + +/* + * Allocate or grow a command table of PCI segments. + */ +static int nvmet_pci_epf_alloc_iod_data_segs(struct nvmet_pci_epf_iod *iod, + int nsegs) +{ + struct nvmet_pci_epf_segment *segs; + int nr_segs = iod->nr_data_segs + nsegs; + + segs = krealloc(iod->data_segs, + nr_segs * sizeof(struct nvmet_pci_epf_segment), + GFP_KERNEL | __GFP_ZERO); + if (!segs) + return -ENOMEM; + + iod->nr_data_segs = nr_segs; + iod->data_segs = segs; + + return 0; +} + +static void nvmet_pci_epf_free_iod(struct nvmet_pci_epf_iod *iod) +{ + int i; + + if (iod->data_segs) { + for (i = 0; i < iod->nr_data_segs; i++) + kfree(iod->data_segs[i].buf); + if (iod->data_segs != &iod->data_seg) + kfree(iod->data_segs); + } + if (iod->data_sgt.nents > 1) + sg_free_table(&iod->data_sgt); + mempool_free(iod, &iod->ctrl->iod_pool); +} + +static int nvmet_pci_epf_transfer_iod_data(struct nvmet_pci_epf_iod *iod) +{ + struct nvmet_pci_epf *nvme_epf = iod->ctrl->nvme_epf; + struct nvmet_pci_epf_segment *seg = &iod->data_segs[0]; + int i, ret; + + /* Split the data transfer according to the PCI segments. */ + for (i = 0; i < iod->nr_data_segs; i++, seg++) { + ret = nvmet_pci_epf_transfer_seg(nvme_epf, seg, iod->dma_dir); + if (ret) { + iod->status = NVME_SC_DATA_XFER_ERROR | NVME_STATUS_DNR; + return ret; + } + } + + return 0; +} + +static inline u32 nvmet_pci_epf_prp_ofst(struct nvmet_pci_epf_ctrl *ctrl, + u64 prp) +{ + return prp & ctrl->mps_mask; +} + +static inline size_t nvmet_pci_epf_prp_size(struct nvmet_pci_epf_ctrl *ctrl, + u64 prp) +{ + return ctrl->mps - nvmet_pci_epf_prp_ofst(ctrl, prp); +} + +/* + * Transfer a PRP list from the host and return the number of prps. + */ +static int nvmet_pci_epf_get_prp_list(struct nvmet_pci_epf_ctrl *ctrl, u64 prp, + size_t xfer_len, __le64 *prps) +{ + size_t nr_prps = (xfer_len + ctrl->mps_mask) >> ctrl->mps_shift; + u32 length; + int ret; + + /* + * Compute the number of PRPs required for the number of bytes to + * transfer (xfer_len). If this number overflows the memory page size + * with the PRP list pointer specified, only return the space available + * in the memory page, the last PRP in there will be a PRP list pointer + * to the remaining PRPs. + */ + length = min(nvmet_pci_epf_prp_size(ctrl, prp), nr_prps << 3); + ret = nvmet_pci_epf_transfer(ctrl, prps, prp, length, DMA_FROM_DEVICE); + if (ret) + return ret; + + return length >> 3; +} + +static int nvmet_pci_epf_iod_parse_prp_list(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_iod *iod) +{ + struct nvme_command *cmd = &iod->cmd; + struct nvmet_pci_epf_segment *seg; + size_t size = 0, ofst, prp_size, xfer_len; + size_t transfer_len = iod->data_len; + int nr_segs, nr_prps = 0; + u64 pci_addr, prp; + int i = 0, ret; + __le64 *prps; + + prps = kzalloc(ctrl->mps, GFP_KERNEL); + if (!prps) + goto err_internal; + + /* + * Allocate PCI segments for the command: this considers the worst case + * scenario where all prps are discontiguous, so get as many segments + * as we can have prps. In practice, most of the time, we will have + * far less PCI segments than prps. + */ + prp = le64_to_cpu(cmd->common.dptr.prp1); + if (!prp) + goto err_invalid_field; + + ofst = nvmet_pci_epf_prp_ofst(ctrl, prp); + nr_segs = (transfer_len + ofst + ctrl->mps - 1) >> ctrl->mps_shift; + + ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs); + if (ret) + goto err_internal; + + /* Set the first segment using prp1. */ + seg = &iod->data_segs[0]; + seg->pci_addr = prp; + seg->length = nvmet_pci_epf_prp_size(ctrl, prp); + + size = seg->length; + pci_addr = prp + size; + nr_segs = 1; + + /* + * Now build the PCI address segments using the PRP lists, starting + * from prp2. + */ + prp = le64_to_cpu(cmd->common.dptr.prp2); + if (!prp) + goto err_invalid_field; + + while (size < transfer_len) { + xfer_len = transfer_len - size; + + if (!nr_prps) { + nr_prps = nvmet_pci_epf_get_prp_list(ctrl, prp, + xfer_len, prps); + if (nr_prps < 0) + goto err_internal; + + i = 0; + ofst = 0; + } + + /* Current entry */ + prp = le64_to_cpu(prps[i]); + if (!prp) + goto err_invalid_field; + + /* Did we reach the last PRP entry of the list? */ + if (xfer_len > ctrl->mps && i == nr_prps - 1) { + /* We need more PRPs: PRP is a list pointer. */ + nr_prps = 0; + continue; + } + + /* Only the first PRP is allowed to have an offset. */ + if (nvmet_pci_epf_prp_ofst(ctrl, prp)) + goto err_invalid_offset; + + if (prp != pci_addr) { + /* Discontiguous prp: new segment. */ + nr_segs++; + if (WARN_ON_ONCE(nr_segs > iod->nr_data_segs)) + goto err_internal; + + seg++; + seg->pci_addr = prp; + seg->length = 0; + pci_addr = prp; + } + + prp_size = min_t(size_t, ctrl->mps, xfer_len); + seg->length += prp_size; + pci_addr += prp_size; + size += prp_size; + + i++; + } + + iod->nr_data_segs = nr_segs; + ret = 0; + + if (size != transfer_len) { + dev_err(ctrl->dev, + "PRPs transfer length mismatch: got %zu B, need %zu B\n", + size, transfer_len); + goto err_internal; + } + + kfree(prps); + + return 0; + +err_invalid_offset: + dev_err(ctrl->dev, "PRPs list invalid offset\n"); + iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR; + goto err; + +err_invalid_field: + dev_err(ctrl->dev, "PRPs list invalid field\n"); + iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + goto err; + +err_internal: + dev_err(ctrl->dev, "PRPs list internal error\n"); + iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + +err: + kfree(prps); + return -EINVAL; +} + +static int nvmet_pci_epf_iod_parse_prp_simple(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_iod *iod) +{ + struct nvme_command *cmd = &iod->cmd; + size_t transfer_len = iod->data_len; + int ret, nr_segs = 1; + u64 prp1, prp2 = 0; + size_t prp1_size; + + prp1 = le64_to_cpu(cmd->common.dptr.prp1); + prp1_size = nvmet_pci_epf_prp_size(ctrl, prp1); + + /* For commands crossing a page boundary, we should have prp2. */ + if (transfer_len > prp1_size) { + prp2 = le64_to_cpu(cmd->common.dptr.prp2); + if (!prp2) { + iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + return -EINVAL; + } + if (nvmet_pci_epf_prp_ofst(ctrl, prp2)) { + iod->status = + NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR; + return -EINVAL; + } + if (prp2 != prp1 + prp1_size) + nr_segs = 2; + } + + if (nr_segs == 1) { + iod->nr_data_segs = 1; + iod->data_segs = &iod->data_seg; + iod->data_segs[0].pci_addr = prp1; + iod->data_segs[0].length = transfer_len; + return 0; + } + + ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_segs); + if (ret) { + iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + return ret; + } + + iod->data_segs[0].pci_addr = prp1; + iod->data_segs[0].length = prp1_size; + iod->data_segs[1].pci_addr = prp2; + iod->data_segs[1].length = transfer_len - prp1_size; + + return 0; +} + +static int nvmet_pci_epf_iod_parse_prps(struct nvmet_pci_epf_iod *iod) +{ + struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl; + u64 prp1 = le64_to_cpu(iod->cmd.common.dptr.prp1); + size_t ofst; + + /* Get the PCI address segments for the command using its PRPs. */ + ofst = nvmet_pci_epf_prp_ofst(ctrl, prp1); + if (ofst & 0x3) { + iod->status = NVME_SC_PRP_INVALID_OFFSET | NVME_STATUS_DNR; + return -EINVAL; + } + + if (iod->data_len + ofst <= ctrl->mps * 2) + return nvmet_pci_epf_iod_parse_prp_simple(ctrl, iod); + + return nvmet_pci_epf_iod_parse_prp_list(ctrl, iod); +} + +/* + * Transfer an SGL segment from the host and return the number of data + * descriptors and the next segment descriptor, if any. + */ +static struct nvme_sgl_desc * +nvmet_pci_epf_get_sgl_segment(struct nvmet_pci_epf_ctrl *ctrl, + struct nvme_sgl_desc *desc, unsigned int *nr_sgls) +{ + struct nvme_sgl_desc *sgls; + u32 length = le32_to_cpu(desc->length); + int nr_descs, ret; + void *buf; + + buf = kmalloc(length, GFP_KERNEL); + if (!buf) + return NULL; + + ret = nvmet_pci_epf_transfer(ctrl, buf, le64_to_cpu(desc->addr), length, + DMA_FROM_DEVICE); + if (ret) { + kfree(buf); + return NULL; + } + + sgls = buf; + nr_descs = length / sizeof(struct nvme_sgl_desc); + if (sgls[nr_descs - 1].type == (NVME_SGL_FMT_SEG_DESC << 4) || + sgls[nr_descs - 1].type == (NVME_SGL_FMT_LAST_SEG_DESC << 4)) { + /* + * We have another SGL segment following this one: do not count + * it as a regular data SGL descriptor and return it to the + * caller. + */ + *desc = sgls[nr_descs - 1]; + nr_descs--; + } else { + /* We do not have another SGL segment after this one. */ + desc->length = 0; + } + + *nr_sgls = nr_descs; + + return sgls; +} + +static int nvmet_pci_epf_iod_parse_sgl_segments(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_iod *iod) +{ + struct nvme_command *cmd = &iod->cmd; + struct nvme_sgl_desc seg = cmd->common.dptr.sgl; + struct nvme_sgl_desc *sgls = NULL; + int n = 0, i, nr_sgls; + int ret; + + /* + * We do not support inline data nor keyed SGLs, so we should be seeing + * only segment descriptors. + */ + if (seg.type != (NVME_SGL_FMT_SEG_DESC << 4) && + seg.type != (NVME_SGL_FMT_LAST_SEG_DESC << 4)) { + iod->status = NVME_SC_SGL_INVALID_TYPE | NVME_STATUS_DNR; + return -EIO; + } + + while (seg.length) { + sgls = nvmet_pci_epf_get_sgl_segment(ctrl, &seg, &nr_sgls); + if (!sgls) { + iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + return -EIO; + } + + /* Grow the PCI segment table as needed. */ + ret = nvmet_pci_epf_alloc_iod_data_segs(iod, nr_sgls); + if (ret) { + iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + goto out; + } + + /* + * Parse the SGL descriptors to build the PCI segment table, + * checking the descriptor type as we go. + */ + for (i = 0; i < nr_sgls; i++) { + if (sgls[i].type != (NVME_SGL_FMT_DATA_DESC << 4)) { + iod->status = NVME_SC_SGL_INVALID_TYPE | + NVME_STATUS_DNR; + goto out; + } + iod->data_segs[n].pci_addr = le64_to_cpu(sgls[i].addr); + iod->data_segs[n].length = le32_to_cpu(sgls[i].length); + n++; + } + + kfree(sgls); + } + + out: + if (iod->status != NVME_SC_SUCCESS) { + kfree(sgls); + return -EIO; + } + + return 0; +} + +static int nvmet_pci_epf_iod_parse_sgls(struct nvmet_pci_epf_iod *iod) +{ + struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl; + struct nvme_sgl_desc *sgl = &iod->cmd.common.dptr.sgl; + + if (sgl->type == (NVME_SGL_FMT_DATA_DESC << 4)) { + /* Single data descriptor case. */ + iod->nr_data_segs = 1; + iod->data_segs = &iod->data_seg; + iod->data_seg.pci_addr = le64_to_cpu(sgl->addr); + iod->data_seg.length = le32_to_cpu(sgl->length); + return 0; + } + + return nvmet_pci_epf_iod_parse_sgl_segments(ctrl, iod); +} + +static int nvmet_pci_epf_alloc_iod_data_buf(struct nvmet_pci_epf_iod *iod) +{ + struct nvmet_pci_epf_ctrl *ctrl = iod->ctrl; + struct nvmet_req *req = &iod->req; + struct nvmet_pci_epf_segment *seg; + struct scatterlist *sg; + int ret, i; + + if (iod->data_len > ctrl->mdts) { + iod->status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + return -EINVAL; + } + + /* + * Get the PCI address segments for the command data buffer using either + * its SGLs or PRPs. + */ + if (iod->cmd.common.flags & NVME_CMD_SGL_ALL) + ret = nvmet_pci_epf_iod_parse_sgls(iod); + else + ret = nvmet_pci_epf_iod_parse_prps(iod); + if (ret) + return ret; + + /* Get a command buffer using SGLs matching the PCI segments. */ + if (iod->nr_data_segs == 1) { + sg_init_table(&iod->data_sgl, 1); + iod->data_sgt.sgl = &iod->data_sgl; + iod->data_sgt.nents = 1; + iod->data_sgt.orig_nents = 1; + } else { + ret = sg_alloc_table(&iod->data_sgt, iod->nr_data_segs, + GFP_KERNEL); + if (ret) + goto err_nomem; + } + + for_each_sgtable_sg(&iod->data_sgt, sg, i) { + seg = &iod->data_segs[i]; + seg->buf = kmalloc(seg->length, GFP_KERNEL); + if (!seg->buf) + goto err_nomem; + sg_set_buf(sg, seg->buf, seg->length); + } + + req->transfer_len = iod->data_len; + req->sg = iod->data_sgt.sgl; + req->sg_cnt = iod->data_sgt.nents; + + return 0; + +err_nomem: + iod->status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + return -ENOMEM; +} + +static void nvmet_pci_epf_complete_iod(struct nvmet_pci_epf_iod *iod) +{ + struct nvmet_pci_epf_queue *cq = iod->cq; + unsigned long flags; + + /* Print an error message for failed commands, except AENs. */ + iod->status = le16_to_cpu(iod->cqe.status) >> 1; + if (iod->status && iod->cmd.common.opcode != nvme_admin_async_event) + dev_err(iod->ctrl->dev, + "CQ[%d]: Command %s (0x%x) status 0x%0x\n", + iod->sq->qid, nvmet_pci_epf_iod_name(iod), + iod->cmd.common.opcode, iod->status); + + /* + * Add the command to the list of completed commands and schedule the + * CQ work. + */ + spin_lock_irqsave(&cq->lock, flags); + list_add_tail(&iod->link, &cq->list); + queue_delayed_work(system_highpri_wq, &cq->work, 0); + spin_unlock_irqrestore(&cq->lock, flags); +} + +static void nvmet_pci_epf_drain_queue(struct nvmet_pci_epf_queue *queue) +{ + struct nvmet_pci_epf_iod *iod; + unsigned long flags; + + spin_lock_irqsave(&queue->lock, flags); + while (!list_empty(&queue->list)) { + iod = list_first_entry(&queue->list, struct nvmet_pci_epf_iod, + link); + list_del_init(&iod->link); + nvmet_pci_epf_free_iod(iod); + } + spin_unlock_irqrestore(&queue->lock, flags); +} + +static int nvmet_pci_epf_add_port(struct nvmet_port *port) +{ + mutex_lock(&nvmet_pci_epf_ports_mutex); + list_add_tail(&port->entry, &nvmet_pci_epf_ports); + mutex_unlock(&nvmet_pci_epf_ports_mutex); + return 0; +} + +static void nvmet_pci_epf_remove_port(struct nvmet_port *port) +{ + mutex_lock(&nvmet_pci_epf_ports_mutex); + list_del_init(&port->entry); + mutex_unlock(&nvmet_pci_epf_ports_mutex); +} + +static struct nvmet_port * +nvmet_pci_epf_find_port(struct nvmet_pci_epf_ctrl *ctrl, __le16 portid) +{ + struct nvmet_port *p, *port = NULL; + + mutex_lock(&nvmet_pci_epf_ports_mutex); + list_for_each_entry(p, &nvmet_pci_epf_ports, entry) { + if (p->disc_addr.portid == portid) { + port = p; + break; + } + } + mutex_unlock(&nvmet_pci_epf_ports_mutex); + + return port; +} + +static void nvmet_pci_epf_queue_response(struct nvmet_req *req) +{ + struct nvmet_pci_epf_iod *iod = + container_of(req, struct nvmet_pci_epf_iod, req); + + iod->status = le16_to_cpu(req->cqe->status) >> 1; + + /* If we have no data to transfer, directly complete the command. */ + if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) { + nvmet_pci_epf_complete_iod(iod); + return; + } + + complete(&iod->done); +} + +static u8 nvmet_pci_epf_get_mdts(const struct nvmet_ctrl *tctrl) +{ + struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; + int page_shift = NVME_CAP_MPSMIN(tctrl->cap) + 12; + + return ilog2(ctrl->mdts) - page_shift; +} + +static u16 nvmet_pci_epf_create_cq(struct nvmet_ctrl *tctrl, + u16 cqid, u16 flags, u16 qsize, u64 pci_addr, u16 vector) +{ + struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; + struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; + u16 status; + + if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + if (!(flags & NVME_QUEUE_PHYS_CONTIG)) + return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR; + + if (flags & NVME_CQ_IRQ_ENABLED) + set_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags); + + cq->pci_addr = pci_addr; + cq->qid = cqid; + cq->depth = qsize + 1; + cq->vector = vector; + cq->head = 0; + cq->tail = 0; + cq->phase = 1; + cq->db = NVME_REG_DBS + (((cqid * 2) + 1) * sizeof(u32)); + nvmet_pci_epf_bar_write32(ctrl, cq->db, 0); + + if (!cqid) + cq->qes = sizeof(struct nvme_completion); + else + cq->qes = ctrl->io_cqes; + cq->pci_size = cq->qes * cq->depth; + + cq->iv = nvmet_pci_epf_add_irq_vector(ctrl, vector); + if (!cq->iv) { + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + goto err; + } + + status = nvmet_cq_create(tctrl, &cq->nvme_cq, cqid, cq->depth); + if (status != NVME_SC_SUCCESS) + goto err; + + dev_dbg(ctrl->dev, "CQ[%u]: %u entries of %zu B, IRQ vector %u\n", + cqid, qsize, cq->qes, cq->vector); + + return NVME_SC_SUCCESS; + +err: + clear_bit(NVMET_PCI_EPF_Q_IRQ_ENABLED, &cq->flags); + clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags); + return status; +} + +static u16 nvmet_pci_epf_delete_cq(struct nvmet_ctrl *tctrl, u16 cqid) +{ + struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; + struct nvmet_pci_epf_queue *cq = &ctrl->cq[cqid]; + + if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags)) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + cancel_delayed_work_sync(&cq->work); + nvmet_pci_epf_drain_queue(cq); + nvmet_pci_epf_remove_irq_vector(ctrl, cq->vector); + + return NVME_SC_SUCCESS; +} + +static u16 nvmet_pci_epf_create_sq(struct nvmet_ctrl *tctrl, + u16 sqid, u16 flags, u16 qsize, u64 pci_addr) +{ + struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; + struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid]; + u16 status; + + if (test_and_set_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + if (!(flags & NVME_QUEUE_PHYS_CONTIG)) + return NVME_SC_INVALID_QUEUE | NVME_STATUS_DNR; + + sq->pci_addr = pci_addr; + sq->qid = sqid; + sq->depth = qsize + 1; + sq->head = 0; + sq->tail = 0; + sq->phase = 0; + sq->db = NVME_REG_DBS + (sqid * 2 * sizeof(u32)); + nvmet_pci_epf_bar_write32(ctrl, sq->db, 0); + if (!sqid) + sq->qes = 1UL << NVME_ADM_SQES; + else + sq->qes = ctrl->io_sqes; + sq->pci_size = sq->qes * sq->depth; + + status = nvmet_sq_create(tctrl, &sq->nvme_sq, sqid, sq->depth); + if (status != NVME_SC_SUCCESS) + goto out_clear_bit; + + sq->iod_wq = alloc_workqueue("sq%d_wq", WQ_UNBOUND, + min_t(int, sq->depth, WQ_MAX_ACTIVE), sqid); + if (!sq->iod_wq) { + dev_err(ctrl->dev, "Failed to create SQ %d work queue\n", sqid); + status = NVME_SC_INTERNAL | NVME_STATUS_DNR; + goto out_destroy_sq; + } + + dev_dbg(ctrl->dev, "SQ[%u]: %u entries of %zu B\n", + sqid, qsize, sq->qes); + + return NVME_SC_SUCCESS; + +out_destroy_sq: + nvmet_sq_destroy(&sq->nvme_sq); +out_clear_bit: + clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags); + return status; +} + +static u16 nvmet_pci_epf_delete_sq(struct nvmet_ctrl *tctrl, u16 sqid) +{ + struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; + struct nvmet_pci_epf_queue *sq = &ctrl->sq[sqid]; + + if (!test_and_clear_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) + return NVME_SC_QID_INVALID | NVME_STATUS_DNR; + + flush_workqueue(sq->iod_wq); + destroy_workqueue(sq->iod_wq); + sq->iod_wq = NULL; + + nvmet_pci_epf_drain_queue(sq); + + if (sq->nvme_sq.ctrl) + nvmet_sq_destroy(&sq->nvme_sq); + + return NVME_SC_SUCCESS; +} + +static u16 nvmet_pci_epf_get_feat(const struct nvmet_ctrl *tctrl, + u8 feat, void *data) +{ + struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; + struct nvmet_feat_arbitration *arb; + struct nvmet_feat_irq_coalesce *irqc; + struct nvmet_feat_irq_config *irqcfg; + struct nvmet_pci_epf_irq_vector *iv; + u16 status; + + switch (feat) { + case NVME_FEAT_ARBITRATION: + arb = data; + if (!ctrl->sq_ab) + arb->ab = 0x7; + else + arb->ab = ilog2(ctrl->sq_ab); + return NVME_SC_SUCCESS; + + case NVME_FEAT_IRQ_COALESCE: + irqc = data; + irqc->thr = ctrl->irq_vector_threshold; + irqc->time = 0; + return NVME_SC_SUCCESS; + + case NVME_FEAT_IRQ_CONFIG: + irqcfg = data; + mutex_lock(&ctrl->irq_lock); + iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv); + if (iv) { + irqcfg->cd = iv->cd; + status = NVME_SC_SUCCESS; + } else { + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + mutex_unlock(&ctrl->irq_lock); + return status; + + default: + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } +} + +static u16 nvmet_pci_epf_set_feat(const struct nvmet_ctrl *tctrl, + u8 feat, void *data) +{ + struct nvmet_pci_epf_ctrl *ctrl = tctrl->drvdata; + struct nvmet_feat_arbitration *arb; + struct nvmet_feat_irq_coalesce *irqc; + struct nvmet_feat_irq_config *irqcfg; + struct nvmet_pci_epf_irq_vector *iv; + u16 status; + + switch (feat) { + case NVME_FEAT_ARBITRATION: + arb = data; + if (arb->ab == 0x7) + ctrl->sq_ab = 0; + else + ctrl->sq_ab = 1 << arb->ab; + return NVME_SC_SUCCESS; + + case NVME_FEAT_IRQ_COALESCE: + /* + * Since we do not implement precise IRQ coalescing timing, + * ignore the time field. + */ + irqc = data; + ctrl->irq_vector_threshold = irqc->thr + 1; + return NVME_SC_SUCCESS; + + case NVME_FEAT_IRQ_CONFIG: + irqcfg = data; + mutex_lock(&ctrl->irq_lock); + iv = nvmet_pci_epf_find_irq_vector(ctrl, irqcfg->iv); + if (iv) { + iv->cd = irqcfg->cd; + status = NVME_SC_SUCCESS; + } else { + status = NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } + mutex_unlock(&ctrl->irq_lock); + return status; + + default: + return NVME_SC_INVALID_FIELD | NVME_STATUS_DNR; + } +} + +static const struct nvmet_fabrics_ops nvmet_pci_epf_fabrics_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_PCI, + .add_port = nvmet_pci_epf_add_port, + .remove_port = nvmet_pci_epf_remove_port, + .queue_response = nvmet_pci_epf_queue_response, + .get_mdts = nvmet_pci_epf_get_mdts, + .create_cq = nvmet_pci_epf_create_cq, + .delete_cq = nvmet_pci_epf_delete_cq, + .create_sq = nvmet_pci_epf_create_sq, + .delete_sq = nvmet_pci_epf_delete_sq, + .get_feature = nvmet_pci_epf_get_feat, + .set_feature = nvmet_pci_epf_set_feat, +}; + +static void nvmet_pci_epf_cq_work(struct work_struct *work); + +static void nvmet_pci_epf_init_queue(struct nvmet_pci_epf_ctrl *ctrl, + unsigned int qid, bool sq) +{ + struct nvmet_pci_epf_queue *queue; + + if (sq) { + queue = &ctrl->sq[qid]; + set_bit(NVMET_PCI_EPF_Q_IS_SQ, &queue->flags); + } else { + queue = &ctrl->cq[qid]; + INIT_DELAYED_WORK(&queue->work, nvmet_pci_epf_cq_work); + } + queue->ctrl = ctrl; + queue->qid = qid; + spin_lock_init(&queue->lock); + INIT_LIST_HEAD(&queue->list); +} + +static int nvmet_pci_epf_alloc_queues(struct nvmet_pci_epf_ctrl *ctrl) +{ + unsigned int qid; + + ctrl->sq = kcalloc(ctrl->nr_queues, + sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL); + if (!ctrl->sq) + return -ENOMEM; + + ctrl->cq = kcalloc(ctrl->nr_queues, + sizeof(struct nvmet_pci_epf_queue), GFP_KERNEL); + if (!ctrl->cq) { + kfree(ctrl->sq); + ctrl->sq = NULL; + return -ENOMEM; + } + + for (qid = 0; qid < ctrl->nr_queues; qid++) { + nvmet_pci_epf_init_queue(ctrl, qid, true); + nvmet_pci_epf_init_queue(ctrl, qid, false); + } + + return 0; +} + +static void nvmet_pci_epf_free_queues(struct nvmet_pci_epf_ctrl *ctrl) +{ + kfree(ctrl->sq); + ctrl->sq = NULL; + kfree(ctrl->cq); + ctrl->cq = NULL; +} + +static int nvmet_pci_epf_map_queue(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_queue *queue) +{ + struct nvmet_pci_epf *nvme_epf = ctrl->nvme_epf; + int ret; + + ret = nvmet_pci_epf_mem_map(nvme_epf, queue->pci_addr, + queue->pci_size, &queue->pci_map); + if (ret) { + dev_err(ctrl->dev, "Failed to map queue %u (err=%d)\n", + queue->qid, ret); + return ret; + } + + if (queue->pci_map.pci_size < queue->pci_size) { + dev_err(ctrl->dev, "Invalid partial mapping of queue %u\n", + queue->qid); + nvmet_pci_epf_mem_unmap(nvme_epf, &queue->pci_map); + return -ENOMEM; + } + + return 0; +} + +static inline void nvmet_pci_epf_unmap_queue(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_queue *queue) +{ + nvmet_pci_epf_mem_unmap(ctrl->nvme_epf, &queue->pci_map); +} + +static void nvmet_pci_epf_exec_iod_work(struct work_struct *work) +{ + struct nvmet_pci_epf_iod *iod = + container_of(work, struct nvmet_pci_epf_iod, work); + struct nvmet_req *req = &iod->req; + int ret; + + if (!iod->ctrl->link_up) { + nvmet_pci_epf_free_iod(iod); + return; + } + + if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &iod->sq->flags)) { + iod->status = NVME_SC_QID_INVALID | NVME_STATUS_DNR; + goto complete; + } + + if (!nvmet_req_init(req, &iod->cq->nvme_cq, &iod->sq->nvme_sq, + &nvmet_pci_epf_fabrics_ops)) + goto complete; + + iod->data_len = nvmet_req_transfer_len(req); + if (iod->data_len) { + /* + * Get the data DMA transfer direction. Here "device" means the + * PCI root-complex host. + */ + if (nvme_is_write(&iod->cmd)) + iod->dma_dir = DMA_FROM_DEVICE; + else + iod->dma_dir = DMA_TO_DEVICE; + + /* + * Setup the command data buffer and get the command data from + * the host if needed. + */ + ret = nvmet_pci_epf_alloc_iod_data_buf(iod); + if (!ret && iod->dma_dir == DMA_FROM_DEVICE) + ret = nvmet_pci_epf_transfer_iod_data(iod); + if (ret) { + nvmet_req_uninit(req); + goto complete; + } + } + + req->execute(req); + + /* + * If we do not have data to transfer after the command execution + * finishes, nvmet_pci_epf_queue_response() will complete the command + * directly. No need to wait for the completion in this case. + */ + if (!iod->data_len || iod->dma_dir != DMA_TO_DEVICE) + return; + + wait_for_completion(&iod->done); + + if (iod->status == NVME_SC_SUCCESS) { + WARN_ON_ONCE(!iod->data_len || iod->dma_dir != DMA_TO_DEVICE); + nvmet_pci_epf_transfer_iod_data(iod); + } + +complete: + nvmet_pci_epf_complete_iod(iod); +} + +static int nvmet_pci_epf_process_sq(struct nvmet_pci_epf_ctrl *ctrl, + struct nvmet_pci_epf_queue *sq) +{ + struct nvmet_pci_epf_iod *iod; + int ret, n = 0; + + sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db); + while (sq->head != sq->tail && (!ctrl->sq_ab || n < ctrl->sq_ab)) { + iod = nvmet_pci_epf_alloc_iod(sq); + if (!iod) + break; + + /* Get the NVMe command submitted by the host. */ + ret = nvmet_pci_epf_transfer(ctrl, &iod->cmd, + sq->pci_addr + sq->head * sq->qes, + sq->qes, DMA_FROM_DEVICE); + if (ret) { + /* Not much we can do... */ + nvmet_pci_epf_free_iod(iod); + break; + } + + dev_dbg(ctrl->dev, "SQ[%u]: head %u, tail %u, command %s\n", + sq->qid, sq->head, sq->tail, + nvmet_pci_epf_iod_name(iod)); + + sq->head++; + if (sq->head == sq->depth) + sq->head = 0; + n++; + + queue_work_on(WORK_CPU_UNBOUND, sq->iod_wq, &iod->work); + + sq->tail = nvmet_pci_epf_bar_read32(ctrl, sq->db); + } + + return n; +} + +static void nvmet_pci_epf_poll_sqs_work(struct work_struct *work) +{ + struct nvmet_pci_epf_ctrl *ctrl = + container_of(work, struct nvmet_pci_epf_ctrl, poll_sqs.work); + struct nvmet_pci_epf_queue *sq; + unsigned long last = 0; + int i, nr_sqs; + + while (ctrl->link_up && ctrl->enabled) { + nr_sqs = 0; + /* Do round-robin arbitration. */ + for (i = 0; i < ctrl->nr_queues; i++) { + sq = &ctrl->sq[i]; + if (!test_bit(NVMET_PCI_EPF_Q_LIVE, &sq->flags)) + continue; + if (nvmet_pci_epf_process_sq(ctrl, sq)) + nr_sqs++; + } + + if (nr_sqs) { + last = jiffies; + continue; + } + + /* + * If we have not received any command on any queue for more + * than NVMET_PCI_EPF_SQ_POLL_IDLE, assume we are idle and + * reschedule. This avoids "burning" a CPU when the controller + * is idle for a long time. + */ + if (time_is_before_jiffies(last + NVMET_PCI_EPF_SQ_POLL_IDLE)) + break; + + cpu_relax(); + } + + schedule_delayed_work(&ctrl->poll_sqs, NVMET_PCI_EPF_SQ_POLL_INTERVAL); +} + +static void nvmet_pci_epf_cq_work(struct work_struct *work) +{ + struct nvmet_pci_epf_queue *cq = + container_of(work, struct nvmet_pci_epf_queue, work.work); + struct nvmet_pci_epf_ctrl *ctrl = cq->ctrl; + struct nvme_completion *cqe; + struct nvmet_pci_epf_iod *iod; + unsigned long flags; + int ret, n = 0; + + ret = nvmet_pci_epf_map_queue(ctrl, cq); + if (ret) + goto again; + + while (test_bit(NVMET_PCI_EPF_Q_LIVE, &cq->flags) && ctrl->link_up) { + + /* Check that the CQ is not full. */ + cq->head = nvmet_pci_epf_bar_read32(ctrl, cq->db); + if (cq->head == cq->tail + 1) { + ret = -EAGAIN; + break; + } + + spin_lock_irqsave(&cq->lock, flags); + iod = list_first_entry_or_null(&cq->list, + struct nvmet_pci_epf_iod, link); + if (iod) + list_del_init(&iod->link); + spin_unlock_irqrestore(&cq->lock, flags); + + if (!iod) + break; + + /* Post the IOD completion entry. */ + cqe = &iod->cqe; + cqe->status = cpu_to_le16((iod->status << 1) | cq->phase); + + dev_dbg(ctrl->dev, + "CQ[%u]: %s status 0x%x, result 0x%llx, head %u, tail %u, phase %u\n", + cq->qid, nvmet_pci_epf_iod_name(iod), iod->status, + le64_to_cpu(cqe->result.u64), cq->head, cq->tail, + cq->phase); + + memcpy_toio(cq->pci_map.virt_addr + cq->tail * cq->qes, + cqe, cq->qes); + + cq->tail++; + if (cq->tail >= cq->depth) { + cq->tail = 0; + cq->phase ^= 1; + } + + nvmet_pci_epf_free_iod(iod); + + /* Signal the host. */ + nvmet_pci_epf_raise_irq(ctrl, cq, false); + n++; + } + + nvmet_pci_epf_unmap_queue(ctrl, cq); + + /* + * We do not support precise IRQ coalescing time (100ns units as per + * NVMe specifications). So if we have posted completion entries without + * reaching the interrupt coalescing threshold, raise an interrupt. + */ + if (n) + nvmet_pci_epf_raise_irq(ctrl, cq, true); + +again: + if (ret < 0) + queue_delayed_work(system_highpri_wq, &cq->work, + NVMET_PCI_EPF_CQ_RETRY_INTERVAL); +} + +static int nvmet_pci_epf_enable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) +{ + u64 pci_addr, asq, acq; + u32 aqa; + u16 status, qsize; + + if (ctrl->enabled) + return 0; + + dev_info(ctrl->dev, "Enabling controller\n"); + + ctrl->mps_shift = nvmet_cc_mps(ctrl->cc) + 12; + ctrl->mps = 1UL << ctrl->mps_shift; + ctrl->mps_mask = ctrl->mps - 1; + + ctrl->io_sqes = 1UL << nvmet_cc_iosqes(ctrl->cc); + if (ctrl->io_sqes < sizeof(struct nvme_command)) { + dev_err(ctrl->dev, "Unsupported I/O SQES %zu (need %zu)\n", + ctrl->io_sqes, sizeof(struct nvme_command)); + return -EINVAL; + } + + ctrl->io_cqes = 1UL << nvmet_cc_iocqes(ctrl->cc); + if (ctrl->io_cqes < sizeof(struct nvme_completion)) { + dev_err(ctrl->dev, "Unsupported I/O CQES %zu (need %zu)\n", + ctrl->io_sqes, sizeof(struct nvme_completion)); + return -EINVAL; + } + + /* Create the admin queue. */ + aqa = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_AQA); + asq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ASQ); + acq = nvmet_pci_epf_bar_read64(ctrl, NVME_REG_ACQ); + + qsize = (aqa & 0x0fff0000) >> 16; + pci_addr = acq & GENMASK_ULL(63, 12); + status = nvmet_pci_epf_create_cq(ctrl->tctrl, 0, + NVME_CQ_IRQ_ENABLED | NVME_QUEUE_PHYS_CONTIG, + qsize, pci_addr, 0); + if (status != NVME_SC_SUCCESS) { + dev_err(ctrl->dev, "Failed to create admin completion queue\n"); + return -EINVAL; + } + + qsize = aqa & 0x00000fff; + pci_addr = asq & GENMASK_ULL(63, 12); + status = nvmet_pci_epf_create_sq(ctrl->tctrl, 0, NVME_QUEUE_PHYS_CONTIG, + qsize, pci_addr); + if (status != NVME_SC_SUCCESS) { + dev_err(ctrl->dev, "Failed to create admin submission queue\n"); + nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); + return -EINVAL; + } + + ctrl->sq_ab = NVMET_PCI_EPF_SQ_AB; + ctrl->irq_vector_threshold = NVMET_PCI_EPF_IV_THRESHOLD; + ctrl->enabled = true; + + /* Start polling the controller SQs. */ + schedule_delayed_work(&ctrl->poll_sqs, 0); + + return 0; +} + +static void nvmet_pci_epf_disable_ctrl(struct nvmet_pci_epf_ctrl *ctrl) +{ + int qid; + + if (!ctrl->enabled) + return; + + dev_info(ctrl->dev, "Disabling controller\n"); + + ctrl->enabled = false; + cancel_delayed_work_sync(&ctrl->poll_sqs); + + /* Delete all I/O queues first. */ + for (qid = 1; qid < ctrl->nr_queues; qid++) + nvmet_pci_epf_delete_sq(ctrl->tctrl, qid); + + for (qid = 1; qid < ctrl->nr_queues; qid++) + nvmet_pci_epf_delete_cq(ctrl->tctrl, qid); + + /* Delete the admin queue last. */ + nvmet_pci_epf_delete_sq(ctrl->tctrl, 0); + nvmet_pci_epf_delete_cq(ctrl->tctrl, 0); +} + +static void nvmet_pci_epf_poll_cc_work(struct work_struct *work) +{ + struct nvmet_pci_epf_ctrl *ctrl = + container_of(work, struct nvmet_pci_epf_ctrl, poll_cc.work); + u32 old_cc, new_cc; + int ret; + + if (!ctrl->tctrl) + return; + + old_cc = ctrl->cc; + new_cc = nvmet_pci_epf_bar_read32(ctrl, NVME_REG_CC); + ctrl->cc = new_cc; + + if (nvmet_cc_en(new_cc) && !nvmet_cc_en(old_cc)) { + ret = nvmet_pci_epf_enable_ctrl(ctrl); + if (ret) + return; + ctrl->csts |= NVME_CSTS_RDY; + } + + if (!nvmet_cc_en(new_cc) && nvmet_cc_en(old_cc)) { + nvmet_pci_epf_disable_ctrl(ctrl); + ctrl->csts &= ~NVME_CSTS_RDY; + } + + if (nvmet_cc_shn(new_cc) && !nvmet_cc_shn(old_cc)) { + nvmet_pci_epf_disable_ctrl(ctrl); + ctrl->csts |= NVME_CSTS_SHST_CMPLT; + } + + if (!nvmet_cc_shn(new_cc) && nvmet_cc_shn(old_cc)) + ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; + + nvmet_update_cc(ctrl->tctrl, ctrl->cc); + nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts); + + schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL); +} + +static void nvmet_pci_epf_init_bar(struct nvmet_pci_epf_ctrl *ctrl) +{ + struct nvmet_ctrl *tctrl = ctrl->tctrl; + + ctrl->bar = ctrl->nvme_epf->reg_bar; + + /* Copy the target controller capabilities as a base. */ + ctrl->cap = tctrl->cap; + + /* Contiguous Queues Required (CQR). */ + ctrl->cap |= 0x1ULL << 16; + + /* Set Doorbell stride to 4B (DSTRB). */ + ctrl->cap &= ~GENMASK_ULL(35, 32); + + /* Clear NVM Subsystem Reset Supported (NSSRS). */ + ctrl->cap &= ~(0x1ULL << 36); + + /* Clear Boot Partition Support (BPS). */ + ctrl->cap &= ~(0x1ULL << 45); + + /* Clear Persistent Memory Region Supported (PMRS). */ + ctrl->cap &= ~(0x1ULL << 56); + + /* Clear Controller Memory Buffer Supported (CMBS). */ + ctrl->cap &= ~(0x1ULL << 57); + + /* Controller configuration. */ + ctrl->cc = tctrl->cc & (~NVME_CC_ENABLE); + + /* Controller status. */ + ctrl->csts = ctrl->tctrl->csts; + + nvmet_pci_epf_bar_write64(ctrl, NVME_REG_CAP, ctrl->cap); + nvmet_pci_epf_bar_write32(ctrl, NVME_REG_VS, tctrl->subsys->ver); + nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CSTS, ctrl->csts); + nvmet_pci_epf_bar_write32(ctrl, NVME_REG_CC, ctrl->cc); +} + +static int nvmet_pci_epf_create_ctrl(struct nvmet_pci_epf *nvme_epf, + unsigned int max_nr_queues) +{ + struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; + struct nvmet_alloc_ctrl_args args = {}; + char hostnqn[NVMF_NQN_SIZE]; + uuid_t id; + int ret; + + memset(ctrl, 0, sizeof(*ctrl)); + ctrl->dev = &nvme_epf->epf->dev; + mutex_init(&ctrl->irq_lock); + ctrl->nvme_epf = nvme_epf; + ctrl->mdts = nvme_epf->mdts_kb * SZ_1K; + INIT_DELAYED_WORK(&ctrl->poll_cc, nvmet_pci_epf_poll_cc_work); + INIT_DELAYED_WORK(&ctrl->poll_sqs, nvmet_pci_epf_poll_sqs_work); + + ret = mempool_init_kmalloc_pool(&ctrl->iod_pool, + max_nr_queues * NVMET_MAX_QUEUE_SIZE, + sizeof(struct nvmet_pci_epf_iod)); + if (ret) { + dev_err(ctrl->dev, "Failed to initialize IOD mempool\n"); + return ret; + } + + ctrl->port = nvmet_pci_epf_find_port(ctrl, nvme_epf->portid); + if (!ctrl->port) { + dev_err(ctrl->dev, "Port not found\n"); + ret = -EINVAL; + goto out_mempool_exit; + } + + /* Create the target controller. */ + uuid_gen(&id); + snprintf(hostnqn, NVMF_NQN_SIZE, + "nqn.2014-08.org.nvmexpress:uuid:%pUb", &id); + args.port = ctrl->port; + args.subsysnqn = nvme_epf->subsysnqn; + memset(&id, 0, sizeof(uuid_t)); + args.hostid = &id; + args.hostnqn = hostnqn; + args.ops = &nvmet_pci_epf_fabrics_ops; + + ctrl->tctrl = nvmet_alloc_ctrl(&args); + if (!ctrl->tctrl) { + dev_err(ctrl->dev, "Failed to create target controller\n"); + ret = -ENOMEM; + goto out_mempool_exit; + } + ctrl->tctrl->drvdata = ctrl; + + /* We do not support protection information for now. */ + if (ctrl->tctrl->pi_support) { + dev_err(ctrl->dev, + "Protection information (PI) is not supported\n"); + ret = -ENOTSUPP; + goto out_put_ctrl; + } + + /* Allocate our queues, up to the maximum number. */ + ctrl->nr_queues = min(ctrl->tctrl->subsys->max_qid + 1, max_nr_queues); + ret = nvmet_pci_epf_alloc_queues(ctrl); + if (ret) + goto out_put_ctrl; + + /* + * Allocate the IRQ vectors descriptors. We cannot have more than the + * maximum number of queues. + */ + ret = nvmet_pci_epf_alloc_irq_vectors(ctrl); + if (ret) + goto out_free_queues; + + dev_info(ctrl->dev, + "New PCI ctrl \"%s\", %u I/O queues, mdts %u B\n", + ctrl->tctrl->subsys->subsysnqn, ctrl->nr_queues - 1, + ctrl->mdts); + + /* Initialize BAR 0 using the target controller CAP. */ + nvmet_pci_epf_init_bar(ctrl); + + return 0; + +out_free_queues: + nvmet_pci_epf_free_queues(ctrl); +out_put_ctrl: + nvmet_ctrl_put(ctrl->tctrl); + ctrl->tctrl = NULL; +out_mempool_exit: + mempool_exit(&ctrl->iod_pool); + return ret; +} + +static void nvmet_pci_epf_start_ctrl(struct nvmet_pci_epf_ctrl *ctrl) +{ + schedule_delayed_work(&ctrl->poll_cc, NVMET_PCI_EPF_CC_POLL_INTERVAL); +} + +static void nvmet_pci_epf_stop_ctrl(struct nvmet_pci_epf_ctrl *ctrl) +{ + cancel_delayed_work_sync(&ctrl->poll_cc); + + nvmet_pci_epf_disable_ctrl(ctrl); +} + +static void nvmet_pci_epf_destroy_ctrl(struct nvmet_pci_epf_ctrl *ctrl) +{ + if (!ctrl->tctrl) + return; + + dev_info(ctrl->dev, "Destroying PCI ctrl \"%s\"\n", + ctrl->tctrl->subsys->subsysnqn); + + nvmet_pci_epf_stop_ctrl(ctrl); + + nvmet_pci_epf_free_queues(ctrl); + nvmet_pci_epf_free_irq_vectors(ctrl); + + nvmet_ctrl_put(ctrl->tctrl); + ctrl->tctrl = NULL; + + mempool_exit(&ctrl->iod_pool); +} + +static int nvmet_pci_epf_configure_bar(struct nvmet_pci_epf *nvme_epf) +{ + struct pci_epf *epf = nvme_epf->epf; + const struct pci_epc_features *epc_features = nvme_epf->epc_features; + size_t reg_size, reg_bar_size; + size_t msix_table_size = 0; + + /* + * The first free BAR will be our register BAR and per NVMe + * specifications, it must be BAR 0. + */ + if (pci_epc_get_first_free_bar(epc_features) != BAR_0) { + dev_err(&epf->dev, "BAR 0 is not free\n"); + return -ENODEV; + } + + if (epc_features->bar[BAR_0].only_64bit) + epf->bar[BAR_0].flags |= PCI_BASE_ADDRESS_MEM_TYPE_64; + + /* + * Calculate the size of the register bar: NVMe registers first with + * enough space for the doorbells, followed by the MSI-X table + * if supported. + */ + reg_size = NVME_REG_DBS + (NVMET_NR_QUEUES * 2 * sizeof(u32)); + reg_size = ALIGN(reg_size, 8); + + if (epc_features->msix_capable) { + size_t pba_size; + + msix_table_size = PCI_MSIX_ENTRY_SIZE * epf->msix_interrupts; + nvme_epf->msix_table_offset = reg_size; + pba_size = ALIGN(DIV_ROUND_UP(epf->msix_interrupts, 8), 8); + + reg_size += msix_table_size + pba_size; + } + + if (epc_features->bar[BAR_0].type == BAR_FIXED) { + if (reg_size > epc_features->bar[BAR_0].fixed_size) { + dev_err(&epf->dev, + "BAR 0 size %llu B too small, need %zu B\n", + epc_features->bar[BAR_0].fixed_size, + reg_size); + return -ENOMEM; + } + reg_bar_size = epc_features->bar[BAR_0].fixed_size; + } else { + reg_bar_size = ALIGN(reg_size, max(epc_features->align, 4096)); + } + + nvme_epf->reg_bar = pci_epf_alloc_space(epf, reg_bar_size, BAR_0, + epc_features, PRIMARY_INTERFACE); + if (!nvme_epf->reg_bar) { + dev_err(&epf->dev, "Failed to allocate BAR 0\n"); + return -ENOMEM; + } + memset(nvme_epf->reg_bar, 0, reg_bar_size); + + return 0; +} + +static void nvmet_pci_epf_free_bar(struct nvmet_pci_epf *nvme_epf) +{ + struct pci_epf *epf = nvme_epf->epf; + + if (!nvme_epf->reg_bar) + return; + + pci_epf_free_space(epf, nvme_epf->reg_bar, BAR_0, PRIMARY_INTERFACE); + nvme_epf->reg_bar = NULL; +} + +static void nvmet_pci_epf_clear_bar(struct nvmet_pci_epf *nvme_epf) +{ + struct pci_epf *epf = nvme_epf->epf; + + pci_epc_clear_bar(epf->epc, epf->func_no, epf->vfunc_no, + &epf->bar[BAR_0]); +} + +static int nvmet_pci_epf_init_irq(struct nvmet_pci_epf *nvme_epf) +{ + const struct pci_epc_features *epc_features = nvme_epf->epc_features; + struct pci_epf *epf = nvme_epf->epf; + int ret; + + /* Enable MSI-X if supported, otherwise, use MSI. */ + if (epc_features->msix_capable && epf->msix_interrupts) { + ret = pci_epc_set_msix(epf->epc, epf->func_no, epf->vfunc_no, + epf->msix_interrupts, BAR_0, + nvme_epf->msix_table_offset); + if (ret) { + dev_err(&epf->dev, "Failed to configure MSI-X\n"); + return ret; + } + + nvme_epf->nr_vectors = epf->msix_interrupts; + nvme_epf->irq_type = PCI_IRQ_MSIX; + + return 0; + } + + if (epc_features->msi_capable && epf->msi_interrupts) { + ret = pci_epc_set_msi(epf->epc, epf->func_no, epf->vfunc_no, + epf->msi_interrupts); + if (ret) { + dev_err(&epf->dev, "Failed to configure MSI\n"); + return ret; + } + + nvme_epf->nr_vectors = epf->msi_interrupts; + nvme_epf->irq_type = PCI_IRQ_MSI; + + return 0; + } + + /* MSI and MSI-X are not supported: fall back to INTx. */ + nvme_epf->nr_vectors = 1; + nvme_epf->irq_type = PCI_IRQ_INTX; + + return 0; +} + +static int nvmet_pci_epf_epc_init(struct pci_epf *epf) +{ + struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); + const struct pci_epc_features *epc_features = nvme_epf->epc_features; + struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; + unsigned int max_nr_queues = NVMET_NR_QUEUES; + int ret; + + /* For now, do not support virtual functions. */ + if (epf->vfunc_no > 0) { + dev_err(&epf->dev, "Virtual functions are not supported\n"); + return -EINVAL; + } + + /* + * Cap the maximum number of queues we can support on the controller + * with the number of IRQs we can use. + */ + if (epc_features->msix_capable && epf->msix_interrupts) { + dev_info(&epf->dev, + "PCI endpoint controller supports MSI-X, %u vectors\n", + epf->msix_interrupts); + max_nr_queues = min(max_nr_queues, epf->msix_interrupts); + } else if (epc_features->msi_capable && epf->msi_interrupts) { + dev_info(&epf->dev, + "PCI endpoint controller supports MSI, %u vectors\n", + epf->msi_interrupts); + max_nr_queues = min(max_nr_queues, epf->msi_interrupts); + } + + if (max_nr_queues < 2) { + dev_err(&epf->dev, "Invalid maximum number of queues %u\n", + max_nr_queues); + return -EINVAL; + } + + /* Create the target controller. */ + ret = nvmet_pci_epf_create_ctrl(nvme_epf, max_nr_queues); + if (ret) { + dev_err(&epf->dev, + "Failed to create NVMe PCI target controller (err=%d)\n", + ret); + return ret; + } + + /* Set device ID, class, etc. */ + epf->header->vendorid = ctrl->tctrl->subsys->vendor_id; + epf->header->subsys_vendor_id = ctrl->tctrl->subsys->subsys_vendor_id; + ret = pci_epc_write_header(epf->epc, epf->func_no, epf->vfunc_no, + epf->header); + if (ret) { + dev_err(&epf->dev, + "Failed to write configuration header (err=%d)\n", ret); + goto out_destroy_ctrl; + } + + ret = pci_epc_set_bar(epf->epc, epf->func_no, epf->vfunc_no, + &epf->bar[BAR_0]); + if (ret) { + dev_err(&epf->dev, "Failed to set BAR 0 (err=%d)\n", ret); + goto out_destroy_ctrl; + } + + /* + * Enable interrupts and start polling the controller BAR if we do not + * have a link up notifier. + */ + ret = nvmet_pci_epf_init_irq(nvme_epf); + if (ret) + goto out_clear_bar; + + if (!epc_features->linkup_notifier) { + ctrl->link_up = true; + nvmet_pci_epf_start_ctrl(&nvme_epf->ctrl); + } + + return 0; + +out_clear_bar: + nvmet_pci_epf_clear_bar(nvme_epf); +out_destroy_ctrl: + nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl); + return ret; +} + +static void nvmet_pci_epf_epc_deinit(struct pci_epf *epf) +{ + struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); + struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; + + ctrl->link_up = false; + nvmet_pci_epf_destroy_ctrl(ctrl); + + nvmet_pci_epf_deinit_dma(nvme_epf); + nvmet_pci_epf_clear_bar(nvme_epf); +} + +static int nvmet_pci_epf_link_up(struct pci_epf *epf) +{ + struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); + struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; + + ctrl->link_up = true; + nvmet_pci_epf_start_ctrl(ctrl); + + return 0; +} + +static int nvmet_pci_epf_link_down(struct pci_epf *epf) +{ + struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); + struct nvmet_pci_epf_ctrl *ctrl = &nvme_epf->ctrl; + + ctrl->link_up = false; + nvmet_pci_epf_stop_ctrl(ctrl); + + return 0; +} + +static const struct pci_epc_event_ops nvmet_pci_epf_event_ops = { + .epc_init = nvmet_pci_epf_epc_init, + .epc_deinit = nvmet_pci_epf_epc_deinit, + .link_up = nvmet_pci_epf_link_up, + .link_down = nvmet_pci_epf_link_down, +}; + +static int nvmet_pci_epf_bind(struct pci_epf *epf) +{ + struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); + const struct pci_epc_features *epc_features; + struct pci_epc *epc = epf->epc; + int ret; + + if (WARN_ON_ONCE(!epc)) + return -EINVAL; + + epc_features = pci_epc_get_features(epc, epf->func_no, epf->vfunc_no); + if (!epc_features) { + dev_err(&epf->dev, "epc_features not implemented\n"); + return -EOPNOTSUPP; + } + nvme_epf->epc_features = epc_features; + + ret = nvmet_pci_epf_configure_bar(nvme_epf); + if (ret) + return ret; + + nvmet_pci_epf_init_dma(nvme_epf); + + return 0; +} + +static void nvmet_pci_epf_unbind(struct pci_epf *epf) +{ + struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); + struct pci_epc *epc = epf->epc; + + nvmet_pci_epf_destroy_ctrl(&nvme_epf->ctrl); + + if (epc->init_complete) { + nvmet_pci_epf_deinit_dma(nvme_epf); + nvmet_pci_epf_clear_bar(nvme_epf); + } + + nvmet_pci_epf_free_bar(nvme_epf); +} + +static struct pci_epf_header nvme_epf_pci_header = { + .vendorid = PCI_ANY_ID, + .deviceid = PCI_ANY_ID, + .progif_code = 0x02, /* NVM Express */ + .baseclass_code = PCI_BASE_CLASS_STORAGE, + .subclass_code = 0x08, /* Non-Volatile Memory controller */ + .interrupt_pin = PCI_INTERRUPT_INTA, +}; + +static int nvmet_pci_epf_probe(struct pci_epf *epf, + const struct pci_epf_device_id *id) +{ + struct nvmet_pci_epf *nvme_epf; + int ret; + + nvme_epf = devm_kzalloc(&epf->dev, sizeof(*nvme_epf), GFP_KERNEL); + if (!nvme_epf) + return -ENOMEM; + + ret = devm_mutex_init(&epf->dev, &nvme_epf->mmio_lock); + if (ret) + return ret; + + nvme_epf->epf = epf; + nvme_epf->mdts_kb = NVMET_PCI_EPF_MDTS_KB; + + epf->event_ops = &nvmet_pci_epf_event_ops; + epf->header = &nvme_epf_pci_header; + epf_set_drvdata(epf, nvme_epf); + + return 0; +} + +#define to_nvme_epf(epf_group) \ + container_of(epf_group, struct nvmet_pci_epf, group) + +static ssize_t nvmet_pci_epf_portid_show(struct config_item *item, char *page) +{ + struct config_group *group = to_config_group(item); + struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group); + + return sysfs_emit(page, "%u\n", le16_to_cpu(nvme_epf->portid)); +} + +static ssize_t nvmet_pci_epf_portid_store(struct config_item *item, + const char *page, size_t len) +{ + struct config_group *group = to_config_group(item); + struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group); + u16 portid; + + /* Do not allow setting this when the function is already started. */ + if (nvme_epf->ctrl.tctrl) + return -EBUSY; + + if (!len) + return -EINVAL; + + if (kstrtou16(page, 0, &portid)) + return -EINVAL; + + nvme_epf->portid = cpu_to_le16(portid); + + return len; +} + +CONFIGFS_ATTR(nvmet_pci_epf_, portid); + +static ssize_t nvmet_pci_epf_subsysnqn_show(struct config_item *item, + char *page) +{ + struct config_group *group = to_config_group(item); + struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group); + + return sysfs_emit(page, "%s\n", nvme_epf->subsysnqn); +} + +static ssize_t nvmet_pci_epf_subsysnqn_store(struct config_item *item, + const char *page, size_t len) +{ + struct config_group *group = to_config_group(item); + struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group); + + /* Do not allow setting this when the function is already started. */ + if (nvme_epf->ctrl.tctrl) + return -EBUSY; + + if (!len) + return -EINVAL; + + strscpy(nvme_epf->subsysnqn, page, len); + + return len; +} + +CONFIGFS_ATTR(nvmet_pci_epf_, subsysnqn); + +static ssize_t nvmet_pci_epf_mdts_kb_show(struct config_item *item, char *page) +{ + struct config_group *group = to_config_group(item); + struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group); + + return sysfs_emit(page, "%u\n", nvme_epf->mdts_kb); +} + +static ssize_t nvmet_pci_epf_mdts_kb_store(struct config_item *item, + const char *page, size_t len) +{ + struct config_group *group = to_config_group(item); + struct nvmet_pci_epf *nvme_epf = to_nvme_epf(group); + unsigned long mdts_kb; + int ret; + + if (nvme_epf->ctrl.tctrl) + return -EBUSY; + + ret = kstrtoul(page, 0, &mdts_kb); + if (ret) + return ret; + if (!mdts_kb) + mdts_kb = NVMET_PCI_EPF_MDTS_KB; + else if (mdts_kb > NVMET_PCI_EPF_MAX_MDTS_KB) + mdts_kb = NVMET_PCI_EPF_MAX_MDTS_KB; + + if (!is_power_of_2(mdts_kb)) + return -EINVAL; + + nvme_epf->mdts_kb = mdts_kb; + + return len; +} + +CONFIGFS_ATTR(nvmet_pci_epf_, mdts_kb); + +static struct configfs_attribute *nvmet_pci_epf_attrs[] = { + &nvmet_pci_epf_attr_portid, + &nvmet_pci_epf_attr_subsysnqn, + &nvmet_pci_epf_attr_mdts_kb, + NULL, +}; + +static const struct config_item_type nvmet_pci_epf_group_type = { + .ct_attrs = nvmet_pci_epf_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_pci_epf_add_cfs(struct pci_epf *epf, + struct config_group *group) +{ + struct nvmet_pci_epf *nvme_epf = epf_get_drvdata(epf); + + config_group_init_type_name(&nvme_epf->group, "nvme", + &nvmet_pci_epf_group_type); + + return &nvme_epf->group; +} + +static const struct pci_epf_device_id nvmet_pci_epf_ids[] = { + { .name = "nvmet_pci_epf" }, + {}, +}; + +static struct pci_epf_ops nvmet_pci_epf_ops = { + .bind = nvmet_pci_epf_bind, + .unbind = nvmet_pci_epf_unbind, + .add_cfs = nvmet_pci_epf_add_cfs, +}; + +static struct pci_epf_driver nvmet_pci_epf_driver = { + .driver.name = "nvmet_pci_epf", + .probe = nvmet_pci_epf_probe, + .id_table = nvmet_pci_epf_ids, + .ops = &nvmet_pci_epf_ops, + .owner = THIS_MODULE, +}; + +static int __init nvmet_pci_epf_init_module(void) +{ + int ret; + + ret = pci_epf_register_driver(&nvmet_pci_epf_driver); + if (ret) + return ret; + + ret = nvmet_register_transport(&nvmet_pci_epf_fabrics_ops); + if (ret) { + pci_epf_unregister_driver(&nvmet_pci_epf_driver); + return ret; + } + + return 0; +} + +static void __exit nvmet_pci_epf_cleanup_module(void) +{ + nvmet_unregister_transport(&nvmet_pci_epf_fabrics_ops); + pci_epf_unregister_driver(&nvmet_pci_epf_driver); +} + +module_init(nvmet_pci_epf_init_module); +module_exit(nvmet_pci_epf_cleanup_module); + +MODULE_DESCRIPTION("NVMe PCI Endpoint Function target driver"); +MODULE_AUTHOR("Damien Le Moal "); +MODULE_LICENSE("GPL"); From 002ec8f1c69d3722a033eaf45102ba747ae80e94 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 4 Jan 2025 13:59:51 +0900 Subject: [PATCH 133/150] Documentation: Document the NVMe PCI endpoint target driver Add a documentation file (Documentation/nvme/nvme-pci-endpoint-target.rst) for the new NVMe PCI endpoint target driver. This provides an overview of the driver requirements, capabilities and limitations. A user guide describing how to setup a NVMe PCI endpoint device using this driver is also provided. This document is made accessible also from the PCI endpoint documentation using a link. Furthermore, since the existing nvme documentation was not accessible from the top documentation index, an index file is added to Documentation/nvme and this index listed as "NVMe Subsystem" in the "Storage interfaces" section of the subsystem API index. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Acked-by: Bjorn Helgaas Reviewed-by: Manivannan Sadhasivam Signed-off-by: Keith Busch --- Documentation/PCI/endpoint/index.rst | 1 + .../PCI/endpoint/pci-nvme-function.rst | 13 + Documentation/nvme/index.rst | 12 + .../nvme/nvme-pci-endpoint-target.rst | 368 ++++++++++++++++++ Documentation/subsystem-apis.rst | 1 + 5 files changed, 395 insertions(+) create mode 100644 Documentation/PCI/endpoint/pci-nvme-function.rst create mode 100644 Documentation/nvme/index.rst create mode 100644 Documentation/nvme/nvme-pci-endpoint-target.rst diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst index 4d2333e7ae06..dd1f62e731c9 100644 --- a/Documentation/PCI/endpoint/index.rst +++ b/Documentation/PCI/endpoint/index.rst @@ -15,6 +15,7 @@ PCI Endpoint Framework pci-ntb-howto pci-vntb-function pci-vntb-howto + pci-nvme-function function/binding/pci-test function/binding/pci-ntb diff --git a/Documentation/PCI/endpoint/pci-nvme-function.rst b/Documentation/PCI/endpoint/pci-nvme-function.rst new file mode 100644 index 000000000000..df57b8e7d066 --- /dev/null +++ b/Documentation/PCI/endpoint/pci-nvme-function.rst @@ -0,0 +1,13 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +PCI NVMe Function +================= + +:Author: Damien Le Moal + +The PCI NVMe endpoint function implements a PCI NVMe controller using the NVMe +subsystem target core code. The driver for this function resides with the NVMe +subsystem as drivers/nvme/target/nvmet-pciep.c. + +See Documentation/nvme/nvme-pci-endpoint-target.rst for more details. diff --git a/Documentation/nvme/index.rst b/Documentation/nvme/index.rst new file mode 100644 index 000000000000..13383c760cc7 --- /dev/null +++ b/Documentation/nvme/index.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============== +NVMe Subsystem +============== + +.. toctree:: + :maxdepth: 2 + :numbered: + + feature-and-quirk-policy + nvme-pci-endpoint-target diff --git a/Documentation/nvme/nvme-pci-endpoint-target.rst b/Documentation/nvme/nvme-pci-endpoint-target.rst new file mode 100644 index 000000000000..66e7b7d869b4 --- /dev/null +++ b/Documentation/nvme/nvme-pci-endpoint-target.rst @@ -0,0 +1,368 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================= +NVMe PCI Endpoint Function Target +================================= + +:Author: Damien Le Moal + +The NVMe PCI endpoint function target driver implements a NVMe PCIe controller +using a NVMe fabrics target controller configured with the PCI transport type. + +Overview +======== + +The NVMe PCI endpoint function target driver allows exposing a NVMe target +controller over a PCIe link, thus implementing an NVMe PCIe device similar to a +regular M.2 SSD. The target controller is created in the same manner as when +using NVMe over fabrics: the controller represents the interface to an NVMe +subsystem using a port. The port transfer type must be configured to be +"pci". The subsystem can be configured to have namespaces backed by regular +files or block devices, or can use NVMe passthrough to expose to the PCI host an +existing physical NVMe device or a NVMe fabrics host controller (e.g. a NVMe TCP +host controller). + +The NVMe PCI endpoint function target driver relies as much as possible on the +NVMe target core code to parse and execute NVMe commands submitted by the PCIe +host. However, using the PCI endpoint framework API and DMA API, the driver is +also responsible for managing all data transfers over the PCIe link. This +implies that the NVMe PCI endpoint function target driver implements several +NVMe data structure management and some NVMe command parsing. + +1) The driver manages retrieval of NVMe commands in submission queues using DMA + if supported, or MMIO otherwise. Each command retrieved is then executed + using a work item to maximize performance with the parallel execution of + multiple commands on different CPUs. The driver uses a work item to + constantly poll the doorbell of all submission queues to detect command + submissions from the PCIe host. + +2) The driver transfers completion queues entries of completed commands to the + PCIe host using MMIO copy of the entries in the host completion queue. + After posting completion entries in a completion queue, the driver uses the + PCI endpoint framework API to raise an interrupt to the host to signal the + commands completion. + +3) For any command that has a data buffer, the NVMe PCI endpoint target driver + parses the command PRPs or SGLs lists to create a list of PCI address + segments representing the mapping of the command data buffer on the host. + The command data buffer is transferred over the PCIe link using this list of + PCI address segments using DMA, if supported. If DMA is not supported, MMIO + is used, which results in poor performance. For write commands, the command + data buffer is transferred from the host into a local memory buffer before + executing the command using the target core code. For read commands, a local + memory buffer is allocated to execute the command and the content of that + buffer is transferred to the host once the command completes. + +Controller Capabilities +----------------------- + +The NVMe capabilities exposed to the PCIe host through the BAR 0 registers +are almost identical to the capabilities of the NVMe target controller +implemented by the target core code. There are some exceptions. + +1) The NVMe PCI endpoint target driver always sets the controller capability + CQR bit to request "Contiguous Queues Required". This is to facilitate the + mapping of a queue PCI address range to the local CPU address space. + +2) The doorbell stride (DSTRB) is always set to be 4B + +3) Since the PCI endpoint framework does not provide a way to handle PCI level + resets, the controller capability NSSR bit (NVM Subsystem Reset Supported) + is always cleared. + +4) The boot partition support (BPS), Persistent Memory Region Supported (PMRS) + and Controller Memory Buffer Supported (CMBS) capabilities are never + reported. + +Supported Features +------------------ + +The NVMe PCI endpoint target driver implements support for both PRPs and SGLs. +The driver also implements IRQ vector coalescing and submission queue +arbitration burst. + +The maximum number of queues and the maximum data transfer size (MDTS) are +configurable through configfs before starting the controller. To avoid issues +with excessive local memory usage for executing commands, MDTS defaults to 512 +KB and is limited to a maximum of 2 MB (arbitrary limit). + +Mimimum number of PCI Address Mapping Windows Required +------------------------------------------------------ + +Most PCI endpoint controllers provide a limited number of mapping windows for +mapping a PCI address range to local CPU memory addresses. The NVMe PCI +endpoint target controllers uses mapping windows for the following. + +1) One memory window for raising MSI or MSI-X interrupts +2) One memory window for MMIO transfers +3) One memory window for each completion queue + +Given the highly asynchronous nature of the NVMe PCI endpoint target driver +operation, the memory windows as described above will generally not be used +simultaneously, but that may happen. So a safe maximum number of completion +queues that can be supported is equal to the total number of memory mapping +windows of the PCI endpoint controller minus two. E.g. for an endpoint PCI +controller with 32 outbound memory windows available, up to 30 completion +queues can be safely operated without any risk of getting PCI address mapping +errors due to the lack of memory windows. + +Maximum Number of Queue Pairs +----------------------------- + +Upon binding of the NVMe PCI endpoint target driver to the PCI endpoint +controller, BAR 0 is allocated with enough space to accommodate the admin queue +and multiple I/O queues. The maximum of number of I/O queues pairs that can be +supported is limited by several factors. + +1) The NVMe target core code limits the maximum number of I/O queues to the + number of online CPUs. +2) The total number of queue pairs, including the admin queue, cannot exceed + the number of MSI-X or MSI vectors available. +3) The total number of completion queues must not exceed the total number of + PCI mapping windows minus 2 (see above). + +The NVMe endpoint function driver allows configuring the maximum number of +queue pairs through configfs. + +Limitations and NVMe Specification Non-Compliance +------------------------------------------------- + +Similar to the NVMe target core code, the NVMe PCI endpoint target driver does +not support multiple submission queues using the same completion queue. All +submission queues must specify a unique completion queue. + + +User Guide +========== + +This section describes the hardware requirements and how to setup an NVMe PCI +endpoint target device. + +Kernel Requirements +------------------- + +The kernel must be compiled with the configuration options CONFIG_PCI_ENDPOINT, +CONFIG_PCI_ENDPOINT_CONFIGFS, and CONFIG_NVME_TARGET_PCI_EPF enabled. +CONFIG_PCI, CONFIG_BLK_DEV_NVME and CONFIG_NVME_TARGET must also be enabled +(obviously). + +In addition to this, at least one PCI endpoint controller driver should be +available for the endpoint hardware used. + +To facilitate testing, enabling the null-blk driver (CONFIG_BLK_DEV_NULL_BLK) +is also recommended. With this, a simple setup using a null_blk block device +as a subsystem namespace can be used. + +Hardware Requirements +--------------------- + +To use the NVMe PCI endpoint target driver, at least one endpoint controller +device is required. + +To find the list of endpoint controller devices in the system:: + + # ls /sys/class/pci_epc/ + a40000000.pcie-ep + +If PCI_ENDPOINT_CONFIGFS is enabled:: + + # ls /sys/kernel/config/pci_ep/controllers + a40000000.pcie-ep + +The endpoint board must of course also be connected to a host with a PCI cable +with RX-TX signal swapped. If the host PCI slot used does not have +plug-and-play capabilities, the host should be powered off when the NVMe PCI +endpoint device is configured. + +NVMe Endpoint Device +-------------------- + +Creating an NVMe endpoint device is a two step process. First, an NVMe target +subsystem and port must be defined. Second, the NVMe PCI endpoint device must +be setup and bound to the subsystem and port created. + +Creating a NVMe Subsystem and Port +---------------------------------- + +Details about how to configure a NVMe target subsystem and port are outside the +scope of this document. The following only provides a simple example of a port +and subsystem with a single namespace backed by a null_blk device. + +First, make sure that configfs is enabled:: + + # mount -t configfs none /sys/kernel/config + +Next, create a null_blk device (default settings give a 250 GB device without +memory backing). The block device created will be /dev/nullb0 by default:: + + # modprobe null_blk + # ls /dev/nullb0 + /dev/nullb0 + +The NVMe PCI endpoint function target driver must be loaded:: + + # modprobe nvmet_pci_epf + # lsmod | grep nvmet + nvmet_pci_epf 32768 0 + nvmet 118784 1 nvmet_pci_epf + nvme_core 131072 2 nvmet_pci_epf,nvmet + +Now, create a subsystem and a port that we will use to create a PCI target +controller when setting up the NVMe PCI endpoint target device. In this +example, the port is created with a maximum of 4 I/O queue pairs:: + + # cd /sys/kernel/config/nvmet/subsystems + # mkdir nvmepf.0.nqn + # echo -n "Linux-pci-epf" > nvmepf.0.nqn/attr_model + # echo "0x1b96" > nvmepf.0.nqn/attr_vendor_id + # echo "0x1b96" > nvmepf.0.nqn/attr_subsys_vendor_id + # echo 1 > nvmepf.0.nqn/attr_allow_any_host + # echo 4 > nvmepf.0.nqn/attr_qid_max + +Next, create and enable the subsystem namespace using the null_blk block +device:: + + # mkdir nvmepf.0.nqn/namespaces/1 + # echo -n "/dev/nullb0" > nvmepf.0.nqn/namespaces/1/device_path + # echo 1 > "nvmepf.0.nqn/namespaces/1/enable" + +Finally, create the target port and link it to the subsystem:: + + # cd /sys/kernel/config/nvmet/ports + # mkdir 1 + # echo -n "pci" > 1/addr_trtype + # ln -s /sys/kernel/config/nvmet/subsystems/nvmepf.0.nqn \ + /sys/kernel/config/nvmet/ports/1/subsystems/nvmepf.0.nqn + +Creating a NVMe PCI Endpoint Device +----------------------------------- + +With the NVMe target subsystem and port ready for use, the NVMe PCI endpoint +device can now be created and enabled. The NVMe PCI endpoint target driver +should already be loaded (that is done automatically when the port is created):: + + # ls /sys/kernel/config/pci_ep/functions + nvmet_pci_epf + +Next, create function 0:: + + # cd /sys/kernel/config/pci_ep/functions/nvmet_pci_epf + # mkdir nvmepf.0 + # ls nvmepf.0/ + baseclass_code msix_interrupts secondary + cache_line_size nvme subclass_code + deviceid primary subsys_id + interrupt_pin progif_code subsys_vendor_id + msi_interrupts revid vendorid + +Configure the function using any device ID (the vendor ID for the device will +be automatically set to the same value as the NVMe target subsystem vendor +ID):: + + # cd /sys/kernel/config/pci_ep/functions/nvmet_pci_epf + # echo 0xBEEF > nvmepf.0/deviceid + # echo 32 > nvmepf.0/msix_interrupts + +If the PCI endpoint controller used does not support MSI-X, MSI can be +configured instead:: + + # echo 32 > nvmepf.0/msi_interrupts + +Next, let's bind our endpoint device with the target subsystem and port that we +created:: + + # echo 1 > nvmepf.0/nvme/portid + # echo "nvmepf.0.nqn" > nvmepf.0/nvme/subsysnqn + +The endpoint function can then be bound to the endpoint controller and the +controller started:: + + # cd /sys/kernel/config/pci_ep + # ln -s functions/nvmet_pci_epf/nvmepf.0 controllers/a40000000.pcie-ep/ + # echo 1 > controllers/a40000000.pcie-ep/start + +On the endpoint machine, kernel messages will show information as the NVMe +target device and endpoint device are created and connected. + +.. code-block:: text + + null_blk: disk nullb0 created + null_blk: module loaded + nvmet: adding nsid 1 to subsystem nvmepf.0.nqn + nvmet_pci_epf nvmet_pci_epf.0: PCI endpoint controller supports MSI-X, 32 vectors + nvmet: Created nvm controller 1 for subsystem nvmepf.0.nqn for NQN nqn.2014-08.org.nvmexpress:uuid:2ab90791-2246-4fbb-961d-4c3d5a5a0176. + nvmet_pci_epf nvmet_pci_epf.0: New PCI ctrl "nvmepf.0.nqn", 4 I/O queues, mdts 524288 B + +PCI Root-Complex Host +--------------------- + +Booting the PCI host will result in the initialization of the PCIe link (this +may be signaled by the PCI endpoint driver with a kernel message). A kernel +message on the endpoint will also signal when the host NVMe driver enables the +device controller:: + + nvmet_pci_epf nvmet_pci_epf.0: Enabling controller + +On the host side, the NVMe PCI endpoint function target device will is +discoverable as a PCI device, with the vendor ID and device ID as configured:: + + # lspci -n + 0000:01:00.0 0108: 1b96:beef + +An this device will be recognized as an NVMe device with a single namespace:: + + # lsblk + NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINTS + nvme0n1 259:0 0 250G 0 disk + +The NVMe endpoint block device can then be used as any other regular NVMe +namespace block device. The *nvme* command line utility can be used to get more +detailed information about the endpoint device:: + + # nvme id-ctrl /dev/nvme0 + NVME Identify Controller: + vid : 0x1b96 + ssvid : 0x1b96 + sn : 94993c85650ef7bcd625 + mn : Linux-pci-epf + fr : 6.13.0-r + rab : 6 + ieee : 000000 + cmic : 0xb + mdts : 7 + cntlid : 0x1 + ver : 0x20100 + ... + + +Endpoint Bindings +================= + +The NVMe PCI endpoint target driver uses the PCI endpoint configfs device +attributes as follows. + +================ =========================================================== +vendorid Ignored (the vendor id of the NVMe target subsystem is used) +deviceid Anything is OK (e.g. PCI_ANY_ID) +revid Do not care +progif_code Must be 0x02 (NVM Express) +baseclass_code Must be 0x01 (PCI_BASE_CLASS_STORAGE) +subclass_code Must be 0x08 (Non-Volatile Memory controller) +cache_line_size Do not care +subsys_vendor_id Ignored (the subsystem vendor id of the NVMe target subsystem + is used) +subsys_id Anything is OK (e.g. PCI_ANY_ID) +msi_interrupts At least equal to the number of queue pairs desired +msix_interrupts At least equal to the number of queue pairs desired +interrupt_pin Interrupt PIN to use if MSI and MSI-X are not supported +================ =========================================================== + +The NVMe PCI endpoint target function also has some specific configurable +fields defined in the *nvme* subdirectory of the function directory. These +fields are as follows. + +================ =========================================================== +mdts_kb Maximum data transfer size in KiB (default: 512) +portid The ID of the target port to use +subsysnqn The NQN of the target subsystem to use +================ =========================================================== diff --git a/Documentation/subsystem-apis.rst b/Documentation/subsystem-apis.rst index 74af50d2ef7f..b52ad5b969d4 100644 --- a/Documentation/subsystem-apis.rst +++ b/Documentation/subsystem-apis.rst @@ -60,6 +60,7 @@ Storage interfaces cdrom/index scsi/index target/index + nvme/index Other subsystems ---------------- From e4a0a3058de85bc623f1ba90eec68f239d0a11b2 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 8 Dec 2024 13:34:32 +0200 Subject: [PATCH 134/150] nvme-pci: fix comment typo envent -> event. Signed-off-by: Baruch Siach Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 57e8e32c4529..c3bfbe11ee57 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -372,7 +372,7 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, /* * Ensure that the doorbell is updated before reading the event * index from memory. The controller needs to provide similar - * ordering to ensure the envent index is updated before reading + * ordering to ensure the event index is updated before reading * the doorbell. */ mb(); From d4a95adeabc6b5a39405e49c6d5ed14dd83682c4 Mon Sep 17 00:00:00 2001 From: Keisuke Nishimura Date: Mon, 16 Dec 2024 16:27:20 +0100 Subject: [PATCH 135/150] nvme: Add error path for xa_store in nvme_init_effects The xa_store() may fail due to memory allocation failure because there is no guarantee that the index NVME_CSI_NVM is already used. This fix introduces a new function to handle the error path. Fixes: cc115cbe12d9 ("nvme: always initialize known command effects") Signed-off-by: Keisuke Nishimura Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4bdd5144af7c..2a0555856795 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3175,6 +3175,25 @@ free_data: return ret; } +static int nvme_init_effects_log(struct nvme_ctrl *ctrl, + u8 csi, struct nvme_effects_log **log) +{ + struct nvme_effects_log *effects, *old; + + effects = kzalloc(sizeof(*effects), GFP_KERNEL); + if (effects) + return -ENOMEM; + + old = xa_store(&ctrl->cels, csi, effects, GFP_KERNEL); + if (xa_is_err(old)) { + kfree(effects); + return xa_err(old); + } + + *log = effects; + return 0; +} + static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl) { struct nvme_effects_log *log = ctrl->effects; @@ -3221,10 +3240,9 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) } if (!ctrl->effects) { - ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); - if (!ctrl->effects) - return -ENOMEM; - xa_store(&ctrl->cels, NVME_CSI_NVM, ctrl->effects, GFP_KERNEL); + ret = nvme_init_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); + if (ret < 0) + return ret; } nvme_init_known_nvm_effects(ctrl); From 4a324970fabad503260973cd588609f3a26baab9 Mon Sep 17 00:00:00 2001 From: Francis Pravin Date: Fri, 10 Jan 2025 05:21:37 +0530 Subject: [PATCH 136/150] nvme-pci: use correct size to free the hmb buffer dev->host_mem_size value is updated only after the successful buffer allocation of hmb descriptor. Otherwise, it may have some undefined value. So, use the correct size to free the hmb buffer when the hmb descriptor buffer allocation failed. Signed-off-by: Francis Pravin Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c3bfbe11ee57..fe0795e16e25 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2085,8 +2085,8 @@ static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size) sizeof(*dev->host_mem_descs), &dev->host_mem_descs_dma, GFP_KERNEL); if (!dev->host_mem_descs) { - dma_free_noncontiguous(dev->dev, dev->host_mem_size, - dev->hmb_sgt, DMA_BIDIRECTIONAL); + dma_free_noncontiguous(dev->dev, size, dev->hmb_sgt, + DMA_BIDIRECTIONAL); dev->hmb_sgt = NULL; return -ENOMEM; } From 7c0be4ead1f8f5f8be0803f347de0de81e3b8e1c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 13 Jan 2025 09:58:33 +0800 Subject: [PATCH 137/150] block: mark GFP_NOIO around sysfs ->store() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sysfs ->store is called with queue freezed, meantime we have several ->store() callbacks(update_nr_requests, wbt, scheduler) to allocate memory with GFP_KERNEL which may run into direct reclaim code path, then potential deadlock can be caused. Fix the issue by marking NOIO around sysfs ->store() Reported-by: Thomas Hellström Cc: stable@vger.kernel.org Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: John Garry Link: https://lore.kernel.org/r/20250113015833.698458-1-ming.lei@redhat.com Link: https://lore.kernel.org/linux-block/Z4RkemI9f6N5zoEF@fedora/T/#mc774c65eeca5c024d29695f9ac6152b87763f305 Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e828be777206..e09b455874bf 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -681,6 +681,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, struct queue_sysfs_entry *entry = to_queue(attr); struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; + unsigned int noio_flag; ssize_t res; if (!entry->store_limit && !entry->store) @@ -711,7 +712,9 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, mutex_lock(&q->sysfs_lock); blk_mq_freeze_queue(q); + noio_flag = memalloc_noio_save(); res = entry->store(disk, page, length); + memalloc_noio_restore(noio_flag); blk_mq_unfreeze_queue(q); mutex_unlock(&q->sysfs_lock); return res; From 8337b029f788272f5273887ccefb8226404658ce Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 29 Oct 2024 09:19:41 +0800 Subject: [PATCH 138/150] nbd: fix partial sending nbd driver sends request header and payload with multiple call of sock_sendmsg, and partial sending can't be avoided. However, nbd driver returns BLK_STS_RESOURCE to block core in this situation. This way causes one issue: request->tag may change in the next run of nbd_queue_rq(), but the original old tag has been sent as part of header cookie, this way confuses nbd driver reply handling, since the real request can't be retrieved any more with the obsolete old tag. Fix it by retrying sending directly in per-socket work function, meantime return BLK_STS_OK to block layer core. Cc: vincent.chen@sifive.com Cc: Leon Schuermann Cc: Bart Van Assche Reported-by: Kevin Wolf Signed-off-by: Ming Lei Tested-by: Kevin Wolf Reviewed-by: Kevin Wolf Link: https://lore.kernel.org/r/20241029011941.153037-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 95 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 85 insertions(+), 10 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index efa05c3c06bf..b63a0f29a54a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -62,6 +62,7 @@ struct nbd_sock { bool dead; int fallback_index; int cookie; + struct work_struct work; }; struct recv_thread_args { @@ -141,6 +142,9 @@ struct nbd_device { */ #define NBD_CMD_INFLIGHT 2 +/* Just part of request header or data payload is sent successfully */ +#define NBD_CMD_PARTIAL_SEND 3 + struct nbd_cmd { struct nbd_device *nbd; struct mutex lock; @@ -453,6 +457,12 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) if (!mutex_trylock(&cmd->lock)) return BLK_EH_RESET_TIMER; + /* partial send is handled in nbd_sock's work function */ + if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) { + mutex_unlock(&cmd->lock); + return BLK_EH_RESET_TIMER; + } + if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) { mutex_unlock(&cmd->lock); return BLK_EH_DONE; @@ -601,6 +611,30 @@ static inline int was_interrupted(int result) return result == -ERESTARTSYS || result == -EINTR; } +/* + * We've already sent header or part of data payload, have no choice but + * to set pending and schedule it in work. + * + * And we have to return BLK_STS_OK to block core, otherwise this same + * request may be re-dispatched with different tag, but our header has + * been sent out with old tag, and this way does confuse reply handling. + */ +static void nbd_sched_pending_work(struct nbd_device *nbd, + struct nbd_sock *nsock, + struct nbd_cmd *cmd, int sent) +{ + struct request *req = blk_mq_rq_from_pdu(cmd); + + /* pending work should be scheduled only once */ + WARN_ON_ONCE(test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)); + + nsock->pending = req; + nsock->sent = sent; + set_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags); + refcount_inc(&nbd->config_refs); + schedule_work(&nsock->work); +} + /* * Returns BLK_STS_RESOURCE if the caller should retry after a delay. * Returns BLK_STS_IOERR if sending failed. @@ -686,8 +720,8 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, * completely done. */ if (sent) { - nsock->pending = req; - nsock->sent = sent; + nbd_sched_pending_work(nbd, nsock, cmd, sent); + return BLK_STS_OK; } set_bit(NBD_CMD_REQUEUED, &cmd->flags); return BLK_STS_RESOURCE; @@ -724,14 +758,8 @@ send_pages: result = sock_xmit(nbd, index, 1, &from, flags, &sent); if (result < 0) { if (was_interrupted(result)) { - /* We've already sent the header, we - * have no choice but to set pending and - * return BUSY. - */ - nsock->pending = req; - nsock->sent = sent; - set_bit(NBD_CMD_REQUEUED, &cmd->flags); - return BLK_STS_RESOURCE; + nbd_sched_pending_work(nbd, nsock, cmd, sent); + return BLK_STS_OK; } dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", @@ -757,6 +785,14 @@ out: return BLK_STS_OK; requeue: + /* + * Can't requeue in case we are dealing with partial send + * + * We must run from pending work function. + * */ + if (test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags)) + return BLK_STS_OK; + /* retry on a different socket */ dev_err_ratelimited(disk_to_dev(nbd->disk), "Request send failed, requeueing\n"); @@ -765,6 +801,44 @@ requeue: return BLK_STS_OK; } +/* handle partial sending */ +static void nbd_pending_cmd_work(struct work_struct *work) +{ + struct nbd_sock *nsock = container_of(work, struct nbd_sock, work); + struct request *req = nsock->pending; + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); + struct nbd_device *nbd = cmd->nbd; + unsigned long deadline = READ_ONCE(req->deadline); + unsigned int wait_ms = 2; + + mutex_lock(&cmd->lock); + + WARN_ON_ONCE(test_bit(NBD_CMD_REQUEUED, &cmd->flags)); + if (WARN_ON_ONCE(!test_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags))) + goto out; + + mutex_lock(&nsock->tx_lock); + while (true) { + nbd_send_cmd(nbd, cmd, cmd->index); + if (!nsock->pending) + break; + + /* don't bother timeout handler for partial sending */ + if (READ_ONCE(jiffies) + msecs_to_jiffies(wait_ms) >= deadline) { + cmd->status = BLK_STS_IOERR; + blk_mq_complete_request(req); + break; + } + msleep(wait_ms); + wait_ms *= 2; + } + mutex_unlock(&nsock->tx_lock); + clear_bit(NBD_CMD_PARTIAL_SEND, &cmd->flags); +out: + mutex_unlock(&cmd->lock); + nbd_config_put(nbd); +} + static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock, struct nbd_reply *reply) { @@ -1211,6 +1285,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, nsock->pending = NULL; nsock->sent = 0; nsock->cookie = 0; + INIT_WORK(&nsock->work, nbd_pending_cmd_work); socks[config->num_connections++] = nsock; atomic_inc(&config->live_connections); blk_mq_unfreeze_queue(nbd->disk->queue); From 4fa5c37012d71f6a39c4286ffabb9466f1728ba3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 10 Jan 2025 22:27:36 -0800 Subject: [PATCH 139/150] blk-cgroup: fix kernel-doc warnings in header file Correct the function parameters and function names to eliminate kernel-doc warnings: blk-cgroup.h:238: warning: Function parameter or struct member 'bio' not described in 'bio_issue_as_root_blkg' blk-cgroup.h:248: warning: bad line: blk-cgroup.h:279: warning: expecting prototype for blkg_to_pdata(). Prototype was for blkg_to_pd() instead blk-cgroup.h:296: warning: expecting prototype for pdata_to_blkg(). Prototype was for pd_to_blkg() instead Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Cc: cgroups@vger.kernel.org Link: https://lore.kernel.org/r/20250111062736.910383-1-rdunlap@infradead.org Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b9e3265c1eb3..2c4663bd993a 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -225,7 +225,9 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx); /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg - * @return: true if this bio needs to be submitted with the root blkg context. + * @bio: the target &bio + * + * Return: true if this bio needs to be submitted with the root blkg context. * * In order to avoid priority inversions we sometimes need to issue a bio as if * it were attached to the root blkg, and then backcharge to the actual owning @@ -245,7 +247,7 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio) * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. - + * * Must be called in a RCU critical section. */ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, @@ -268,7 +270,7 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, } /** - * blkg_to_pdata - get policy private data + * blkg_to_pd - get policy private data * @blkg: blkg of interest * @pol: policy of interest * @@ -287,7 +289,7 @@ static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, } /** - * pdata_to_blkg - get blkg associated with policy private data + * pd_to_blkg - get blkg associated with policy private data * @pd: policy private data of interest * * @pd is policy private data. Determine the blkg it's associated with. From f403034e8afd12ed6ea5de64f0adda3d90e67c9d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 10 Jan 2025 22:27:48 -0800 Subject: [PATCH 140/150] blk-cgroup: rwstat: fix kernel-doc warnings in header file Correct the function parameters to eliminate kernel-doc warnings: blk-cgroup-rwstat.h:63: warning: Function parameter or struct member 'opf' not described in 'blkg_rwstat_add' blk-cgroup-rwstat.h:63: warning: Excess function parameter 'op' description in 'blkg_rwstat_add' blk-cgroup-rwstat.h:91: warning: Function parameter or struct member 'result' not described in 'blkg_rwstat_read' Signed-off-by: Randy Dunlap Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Cc: cgroups@vger.kernel.org Link: https://lore.kernel.org/r/20250111062748.910442-1-rdunlap@infradead.org Signed-off-by: Jens Axboe --- block/blk-cgroup-rwstat.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h index 022527b0b043..703a16fe1404 100644 --- a/block/blk-cgroup-rwstat.h +++ b/block/blk-cgroup-rwstat.h @@ -52,7 +52,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, /** * blkg_rwstat_add - add a value to a blkg_rwstat * @rwstat: target blkg_rwstat - * @op: REQ_OP and flags + * @opf: REQ_OP and flags * @val: value to add * * Add @val to @rwstat. The counters are chosen according to @rw. The @@ -83,8 +83,9 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, /** * blkg_rwstat_read - read the current values of a blkg_rwstat * @rwstat: blkg_rwstat to read + * @result: where to put the current values * - * Read the current snapshot of @rwstat and return it in the aux counts. + * Read the current snapshot of @rwstat and return it in the @result counts. */ static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, struct blkg_rwstat_sample *result) From e494e451611a3de6ae95f99e8339210c157d70fb Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 10 Jan 2025 22:27:58 -0800 Subject: [PATCH 141/150] partitions: ldm: remove the initial kernel-doc notation Remove the file's first comment describing what the file is. This comment is not in kernel-doc format so it causes a kernel-doc warning. ldm.h:13: warning: expecting prototype for ldm(). Prototype was for _FS_PT_LDM_H_() instead Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Randy Dunlap Cc: Richard Russon (FlatCap) Cc: linux-ntfs-dev@lists.sourceforge.net Cc: Jens Axboe Link: https://lore.kernel.org/r/20250111062758.910458-1-rdunlap@infradead.org Signed-off-by: Jens Axboe --- block/partitions/ldm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index e259180c8914..aa3bd050d8cd 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * ldm - Part of the Linux-NTFS project. * * Copyright (C) 2001,2002 Richard Russon From 127186cfb184eaccdfe948e6da66940cfa03efc5 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 2 Jan 2025 19:28:41 +0800 Subject: [PATCH 142/150] md: reintroduce md-linear THe md-linear is removed by commit 849d18e27be9 ("md: Remove deprecated CONFIG_MD_LINEAR") because it has been marked as deprecated for a long time. However, md-linear is used widely for underlying disks with different size, sadly we didn't know this until now, and it's true useful to create partitions and assemble multiple raid and then append one to the other. People have to use dm-linear in this case now, however, they will prefer to minimize the number of involved modules. Fixes: 849d18e27be9 ("md: Remove deprecated CONFIG_MD_LINEAR") Cc: stable@vger.kernel.org Signed-off-by: Yu Kuai Acked-by: Coly Li Acked-by: Mike Snitzer Link: https://lore.kernel.org/r/20250102112841.1227111-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/Kconfig | 13 ++ drivers/md/Makefile | 2 + drivers/md/md-autodetect.c | 8 +- drivers/md/md-linear.c | 354 +++++++++++++++++++++++++++++++++ drivers/md/md.c | 2 +- include/uapi/linux/raid/md_p.h | 2 +- include/uapi/linux/raid/md_u.h | 2 + 7 files changed, 379 insertions(+), 4 deletions(-) create mode 100644 drivers/md/md-linear.c diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 1e9db8e4acdf..0b1870a09e1f 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -61,6 +61,19 @@ config MD_BITMAP_FILE various kernel APIs and can only work with files on a file system not actually sitting on the MD device. +config MD_LINEAR + tristate "Linear (append) mode" + depends on BLK_DEV_MD + help + If you say Y here, then your multiple devices driver will be able to + use the so-called linear mode, i.e. it will combine the hard disk + partitions by simply appending one to the other. + + To compile this as a module, choose M here: the module + will be called linear. + + If unsure, say Y. + config MD_RAID0 tristate "RAID-0 (striping) mode" depends on BLK_DEV_MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 476a214e4bdc..87bdfc9fe14c 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -29,12 +29,14 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o +linear-y += md-linear.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise # themselves, and md.o may use the personalities when it # auto-initialised. +obj-$(CONFIG_MD_LINEAR) += linear.o obj-$(CONFIG_MD_RAID0) += raid0.o obj-$(CONFIG_MD_RAID1) += raid1.o obj-$(CONFIG_MD_RAID10) += raid10.o diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index b2a00f213c2c..4b80165afd23 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -49,6 +49,7 @@ static int md_setup_ents __initdata; * instead of just one. -- KTK * 18May2000: Added support for persistent-superblock arrays: * md=n,0,factor,fault,device-list uses RAID0 for device n + * md=n,-1,factor,fault,device-list uses LINEAR for device n * md=n,device-list reads a RAID superblock from the devices * elements in device-list are read by name_to_kdev_t so can be * a hex number or something like /dev/hda1 /dev/sdb @@ -87,7 +88,7 @@ static int __init md_setup(char *str) md_setup_ents++; switch (get_option(&str, &level)) { /* RAID level */ case 2: /* could be 0 or -1.. */ - if (level == 0) { + if (level == 0 || level == LEVEL_LINEAR) { if (get_option(&str, &factor) != 2 || /* Chunk Size */ get_option(&str, &fault) != 2) { printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); @@ -95,7 +96,10 @@ static int __init md_setup(char *str) } md_setup_args[ent].level = level; md_setup_args[ent].chunk = 1 << (factor+12); - pername = "raid0"; + if (level == LEVEL_LINEAR) + pername = "linear"; + else + pername = "raid0"; break; } fallthrough; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c new file mode 100644 index 000000000000..53bc3fda9edb --- /dev/null +++ b/drivers/md/md-linear.c @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * linear.c : Multiple Devices driver for Linux Copyright (C) 1994-96 Marc + * ZYNGIER or + */ + +#include +#include +#include +#include +#include +#include +#include "md.h" + +struct dev_info { + struct md_rdev *rdev; + sector_t end_sector; +}; + +struct linear_conf { + struct rcu_head rcu; + sector_t array_sectors; + /* a copy of mddev->raid_disks */ + int raid_disks; + struct dev_info disks[] __counted_by(raid_disks); +}; + +/* + * find which device holds a particular offset + */ +static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) +{ + int lo, mid, hi; + struct linear_conf *conf; + + lo = 0; + hi = mddev->raid_disks - 1; + conf = mddev->private; + + /* + * Binary Search + */ + + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; +} + +static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct linear_conf *conf; + sector_t array_sectors; + + conf = mddev->private; + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + + return array_sectors; +} + +static int linear_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + int err; + + md_init_stacking_limits(&lim); + lim.max_hw_sectors = mddev->chunk_sectors; + lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.io_min = mddev->chunk_sectors << 9; + err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); + if (err) { + queue_limits_cancel_update(mddev->gendisk->queue); + return err; + } + + return queue_limits_set(mddev->gendisk->queue, &lim); +} + +static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) +{ + struct linear_conf *conf; + struct md_rdev *rdev; + int ret = -EINVAL; + int cnt; + int i; + + conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL); + if (!conf) + return ERR_PTR(-ENOMEM); + + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + + cnt = 0; + conf->array_sectors = 0; + + rdev_for_each(rdev, mddev) { + int j = rdev->raid_disk; + struct dev_info *disk = conf->disks + j; + sector_t sectors; + + if (j < 0 || j >= raid_disks || disk->rdev) { + pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); + goto out; + } + + disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } + + conf->array_sectors += rdev->sectors; + cnt++; + } + if (cnt != raid_disks) { + pr_warn("md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); + goto out; + } + + /* + * Here we calculate the device offsets. + */ + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; + + for (i = 1; i < raid_disks; i++) + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; + + if (!mddev_is_dm(mddev)) { + ret = linear_set_limits(mddev); + if (ret) + goto out; + } + + return conf; + +out: + kfree(conf); + return ERR_PTR(ret); +} + +static int linear_run(struct mddev *mddev) +{ + struct linear_conf *conf; + int ret; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + conf = linear_conf(mddev, mddev->raid_disks); + if (IS_ERR(conf)) + return PTR_ERR(conf); + + mddev->private = conf; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + + ret = md_integrity_register(mddev); + if (ret) { + kfree(conf); + mddev->private = NULL; + } + return ret; +} + +static int linear_add(struct mddev *mddev, struct md_rdev *rdev) +{ + /* Adding a drive to a linear array allows the array to grow. + * It is permitted if the new drive has a matching superblock + * already on it, with raid_disk equal to raid_disks. + * It is achieved by creating a new linear_private_data structure + * and swapping it in in-place of the current one. + * The current one is never freed until the array is stopped. + * This avoids races. + */ + struct linear_conf *newconf, *oldconf; + + if (rdev->saved_raid_disk != mddev->raid_disks) + return -EINVAL; + + rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; + + newconf = linear_conf(mddev, mddev->raid_disks + 1); + if (!newconf) + return -ENOMEM; + + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); + mddev->raid_disks++; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); + kfree_rcu(oldconf, rcu); + return 0; +} + +static void linear_free(struct mddev *mddev, void *priv) +{ + struct linear_conf *conf = priv; + + kfree(conf); +} + +static bool linear_make_request(struct mddev *mddev, struct bio *bio) +{ + struct dev_info *tmp_dev; + sector_t start_sector, end_sector, data_offset; + sector_t bio_sector = bio->bi_iter.bi_sector; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) + return true; + + tmp_dev = which_dev(mddev, bio_sector); + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + end_sector = tmp_dev->end_sector; + data_offset = tmp_dev->rdev->data_offset; + + if (unlikely(bio_sector >= end_sector || + bio_sector < start_sector)) + goto out_of_bounds; + + if (unlikely(is_rdev_broken(tmp_dev->rdev))) { + md_error(mddev, tmp_dev->rdev); + bio_io_error(bio); + return true; + } + + if (unlikely(bio_end_sector(bio) > end_sector)) { + /* This bio crosses a device boundary, so we have to split it */ + struct bio *split = bio_split(bio, end_sector - bio_sector, + GFP_NOIO, &mddev->bio_set); + + if (IS_ERR(split)) { + bio->bi_status = errno_to_blk_status(PTR_ERR(split)); + bio_endio(bio); + return true; + } + + bio_chain(split, bio); + submit_bio_noacct(bio); + bio = split; + } + + md_account_bio(mddev, &bio); + bio_set_dev(bio, tmp_dev->rdev->bdev); + bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - + start_sector + data_offset; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !bdev_max_discard_sectors(bio->bi_bdev))) { + /* Just ignore it */ + bio_endio(bio); + } else { + if (mddev->gendisk) + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_write_zeroes(mddev, bio); + submit_bio_noacct(bio); + } + return true; + +out_of_bounds: + pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_iter.bi_sector, + tmp_dev->rdev->bdev, + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); + bio_io_error(bio); + return true; +} + +static void linear_status(struct seq_file *seq, struct mddev *mddev) +{ + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); +} + +static void linear_error(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) { + char *md_name = mdname(mddev); + + pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n", + md_name, rdev->bdev); + } +} + +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + +static struct md_personality linear_personality = { + .name = "linear", + .level = LEVEL_LINEAR, + .owner = THIS_MODULE, + .make_request = linear_make_request, + .run = linear_run, + .free = linear_free, + .status = linear_status, + .hot_add_disk = linear_add, + .size = linear_size, + .quiesce = linear_quiesce, + .error_handler = linear_error, +}; + +static int __init linear_init(void) +{ + return register_md_personality(&linear_personality); +} + +static void linear_exit(void) +{ + unregister_md_personality(&linear_personality); +} + +module_init(linear_init); +module_exit(linear_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)"); +MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ +MODULE_ALIAS("md-linear"); +MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md.c b/drivers/md/md.c index aebe12b0ee27..3dd013f82e26 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8124,7 +8124,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev) return; mddev->pers->error_handler(mddev, rdev); - if (mddev->pers->level == 0) + if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR) return; if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags)) diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 5a43c23f53bf..ff47b6f0ba0f 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -233,7 +233,7 @@ struct mdp_superblock_1 { char set_name[32]; /* set and interpreted by user-space */ __le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ - __le32 level; /* 0,1,4,5 */ + __le32 level; /* 0,1,4,5, -1 (linear) */ __le32 layout; /* only for raid5 and raid10 currently */ __le64 size; /* used size of component devices, in 512byte sectors */ diff --git a/include/uapi/linux/raid/md_u.h b/include/uapi/linux/raid/md_u.h index 7be89a4906e7..a893010735fb 100644 --- a/include/uapi/linux/raid/md_u.h +++ b/include/uapi/linux/raid/md_u.h @@ -103,6 +103,8 @@ typedef struct mdu_array_info_s { } mdu_array_info_t; +#define LEVEL_LINEAR (-1) + /* we need a value for 'no level specified' and 0 * means 'raid0', so we need something else. This is * for internal use only From 4fa91616c078c203f1ab6c43f9524b7e352c8217 Mon Sep 17 00:00:00 2001 From: David Reaver Date: Wed, 8 Jan 2025 11:21:30 -0800 Subject: [PATCH 143/150] md: Replace deprecated kmap_atomic() with kmap_local_page() kmap_atomic() is deprecated and should be replaced with kmap_local_page() [1][2]. kmap_local_page() is faster in kernels with HIGHMEM enabled, can take page faults, and allows preemption. According to [2], this is safe as long as the code between kmap_atomic() and kunmap_atomic() does not implicitly depend on disabling page faults or preemption. It appears to me that none of the call sites in this patch depend on disabling page faults or preemption; they are all mapping a page to simply extract some information from it or print some debug info. [1] https://lwn.net/Articles/836144/ [2] https://docs.kernel.org/mm/highmem.html#temporary-virtual-mappings Signed-off-by: David Reaver Link: https://lore.kernel.org/r/20250108192131.46843-1-me@davidreaver.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 42 ++++++++++++++++++++-------------------- drivers/md/raid5-cache.c | 16 +++++++-------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c3a42dd66ce5..d16a89d673a1 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -682,7 +682,7 @@ static void bitmap_update_sb(void *data) return; if (!bitmap->storage.sb_page) /* no superblock */ return; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->events = cpu_to_le64(bitmap->mddev->events); if (bitmap->mddev->events < bitmap->events_cleared) /* rocking back to read-only */ @@ -702,7 +702,7 @@ static void bitmap_update_sb(void *data) sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> bitmap_info.space); - kunmap_atomic(sb); + kunmap_local(sb); if (bitmap->storage.file) write_file_page(bitmap, bitmap->storage.sb_page, 1); @@ -717,7 +717,7 @@ static void bitmap_print_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->storage.sb_page) return; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); pr_debug(" version: %u\n", le32_to_cpu(sb->version)); @@ -736,7 +736,7 @@ static void bitmap_print_sb(struct bitmap *bitmap) pr_debug(" sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb); + kunmap_local(sb); } /* @@ -760,7 +760,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) return -ENOMEM; bitmap->storage.sb_index = 0; - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->magic = cpu_to_le32(BITMAP_MAGIC); sb->version = cpu_to_le32(BITMAP_MAJOR_HI); @@ -768,7 +768,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) chunksize = bitmap->mddev->bitmap_info.chunksize; BUG_ON(!chunksize); if (!is_power_of_2(chunksize)) { - kunmap_atomic(sb); + kunmap_local(sb); pr_warn("bitmap chunksize not a power of 2\n"); return -EINVAL; } @@ -803,7 +803,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap) sb->events_cleared = cpu_to_le64(bitmap->mddev->events); bitmap->mddev->bitmap_info.nodes = 0; - kunmap_atomic(sb); + kunmap_local(sb); return 0; } @@ -865,7 +865,7 @@ re_read: return err; err = -EINVAL; - sb = kmap_atomic(sb_page); + sb = kmap_local_page(sb_page); chunksize = le32_to_cpu(sb->chunksize); daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; @@ -932,7 +932,7 @@ re_read: err = 0; out: - kunmap_atomic(sb); + kunmap_local(sb); if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { /* Assigning chunksize is required for "re_read" */ bitmap->mddev->bitmap_info.chunksize = chunksize; @@ -1161,12 +1161,12 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) bit = file_page_offset(&bitmap->storage, chunk); /* set the bit */ - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set_bit(bit, kaddr); else set_bit_le(bit, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); pr_debug("set file bit %lu page %lu\n", bit, index); /* record page number so it gets flushed to disk when unplug occurs */ set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY); @@ -1190,12 +1190,12 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) if (!page) return; bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) clear_bit(bit, paddr); else clear_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) { set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING); bitmap->allclean = 0; @@ -1214,12 +1214,12 @@ static int md_bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) if (!page) return -EINVAL; bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) set = test_bit(bit, paddr); else set = test_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); return set; } @@ -1388,9 +1388,9 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) * If the bitmap is out of date, dirty the whole page * and write it out */ - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); memset(paddr + offset, 0xff, PAGE_SIZE - offset); - kunmap_atomic(paddr); + kunmap_local(paddr); filemap_write_page(bitmap, i, true); if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) { @@ -1406,12 +1406,12 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) void *paddr; bool was_set; - paddr = kmap_atomic(page); + paddr = kmap_local_page(page); if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) was_set = test_bit(bit, paddr); else was_set = test_bit_le(bit, paddr); - kunmap_atomic(paddr); + kunmap_local(paddr); if (was_set) { /* if the disk bit is set, set the memory bit */ @@ -1546,10 +1546,10 @@ static void bitmap_daemon_work(struct mddev *mddev) bitmap_super_t *sb; bitmap->need_sync = 0; if (bitmap->storage.filemap) { - sb = kmap_atomic(bitmap->storage.sb_page); + sb = kmap_local_page(bitmap->storage.sb_page); sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb); + kunmap_local(sb); set_page_attr(bitmap, 0, BITMAP_PAGE_NEEDWRITE); } diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b4f7b79fd187..7a22cb2e5ac3 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1023,10 +1023,10 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) /* checksum is already calculated in last run */ if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) continue; - addr = kmap_atomic(sh->dev[i].page); + addr = kmap_local_page(sh->dev[i].page); sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); + kunmap_local(addr); } parity_pages = 1 + !!(sh->qd_idx >= 0); data_pages = write_disks - parity_pages; @@ -1979,9 +1979,9 @@ r5l_recovery_verify_data_checksum(struct r5l_log *log, u32 checksum; r5l_recovery_read_page(log, ctx, page, log_offset); - addr = kmap_atomic(page); + addr = kmap_local_page(page); checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); + kunmap_local(addr); return (le32_to_cpu(log_checksum) == checksum) ? 0 : -EINVAL; } @@ -2381,11 +2381,11 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, payload->size = cpu_to_le32(BLOCK_SECTORS); payload->location = cpu_to_le64( raid5_compute_blocknr(sh, i, 0)); - addr = kmap_atomic(dev->page); + addr = kmap_local_page(dev->page); payload->checksum[0] = cpu_to_le32( crc32c_le(log->uuid_checksum, addr, PAGE_SIZE)); - kunmap_atomic(addr); + kunmap_local(addr); sync_page_io(log->rdev, write_pos, PAGE_SIZE, dev->page, REQ_OP_WRITE, false); write_pos = r5l_ring_add(log, write_pos, @@ -2888,10 +2888,10 @@ int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) continue; - addr = kmap_atomic(sh->dev[i].page); + addr = kmap_local_page(sh->dev[i].page); sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); + kunmap_local(addr); pages++; } WARN_ON(pages == 0); From 08c50142a128dcb2d7060aa3b4c5db8837f7a46a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 Jan 2025 09:51:41 +0800 Subject: [PATCH 144/150] md/md-bitmap: factor behind write counters out from bitmap_{start/end}write() behind_write is only used in raid1, prepare to refactor bitmap_{start/end}write(), there are no functional changes. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Link: https://lore.kernel.org/r/20250109015145.158868-2-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 57 +++++++++++++++++++++++++--------------- drivers/md/md-bitmap.h | 7 +++-- drivers/md/raid1.c | 12 +++++---- drivers/md/raid10.c | 6 ++--- drivers/md/raid5-cache.c | 3 +-- drivers/md/raid5.c | 11 ++++---- 6 files changed, 56 insertions(+), 40 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index d16a89d673a1..e1d3970c300d 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1671,24 +1671,13 @@ __acquires(bitmap->lock) } static int bitmap_startwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool behind) + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return 0; - if (behind) { - int bw; - atomic_inc(&bitmap->behind_writes); - bw = atomic_read(&bitmap->behind_writes); - if (bw > bitmap->behind_writes_used) - bitmap->behind_writes_used = bw; - - pr_debug("inc write-behind count %d/%lu\n", - bw, bitmap->mddev->bitmap_info.max_write_behind); - } - while (sectors) { sector_t blocks; bitmap_counter_t *bmc; @@ -1737,21 +1726,13 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, } static void bitmap_endwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool success, bool behind) + unsigned long sectors, bool success) { struct bitmap *bitmap = mddev->bitmap; if (!bitmap) return; - if (behind) { - if (atomic_dec_and_test(&bitmap->behind_writes)) - wake_up(&bitmap->behind_wait); - pr_debug("dec write-behind count %d/%lu\n", - atomic_read(&bitmap->behind_writes), - bitmap->mddev->bitmap_info.max_write_behind); - } - while (sectors) { sector_t blocks; unsigned long flags; @@ -2062,6 +2043,37 @@ static void md_bitmap_free(void *data) kfree(bitmap); } +static void bitmap_start_behind_write(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + int bw; + + if (!bitmap) + return; + + atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + + pr_debug("inc write-behind count %d/%lu\n", + bw, bitmap->mddev->bitmap_info.max_write_behind); +} + +static void bitmap_end_behind_write(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; + + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); + pr_debug("dec write-behind count %d/%lu\n", + atomic_read(&bitmap->behind_writes), + bitmap->mddev->bitmap_info.max_write_behind); +} + static void bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -2981,6 +2993,9 @@ static struct bitmap_operations bitmap_ops = { .dirty_bits = bitmap_dirty_bits, .unplug = bitmap_unplug, .daemon_work = bitmap_daemon_work, + + .start_behind_write = bitmap_start_behind_write, + .end_behind_write = bitmap_end_behind_write, .wait_behind_writes = bitmap_wait_behind_writes, .startwrite = bitmap_startwrite, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 662e6fc141a7..e87a1f493d3c 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -84,12 +84,15 @@ struct bitmap_operations { unsigned long e); void (*unplug)(struct mddev *mddev, bool sync); void (*daemon_work)(struct mddev *mddev); + + void (*start_behind_write)(struct mddev *mddev); + void (*end_behind_write)(struct mddev *mddev); void (*wait_behind_writes)(struct mddev *mddev); int (*startwrite)(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool behind); + unsigned long sectors); void (*endwrite)(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool success, bool behind); + unsigned long sectors, bool success); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 519c56f0ee3d..15ba7a001f30 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -420,10 +420,11 @@ static void close_write(struct r1bio *r1_bio) r1_bio->behind_master_bio = NULL; } + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->end_behind_write(mddev); /* clear the bitmap if all writes complete successfully */ mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); + !test_bit(R1BIO_Degraded, &r1_bio->state)); md_write_end(mddev); } @@ -1645,9 +1646,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, stats.behind_writes < max_write_behind) alloc_behind_master_bio(r1_bio, bio); - mddev->bitmap_ops->startwrite( - mddev, r1_bio->sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + mddev->bitmap_ops->start_behind_write(mddev); + mddev->bitmap_ops->startwrite(mddev, r1_bio->sector, + r1_bio->sectors); first_clone = 0; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7d7a8a2524dc..c3a93b2a26a6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -430,8 +430,7 @@ static void close_write(struct r10bio *r10_bio) /* clear the bitmap if all writes complete successfully */ mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - false); + !test_bit(R10BIO_Degraded, &r10_bio->state)); md_write_end(mddev); } @@ -1519,8 +1518,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, md_account_bio(mddev, &bio); r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); - mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors, - false); + mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 7a22cb2e5ac3..1ac33d82bf8c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -315,8 +315,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, r5c_return_dev_pending_writes(conf, &sh->dev[i]); conf->mddev->bitmap_ops->endwrite(conf->mddev, sh->sector, RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - false); + !test_bit(STRIPE_DEGRADED, &sh->state)); } } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f09e7677ee9f..93cc7e252dd4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3564,7 +3564,7 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, - RAID5_STRIPE_SECTORS(conf), false); + RAID5_STRIPE_SECTORS(conf)); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3665,7 +3665,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bitmap_end) conf->mddev->bitmap_ops->endwrite(conf->mddev, sh->sector, RAID5_STRIPE_SECTORS(conf), - false, false); + false); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3712,7 +3712,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bitmap_end) conf->mddev->bitmap_ops->endwrite(conf->mddev, sh->sector, RAID5_STRIPE_SECTORS(conf), - false, false); + false); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -4063,8 +4063,7 @@ returnbi: } conf->mddev->bitmap_ops->endwrite(conf->mddev, sh->sector, RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - false); + !test_bit(STRIPE_DEGRADED, &sh->state)); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -5787,7 +5786,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) for (d = 0; d < conf->raid_disks - conf->max_degraded; d++) mddev->bitmap_ops->startwrite(mddev, sh->sector, - RAID5_STRIPE_SECTORS(conf), false); + RAID5_STRIPE_SECTORS(conf)); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } From 4f0e7d0e03b7b80af84759a9e7cfb0f81ac4adae Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 Jan 2025 09:51:42 +0800 Subject: [PATCH 145/150] md/md-bitmap: remove the last parameter for bimtap_ops->endwrite() For the case that IO failed for one rdev, the bit will be mark as NEEDED in following cases: 1) If badblocks is set and rdev is not faulty; 2) If rdev is faulty; Case 1) is useless because synchronize data to badblocks make no sense. Case 2) can be replaced with mddev->degraded. Also remove R1BIO_Degraded, R10BIO_Degraded and STRIPE_DEGRADED since case 2) no longer use them. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20250109015145.158868-3-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 19 ++++++++++--------- drivers/md/md-bitmap.h | 2 +- drivers/md/raid1.c | 26 +++----------------------- drivers/md/raid1.h | 1 - drivers/md/raid10.c | 23 +++-------------------- drivers/md/raid10.h | 1 - drivers/md/raid5-cache.c | 3 +-- drivers/md/raid5.c | 15 +++------------ drivers/md/raid5.h | 1 - 9 files changed, 21 insertions(+), 70 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index e1d3970c300d..ec4ecd96e6b1 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1726,7 +1726,7 @@ static int bitmap_startwrite(struct mddev *mddev, sector_t offset, } static void bitmap_endwrite(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool success) + unsigned long sectors) { struct bitmap *bitmap = mddev->bitmap; @@ -1745,15 +1745,16 @@ static void bitmap_endwrite(struct mddev *mddev, sector_t offset, return; } - if (success && !bitmap->mddev->degraded && - bitmap->events_cleared < bitmap->mddev->events) { - bitmap->events_cleared = bitmap->mddev->events; - bitmap->need_sync = 1; - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); - } - - if (!success && !NEEDED(*bmc)) + if (!bitmap->mddev->degraded) { + if (bitmap->events_cleared < bitmap->mddev->events) { + bitmap->events_cleared = bitmap->mddev->events; + bitmap->need_sync = 1; + sysfs_notify_dirent_safe( + bitmap->sysfs_can_clear); + } + } else if (!NEEDED(*bmc)) { *bmc |= NEEDED_MASK; + } if (COUNTER(*bmc) == COUNTER_MAX) wake_up(&bitmap->overflow_wait); diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index e87a1f493d3c..31c93019c76b 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -92,7 +92,7 @@ struct bitmap_operations { int (*startwrite)(struct mddev *mddev, sector_t offset, unsigned long sectors); void (*endwrite)(struct mddev *mddev, sector_t offset, - unsigned long sectors, bool success); + unsigned long sectors); bool (*start_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks, bool degraded); void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 15ba7a001f30..4d09d85321b6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -423,8 +423,7 @@ static void close_write(struct r1bio *r1_bio) if (test_bit(R1BIO_BehindIO, &r1_bio->state)) mddev->bitmap_ops->end_behind_write(mddev); /* clear the bitmap if all writes complete successfully */ - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state)); + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors); md_write_end(mddev); } @@ -481,8 +480,6 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { - /* Fail the request */ - set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; @@ -1536,11 +1533,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, write_behind = true; r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { - if (i < conf->raid_disks) - set_bit(R1BIO_Degraded, &r1_bio->state); + if (!rdev || test_bit(Faulty, &rdev->flags)) continue; - } atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { @@ -1559,16 +1553,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ max_sectors = bad_sectors; rdev_dec_pending(rdev, mddev); - /* We don't set R1BIO_Degraded as that - * only applies if the disk is - * missing, so it might be re-added, - * and we want to know to recover this - * chunk. - * In this case the device is here, - * and the fact that this chunk is not - * in-sync is recorded in the bad - * block log - */ continue; } if (is_bad) { @@ -2616,12 +2600,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * errors. */ fail = true; - if (!narrow_write_error(r1_bio, m)) { + if (!narrow_write_error(r1_bio, m)) md_error(conf->mddev, conf->mirrors[m].rdev); /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } @@ -2712,8 +2694,6 @@ static void raid1d(struct md_thread *thread) list_del(&r1_bio->retry_list); idx = sector_to_idx(r1_bio->sector); atomic_dec(&conf->nr_queued[idx]); - if (mddev->degraded) - set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) close_write(r1_bio); raid_end_bio_io(r1_bio); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 5300cbaa58a4..33f318fcc268 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -188,7 +188,6 @@ struct r1bio { enum r1bio_state { R1BIO_Uptodate, R1BIO_IsSync, - R1BIO_Degraded, R1BIO_BehindIO, /* Set ReadError on bios that experience a readerror so that * raid1d knows what to do with them. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c3a93b2a26a6..340a4710c222 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -429,8 +429,7 @@ static void close_write(struct r10bio *r10_bio) struct mddev *mddev = r10_bio->mddev; /* clear the bitmap if all writes complete successfully */ - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state)); + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors); md_write_end(mddev); } @@ -500,7 +499,6 @@ static void raid10_end_write_request(struct bio *bio) set_bit(R10BIO_WriteError, &r10_bio->state); else { /* Fail the request */ - set_bit(R10BIO_Degraded, &r10_bio->state); r10_bio->devs[slot].bio = NULL; to_put = bio; dec_rdev = 1; @@ -1437,10 +1435,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev && !rrdev) { - set_bit(R10BIO_Degraded, &r10_bio->state); + if (!rdev && !rrdev) continue; - } if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; @@ -1457,14 +1453,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * to other devices yet */ max_sectors = bad_sectors; - /* We don't set R10BIO_Degraded as that - * only applies if the disk is missing, - * so it might be re-added, and we want to - * know to recover this chunk. - * In this case the device is here, and the - * fact that this chunk is not in-sync is - * recorded in the bad block log. - */ continue; } if (is_bad) { @@ -2964,11 +2952,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && bio->bi_status) { fail = true; - if (!narrow_write_error(r10_bio, m)) { + if (!narrow_write_error(r10_bio, m)) md_error(conf->mddev, rdev); - set_bit(R10BIO_Degraded, - &r10_bio->state); - } rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; @@ -3027,8 +3012,6 @@ static void raid10d(struct md_thread *thread) r10_bio = list_first_entry(&tmp, struct r10bio, retry_list); list_del(&r10_bio->retry_list); - if (mddev->degraded) - set_bit(R10BIO_Degraded, &r10_bio->state); if (test_bit(R10BIO_WriteError, &r10_bio->state)) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 2e75e88d0802..3f16ad6904a9 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -161,7 +161,6 @@ enum r10bio_state { R10BIO_IsSync, R10BIO_IsRecover, R10BIO_IsReshape, - R10BIO_Degraded, /* Set ReadError on bios that experience a read error * so that raid10d knows what to do with them. */ diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 1ac33d82bf8c..217b3223ef70 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -314,8 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state)); + sh->sector, RAID5_STRIPE_SECTORS(conf)); } } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 93cc7e252dd4..a5a619400d8f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1345,8 +1345,6 @@ again: submit_bio_noacct(rbi); } if (!rdev && !rrdev) { - if (op_is_write(op)) - set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %d on disc %d for sector %llu\n", bi->bi_opf, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -2884,7 +2882,6 @@ static void raid5_end_write_request(struct bio *bi) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (bi->bi_status) { - set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -3664,8 +3661,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - false); + sh->sector, RAID5_STRIPE_SECTORS(conf)); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3711,8 +3707,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - false); + sh->sector, RAID5_STRIPE_SECTORS(conf)); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -4062,8 +4057,7 @@ returnbi: wbi = wbi2; } conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state)); + sh->sector, RAID5_STRIPE_SECTORS(conf)); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -4340,7 +4334,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, s->locked++; set_bit(R5_Wantwrite, &dev->flags); - clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); break; case check_state_run: @@ -4497,7 +4490,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, clear_bit(R5_Wantwrite, &dev->flags); s->locked--; } - clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); break; @@ -4899,7 +4891,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED) | (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d174e586698f..69000fb90bd5 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -358,7 +358,6 @@ enum { STRIPE_REPLACED, STRIPE_PREREAD_ACTIVE, STRIPE_DELAYED, - STRIPE_DEGRADED, STRIPE_BIT_DELAY, STRIPE_EXPANDING, STRIPE_EXPAND_SOURCE, From 0c984a283a3ea3f10bebecd6c57c1d41b2e4f518 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 Jan 2025 09:51:43 +0800 Subject: [PATCH 146/150] md: add a new callback pers->bitmap_sector() This callback will be used in raid5 to convert io ranges from array to bitmap. Signed-off-by: Yu Kuai Reviewed-by: Xiao Ni Link: https://lore.kernel.org/r/20250109015145.158868-4-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/md.h b/drivers/md/md.h index 4ba93af36126..de6dadb9a40b 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -746,6 +746,9 @@ struct md_personality void *(*takeover) (struct mddev *mddev); /* Changes the consistency policy of an active array. */ int (*change_consistency_policy)(struct mddev *mddev, const char *buf); + /* convert io ranges from array to bitmap */ + void (*bitmap_sector)(struct mddev *mddev, sector_t *offset, + unsigned long *sectors); }; struct md_sysfs_entry { From 9c89f604476cf15c31fbbdb043cff7fbf1dbe0cb Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 Jan 2025 09:51:44 +0800 Subject: [PATCH 147/150] md/raid5: implement pers->bitmap_sector() Bitmap is used for the whole array for raid1/raid10, hence IO for the array can be used directly for bitmap. However, bitmap is used for underlying disks for raid5, hence IO for the array can't be used directly for bitmap. Implement pers->bitmap_sector() for raid5 to convert IO ranges from the array to the underlying disks. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20250109015145.158868-5-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/raid5.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a5a619400d8f..a08f49b47b7e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5918,6 +5918,54 @@ static enum reshape_loc get_reshape_loc(struct mddev *mddev, return LOC_BEHIND_RESHAPE; } +static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset, + unsigned long *sectors) +{ + struct r5conf *conf = mddev->private; + sector_t start = *offset; + sector_t end = start + *sectors; + sector_t prev_start = start; + sector_t prev_end = end; + int sectors_per_chunk; + enum reshape_loc loc; + int dd_idx; + + sectors_per_chunk = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + start = round_down(start, sectors_per_chunk); + end = round_up(end, sectors_per_chunk); + + start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL); + end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL); + + /* + * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make + * progress, hence it's the same as LOC_BEHIND_RESHAPE. + */ + loc = get_reshape_loc(mddev, conf, prev_start); + if (likely(loc != LOC_AHEAD_OF_RESHAPE)) { + *offset = start; + *sectors = end - start; + return; + } + + sectors_per_chunk = conf->prev_chunk_sectors * + (conf->previous_raid_disks - conf->max_degraded); + prev_start = round_down(prev_start, sectors_per_chunk); + prev_end = round_down(prev_end, sectors_per_chunk); + + prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL); + prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL); + + /* + * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO + * is handled in make_stripe_request(), we can't know this here hence + * we set bits for both. + */ + *offset = min(start, prev_start); + *sectors = max(end, prev_end) - *offset; +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -8966,6 +9014,7 @@ static struct md_personality raid6_personality = .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, .prepare_suspend = raid5_prepare_suspend, + .bitmap_sector = raid5_bitmap_sector, }; static struct md_personality raid5_personality = { @@ -8991,6 +9040,7 @@ static struct md_personality raid5_personality = .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, .prepare_suspend = raid5_prepare_suspend, + .bitmap_sector = raid5_bitmap_sector, }; static struct md_personality raid4_personality = @@ -9017,6 +9067,7 @@ static struct md_personality raid4_personality = .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, .prepare_suspend = raid5_prepare_suspend, + .bitmap_sector = raid5_bitmap_sector, }; static int __init raid5_init(void) From cd5fc653381811f1e0ba65f5d169918cab61476f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 9 Jan 2025 09:51:45 +0800 Subject: [PATCH 148/150] md/md-bitmap: move bitmap_{start, end}write to md upper layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two BUG reports that raid5 will hang at bitmap_startwrite([1],[2]), root cause is that bitmap start write and end write is unbalanced, it's not quite clear where, and while reviewing raid5 code, it's found that bitmap operations can be optimized. For example, for a 4 disks raid5, with chunksize=8k, if user issue a IO (0 + 48k) to the array: ┌────────────────────────────────────────────────────────────┐ │chunk 0 │ │ ┌────────────┬─────────────┬─────────────┬────────────┼ │ sh0 │A0: 0 + 4k │A1: 8k + 4k │A2: 16k + 4k │A3: P │ │ ┼────────────┼─────────────┼─────────────┼────────────┼ │ sh1 │B0: 4k + 4k │B1: 12k + 4k │B2: 20k + 4k │B3: P │ ┼──────┴────────────┴─────────────┴─────────────┴────────────┼ │chunk 1 │ │ ┌────────────┬─────────────┬─────────────┬────────────┤ │ sh2 │C0: 24k + 4k│C1: 32k + 4k │C2: P │C3: 40k + 4k│ │ ┼────────────┼─────────────┼─────────────┼────────────┼ │ sh3 │D0: 28k + 4k│D1: 36k + 4k │D2: P │D3: 44k + 4k│ └──────┴────────────┴─────────────┴─────────────┴────────────┘ Before this patch, 4 stripe head will be used, and each sh will attach bio for 3 disks, and each attached bio will trigger bitmap_startwrite() once, which means total 12 times. - 3 times (0 + 4k), for (A0, A1 and A2) - 3 times (4 + 4k), for (B0, B1 and B2) - 3 times (8 + 4k), for (C0, C1 and C3) - 3 times (12 + 4k), for (D0, D1 and D3) After this patch, md upper layer will calculate that IO range (0 + 48k) is corresponding to the bitmap (0 + 16k), and call bitmap_startwrite() just once. Noted that this patch will align bitmap ranges to the chunks, for example, if user issue a IO (0 + 4k) to array: - Before this patch, 1 time (0 + 4k), for A0; - After this patch, 1 time (0 + 8k) for chunk 0; Usually, one bitmap bit will represent more than one disk chunk, and this doesn't have any difference. And even if user really created a array that one chunk contain multiple bits, the overhead is that more data will be recovered after power failure. Also remove STRIPE_BITMAP_PENDING since it's not used anymore. [1] https://lore.kernel.org/all/CAJpMwyjmHQLvm6zg1cmQErttNNQPDAAXPKM3xgTjMhbfts986Q@mail.gmail.com/ [2] https://lore.kernel.org/all/ADF7D720-5764-4AF3-B68E-1845988737AA@flyingcircus.io/ Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20250109015145.158868-6-yukuai1@huaweicloud.com Signed-off-by: Song Liu --- drivers/md/md.c | 29 +++++++++++++++++++++++ drivers/md/md.h | 2 ++ drivers/md/raid1.c | 4 ---- drivers/md/raid10.c | 3 --- drivers/md/raid5-cache.c | 2 -- drivers/md/raid5.c | 50 +++++----------------------------------- drivers/md/raid5.h | 3 --- 7 files changed, 37 insertions(+), 56 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3dd013f82e26..866015b681af 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8745,12 +8745,32 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); +static void md_bitmap_start(struct mddev *mddev, + struct md_io_clone *md_io_clone) +{ + if (mddev->pers->bitmap_sector) + mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, + &md_io_clone->sectors); + + mddev->bitmap_ops->startwrite(mddev, md_io_clone->offset, + md_io_clone->sectors); +} + +static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) +{ + mddev->bitmap_ops->endwrite(mddev, md_io_clone->offset, + md_io_clone->sectors); +} + static void md_end_clone_io(struct bio *bio) { struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; @@ -8775,6 +8795,12 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); + if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { + md_io_clone->offset = (*bio)->bi_iter.bi_sector; + md_io_clone->sectors = bio_sectors(*bio); + md_bitmap_start(mddev, md_io_clone); + } + clone->bi_end_io = md_end_clone_io; clone->bi_private = md_io_clone; *bio = clone; @@ -8793,6 +8819,9 @@ void md_free_cloned_bio(struct bio *bio) struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + md_bitmap_end(mddev, md_io_clone); + if (bio->bi_status && !orig_bio->bi_status) orig_bio->bi_status = bio->bi_status; diff --git a/drivers/md/md.h b/drivers/md/md.h index de6dadb9a40b..def808064ad8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -831,6 +831,8 @@ struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; + sector_t offset; + unsigned long sectors; struct bio bio_clone; }; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4d09d85321b6..a5cd6522fc2d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -422,8 +422,6 @@ static void close_write(struct r1bio *r1_bio) if (test_bit(R1BIO_BehindIO, &r1_bio->state)) mddev->bitmap_ops->end_behind_write(mddev); - /* clear the bitmap if all writes complete successfully */ - mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors); md_write_end(mddev); } @@ -1632,8 +1630,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (test_bit(R1BIO_BehindIO, &r1_bio->state)) mddev->bitmap_ops->start_behind_write(mddev); - mddev->bitmap_ops->startwrite(mddev, r1_bio->sector, - r1_bio->sectors); first_clone = 0; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 340a4710c222..e1e6cd7fb125 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -428,8 +428,6 @@ static void close_write(struct r10bio *r10_bio) { struct mddev *mddev = r10_bio->mddev; - /* clear the bitmap if all writes complete successfully */ - mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors); md_write_end(mddev); } @@ -1506,7 +1504,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, md_account_bio(mddev, &bio); r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); - mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 217b3223ef70..e530271cb86b 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -313,8 +313,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf)); } } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a08f49b47b7e..5c79429acc64 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -906,8 +906,7 @@ static bool stripe_can_batch(struct stripe_head *sh) if (raid5_has_log(conf) || raid5_has_ppl(conf)) return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && - !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && - is_full_stripe_write(sh); + is_full_stripe_write(sh); } /* we only do back search */ @@ -3545,29 +3544,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, sh->dev[dd_idx].sector); - if (conf->mddev->bitmap && firstwrite) { - /* Cannot hold spinlock over bitmap_startwrite, - * but must ensure this isn't added to a batch until - * we have added to the bitmap and set bm_seq. - * So set STRIPE_BITMAP_PENDING to prevent - * batching. - * If multiple __add_stripe_bio() calls race here they - * much all set STRIPE_BITMAP_PENDING. So only the first one - * to complete "bitmap_startwrite" gets to set - * STRIPE_BIT_DELAY. This is important as once a stripe - * is added to a batch, STRIPE_BIT_DELAY cannot be changed - * any more. - */ - set_bit(STRIPE_BITMAP_PENDING, &sh->state); - spin_unlock_irq(&sh->stripe_lock); - conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, - RAID5_STRIPE_SECTORS(conf)); - spin_lock_irq(&sh->stripe_lock); - clear_bit(STRIPE_BITMAP_PENDING, &sh->state); - if (!sh->batch_head) { - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } + if (conf->mddev->bitmap && firstwrite && !sh->batch_head) { + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); } } @@ -3618,7 +3597,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(sh->batch_head); for (i = disks; i--; ) { struct bio *bi; - int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { struct md_rdev *rdev = conf->disks[i].rdev; @@ -3643,8 +3621,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].towrite = NULL; sh->overwrite_disks = 0; spin_unlock_irq(&sh->stripe_lock); - if (bi) - bitmap_end = 1; log_stripe_write_finished(sh); @@ -3659,10 +3635,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bio_io_error(bi); bi = nextbi; } - if (bitmap_end) - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf)); - bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; @@ -3671,7 +3643,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].page = sh->dev[i].orig_page; } - if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); @@ -3705,9 +3676,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = nextbi; } } - if (bitmap_end) - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf)); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -4056,8 +4024,7 @@ returnbi: bio_endio(wbi); wbi = wbi2; } - conf->mddev->bitmap_ops->endwrite(conf->mddev, - sh->sector, RAID5_STRIPE_SECTORS(conf)); + if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -4882,8 +4849,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_COMPUTE_RUN) | (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | - (1 << STRIPE_BATCH_ERR) | - (1 << STRIPE_BITMAP_PENDING)), + (1 << STRIPE_BATCH_ERR)), "stripe state: %lx\n", sh->state); WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | (1 << STRIPE_REPLACED)), @@ -5774,10 +5740,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { - for (d = 0; d < conf->raid_disks - conf->max_degraded; - d++) - mddev->bitmap_ops->startwrite(mddev, sh->sector, - RAID5_STRIPE_SECTORS(conf)); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 69000fb90bd5..eafc6e9ed6ee 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -371,9 +371,6 @@ enum { STRIPE_ON_RELEASE_LIST, STRIPE_BATCH_READY, STRIPE_BATCH_ERR, - STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add - * to batch yet. - */ STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) * this bit is used in two scenarios: * From 170e086ad3997f816d1f551f178a03a626a130b7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jan 2025 10:27:54 -0700 Subject: [PATCH 149/150] nvme: fix bogus kzalloc() return check in nvme_init_effects_log() nvme_init_effects_log() returns failure when kzalloc() is successful, which is obviously wrong and causes failures to boot. Correct the check. Fixes: d4a95adeabc6 ("nvme: Add error path for xa_store in nvme_init_effects") Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9e7f1bb81973..0d21258e2283 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3182,7 +3182,7 @@ static int nvme_init_effects_log(struct nvme_ctrl *ctrl, struct nvme_effects_log *effects, *old; effects = kzalloc(sizeof(*effects), GFP_KERNEL); - if (effects) + if (!effects) return -ENOMEM; old = xa_store(&ctrl->cels, csi, effects, GFP_KERNEL); From a13030fd194c88961be4679f87a1380f1bda0ebe Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Mon, 13 Jan 2025 23:03:31 +0700 Subject: [PATCH 150/150] io_uring: simplify the SQPOLL thread check when cancelling requests In io_uring_try_cancel_requests, we check whether sq_data->thread == current to determine if the function is called by the SQPOLL thread to do iopoll when IORING_SETUP_SQPOLL is set. This check can race with the SQPOLL thread termination. io_uring_cancel_generic is used in 2 places: io_uring_cancel_generic and io_ring_exit_work. In io_uring_cancel_generic, we have the information whether the current is SQPOLL thread already. And the SQPOLL thread never reaches io_ring_exit_work. So to avoid the racy check, this commit adds a boolean flag to io_uring_try_cancel_requests to determine if the caller is SQPOLL thread. Reported-by: syzbot+3c750be01dab672c513d@syzkaller.appspotmail.com Reported-by: Li Zetao Reviewed-by: Li Zetao Signed-off-by: Bui Quang Minh Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20250113160331.44057-1-minhquangbui99@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index af03e9973b58..20a46bc671ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -143,7 +143,8 @@ struct io_defer_entry { static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all); + bool cancel_all, + bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); @@ -2869,7 +2870,8 @@ static __cold void io_ring_exit_work(struct work_struct *work) if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -3037,7 +3039,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct io_uring_task *tctx, - bool cancel_all) + bool cancel_all, + bool is_sqpoll_thread) { struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; @@ -3067,7 +3070,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { + is_sqpoll_thread) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; @@ -3140,13 +3143,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) continue; loop |= io_uring_try_cancel_requests(node->ctx, current->io_uring, - cancel_all); + cancel_all, + false); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, current->io_uring, - cancel_all); + cancel_all, + true); } if (loop) {