From 636119af94f2fbf3e4458be66a1bc740ba69ce6d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Sep 2024 08:51:15 -0600 Subject: [PATCH 01/10] io_uring: rename "copy buffers" to "clone buffers" A recent commit added support for copying registered buffers from one ring to another. But that term is a bit confusing, as no copying of buffer data is done here. What is being done is simply cloning the buffer registrations from one ring to another. Rename it while we still can, so that it's more descriptive. No functional changes in this patch. Fixes: 7cc2a6eadcd7 ("io_uring: add IORING_REGISTER_COPY_BUFFERS method") Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 +++--- io_uring/register.c | 4 ++-- io_uring/rsrc.c | 8 ++++---- io_uring/rsrc.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 9dc5bb428c8a..1fe79e750470 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -609,8 +609,8 @@ enum io_uring_register_op { IORING_REGISTER_CLOCK = 29, - /* copy registered buffers from source ring to current ring */ - IORING_REGISTER_COPY_BUFFERS = 30, + /* clone registered buffers from source ring to current ring */ + IORING_REGISTER_CLONE_BUFFERS = 30, /* this goes last */ IORING_REGISTER_LAST, @@ -701,7 +701,7 @@ enum { IORING_REGISTER_SRC_REGISTERED = 1, }; -struct io_uring_copy_buffers { +struct io_uring_clone_buffers { __u32 src_fd; __u32 flags; __u32 pad[6]; diff --git a/io_uring/register.c b/io_uring/register.c index dab0f8024ddf..b8a48a6a89ee 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -542,11 +542,11 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clock(ctx, arg); break; - case IORING_REGISTER_COPY_BUFFERS: + case IORING_REGISTER_CLONE_BUFFERS: ret = -EINVAL; if (!arg || nr_args != 1) break; - ret = io_register_copy_buffers(ctx, arg); + ret = io_register_clone_buffers(ctx, arg); break; default: ret = -EINVAL; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 40696a395f0a..9264e555ae59 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1139,7 +1139,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return 0; } -static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) +static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) { struct io_mapped_ubuf **user_bufs; struct io_rsrc_data *data; @@ -1203,9 +1203,9 @@ static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) * * Since the memory is already accounted once, don't account it again. */ -int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { - struct io_uring_copy_buffers buf; + struct io_uring_clone_buffers buf; bool registered_src; struct file *file; int ret; @@ -1223,7 +1223,7 @@ int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); - ret = io_copy_buffers(ctx, file->private_data); + ret = io_clone_buffers(ctx, file->private_data); if (!registered_src) fput(file); return ret; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 93546ab337a6..eb4803e473b0 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -68,7 +68,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len); -int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg); +int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, From 8b0c6025a02ddec2b497f83e7d2f27a07f1d0653 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 15 Sep 2024 08:51:20 -0600 Subject: [PATCH 02/10] io_uring/rsrc: get rid of io_mapped_ubuf->folio_mask We don't really need to cache this, let's reclaim 8 bytes from struct io_mapped_ubuf and just calculate it when we need it. The only hot path here is io_import_fixed(). Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 9 +++------ io_uring/rsrc.h | 1 - 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 9264e555ae59..2477995e2d65 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -988,13 +988,10 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; - imu->folio_mask = PAGE_MASK; - if (coalesced) { + if (coalesced) imu->folio_shift = data.folio_shift; - imu->folio_mask = ~((1UL << data.folio_shift) - 1); - } refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ~imu->folio_mask; + off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); *pimu = imu; ret = 0; @@ -1132,7 +1129,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, iter->bvec = bvec + seg_skip; iter->nr_segs -= seg_skip; iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~imu->folio_mask; + iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); } } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index eb4803e473b0..e290d2be3285 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -46,7 +46,6 @@ struct io_mapped_ubuf { unsigned int nr_bvecs; unsigned int folio_shift; unsigned long acct_pages; - unsigned long folio_mask; refcount_t refs; struct bio_vec bvec[] __counted_by(nr_bvecs); }; From 9753c642a53bc4fbdef06d372d389dce7d8cddc2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 15 Sep 2024 08:53:45 -0600 Subject: [PATCH 03/10] io_uring/rsrc: change ubuf->ubuf_end to length tracking If we change it to tracking ubuf->start + ubuf->len, then we can reduce the size of struct io_mapped_ubuf by another 4 bytes, effectively 8 bytes, as a hole is eliminated too. This shrinks io_mapped_ubuf to 32 bytes. Signed-off-by: Jens Axboe --- io_uring/fdinfo.c | 3 +-- io_uring/rsrc.c | 6 +++--- io_uring/rsrc.h | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index d43e1b5fcb36..6b1247664b35 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -177,9 +177,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); } if (has_lock && !xa_empty(&ctx->personalities)) { unsigned long index; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 2477995e2d65..131bcdda577a 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -38,7 +38,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static const struct io_mapped_ubuf dummy_ubuf = { /* set invalid range, so io_import_fixed() fails meeting it */ .ubuf = -1UL, - .ubuf_end = 0, + .len = UINT_MAX, }; int __io_account_mem(struct user_struct *user, unsigned long nr_pages) @@ -985,7 +985,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; if (coalesced) @@ -1086,7 +1086,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; /* diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index e290d2be3285..8ed588036210 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -42,11 +42,11 @@ struct io_rsrc_node { struct io_mapped_ubuf { u64 ubuf; - u64 ubuf_end; + unsigned int len; unsigned int nr_bvecs; unsigned int folio_shift; - unsigned long acct_pages; refcount_t refs; + unsigned long acct_pages; struct bio_vec bvec[] __counted_by(nr_bvecs); }; From a09c17240bdf2e9fa6d0591afa9448b59785f7d4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 16 Sep 2024 02:58:06 -0600 Subject: [PATCH 04/10] io_uring/sqpoll: retain test for whether the CPU is valid A recent commit ensured that SQPOLL cannot be setup with a CPU that isn't in the current tasks cpuset, but it also dropped testing whether the CPU is valid in the first place. Without that, if a task passes in a CPU value that is too high, the following KASAN splat can get triggered: BUG: KASAN: stack-out-of-bounds in io_sq_offload_create+0x858/0xaa4 Read of size 8 at addr ffff800089bc7b90 by task wq-aff.t/1391 CPU: 4 UID: 1000 PID: 1391 Comm: wq-aff.t Not tainted 6.11.0-rc7-00227-g371c468f4db6 #7080 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace.part.0+0xcc/0xe0 show_stack+0x14/0x1c dump_stack_lvl+0x58/0x74 print_report+0x16c/0x4c8 kasan_report+0x9c/0xe4 __asan_report_load8_noabort+0x1c/0x24 io_sq_offload_create+0x858/0xaa4 io_uring_setup+0x1394/0x17c4 __arm64_sys_io_uring_setup+0x6c/0x180 invoke_syscall+0x6c/0x260 el0_svc_common.constprop.0+0x158/0x224 do_el0_svc+0x3c/0x5c el0_svc+0x34/0x70 el0t_64_sync_handler+0x118/0x124 el0t_64_sync+0x168/0x16c The buggy address belongs to stack of task wq-aff.t/1391 and is located at offset 48 in frame: io_sq_offload_create+0x0/0xaa4 This frame has 1 object: [32, 40) 'allowed_mask' The buggy address belongs to the virtual mapping at [ffff800089bc0000, ffff800089bc9000) created by: kernel_clone+0x124/0x7e0 The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff0000d740af80 pfn:0x11740a memcg:ffff0000c2706f02 flags: 0xbffe00000000000(node=0|zone=2|lastcpupid=0x1fff) raw: 0bffe00000000000 0000000000000000 dead000000000122 0000000000000000 raw: ffff0000d740af80 0000000000000000 00000001ffffffff ffff0000c2706f02 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff800089bc7a80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ffff800089bc7b00: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 >ffff800089bc7b80: 00 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 ^ ffff800089bc7c00: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 ffff800089bc7c80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f3 Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409161632.cbeeca0d-lkp@intel.com Fixes: f011c9cf04c0 ("io_uring/sqpoll: do not allow pinning outside of cpuset") Tested-by: Felix Moessbauer Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 272df9d00f45..7adfcf6818ff 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -465,6 +465,8 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, int cpu = p->sq_thread_cpu; ret = -EINVAL; + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + goto err_sqpoll; cpuset_cpus_allowed(current, &allowed_mask); if (!cpumask_test_cpu(cpu, &allowed_mask)) goto err_sqpoll; From 7f44beadcc11adb98220556d2ddbe9c97aa6d42d Mon Sep 17 00:00:00 2001 From: Felix Moessbauer Date: Mon, 16 Sep 2024 13:11:50 +0200 Subject: [PATCH 05/10] io_uring/sqpoll: do not put cpumask on stack Putting the cpumask on the stack is deprecated for a long time (since 2d3854a37e8), as these can be big. Given that, change the on-stack allocation of allowed_mask to be dynamically allocated. Fixes: f011c9cf04c0 ("io_uring/sqpoll: do not allow pinning outside of cpuset") Signed-off-by: Felix Moessbauer Link: https://lore.kernel.org/r/20240916111150.1266191-1-felix.moessbauer@siemens.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 7adfcf6818ff..7c79685baeb1 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -461,15 +461,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, return 0; if (p->flags & IORING_SETUP_SQ_AFF) { - struct cpumask allowed_mask; + cpumask_var_t allowed_mask; int cpu = p->sq_thread_cpu; ret = -EINVAL; if (cpu >= nr_cpu_ids || !cpu_online(cpu)) goto err_sqpoll; - cpuset_cpus_allowed(current, &allowed_mask); - if (!cpumask_test_cpu(cpu, &allowed_mask)) + ret = -ENOMEM; + if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL)) goto err_sqpoll; + ret = -EINVAL; + cpuset_cpus_allowed(current, allowed_mask); + if (!cpumask_test_cpu(cpu, allowed_mask)) { + free_cpumask_var(allowed_mask); + goto err_sqpoll; + } + free_cpumask_var(allowed_mask); sqd->sq_cpu = cpu; } else { sqd->sq_cpu = -1; From 2f6a55e4235f596b7dd9e8a7cf3e07f39ac5e9c2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 16 Sep 2024 17:07:10 +0300 Subject: [PATCH 06/10] io_uring: clean up a type in io_uring_register_get_file() Originally "fd" was unsigned int but it was changed to int when we pulled this code into a separate function in commit 0b6d253e084a ("io_uring/register: provide helper to get io_ring_ctx from 'fd'"). This doesn't really cause a runtime problem because the call to array_index_nospec() will clamp negative fds to 0 and nothing else uses the negative values. Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/6f6cb630-079f-4fdf-bf95-1082e0a3fc6e@stanley.mountain Signed-off-by: Jens Axboe --- io_uring/register.c | 2 +- io_uring/register.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index b8a48a6a89ee..eca26d4884d9 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -561,7 +561,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, * true, then the registered index is used. Otherwise, the normal fd table. * Caller must call fput() on the returned file, unless it's an ERR_PTR. */ -struct file *io_uring_register_get_file(int fd, bool registered) +struct file *io_uring_register_get_file(unsigned int fd, bool registered) { struct file *file; diff --git a/io_uring/register.h b/io_uring/register.h index cc69b88338fe..a5f39d5ef9e0 100644 --- a/io_uring/register.h +++ b/io_uring/register.h @@ -4,6 +4,6 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); -struct file *io_uring_register_get_file(int fd, bool registered); +struct file *io_uring_register_get_file(unsigned int fd, bool registered); #endif From 53d69bdd5b19bb17602cb224e01aeed730ff3289 Mon Sep 17 00:00:00 2001 From: Olivier Langlois Date: Mon, 16 Sep 2024 15:17:56 -0400 Subject: [PATCH 07/10] io_uring/sqpoll: do the napi busy poll outside the submission block there are many small reasons justifying this change. 1. busy poll must be performed even on rings that have no iopoll and no new sqe. It is quite possible that a ring configured for inbound traffic with multishot be several hours without receiving new request submissions 2. NAPI busy poll does not perform any credential validation 3. If the thread is awaken by task work, processing the task work is prioritary over NAPI busy loop. This is why a second loop has been created after the io_sq_tw() call instead of doing the busy loop in __io_sq_thread() outside its credential acquisition block. Signed-off-by: Olivier Langlois Link: https://lore.kernel.org/r/de7679adf1249446bd47426db01d82b9603b7224.1726161831.git.olivier@trillion01.com Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 7c79685baeb1..b800be719260 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -196,9 +196,6 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) ret = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); - if (io_napi(ctx)) - ret += io_napi_sqpoll_busy_poll(ctx); - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) wake_up(&ctx->sqo_sq_wait); if (creds) @@ -323,6 +320,10 @@ static int io_sq_thread(void *data) if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) sqt_spin = true; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + if (io_napi(ctx)) + io_napi_sqpoll_busy_poll(ctx); + if (sqt_spin || !time_after(jiffies, timeout)) { if (sqt_spin) { io_sq_update_worktime(sqd, &start); From 04beb6e0e08c30c6f845f50afb7d7953603d7a6f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 18 Sep 2024 11:58:19 -0600 Subject: [PATCH 08/10] io_uring: check for presence of task_work rather than TIF_NOTIFY_SIGNAL If some part of the kernel adds task_work that needs executing, in terms of signaling it'll generally use TWA_SIGNAL or TWA_RESUME. Those two directly translate to TIF_NOTIFY_SIGNAL or TIF_NOTIFY_RESUME, and can be used for a variety of use case outside of task_work. However, io_cqring_wait_schedule() only tests explicitly for TIF_NOTIFY_SIGNAL. This means it can miss if task_work got added for the task, but used a different kind of signaling mechanism (or none at all). Normally this doesn't matter as any task_work will be run once the task exits to userspace, except if: 1) The ring is setup with DEFER_TASKRUN 2) The local work item may generate normal task_work For condition 2, this can happen when closing a file and it's the final put of that file, for example. This can cause stalls where a task is waiting to make progress inside io_cqring_wait(), but there's nothing else that will wake it up. Hence change the "should we schedule or loop around" check to check for the presence of task_work explicitly, rather than just TIF_NOTIFY_SIGNAL as the mechanism. While in there, also change the ordering of what type of task_work first in terms of ordering, to both make it consistent with other task_work runs in io_uring, but also to better handle the case of defer task_work generating normal task_work, like in the above example. Reported-by: Jan Hendrik Farr Link: https://github.com/axboe/liburing/issues/1235 Cc: stable@vger.kernel.org Fixes: 846072f16eed ("io_uring: mimimise io_cqring_wait_schedule") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1aca501efaf6..11c455b638a2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2461,7 +2461,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, return 1; if (unlikely(!llist_empty(&ctx->work_llist))) return 1; - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) + if (unlikely(task_work_pending(current))) return 1; if (unlikely(task_sigpending(current))) return -EINTR; @@ -2568,9 +2568,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - io_run_task_work(); if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx, nr_wait); + io_run_task_work(); /* * Non-local task_work will be run on exit to userspace, but From eed138d67d99312f07ed3bc326903b6808885571 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Sep 2024 23:38:01 -0600 Subject: [PATCH 09/10] io_uring: improve request linking trace Right now any link trace is listed as being linked after the head request in the chain, but it's more useful to note explicitly which request a given new request is chained to. Change the link trace to dump the tail request so that chains are immediately apparent when looking at traces. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 11c455b638a2..d306f566a944 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2153,7 +2153,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - trace_io_uring_link(req, link->head); + trace_io_uring_link(req, link->last); link->last->link = req; link->last = req; From eac2ca2d682f94f46b1973bdf5e77d85d77b8e53 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Sep 2024 02:51:20 -0600 Subject: [PATCH 10/10] io_uring: check if we need to reschedule during overflow flush In terms of normal application usage, this list will always be empty. And if an application does overflow a bit, it'll have a few entries. However, nothing obviously prevents syzbot from running a test case that generates a ton of overflow entries, and then flushing them can take quite a while. Check for needing to reschedule while flushing, and drop our locks and do so if necessary. There's no state to maintain here as overflows always prune from head-of-list, hence it's fine to drop and reacquire the locks at the end of the loop. Link: https://lore.kernel.org/io-uring/66ed061d.050a0220.29194.0053.GAE@google.com/ Reported-by: syzbot+5fca234bd7eb378ff78e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d306f566a944..4199fbe6ce13 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -624,6 +624,21 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) } list_del(&ocqe->list); kfree(ocqe); + + /* + * For silly syzbot cases that deliberately overflow by huge + * amounts, check if we need to resched and drop and + * reacquire the locks if so. Nothing real would ever hit this. + * Ideally we'd have a non-posting unlock for this, but hard + * to care for a non-real case. + */ + if (need_resched()) { + io_cq_unlock_post(ctx); + mutex_unlock(&ctx->uring_lock); + cond_resched(); + mutex_lock(&ctx->uring_lock); + io_cq_lock(ctx); + } } if (list_empty(&ctx->cq_overflow_list)) {