From c750629caeca01979da3403f4bebecda88713233 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Nov 2024 15:14:34 +0000 Subject: [PATCH 01/10] io_uring: remove io_uring_cqwait_reg_arg A separate wait argument registration API was removed, also delete leftover uapi definitions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/143b6a53591badac23632d3e6fa3e5db4b342ee2.1731942445.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4418d0192959..aac9a4f8fa9a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -873,20 +873,6 @@ enum { IORING_REG_WAIT_TS = (1U << 0), }; -/* - * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of - * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is - * called rather than pass in a wait argument structure separately. - */ -struct io_uring_cqwait_reg_arg { - __u32 flags; - __u32 struct_size; - __u32 nr_entries; - __u32 pad; - __u64 user_addr; - __u64 pad2[3]; -}; - /* * Argument for io_uring_enter(2) with * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument From e358e09a894dbcd51fdbbcf62bec1df249915834 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Nov 2024 15:14:50 +0000 Subject: [PATCH 02/10] io_uring: protect register tracing Syz reports: BUG: KCSAN: data-race in __se_sys_io_uring_register / io_sqe_files_register read-write to 0xffff8881021940b8 of 4 bytes by task 5923 on cpu 1: io_sqe_files_register+0x2c4/0x3b0 io_uring/rsrc.c:713 __io_uring_register io_uring/register.c:403 [inline] __do_sys_io_uring_register io_uring/register.c:611 [inline] __se_sys_io_uring_register+0x8d0/0x1280 io_uring/register.c:591 __x64_sys_io_uring_register+0x55/0x70 io_uring/register.c:591 x64_sys_call+0x202/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:428 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f read to 0xffff8881021940b8 of 4 bytes by task 5924 on cpu 0: __do_sys_io_uring_register io_uring/register.c:613 [inline] __se_sys_io_uring_register+0xe4a/0x1280 io_uring/register.c:591 __x64_sys_io_uring_register+0x55/0x70 io_uring/register.c:591 x64_sys_call+0x202/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:428 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xc9/0x1c0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Which should be due to reading the table size after unlock. We don't care much as it's just to print it in trace, but we might as well do it under the lock. Reported-by: syzbot+5a486fef3de40e0d8c76@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8233af2886a37b57f79e444e3db88fcfda1817ac.1731942203.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/register.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/register.c b/io_uring/register.c index 1a60f4916649..1e99c783abdf 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -905,9 +905,10 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); - mutex_unlock(&ctx->uring_lock); + trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, ctx->buf_table.nr, ret); + mutex_unlock(&ctx->uring_lock); if (!use_registered_ring) fput(file); return ret; From 2ae6bdb1e145af0a47253953132decbd2d52f4b2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Nov 2024 12:15:05 +0300 Subject: [PATCH 03/10] io_uring/region: return negative -E2BIG in io_create_region() This code accidentally returns positivie E2BIG instead of negative -E2BIG. The callers treat negatives and positives the same so this doesn't affect the kernel. The error code is returned to userspace via the system call. Fixes: dfbbfbf19187 ("io_uring: introduce concept of memory regions") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/d8ea3bef-74d8-4f77-8223-6d36464dd4dc@stanley.mountain Signed-off-by: Jens Axboe --- io_uring/memmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 6e6ee79ba94f..3d71756bc598 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -229,7 +229,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; if ((reg->size >> PAGE_SHIFT) > INT_MAX) - return E2BIG; + return -E2BIG; if ((reg->user_addr | reg->size) & ~PAGE_MASK) return -EINVAL; if (check_add_overflow(reg->user_addr, reg->size, &end)) From 40cfe553240b32333b42652370ef5232e6ac59e1 Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 20 Nov 2024 14:14:51 -0800 Subject: [PATCH 04/10] io_uring: add io_local_work_pending() In preparation for adding a new llist of tw to retry due to hitting the tw limit, add a helper io_local_work_pending(). This function returns true if there is any local tw pending. For now it only checks ctx->work_llist. Signed-off-by: David Wei Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20241120221452.3762588-2-dw@davidwei.uk Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 +++++++------- io_uring/io_uring.h | 9 +++++++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index da8fd460977b..55e3618b726d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1261,7 +1261,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, int min_events) { - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return false; if (events < min_events) return true; @@ -1314,7 +1314,7 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, { struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; return __io_run_local_work(ctx, &ts, min_events); } @@ -2329,7 +2329,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (!llist_empty(&ctx->work_llist)) { + if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); if (io_run_local_work(ctx, INT_MAX) > 0) return 0; @@ -2459,7 +2459,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) + if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; @@ -2493,7 +2493,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (!io_allowed_run_tw(ctx)) return -EEXIST; - if (!llist_empty(&ctx->work_llist)) + if (io_local_work_pending(ctx)) io_run_local_work(ctx, min_events); io_run_task_work(); @@ -2564,7 +2564,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - if (!llist_empty(&ctx->work_llist)) + if (io_local_work_pending(ctx)) io_run_local_work(ctx, nr_wait); io_run_task_work(); @@ -3158,7 +3158,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { - if (!llist_empty(&node->ctx->work_llist)) { + if (io_local_work_pending(node->ctx)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4070d4c8ef97..69eb3b23a5a0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -347,9 +347,14 @@ static inline int io_run_task_work(void) return ret; } +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist); +} + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !llist_empty(&ctx->work_llist); + return task_work_pending(current) || io_local_work_pending(ctx); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) @@ -484,6 +489,6 @@ enum { static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - !llist_empty(&ctx->work_llist); + io_local_work_pending(ctx); } #endif From f46b9cdb22f7a167c36b6bcddaef7e8aee2598fa Mon Sep 17 00:00:00 2001 From: David Wei Date: Wed, 20 Nov 2024 14:14:52 -0800 Subject: [PATCH 05/10] io_uring: limit local tw done Instead of eagerly running all available local tw, limit the amount of local tw done to the max of IO_LOCAL_TW_DEFAULT_MAX (20) or wait_nr. The value of 20 is chosen as a reasonable heuristic to allow enough work batching but also keep latency down. Add a retry_llist that maintains a list of local tw that couldn't be done in time. No synchronisation is needed since it is only modified within the task context. Signed-off-by: David Wei Link: https://lore.kernel.org/r/20241120221452.3762588-3-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 43 +++++++++++++++++++++++++--------- io_uring/io_uring.h | 2 +- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index aa5f5ea98076..3e934feb3187 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -335,6 +335,7 @@ struct io_ring_ctx { */ struct { struct llist_head work_llist; + struct llist_head retry_llist; unsigned long check_cq; atomic_t cq_wait_nr; atomic_t cq_timeouts; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 55e3618b726d..bfa93888f862 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -122,6 +122,7 @@ #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 +#define IO_LOCAL_TW_DEFAULT_MAX 20 struct io_defer_entry { struct list_head list; @@ -1256,6 +1257,8 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) struct llist_node *node = llist_del_all(&ctx->work_llist); __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, @@ -1270,37 +1273,55 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, return false; } +static int __io_run_local_work_loop(struct llist_node **node, + struct io_tw_state *ts, + int events) +{ + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); + *node = next; + if (--events <= 0) + break; + } + + return events; +} + static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, int min_events) { struct llist_node *node; unsigned int loops = 0; - int ret = 0; + int ret, limit; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + limit = max(IO_LOCAL_TW_DEFAULT_MAX, min_events); again: + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, limit); + if (ctx->retry_llist.first) + goto retry_done; + /* * llists are in reverse order, flip it back the right way before * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - while (node) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - ret++; - node = next; - } + ret = __io_run_local_work_loop(&node, ts, ret); + ctx->retry_llist.first = node; loops++; + ret = limit - ret; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; +retry_done: io_submit_flush_completions(ctx); if (io_run_local_work_continue(ctx, ret, min_events)) goto again; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 69eb3b23a5a0..12abee607e4a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -349,7 +349,7 @@ static inline int io_run_task_work(void) static inline bool io_local_work_pending(struct io_ring_ctx *ctx) { - return !llist_empty(&ctx->work_llist); + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); } static inline bool io_task_work_pending(struct io_ring_ctx *ctx) From ee116574de8415b0673c466e6cd28ba5f70c41a2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Nov 2024 07:12:17 -0700 Subject: [PATCH 06/10] io_uring/nop: ensure nop->fd is always initialized A previous commit added file support for nop, but it only initializes nop->fd if IORING_NOP_FIXED_FILE is set. That check should be IORING_NOP_FILE. Fix up the condition in nop preparation, and initialize it to a sane value even if we're not going to be directly using it. While in there, do the same thing for the nop->buffer field. Reported-by: syzbot+9a8500a45c2cabdf9577@syzkaller.appspotmail.com Fixes: a85f31052bce ("io_uring/nop: add support for testing registered files and buffers") Signed-off-by: Jens Axboe --- io_uring/nop.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/io_uring/nop.c b/io_uring/nop.c index 6d470d4251ee..5e5196df650a 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -35,10 +35,14 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) nop->result = READ_ONCE(sqe->len); else nop->result = 0; - if (nop->flags & IORING_NOP_FIXED_FILE) + if (nop->flags & IORING_NOP_FILE) nop->fd = READ_ONCE(sqe->fd); + else + nop->fd = -1; if (nop->flags & IORING_NOP_FIXED_BUFFER) nop->buffer = READ_ONCE(sqe->buf_index); + else + nop->buffer = -1; return 0; } From 0c0a4eae26ac78379d0c1db053de168a8febc6c9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 26 Nov 2024 00:34:18 +0000 Subject: [PATCH 07/10] io_uring: check for overflows in io_pin_pages WARNING: CPU: 0 PID: 5834 at io_uring/memmap.c:144 io_pin_pages+0x149/0x180 io_uring/memmap.c:144 CPU: 0 UID: 0 PID: 5834 Comm: syz-executor825 Not tainted 6.12.0-next-20241118-syzkaller #0 Call Trace: __io_uaddr_map+0xfb/0x2d0 io_uring/memmap.c:183 io_rings_map io_uring/io_uring.c:2611 [inline] io_allocate_scq_urings+0x1c0/0x650 io_uring/io_uring.c:3470 io_uring_create+0x5b5/0xc00 io_uring/io_uring.c:3692 io_uring_setup io_uring/io_uring.c:3781 [inline] ... io_pin_pages()'s uaddr parameter came directly from the user and can be garbage. Don't just add size to it as it can overflow. Cc: stable@vger.kernel.org Reported-by: syzbot+2159cbb522b02847c053@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1b7520ddb168e1d537d64be47414a0629d0d8f8f.1732581026.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 3d71756bc598..ea08f19dc648 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -136,7 +136,12 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) struct page **pages; int ret; - end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (check_add_overflow(uaddr, len, &end)) + return ERR_PTR(-EOVERFLOW); + if (check_add_overflow(end, PAGE_SIZE - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + end = end >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) From 49c5c63d48eb5b110580e4c4b937f0006fcc9b10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Nov 2024 13:42:27 -0700 Subject: [PATCH 08/10] io_uring: fix task_work cap overshooting A previous commit fixed task_work overrunning by a lot more than what the user asked for, by adding a retry list. However, it didn't cap the overall count, hence for multiple task_work runs inside the same wait loop, it'd still overshoot the target by potentially a large amount. Cap it generally inside the wait path. Note that this will still overshoot the default limit of 20, but should overshoot by no more than limit-1 in addition to the limit. That still provides a ceiling over how much task_work will be run, rather than still having gaps where it was uncapped essentially. Fixes: f46b9cdb22f7 ("io_uring: limit local tw done") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bfa93888f862..ae199e44da57 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1277,6 +1277,8 @@ static int __io_run_local_work_loop(struct llist_node **node, struct io_tw_state *ts, int events) { + int ret = 0; + while (*node) { struct llist_node *next = (*node)->next; struct io_kiocb *req = container_of(*node, struct io_kiocb, @@ -1285,27 +1287,27 @@ static int __io_run_local_work_loop(struct llist_node **node, io_poll_task_func, io_req_rw_complete, req, ts); *node = next; - if (--events <= 0) + if (++ret >= events) break; } - return events; + return ret; } static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, - int min_events) + int min_events, int max_events) { struct llist_node *node; unsigned int loops = 0; - int ret, limit; + int ret = 0; if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - limit = max(IO_LOCAL_TW_DEFAULT_MAX, min_events); again: - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, limit); + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); if (ctx->retry_llist.first) goto retry_done; @@ -1314,11 +1316,10 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - ret = __io_run_local_work_loop(&node, ts, ret); + ret += __io_run_local_work_loop(&node, ts, max_events - ret); ctx->retry_llist.first = node; loops++; - ret = limit - ret; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; retry_done: @@ -1337,16 +1338,18 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events); + return __io_run_local_work(ctx, &ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) +static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, + int max_events) { struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events); + ret = __io_run_local_work(ctx, &ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } @@ -2352,7 +2355,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) { if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX) > 0) + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) @@ -2515,7 +2518,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, if (!io_allowed_run_tw(ctx)) return -EEXIST; if (io_local_work_pending(ctx)) - io_run_local_work(ctx, min_events); + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) @@ -2586,7 +2590,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * now rather than let the caller do another wait loop. */ if (io_local_work_pending(ctx)) - io_run_local_work(ctx, nr_wait); + io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); /* @@ -3098,7 +3102,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX) > 0; + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, tctx, cancel_all); From 43eef70e7e2ac74e7767731dd806720c7fb5e010 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 25 Nov 2024 23:10:31 +0000 Subject: [PATCH 09/10] io_uring: fix corner case forgetting to vunmap io_pages_unmap() is a bit tricky in trying to figure whether the pages were previously vmap'ed or not. In particular If there is juts one page it belives there is no need to vunmap. Paired io_pages_map(), however, could've failed io_mem_alloc_compound() and attempted to io_mem_alloc_single(), which does vmap, and that leads to unpaired vmap. The solution is to fail if io_mem_alloc_compound() can't allocate a single page. That's the easiest way to deal with it, and those two functions are getting removed soon, so no need to overcomplicate it. Cc: stable@vger.kernel.org Fixes: 3ab1db3c6039e ("io_uring: get rid of remap_pfn_range() for mapping rings/sqes") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/477e75a3907a2fe83249e49c0a92cd480b2c60e0.1732569842.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/memmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/memmap.c b/io_uring/memmap.c index ea08f19dc648..57de9bccbf50 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -73,6 +73,8 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages, ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) goto done; + if (nr_pages == 1) + goto fail; ret = io_mem_alloc_single(pages, nr_pages, size, gfp); if (!IS_ERR(ret)) { @@ -81,7 +83,7 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages, *npages = nr_pages; return ret; } - +fail: kvfree(pages); *out_pages = NULL; *npages = 0; From 7eb75ce7527129d7f1fee6951566af409a37a1c4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Nov 2024 07:20:28 -0700 Subject: [PATCH 10/10] io_uring/tctx: work around xa_store() allocation error issue syzbot triggered the following WARN_ON: WARNING: CPU: 0 PID: 16 at io_uring/tctx.c:51 __io_uring_free+0xfa/0x140 io_uring/tctx.c:51 which is the WARN_ON_ONCE(!xa_empty(&tctx->xa)); sanity check in __io_uring_free() when a io_uring_task is going through its final put. The syzbot test case includes injecting memory allocation failures, and it very much looks like xa_store() can fail one of its memory allocations and end up with ->head being non-NULL even though no entries exist in the xarray. Until this issue gets sorted out, work around it by attempting to iterate entries in our xarray, and WARN_ON_ONCE() if one is found. Reported-by: syzbot+cc36d44ec9f368e443d3@syzkaller.appspotmail.com Link: https://lore.kernel.org/io-uring/673c1643.050a0220.87769.0066.GAE@google.com/ Signed-off-by: Jens Axboe --- io_uring/tctx.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 503f3ff8bc4f..adc6e42c14df 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -47,8 +47,19 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, void __io_uring_free(struct task_struct *tsk) { struct io_uring_task *tctx = tsk->io_uring; + struct io_tctx_node *node; + unsigned long index; - WARN_ON_ONCE(!xa_empty(&tctx->xa)); + /* + * Fault injection forcing allocation errors in the xa_store() path + * can lead to xa_empty() returning false, even though no actual + * node is stored in the xarray. Until that gets sorted out, attempt + * an iteration here and warn if any entries are found. + */ + xa_for_each(&tctx->xa, index, node) { + WARN_ON_ONCE(1); + break; + } WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs);