From fe80eb15dea5125ea64845c9de0dd7f8478dd267 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Jan 2024 10:05:32 -0700 Subject: [PATCH 1/7] io_uring/rw: cleanup io_rw_done() This originally came from the aio side, and it's laid out rather oddly. The common case here is that we either get -EIOCBQUEUED from submitting an async request, or that we complete the request correctly with the given number of bytes. Handling the odd internal restart error codes is not a common operation. Lay it out a bit more optimally that better explains the normal flow, and switch to avoiding the indirect call completely as this is our kiocb and we know the completion handler can only be one of two possible variants. While at it, move it to where it belongs in the file, with fellow end IO helpers. Outside of being easier to read, this also reduces the text size of the function by 24 bytes for me on arm64. Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- io_uring/rw.c | 48 +++++++++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0c856726b15d..118cc9f1cf16 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -168,27 +168,6 @@ void io_readv_writev_cleanup(struct io_kiocb *req) kfree(io->free_iovec); } -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) -{ - switch (ret) { - case -EIOCBQUEUED: - break; - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - case -ERESTART_RESTARTBLOCK: - /* - * We can't just restart the syscall, since previously - * submitted sqes may already be in progress. Just fail this - * IO with EINTR. - */ - ret = -EINTR; - fallthrough; - default: - kiocb->ki_complete(kiocb, ret); - } -} - static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); @@ -371,6 +350,33 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) smp_store_release(&req->iopoll_completed, 1); } +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + /* IO was queued async, completion will happen later */ + if (ret == -EIOCBQUEUED) + return; + + /* transform internal restart error codes */ + if (unlikely(ret < 0)) { + switch (ret) { + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail + * this IO with EINTR. + */ + ret = -EINTR; + break; + } + } + + INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll, + io_complete_rw, kiocb, ret); +} + static int kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { From 3f302388d45855c0b24802e7b414e3fb29f172e3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Jan 2024 13:34:33 -0700 Subject: [PATCH 2/7] io_uring/rsrc: improve code generation for fixed file assignment For the normal read/write path, we have already locked the ring submission side when assigning the file. This causes branch mispredictions when we then check and try and lock again in io_req_set_rsrc_node(). As this is a very hot path, this matters. Add a basic helper that already assumes we already have it locked, and use that in io_file_get_fixed(). Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +++-- io_uring/rsrc.h | 14 +++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4afb911fc042..50c9f04bc193 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2000,9 +2000,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, goto out; fd = array_index_nospec(fd, ctx->nr_user_files); slot = io_fixed_file_slot(&ctx->file_table, fd); - file = io_slot_file(slot); + if (!req->rsrc_node) + __io_req_set_rsrc_node(req, ctx); req->flags |= io_slot_flags(slot); - io_req_set_rsrc_node(req, ctx, 0); + file = io_slot_file(slot); out: io_ring_submit_unlock(ctx, issue_flags); return file; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7238b9cfe33b..c6f199bbee28 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -102,17 +102,21 @@ static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, node->refs++; } +static inline void __io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx) +{ + lockdep_assert_held(&ctx->uring_lock); + req->rsrc_node = ctx->rsrc_node; + io_charge_rsrc_node(ctx, ctx->rsrc_node); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) { if (!req->rsrc_node) { io_ring_submit_lock(ctx, issue_flags); - - lockdep_assert_held(&ctx->uring_lock); - - req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx, ctx->rsrc_node); + __io_req_set_rsrc_node(req, ctx); io_ring_submit_unlock(ctx, issue_flags); } } From baf59771343dc0c2ef9ac3189bf9df2d6143654f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 17:08:32 -0700 Subject: [PATCH 3/7] io_uring/register: guard compat syscall with CONFIG_COMPAT Add compat.h include to avoid a potential build issue: io_uring/register.c:281:6: error: call to undeclared function 'in_compat_syscall'; ISO C99 and later do not support implicit function declarations [-Werror,-Wimplicit-function-declaration] if (in_compat_syscall()) { ^ 1 warning generated. io_uring/register.c:282:9: error: call to undeclared function 'compat_get_bitmap'; ISO C99 and later do not support implicit function declarations [-Werror,-Wimplicit-function-declaration] ret = compat_get_bitmap(cpumask_bits(new_mask), ^ Fixes: c43203154d8a ("io_uring/register: move io_uring_register(2) related code to register.c") Reported-by: Manu Bretelle Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- io_uring/register.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/io_uring/register.c b/io_uring/register.c index 708dd1d89add..5e62c1208996 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -278,13 +279,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, if (len > cpumask_size()) len = cpumask_size(); - if (in_compat_syscall()) { +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) ret = compat_get_bitmap(cpumask_bits(new_mask), (const compat_ulong_t __user *)arg, len * 8 /* CHAR_BIT */); - } else { + else +#endif ret = copy_from_user(new_mask, arg, len); - } if (ret) { free_cpumask_var(new_mask); From dc12d1799ce710fd90abbe0ced71e7e1ae0894fc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:26 +0000 Subject: [PATCH 4/7] io_uring: adjust defer tw counting The UINT_MAX work item counting bias in io_req_local_work_add() in case of !IOU_F_TWQ_LAZY_WAKE works in a sense that we will not miss a wake up, however it's still eerie. In particular, if we add a lazy work item after a non-lazy one, we'll increment it and get nr_tw==0, and subsequent adds may try to unnecessarily wake up the task, which is though not so likely to happen in real workloads. Half the bias, it's still large enough to be larger than any valid ->cq_wait_nr, which is limited by IORING_MAX_CQ_ENTRIES, but further have a good enough of space before it overflows. Fixes: 8751d15426a31 ("io_uring: reduce scheduling due to tw") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/108b971e958deaf7048342930c341ba90f75d806.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 50c9f04bc193..d40c767a6216 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1325,7 +1325,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) nr_tw = nr_tw_prev + 1; /* Large enough to fail the nr_wait comparison below */ if (!(flags & IOU_F_TWQ_LAZY_WAKE)) - nr_tw = -1U; + nr_tw = INT_MAX; req->nr_tw = nr_tw; req->io_task_work.node.next = first; From d381099f980b5f6c3c7e150baf13b0aaefc66c29 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:27 +0000 Subject: [PATCH 5/7] io_uring: clean up local tw add-wait sync Kill a smp_mb__after_atomic() right before wake_up, it's useless, and add a comment explaining implicit barriers from cmpxchg and synchronsation around ->cq_wait_nr with the waiter. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3007f3c2d53c72b61de56919ef56b53158b8276f.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d40c767a6216..3ab7e6a46149 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1332,6 +1332,14 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) } while (!try_cmpxchg(&ctx->work_llist.first, &first, &req->io_task_work.node)); + /* + * cmpxchg implies a full barrier, which pairs with the barrier + * in set_current_state() on the io_cqring_wait() side. It's used + * to ensure that either we see updated ->cq_wait_nr, or waiters + * going to sleep will observe the work added to the list, which + * is similar to the wait/wawke task state sync. + */ + if (!first) { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); @@ -1346,8 +1354,6 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) /* either not enough or the previous add has already woken it up */ if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) return; - /* pairs with set_current_state() in io_cqring_wait() */ - smp_mb__after_atomic(); wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } From e8c407717b4814dac5641d93cbbbb9fc394f7cf0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:28 +0000 Subject: [PATCH 6/7] io_uring: clean *local_work_add var naming if (!first) { ... } While it reads as do something if it's not the first entry, it does exactly the opposite because "first" here is a pointer to the first entry. Remove the confusion by renaming it into "head". Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3b8be483b52f58a524185bb88694b8a268e7e85d.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3ab7e6a46149..3508198d17ba 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1304,16 +1304,16 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; - struct llist_node *first; + struct llist_node *head; if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; - first = READ_ONCE(ctx->work_llist.first); + head = READ_ONCE(ctx->work_llist.first); do { nr_tw_prev = 0; - if (first) { - struct io_kiocb *first_req = container_of(first, + if (head) { + struct io_kiocb *first_req = container_of(head, struct io_kiocb, io_task_work.node); /* @@ -1328,8 +1328,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) nr_tw = INT_MAX; req->nr_tw = nr_tw; - req->io_task_work.node.next = first; - } while (!try_cmpxchg(&ctx->work_llist.first, &first, + req->io_task_work.node.next = head; + } while (!try_cmpxchg(&ctx->work_llist.first, &head, &req->io_task_work.node)); /* @@ -1340,7 +1340,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) * is similar to the wait/wawke task state sync. */ - if (!first) { + if (!head) { if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) From b4bc35cf8704db86203c0739711dab1804265bf3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 17 Jan 2024 00:57:29 +0000 Subject: [PATCH 7/7] io_uring: combine cq_wait_nr checks Instead of explicitly checking ->cq_wait_nr for whether there are waiting, which is currently represented by 0, we can store there a large value and the nr_tw will automatically filter out those cases. Add a named constant for that and for the wake up bias value. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/38def30282654d980673976cd42fde9bab19b297.1705438669.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3508198d17ba..b5fa3c7df1cf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -137,6 +137,14 @@ struct io_defer_entry { #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) +/* + * No waiters. It's larger than any valid value of the tw counter + * so that tests against ->cq_wait_nr would fail and skip wake_up(). + */ +#define IO_CQ_WAKE_INIT (-1U) +/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ +#define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) + static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); @@ -303,6 +311,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); @@ -1306,6 +1315,13 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; + /* See comment above IO_CQ_WAKE_INIT */ + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); + + /* + * We don't know how many reuqests is there in the link and whether + * they can even be queued lazily, fall back to non-lazy. + */ if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) flags &= ~IOU_F_TWQ_LAZY_WAKE; @@ -1322,10 +1338,14 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) */ nr_tw_prev = READ_ONCE(first_req->nr_tw); } + + /* + * Theoretically, it can overflow, but that's fine as one of + * previous adds should've tried to wake the task. + */ nr_tw = nr_tw_prev + 1; - /* Large enough to fail the nr_wait comparison below */ if (!(flags & IOU_F_TWQ_LAZY_WAKE)) - nr_tw = INT_MAX; + nr_tw = IO_CQ_WAKE_FORCE; req->nr_tw = nr_tw; req->io_task_work.node.next = head; @@ -1348,11 +1368,11 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) } nr_wait = atomic_read(&ctx->cq_wait_nr); - /* no one is waiting */ - if (!nr_wait) + /* not enough or no one is waiting */ + if (nr_tw < nr_wait) return; - /* either not enough or the previous add has already woken it up */ - if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) + /* the previous add has already woken it up */ + if (nr_tw_prev >= nr_wait) return; wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } @@ -2620,7 +2640,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); - atomic_set(&ctx->cq_wait_nr, 0); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); /* * Run task_work after scheduling and before io_should_wake().