diff --git a/fs/io_uring.c b/fs/io_uring.c index 2a89928ac1ba..7d3589e3a277 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -106,7 +106,8 @@ #define IORING_MAX_REG_BUFFERS (1U << 14) #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC) + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_CQE_SKIP_SUCCESS) #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) @@ -721,6 +722,7 @@ enum { REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, + REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, /* first byte is taken by user flags, shift it to not overlap */ REQ_F_FAIL_BIT = 8, @@ -737,6 +739,7 @@ enum { REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, REQ_F_ASYNC_DATA_BIT, + REQ_F_SKIP_LINK_CQES_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -758,6 +761,8 @@ enum { REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), /* IOSQE_BUFFER_SELECT */ REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + /* IOSQE_CQE_SKIP_SUCCESS */ + REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), /* fail rest of links */ REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), @@ -791,6 +796,8 @@ enum { REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), /* ->async_data allocated */ REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + /* don't post CQEs while failing linked requests */ + REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), }; struct async_poll { @@ -1301,6 +1308,10 @@ static inline bool req_has_async_data(struct io_kiocb *req) static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; + if (req->flags & REQ_F_CQE_SKIP) { + req->flags &= ~REQ_F_CQE_SKIP; + req->flags |= REQ_F_SKIP_LINK_CQES; + } } static inline void req_fail_link_node(struct io_kiocb *req, int res) @@ -1843,7 +1854,8 @@ static inline bool __io_fill_cqe(struct io_ring_ctx *ctx, u64 user_data, static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags) { - __io_fill_cqe(req->ctx, req->user_data, res, cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(req->ctx, req->user_data, res, cflags); } static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, @@ -1859,7 +1871,8 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - __io_fill_cqe(ctx, req->user_data, res, cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked * free_list cache. @@ -2067,6 +2080,7 @@ static bool io_kill_linked_timeout(struct io_kiocb *req) link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { list_del(&link->timeout.list); + /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); return true; @@ -2079,6 +2093,7 @@ static void io_fail_links(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { struct io_kiocb *nxt, *link = req->link; + bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; req->link = NULL; while (link) { @@ -2091,7 +2106,10 @@ static void io_fail_links(struct io_kiocb *req) link->link = NULL; trace_io_uring_fail_link(req, link); - io_fill_cqe_req(link, res, 0); + if (!ignore_cqes) { + link->flags &= ~REQ_F_CQE_SKIP; + io_fill_cqe_req(link, res, 0); + } io_put_req_deferred(link); link = nxt; } @@ -2108,6 +2126,7 @@ static bool io_disarm_next(struct io_kiocb *req) req->flags &= ~REQ_F_ARM_LTIMEOUT; if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { io_remove_next_linked(req); + /* leave REQ_F_CQE_SKIP to io_fill_cqe_req */ io_fill_cqe_req(link, -ECANCELED, 0); io_put_req_deferred(link); posted = true; @@ -2372,8 +2391,9 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - __io_fill_cqe(ctx, req->user_data, req->result, - req->cflags); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(ctx, req->user_data, req->result, + req->cflags); } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); @@ -2503,12 +2523,14 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) prev = start; wq_list_for_each_resume(pos, prev) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + u32 cflags; /* order with io_complete_rw_iopoll(), e.g. ->result updates */ if (!smp_load_acquire(&req->iopoll_completed)) break; - __io_fill_cqe(ctx, req->user_data, req->result, - io_put_rw_kbuf(req)); + cflags = io_put_rw_kbuf(req); + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe(ctx, req->user_data, req->result, cflags); nr_events++; } @@ -5832,6 +5854,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe flags = READ_ONCE(sqe->len); if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; + if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) + return -EINVAL; io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); @@ -10442,7 +10466,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | - IORING_FEAT_RSRC_TAGS; + IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index c45b5e9a9387..787f491f0d2a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -70,6 +70,7 @@ enum { IOSQE_IO_HARDLINK_BIT, IOSQE_ASYNC_BIT, IOSQE_BUFFER_SELECT_BIT, + IOSQE_CQE_SKIP_SUCCESS_BIT, }; /* @@ -87,6 +88,8 @@ enum { #define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT) /* select buffer from sqe->buf_group */ #define IOSQE_BUFFER_SELECT (1U << IOSQE_BUFFER_SELECT_BIT) +/* don't post CQE if request succeeded */ +#define IOSQE_CQE_SKIP_SUCCESS (1U << IOSQE_CQE_SKIP_SUCCESS_BIT) /* * io_uring_setup() flags @@ -289,6 +292,7 @@ struct io_uring_params { #define IORING_FEAT_EXT_ARG (1U << 8) #define IORING_FEAT_NATIVE_WORKERS (1U << 9) #define IORING_FEAT_RSRC_TAGS (1U << 10) +#define IORING_FEAT_CQE_SKIP (1U << 11) /* * io_uring_register(2) opcodes and arguments