io_uring: return an error when cqe is dropped

Right now io_uring will not actively inform userspace if a CQE is
dropped. This is extremely rare, requiring a CQ ring overflow, as well as
a GFP_ATOMIC kmalloc failure. However the consequences could cause for
example applications to go into an undefined state, possibly waiting for a
CQE that never arrives.

Return an error code (EBADR) in these cases. Since this is expected to be
incredibly rare, try and avoid as much as possible affecting the hot code
paths, and so it only is returned lazily and when there is no other
available CQEs.

Once the error is returned, reset the error condition assuming the user is
either ok with it or will clean up appropriately.

Signed-off-by: Dylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220421091345.2115755-6-dylany@fb.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Dylan Yudaken 2022-04-21 02:13:44 -07:00 committed by Jens Axboe
parent 10988a0a67
commit 155bc9505d

View File

@ -905,6 +905,7 @@ struct io_cqe {
enum { enum {
IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_OVERFLOW_BIT,
IO_CHECK_CQ_DROPPED_BIT,
}; };
/* /*
@ -2119,6 +2120,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
* on the floor. * on the floor.
*/ */
io_account_cq_overflow(ctx); io_account_cq_overflow(ctx);
set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
return false; return false;
} }
if (list_empty(&ctx->cq_overflow_list)) { if (list_empty(&ctx->cq_overflow_list)) {
@ -2958,16 +2960,26 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{ {
unsigned int nr_events = 0; unsigned int nr_events = 0;
int ret = 0; int ret = 0;
unsigned long check_cq;
/* /*
* Don't enter poll loop if we already have events pending. * Don't enter poll loop if we already have events pending.
* If we do, we can potentially be spinning for commands that * If we do, we can potentially be spinning for commands that
* already triggered a CQE (eg in error). * already triggered a CQE (eg in error).
*/ */
if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) check_cq = READ_ONCE(ctx->check_cq);
if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
__io_cqring_overflow_flush(ctx, false); __io_cqring_overflow_flush(ctx, false);
if (io_cqring_events(ctx)) if (io_cqring_events(ctx))
return 0; return 0;
/*
* Similarly do not spin if we have not informed the user of any
* dropped CQE.
*/
if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
return -EBADR;
do { do {
/* /*
* If a submit got punted to a workqueue, we can have the * If a submit got punted to a workqueue, we can have the
@ -8327,15 +8339,18 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
ktime_t timeout) ktime_t timeout)
{ {
int ret; int ret;
unsigned long check_cq;
/* make sure we run task_work before checking for signals */ /* make sure we run task_work before checking for signals */
ret = io_run_task_work_sig(); ret = io_run_task_work_sig();
if (ret || io_should_wake(iowq)) if (ret || io_should_wake(iowq))
return ret; return ret;
check_cq = READ_ONCE(ctx->check_cq);
/* let the caller flush overflows, retry */ /* let the caller flush overflows, retry */
if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))
return 1; return 1;
if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)))
return -EBADR;
if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS))
return -ETIME; return -ETIME;
return 1; return 1;
@ -10987,9 +11002,18 @@ iopoll_locked:
} }
} }
if (!ret) if (!ret) {
ret = ret2; ret = ret2;
/*
* EBADR indicates that one or more CQE were dropped.
* Once the user has been informed we can clear the bit
* as they are obviously ok with those drops.
*/
if (unlikely(ret2 == -EBADR))
clear_bit(IO_CHECK_CQ_DROPPED_BIT,
&ctx->check_cq);
}
} }
out: out: