linux-stable/io_uring/eventfd.c
Jens Axboe f4bb2f65bb io_uring/eventfd: move ctx->evfd_last_cq_tail into io_ev_fd
Everything else about the io_uring eventfd support is nicely kept
private to that code, except the cached_cq_tail tracking. With
everything else in place, move io_eventfd_flush_signal() to using
the ev_fd grab+release helpers, which then enables the direct use of
io_ev_fd for this tracking too.

Link: https://lore.kernel.org/r/20240921080307.185186-7-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-29 13:43:26 -06:00

209 lines
4.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/eventfd.h>
#include <linux/eventpoll.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
#include "io-wq.h"
#include "eventfd.h"
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async;
/* protected by ->completion_lock */
unsigned last_cq_tail;
refcount_t refs;
atomic_t ops;
struct rcu_head rcu;
};
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
};
static void io_eventfd_free(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
eventfd_ctx_put(ev_fd->cq_ev_fd);
kfree(ev_fd);
}
static void io_eventfd_do_signal(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
if (refcount_dec_and_test(&ev_fd->refs))
io_eventfd_free(rcu);
}
static void io_eventfd_put(struct io_ev_fd *ev_fd)
{
if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
}
static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref)
{
if (put_ref)
io_eventfd_put(ev_fd);
rcu_read_unlock();
}
/*
* Returns true if the caller should put the ev_fd reference, false if not.
*/
static bool __io_eventfd_signal(struct io_ev_fd *ev_fd)
{
if (eventfd_signal_allowed()) {
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
return true;
}
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
return false;
}
return true;
}
/*
* Trigger if eventfd_async isn't set, or if it's set and the caller is
* an async worker. If ev_fd isn't valid, obviously return false.
*/
static bool io_eventfd_trigger(struct io_ev_fd *ev_fd)
{
if (ev_fd)
return !ev_fd->eventfd_async || io_wq_current_is_worker();
return false;
}
/*
* On success, returns with an ev_fd reference grabbed and the RCU read
* lock held.
*/
static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return NULL;
rcu_read_lock();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
* and eventfd_signal
*/
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs))
return ev_fd;
rcu_read_unlock();
return NULL;
}
void io_eventfd_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = io_eventfd_grab(ctx);
if (ev_fd)
io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd));
}
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = io_eventfd_grab(ctx);
if (ev_fd) {
bool skip, put_ref = true;
/*
* Eventfd should only get triggered when at least one event
* has been posted. Some applications rely on the eventfd
* notification count only changing IFF a new CQE has been
* added to the CQ ring. There's no dependency on 1:1
* relationship between how many times this function is called
* (and hence the eventfd count) and number of CQEs posted to
* the CQ ring.
*/
spin_lock(&ctx->completion_lock);
skip = ctx->cached_cq_tail == ev_fd->last_cq_tail;
ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
if (!skip)
put_ref = __io_eventfd_signal(ev_fd);
io_eventfd_release(ev_fd, put_ref);
}
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ev_fd->last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
refcount_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
io_eventfd_put(ev_fd);
return 0;
}
return -ENXIO;
}