for-6.11/io_uring-20240714

-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmaTgusQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpr+1EAC4I7pRAM341sfmhe/9QQKMM8VzGwy5Tlr1 AFLO3BujRTl6X8S9fQjIjN1coW6u4F42I19+vVlxqvB7CUnqt9VWpexEjxe4K0FR R+hIZW+fWV9K/eMrcsLcI7oReN5kIihHOzzy3wz0rENoGB5dCl6JAZMHDUCSqP0/ ZJJQ5ut8ah20Y/myHnzP5o4TfdE7nGo73Di2YoE2g3KqeX/dlAKW9+5hqKzzrHhM 2U25k/6KLy0ROzKpy2qW0QRE3pT5udoHLK2ue9+XwXF8JWVTlfVkHBzGY7NstyyT z07SEzW1q4xV1HdCwGDAU7cL2NJMRXSG0p2WZTm8QyaVTdsZQvEx08GLsVdLvFH5 Gg+oOaxVE+INzW+/Lwz7lFHgq6XEjdAlEAOXDtGkZoni6Rt6iCzFCW6RTf/guy8o Cub7tatMyegxai9+FTN/oFVoydRR0tsMf0OHrWnLOperh9CaxAwXvmKFeT/UTwiB KIuIOJop7aThJbiV42a/xwTrEjNMZRv6uVBBEtJX3rxpmIhqTbjcAv9rKMmgtLMk s6yX1MvYdOLhhEDyoUBX0dJdEETBf3KbnYIwi8kb4Sbkw/ZDgnkmSxFysom61wUF byAFEpah3ZFR8aES0uNKUE6UHK6i5qqp0Za/n6gA927E/WGCU9ndaS+01gyknog0 8FqFYwruHQ== =50CO -----END PGP SIGNATURE----- Merge tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux Pull io_uring updates from Jens Axboe: "Here are the io_uring updates queued up for 6.11. Nothing major this time around, various minor improvements and cleanups/fixes. This contains: - Add bind/listen opcodes. Main motivation is to support direct descriptors, to avoid needing a regular fd just for doing these two operations (Gabriel) - Probe fixes (Gabriel) - Treat io-wq work flags as atomics. Not fixing a real issue, but may as well and it silences a KCSAN warning (me) - Cleanup of rsrc __set_current_state() usage (me) - Add 64-bit for {m,f}advise operations (me) - Improve performance of data ring messages (me) - Fix for ring message overflow posting (Pavel) - Fix for freezer interaction with TWA_NOTIFY_SIGNAL. Not strictly an io_uring thing, but since TWA_NOTIFY_SIGNAL was originally added for faster task_work signaling for io_uring, bundling it with this pull (Pavel) - Add Pavel as a co-maintainer - Various cleanups (me, Thorsten)" * tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux: (28 commits) io_uring/net: check socket is valid in io_bind()/io_listen() kernel: rerun task_work while freezing in get_signal() io_uring/io-wq: limit retrying worker initialisation io_uring/napi: Remove unnecessary s64 cast io_uring/net: cleanup io_recv_finish() bundle handling io_uring/msg_ring: fix overflow posting MAINTAINERS: change Pavel Begunkov from io_uring reviewer to maintainer io_uring/msg_ring: use kmem_cache_free() to free request io_uring/msg_ring: check for dead submitter task io_uring/msg_ring: add an alloc cache for io_kiocb entries io_uring/msg_ring: improve handling of target CQE posting io_uring: add io_add_aux_cqe() helper io_uring: add remote task_work execution helper io_uring/msg_ring: tighten requirement for remote posting io_uring: Allocate only necessary memory in io_probe io_uring: Fix probe of disabled operations io_uring: Introduce IORING_OP_LISTEN io_uring: Introduce IORING_OP_BIND net: Split a __sys_listen helper for io_uring net: Split a __sys_bind helper for io_uring ...
2024-12-29 09:13:38 +00:00 · 2024-07-15 13:49:10 -07:00 · 2024-07-15 13:49:10 -07:00 · 3a56e24173
commit 3a56e24173
parent 4f5e249ec0 ad00e62914
23 changed files with 547 additions and 319 deletions
--- a/2
+++ b/2
@ -11551,7 +11551,7 @@ F:	include/linux/iosys-map.h
 IO_URING
 M:	Jens Axboe <axboe@kernel.dk>
-R:	Pavel Begunkov <asml.silence@gmail.com>
+M:	Pavel Begunkov <asml.silence@gmail.com>
 L:	io-uring@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.dk/linux-block
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@ -50,7 +50,7 @@ struct io_wq_work_list {
 struct io_wq_work {
 	struct io_wq_work_node list;
-	unsigned flags;
+	atomic_t flags;
 	/* place it here instead of io_kiocb as it fills padding and saves 4B */
 	int cancel_seq;
 };
@ -210,14 +210,6 @@ struct io_submit_state {
 	struct blk_plug		plug;
 };
 struct io_ev_fd {
 	struct eventfd_ctx	*cq_ev_fd;
 	unsigned int		eventfd_async: 1;
 	struct rcu_head		rcu;
 	atomic_t		refs;
 	atomic_t		ops;
 };
 struct io_alloc_cache {
 	void			**entries;
 	unsigned int		nr_cached;
@ -372,7 +364,6 @@ struct io_ring_ctx {
 	struct io_restriction		restrictions;
 	/* slow path rsrc auxilary data, used by update/register */
 	struct io_mapped_ubuf		*dummy_ubuf;
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
@ -405,6 +396,9 @@ struct io_ring_ctx {
 	struct callback_head		poll_wq_task_work;
 	struct list_head		defer_list;
 	struct io_alloc_cache		msg_cache;
 	spinlock_t			msg_lock;
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	struct list_head	napi_list;	/* track busy poll napi_id */
 	spinlock_t		napi_lock;	/* napi_list lock */
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
 extern int __sys_socket(int family, int type, int protocol);
 extern struct file *__sys_socket_file(int family, int type, int protocol);
 extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
 extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
 			     int addrlen);
 extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
 			      int addrlen, int file_flags);
 extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
 			 int addrlen);
 extern int __sys_listen(int fd, int backlog);
 extern int __sys_listen_socket(struct socket *sock, int backlog);
 extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
 			     int __user *usockaddr_len);
 extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@ -257,6 +257,8 @@ enum io_uring_op {
 	IORING_OP_FUTEX_WAITV,
 	IORING_OP_FIXED_FD_INSTALL,
 	IORING_OP_FTRUNCATE,
 	IORING_OP_BIND,
 	IORING_OP_LISTEN,
 	/* this goes last, obviously */
 	IORING_OP_LAST,
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@ -4,9 +4,9 @@
 obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
 					tctx.o filetable.o rw.o net.o poll.o \
-					uring_cmd.o openclose.o sqpoll.o \
+					eventfd.o uring_cmd.o openclose.o \
-					xattr.o nop.o fs.o splice.o sync.o \
+					sqpoll.o xattr.o nop.o fs.o splice.o \
-					msg_ring.o advise.o openclose.o \
+					sync.o msg_ring.o advise.o openclose.o \
 					epoll.o statx.o timeout.o fdinfo.o \
 					cancel.o waitid.o register.o \
 					truncate.o memmap.o
--- a/io_uring/advise.c
+++ b/io_uring/advise.c
@ -17,14 +17,14 @@
 struct io_fadvise {
 	struct file			*file;
 	u64				offset;
-	u32				len;
+	u64				len;
 	u32				advice;
 };
 struct io_madvise {
 	struct file			*file;
 	u64				addr;
-	u32				len;
+	u64				len;
 	u32				advice;
 };
@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
 	struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
-	if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	ma->addr = READ_ONCE(sqe->addr);
-	ma->len = READ_ONCE(sqe->len);
+	ma->len = READ_ONCE(sqe->off);
 	if (!ma->len)
 		ma->len = READ_ONCE(sqe->len);
 	ma->advice = READ_ONCE(sqe->fadvise_advice);
 	req->flags |= REQ_F_FORCE_ASYNC;
 	return 0;
@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
-	if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
+	if (sqe->buf_index || sqe->splice_fd_in)
 		return -EINVAL;
 	fa->offset = READ_ONCE(sqe->off);
-	fa->len = READ_ONCE(sqe->len);
+	fa->len = READ_ONCE(sqe->addr);
 	if (!fa->len)
 		fa->len = READ_ONCE(sqe->len);
 	fa->advice = READ_ONCE(sqe->fadvise_advice);
 	if (io_fadvise_force_async(fa))
 		req->flags |= REQ_F_FORCE_ASYNC;
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@ -0,0 +1,160 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/eventfd.h>
 #include <linux/eventpoll.h>
 #include <linux/io_uring.h>
 #include <linux/io_uring_types.h>
 #include "io-wq.h"
 #include "eventfd.h"
 struct io_ev_fd {
 	struct eventfd_ctx	*cq_ev_fd;
 	unsigned int		eventfd_async: 1;
 	struct rcu_head		rcu;
 	atomic_t		refs;
 	atomic_t		ops;
 };
 enum {
 	IO_EVENTFD_OP_SIGNAL_BIT,
 };
 static void io_eventfd_free(struct rcu_head *rcu)
 {
 	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
 	eventfd_ctx_put(ev_fd->cq_ev_fd);
 	kfree(ev_fd);
 }
 static void io_eventfd_do_signal(struct rcu_head *rcu)
 {
 	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
 	eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
 	if (atomic_dec_and_test(&ev_fd->refs))
 		io_eventfd_free(rcu);
 }
 void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd = NULL;
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 		return;
 	guard(rcu)();
 	/*
 	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
 	 * and eventfd_signal
 	 */
 	ev_fd = rcu_dereference(ctx->io_ev_fd);
 	/*
 	 * Check again if ev_fd exists incase an io_eventfd_unregister call
 	 * completed between the NULL check of ctx->io_ev_fd at the start of
 	 * the function and rcu_read_lock.
 	 */
 	if (unlikely(!ev_fd))
 		return;
 	if (!atomic_inc_not_zero(&ev_fd->refs))
 		return;
 	if (ev_fd->eventfd_async && !io_wq_current_is_worker())
 		goto out;
 	if (likely(eventfd_signal_allowed())) {
 		eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
 	} else {
 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
 			call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
 			return;
 		}
 	}
 out:
 	if (atomic_dec_and_test(&ev_fd->refs))
 		call_rcu(&ev_fd->rcu, io_eventfd_free);
 }
 void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
 {
 	bool skip;
 	spin_lock(&ctx->completion_lock);
 	/*
 	 * Eventfd should only get triggered when at least one event has been
 	 * posted. Some applications rely on the eventfd notification count
 	 * only changing IFF a new CQE has been added to the CQ ring. There's
 	 * no depedency on 1:1 relationship between how many times this
 	 * function is called (and hence the eventfd count) and number of CQEs
 	 * posted to the CQ ring.
 	 */
 	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
 	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
 	spin_unlock(&ctx->completion_lock);
 	if (skip)
 		return;
 	io_eventfd_signal(ctx);
 }
 int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int eventfd_async)
 {
 	struct io_ev_fd *ev_fd;
 	__s32 __user *fds = arg;
 	int fd;
 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
 					lockdep_is_held(&ctx->uring_lock));
 	if (ev_fd)
 		return -EBUSY;
 	if (copy_from_user(&fd, fds, sizeof(*fds)))
 		return -EFAULT;
 	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
 	if (!ev_fd)
 		return -ENOMEM;
 	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
 	if (IS_ERR(ev_fd->cq_ev_fd)) {
 		int ret = PTR_ERR(ev_fd->cq_ev_fd);
 		kfree(ev_fd);
 		return ret;
 	}
 	spin_lock(&ctx->completion_lock);
 	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
 	spin_unlock(&ctx->completion_lock);
 	ev_fd->eventfd_async = eventfd_async;
 	ctx->has_evfd = true;
 	atomic_set(&ev_fd->refs, 1);
 	atomic_set(&ev_fd->ops, 0);
 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
 	return 0;
 }
 int io_eventfd_unregister(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
 					lockdep_is_held(&ctx->uring_lock));
 	if (ev_fd) {
 		ctx->has_evfd = false;
 		rcu_assign_pointer(ctx->io_ev_fd, NULL);
 		if (atomic_dec_and_test(&ev_fd->refs))
 			call_rcu(&ev_fd->rcu, io_eventfd_free);
 		return 0;
 	}
 	return -ENXIO;
 }
--- a/io_uring/eventfd.h
+++ b/io_uring/eventfd.h
@ -0,0 +1,8 @@
 struct io_ring_ctx;
 int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int eventfd_async);
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
 void io_eventfd_signal(struct io_ring_ctx *ctx);
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@ -23,6 +23,7 @@
 #include "io_uring.h"
 #define WORKER_IDLE_TIMEOUT	(5 * HZ)
 #define WORKER_INIT_LIMIT	3
 enum {
 	IO_WORKER_F_UP		= 0,	/* up and active */
@ -58,6 +59,7 @@ struct io_worker {
 	unsigned long create_state;
 	struct callback_head create_work;
 	int init_retries;
 	union {
 		struct rcu_head rcu;
@ -159,7 +161,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound)
 static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
 						  struct io_wq_work *work)
 {
-	return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND));
+	return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND));
 }
 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
@ -451,7 +453,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
 static inline unsigned int io_get_work_hash(struct io_wq_work *work)
 {
-	return work->flags >> IO_WQ_HASH_SHIFT;
+	return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT;
 }
 static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
@ -592,8 +594,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
 			next_hashed = wq_next_work(work);
-			if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
+			if (do_kill &&
-				work->flags |= IO_WQ_WORK_CANCEL;
+			    (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND))
 				atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
 			wq->do_work(work);
 			io_assign_current_work(worker, NULL);
@ -744,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
 	return true;
 }
-static inline bool io_should_retry_thread(long err)
+static inline bool io_should_retry_thread(struct io_worker *worker, long err)
 {
 	/*
 	 * Prevent perpetual task_work retry, if the task (or its group) is
@ -752,6 +755,8 @@ static inline bool io_should_retry_thread(long err)
 	 */
 	if (fatal_signal_pending(current))
 		return false;
 	if (worker->init_retries++ >= WORKER_INIT_LIMIT)
 		return false;
 	switch (err) {
 	case -EAGAIN:
@ -778,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb)
 		io_init_new_worker(wq, worker, tsk);
 		io_worker_release(worker);
 		return;
-	} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+	} else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
 		struct io_wq_acct *acct = io_wq_get_acct(worker);
 		atomic_dec(&acct->nr_running);
@ -845,7 +850,7 @@ static bool create_io_worker(struct io_wq *wq, int index)
 	tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
 	if (!IS_ERR(tsk)) {
 		io_init_new_worker(wq, worker, tsk);
-	} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
+	} else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
 		kfree(worker);
 		goto fail;
 	} else {
@ -891,7 +896,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
 {
 	do {
-		work->flags |= IO_WQ_WORK_CANCEL;
+		atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
 		wq->do_work(work);
 		work = wq->free_work(work);
 	} while (work);
@ -926,7 +931,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 {
 	struct io_wq_acct *acct = io_work_get_acct(wq, work);
-	unsigned long work_flags = work->flags;
+	unsigned int work_flags = atomic_read(&work->flags);
 	struct io_cb_cancel_data match = {
 		.fn		= io_wq_work_match_item,
 		.data		= work,
@ -939,7 +944,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
 	 * been marked as one that should not get executed, cancel it here.
 	 */
 	if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
-	    (work->flags & IO_WQ_WORK_CANCEL)) {
+	    (work_flags & IO_WQ_WORK_CANCEL)) {
 		io_run_cancel(work, wq);
 		return;
 	}
@ -982,7 +987,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
 	unsigned int bit;
 	bit = hash_ptr(val, IO_WQ_HASH_ORDER);
-	work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
+	atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags);
 }
 static bool __io_wq_worker_cancel(struct io_worker *worker,
@ -990,7 +995,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
 				  struct io_wq_work *work)
 {
 	if (work && match->fn(work, match->data)) {
-		work->flags |= IO_WQ_WORK_CANCEL;
+		atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
 		__set_notify_signal(worker->task);
 		return true;
 	}
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void);
 static inline bool io_wq_is_hashed(struct io_wq_work *work)
 {
-	return work->flags & IO_WQ_WORK_HASHED;
+	return atomic_read(&work->flags) & IO_WQ_WORK_HASHED;
 }
 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@ -95,12 +95,14 @@
 #include "futex.h"
 #include "napi.h"
 #include "uring_cmd.h"
 #include "msg_ring.h"
 #include "memmap.h"
 #include "timeout.h"
 #include "poll.h"
 #include "rw.h"
 #include "alloc_cache.h"
 #include "eventfd.h"
 #define IORING_MAX_ENTRIES	32768
 #define IORING_MAX_CQ_ENTRIES	(2 * IORING_MAX_ENTRIES)
@ -314,6 +316,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 			    sizeof(struct io_async_rw));
 	ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct uring_cache));
 	spin_lock_init(&ctx->msg_lock);
 	ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_kiocb));
 	ret |= io_futex_cache_init(ctx);
 	if (ret)
 		goto err;
@ -350,6 +355,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
 	io_alloc_cache_free(&ctx->uring_cache, kfree);
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
@ -461,9 +467,9 @@ static void io_prep_async_work(struct io_kiocb *req)
 	}
 	req->work.list.next = NULL;
-	req->work.flags = 0;
+	atomic_set(&req->work.flags, 0);
 	if (req->flags & REQ_F_FORCE_ASYNC)
-		req->work.flags |= IO_WQ_WORK_CONCURRENT;
+		atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags);
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(req->file);
@ -479,7 +485,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 			io_wq_hash_work(&req->work, file_inode(req->file));
 	} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
 		if (def->unbound_nonreg_file)
-			req->work.flags |= IO_WQ_WORK_UNBOUND;
+			atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags);
 	}
 }
@ -519,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req)
 	 * worker for it).
 	 */
 	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
-		req->work.flags |= IO_WQ_WORK_CANCEL;
+		atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
 	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
 	io_wq_enqueue(tctx->io_wq, &req->work);
@ -541,84 +547,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
 	}
 }
 void io_eventfd_ops(struct rcu_head *rcu)
 {
 	struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
 	int ops = atomic_xchg(&ev_fd->ops, 0);
 	if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
 		eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
 	/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
 	 * ordering in a race but if references are 0 we know we have to free
 	 * it regardless.
 	 */
 	if (atomic_dec_and_test(&ev_fd->refs)) {
 		eventfd_ctx_put(ev_fd->cq_ev_fd);
 		kfree(ev_fd);
 	}
 }
 static void io_eventfd_signal(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd = NULL;
 	rcu_read_lock();
 	/*
 	 * rcu_dereference ctx->io_ev_fd once and use it for both for checking
 	 * and eventfd_signal
 	 */
 	ev_fd = rcu_dereference(ctx->io_ev_fd);
 	/*
 	 * Check again if ev_fd exists incase an io_eventfd_unregister call
 	 * completed between the NULL check of ctx->io_ev_fd at the start of
 	 * the function and rcu_read_lock.
 	 */
 	if (unlikely(!ev_fd))
 		goto out;
 	if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
 		goto out;
 	if (ev_fd->eventfd_async && !io_wq_current_is_worker())
 		goto out;
 	if (likely(eventfd_signal_allowed())) {
 		eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
 	} else {
 		atomic_inc(&ev_fd->refs);
 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
 			call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
 		else
 			atomic_dec(&ev_fd->refs);
 	}
 out:
 	rcu_read_unlock();
 }
 static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
 {
 	bool skip;
 	spin_lock(&ctx->completion_lock);
 	/*
 	 * Eventfd should only get triggered when at least one event has been
 	 * posted. Some applications rely on the eventfd notification count
 	 * only changing IFF a new CQE has been added to the CQ ring. There's
 	 * no depedency on 1:1 relationship between how many times this
 	 * function is called (and hence the eventfd count) and number of CQEs
 	 * posted to the CQ ring.
 	 */
 	skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
 	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
 	spin_unlock(&ctx->completion_lock);
 	if (skip)
 		return;
 	io_eventfd_signal(ctx);
 }
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
 	if (ctx->poll_activated)
@ -878,19 +806,42 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	return false;
 }
 static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 			      u32 cflags)
 {
 	bool filled;
 	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
 	if (!filled)
 		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
 	return filled;
 }
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 {
 	bool filled;
 	io_cq_lock(ctx);
-	filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
+	filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
 	if (!filled)
 		filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
 	io_cq_unlock_post(ctx);
 	return filled;
 }
 /*
 * Must be called from inline task_work so we now a flush will happen later,
 * and obviously with ctx->uring_lock held (tw always has that).
 */
 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 {
 	if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
 		spin_lock(&ctx->completion_lock);
 		io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
 		spin_unlock(&ctx->completion_lock);
 	}
 	ctx->submit_state.cq_flush = true;
 }
 /*
 * A helper for multishot requests posting additional CQEs.
 * Should only be used from a task_work including IO_URING_F_MULTISHOT.
@ -1175,9 +1126,10 @@ void tctx_task_work(struct callback_head *cb)
 	WARN_ON_ONCE(ret);
 }
-static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
+static inline void io_req_local_work_add(struct io_kiocb *req,
 					 struct io_ring_ctx *ctx,
 					 unsigned flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned nr_wait, nr_tw, nr_tw_prev;
 	struct llist_node *head;
@ -1191,6 +1143,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 	if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
 		flags &= ~IOU_F_TWQ_LAZY_WAKE;
 	guard(rcu)();
 	head = READ_ONCE(ctx->work_llist.first);
 	do {
 		nr_tw_prev = 0;
@ -1272,13 +1226,18 @@ static void io_req_normal_work_add(struct io_kiocb *req)
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 {
-	if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+	if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
-		rcu_read_lock();
+		io_req_local_work_add(req, req->ctx, flags);
-		io_req_local_work_add(req, flags);
+	else
 		rcu_read_unlock();
 	} else {
 		io_req_normal_work_add(req);
-	}
+}
 void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
 				 unsigned flags)
 {
 	if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)))
 		return;
 	io_req_local_work_add(req, ctx, flags);
 }
 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
@ -1467,7 +1426,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	}
 	__io_cq_unlock_post(ctx);
-	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
+	if (!wq_list_empty(&state->compl_reqs)) {
 		io_free_batch_list(ctx, state->compl_reqs.first);
 		INIT_WQ_LIST(&state->compl_reqs);
 	}
@ -1813,14 +1772,14 @@ void io_wq_submit_work(struct io_wq_work *work)
 	io_arm_ltimeout(req);
 	/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
-	if (work->flags & IO_WQ_WORK_CANCEL) {
+	if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) {
 fail:
 		io_req_task_queue_fail(req, err);
 		return;
 	}
 	if (!io_assign_file(req, def, issue_flags)) {
 		err = -EBADF;
-		work->flags |= IO_WQ_WORK_CANCEL;
+		atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
 		goto fail;
 	}
@ -2649,6 +2608,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
 	io_alloc_cache_free(&ctx->uring_cache, kfree);
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
 	mutex_unlock(&ctx->uring_lock);
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@ -65,6 +65,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
 int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
 bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
@ -73,6 +74,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
 void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
 				 unsigned flags);
 bool io_alloc_async_data(struct io_kiocb *req);
 void io_req_task_queue(struct io_kiocb *req);
 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
@ -104,12 +107,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
 enum {
 	IO_EVENTFD_OP_SIGNAL_BIT,
 	IO_EVENTFD_OP_FREE_BIT,
 };
 void io_eventfd_ops(struct rcu_head *rcu);
 void io_activate_pollwq(struct io_ring_ctx *ctx);
 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@ -11,9 +11,9 @@
 #include "io_uring.h"
 #include "rsrc.h"
 #include "filetable.h"
 #include "alloc_cache.h"
 #include "msg_ring.h"
 /* All valid masks for MSG_RING */
 #define IORING_MSG_RING_MASK		(IORING_MSG_RING_CQE_SKIP | \
 					IORING_MSG_RING_FLAGS_PASS)
@ -68,59 +68,70 @@ void io_msg_ring_cleanup(struct io_kiocb *req)
 static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
 {
-	if (!target_ctx->task_complete)
+	return target_ctx->task_complete;
 		return false;
 	return current != target_ctx->submitter_task;
 }
-static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func)
+static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	struct io_ring_ctx *ctx = req->file->private_data;
+	struct io_ring_ctx *ctx = req->ctx;
 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	struct task_struct *task = READ_ONCE(ctx->submitter_task);
-	if (unlikely(!task))
+	io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
-		return -EOWNERDEAD;
+	if (spin_trylock(&ctx->msg_lock)) {
-
+		if (io_alloc_cache_put(&ctx->msg_cache, req))
-	init_task_work(&msg->tw, func);
+			req = NULL;
-	if (task_work_add(task, &msg->tw, TWA_SIGNAL))
+		spin_unlock(&ctx->msg_lock);
 		return -EOWNERDEAD;
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 static void io_msg_tw_complete(struct callback_head *head)
 {
 	struct io_msg *msg = container_of(head, struct io_msg, tw);
 	struct io_kiocb *req = cmd_to_io_kiocb(msg);
 	struct io_ring_ctx *target_ctx = req->file->private_data;
 	int ret = 0;
 	if (current->flags & PF_EXITING) {
 		ret = -EOWNERDEAD;
 	} else {
 		u32 flags = 0;
 		if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
 			flags = msg->cqe_flags;
 		/*
 		 * If the target ring is using IOPOLL mode, then we need to be
 		 * holding the uring_lock for posting completions. Other ring
 		 * types rely on the regular completion locking, which is
 		 * handled while posting.
 		 */
 		if (target_ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_lock(&target_ctx->uring_lock);
 		if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
 			ret = -EOVERFLOW;
 		if (target_ctx->flags & IORING_SETUP_IOPOLL)
 			mutex_unlock(&target_ctx->uring_lock);
 	}
 	if (req)
 		kmem_cache_free(req_cachep, req);
 	percpu_ref_put(&ctx->refs);
 }
-	if (ret < 0)
+static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
-		req_set_fail(req);
+			      int res, u32 cflags, u64 user_data)
-	io_req_queue_tw_complete(req, ret);
+{
 	req->task = READ_ONCE(ctx->submitter_task);
 	if (!req->task) {
 		kmem_cache_free(req_cachep, req);
 		return -EOWNERDEAD;
 	}
 	req->cqe.user_data = user_data;
 	io_req_set_res(req, res, cflags);
 	percpu_ref_get(&ctx->refs);
 	req->ctx = ctx;
 	req->io_task_work.func = io_msg_tw_complete;
 	io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE);
 	return 0;
 }
 static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
 {
 	struct io_kiocb *req = NULL;
 	if (spin_trylock(&ctx->msg_lock)) {
 		req = io_alloc_cache_get(&ctx->msg_cache);
 		spin_unlock(&ctx->msg_lock);
 	}
 	if (req)
 		return req;
 	return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN);
 }
 static int io_msg_data_remote(struct io_kiocb *req)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	struct io_kiocb *target;
 	u32 flags = 0;
 	target = io_msg_get_kiocb(req->ctx);
 	if (unlikely(!target))
 		return -ENOMEM;
 	if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
 		flags = msg->cqe_flags;
 	return io_msg_remote_post(target_ctx, target, msg->len, flags,
 					msg->user_data);
 }
 static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
@ -138,7 +149,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
 		return -EBADFD;
 	if (io_msg_need_remote(target_ctx))
-		return io_msg_exec_remote(req, io_msg_tw_complete);
+		return io_msg_data_remote(req);
 	if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
 		flags = msg->cqe_flags;
@ -218,6 +229,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head)
 	io_req_queue_tw_complete(req, ret);
 }
 static int io_msg_fd_remote(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->file->private_data;
 	struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
 	struct task_struct *task = READ_ONCE(ctx->submitter_task);
 	if (unlikely(!task))
 		return -EOWNERDEAD;
 	init_task_work(&msg->tw, io_msg_tw_fd_complete);
 	if (task_work_add(task, &msg->tw, TWA_SIGNAL))
 		return -EOWNERDEAD;
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_ring_ctx *target_ctx = req->file->private_data;
@ -240,7 +267,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
 	}
 	if (io_msg_need_remote(target_ctx))
-		return io_msg_exec_remote(req, io_msg_tw_fd_complete);
+		return io_msg_fd_remote(req);
 	return io_msg_install_complete(req, issue_flags);
 }
@ -294,3 +321,10 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
 void io_msg_cache_free(const void *entry)
 {
 	struct io_kiocb *req = (struct io_kiocb *) entry;
 	kmem_cache_free(req_cachep, req);
 }
--- a/io_uring/msg_ring.h
+++ b/io_uring/msg_ring.h
@ -3,3 +3,4 @@
 int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
 void io_msg_ring_cleanup(struct io_kiocb *req);
 void io_msg_cache_free(const void *entry);
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@ -283,7 +283,7 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
 			s64 poll_to_ns = timespec64_to_ns(ts);
 			if (poll_to_ns > 0) {
 				u64 val = poll_to_ns + 999;
-				do_div(val, (s64) 1000);
+				do_div(val, 1000);
 				poll_to = val;
 			}
 		}
--- a/io_uring/net.c
+++ b/io_uring/net.c
@ -51,6 +51,16 @@ struct io_connect {
 	bool				seen_econnaborted;
 };
 struct io_bind {
 	struct file			*file;
 	int				addr_len;
 };
 struct io_listen {
 	struct file			*file;
 	int				backlog;
 };
 struct io_sr_msg {
 	struct file			*file;
 	union {
@ -817,20 +827,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 				  bool mshot_finished, unsigned issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
-	unsigned int cflags;
+	unsigned int cflags = 0;
 	if (sr->flags & IORING_RECVSEND_BUNDLE)
 		cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
 				      issue_flags);
 	else
 		cflags = io_put_kbuf(req, issue_flags);
 	if (kmsg->msg.msg_inq > 0)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
-	/* bundle with no more immediate buffers, we're done */
+	if (sr->flags & IORING_RECVSEND_BUNDLE) {
-	if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY)
+		cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
-		goto finish;
+				      issue_flags);
 		/* bundle with no more immediate buffers, we're done */
 		if (req->flags & REQ_F_BL_EMPTY)
 			goto finish;
 	} else {
 		cflags |= io_put_kbuf(req, issue_flags);
 	}
 	/*
 	 * Fill CQE for this receive and see if we should keep trying to
@ -1717,6 +1727,70 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
 	struct sockaddr __user *uaddr;
 	struct io_async_msghdr *io;
 	if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
 		return -EINVAL;
 	uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	bind->addr_len =  READ_ONCE(sqe->addr2);
 	io = io_msg_alloc_async(req);
 	if (unlikely(!io))
 		return -ENOMEM;
 	return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
 }
 int io_bind(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
 	struct io_async_msghdr *io = req->async_data;
 	struct socket *sock;
 	int ret;
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 	ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
 	return 0;
 }
 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
 	if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
 		return -EINVAL;
 	listen->backlog = READ_ONCE(sqe->len);
 	return 0;
 }
 int io_listen(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
 	struct socket *sock;
 	int ret;
 	sock = sock_from_file(req->file);
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 	ret = __sys_listen_socket(sock, listen->backlog);
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_set_res(req, ret, 0);
 	return 0;
 }
 void io_netmsg_cache_free(const void *entry)
 {
 	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
--- a/io_uring/net.h
+++ b/io_uring/net.h
@ -49,6 +49,12 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 void io_send_zc_cleanup(struct io_kiocb *req);
 int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_bind(struct io_kiocb *req, unsigned int issue_flags);
 int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_listen(struct io_kiocb *req, unsigned int issue_flags);
 void io_netmsg_cache_free(const void *entry);
 #else
 static inline void io_netmsg_cache_free(const void *entry)
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@ -495,6 +495,26 @@ const struct io_issue_def io_issue_defs[] = {
 		.prep			= io_ftruncate_prep,
 		.issue			= io_ftruncate,
 	},
 	[IORING_OP_BIND] = {
 #if defined(CONFIG_NET)
 		.needs_file		= 1,
 		.prep			= io_bind_prep,
 		.issue			= io_bind,
 		.async_size		= sizeof(struct io_async_msghdr),
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 	[IORING_OP_LISTEN] = {
 #if defined(CONFIG_NET)
 		.needs_file		= 1,
 		.prep			= io_listen_prep,
 		.issue			= io_listen,
 		.async_size		= sizeof(struct io_async_msghdr),
 #else
 		.prep			= io_eopnotsupp_prep,
 #endif
 	},
 };
 const struct io_cold_def io_cold_defs[] = {
@ -716,6 +736,12 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_FTRUNCATE] = {
 		.name			= "FTRUNCATE",
 	},
 	[IORING_OP_BIND] = {
 		.name			= "BIND",
 	},
 	[IORING_OP_LISTEN] = {
 		.name			= "LISTEN",
 	},
 };
 const char *io_uring_get_opcode(u8 opcode)
@ -725,6 +751,14 @@ const char *io_uring_get_opcode(u8 opcode)
 	return "INVALID";
 }
 bool io_uring_op_supported(u8 opcode)
 {
 	if (opcode < IORING_OP_LAST &&
 	    io_issue_defs[opcode].prep != io_eopnotsupp_prep)
 		return true;
 	return false;
 }
 void __init io_uring_optable_init(void)
 {
 	int i;
--- a/io_uring/opdef.h
+++ b/io_uring/opdef.h
@ -17,8 +17,6 @@ struct io_issue_def {
 	unsigned		poll_exclusive : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
 	/* opcode is not supported by this kernel */
 	unsigned		not_supported : 1;
 	/* skip auditing */
 	unsigned		audit_skip : 1;
 	/* supports ioprio */
@ -47,5 +45,7 @@ struct io_cold_def {
 extern const struct io_issue_def io_issue_defs[];
 extern const struct io_cold_def io_cold_defs[];
 bool io_uring_op_supported(u8 opcode);
 void io_uring_optable_init(void);
 #endif
--- a/io_uring/register.c
+++ b/io_uring/register.c
@ -27,65 +27,11 @@
 #include "cancel.h"
 #include "kbuf.h"
 #include "napi.h"
 #include "eventfd.h"
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
 static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 			       unsigned int eventfd_async)
 {
 	struct io_ev_fd *ev_fd;
 	__s32 __user *fds = arg;
 	int fd;
 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
 					lockdep_is_held(&ctx->uring_lock));
 	if (ev_fd)
 		return -EBUSY;
 	if (copy_from_user(&fd, fds, sizeof(*fds)))
 		return -EFAULT;
 	ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
 	if (!ev_fd)
 		return -ENOMEM;
 	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
 	if (IS_ERR(ev_fd->cq_ev_fd)) {
 		int ret = PTR_ERR(ev_fd->cq_ev_fd);
 		kfree(ev_fd);
 		return ret;
 	}
 	spin_lock(&ctx->completion_lock);
 	ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
 	spin_unlock(&ctx->completion_lock);
 	ev_fd->eventfd_async = eventfd_async;
 	ctx->has_evfd = true;
 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
 	atomic_set(&ev_fd->refs, 1);
 	atomic_set(&ev_fd->ops, 0);
 	return 0;
 }
 int io_eventfd_unregister(struct io_ring_ctx *ctx)
 {
 	struct io_ev_fd *ev_fd;
 	ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
 					lockdep_is_held(&ctx->uring_lock));
 	if (ev_fd) {
 		ctx->has_evfd = false;
 		rcu_assign_pointer(ctx->io_ev_fd, NULL);
 		if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
 			call_rcu(&ev_fd->rcu, io_eventfd_ops);
 		return 0;
 	}
 	return -ENXIO;
 }
 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
 			   unsigned nr_args)
 {
@ -93,9 +39,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
 	size_t size;
 	int i, ret;
 	if (nr_args > IORING_OP_LAST)
 		nr_args = IORING_OP_LAST;
 	size = struct_size(p, ops, nr_args);
 	if (size == SIZE_MAX)
 		return -EOVERFLOW;
 	p = kzalloc(size, GFP_KERNEL);
 	if (!p)
 		return -ENOMEM;
@ -108,12 +55,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
 		goto out;
 	p->last_op = IORING_OP_LAST - 1;
 	if (nr_args > IORING_OP_LAST)
 		nr_args = IORING_OP_LAST;
 	for (i = 0; i < nr_args; i++) {
 		p->ops[i].op = i;
-		if (!io_issue_defs[i].not_supported)
+		if (io_uring_op_supported(i))
 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
 	}
 	p->ops_len = i;
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@ -85,31 +85,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 	return 0;
 }
 static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
 		       void __user *arg, unsigned index)
 {
 	struct iovec __user *src;
 #ifdef CONFIG_COMPAT
 	if (ctx->compat) {
 		struct compat_iovec __user *ciovs;
 		struct compat_iovec ciov;
 		ciovs = (struct compat_iovec __user *) arg;
 		if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
 			return -EFAULT;
 		dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
 		dst->iov_len = ciov.iov_len;
 		return 0;
 	}
 #endif
 	src = (struct iovec __user *) arg;
 	if (copy_from_user(dst, &src[index], sizeof(*dst)))
 		return -EFAULT;
 	return 0;
 }
 static int io_buffer_validate(struct iovec *iov)
 {
 	unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
@ -249,7 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		ret = io_run_task_work_sig(ctx);
 		if (ret < 0) {
-			__set_current_state(TASK_RUNNING);
+			finish_wait(&ctx->rsrc_quiesce_wq, &we);
 			mutex_lock(&ctx->uring_lock);
 			if (list_empty(&ctx->rsrc_ref_list))
 				ret = 0;
@ -257,7 +232,6 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
 		}
 		schedule();
 		__set_current_state(TASK_RUNNING);
 		mutex_lock(&ctx->uring_lock);
 		ret = 0;
 	} while (!list_empty(&ctx->rsrc_ref_list));
@ -420,8 +394,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 				   struct io_uring_rsrc_update2 *up,
 				   unsigned int nr_args)
 {
 	struct iovec __user *uvec = u64_to_user_ptr(up->data);
 	u64 __user *tags = u64_to_user_ptr(up->tags);
-	struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
+	struct iovec fast_iov, *iov;
 	struct page *last_hpage = NULL;
 	__u32 done;
 	int i, err;
@ -435,21 +410,23 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		struct io_mapped_ubuf *imu;
 		u64 tag = 0;
-		err = io_copy_iov(ctx, &iov, iovs, done);
+		iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat);
-		if (err)
+		if (IS_ERR(iov)) {
 			err = PTR_ERR(iov);
 			break;
 		}
 		if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
 			err = -EFAULT;
 			break;
 		}
-		err = io_buffer_validate(&iov);
+		err = io_buffer_validate(iov);
 		if (err)
 			break;
-		if (!iov.iov_base && tag) {
+		if (!iov->iov_base && tag) {
 			err = -EINVAL;
 			break;
 		}
-		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
+		err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
 		if (err)
 			break;
@ -971,8 +948,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 {
 	struct page *last_hpage = NULL;
 	struct io_rsrc_data *data;
 	struct iovec fast_iov, *iov = &fast_iov;
 	const struct iovec __user *uvec = (struct iovec * __user) arg;
 	int i, ret;
 	struct iovec iov;
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@ -989,24 +967,27 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return ret;
 	}
 	if (!arg)
 		memset(iov, 0, sizeof(*iov));
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
 		if (arg) {
-			ret = io_copy_iov(ctx, &iov, arg, i);
+			iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat);
 			if (IS_ERR(iov)) {
 				ret = PTR_ERR(iov);
 				break;
 			}
 			ret = io_buffer_validate(iov);
 			if (ret)
 				break;
 			ret = io_buffer_validate(&iov);
 			if (ret)
 				break;
 		} else {
 			memset(&iov, 0, sizeof(iov));
 		}
-		if (!iov.iov_base && *io_get_tag_slot(data, i)) {
+		if (!iov->iov_base && *io_get_tag_slot(data, i)) {
 			ret = -EINVAL;
 			break;
 		}
-		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
+		ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
 					     &last_hpage);
 		if (ret)
 			break;
--- a/kernel/signal.c
+++ b/kernel/signal.c
@ -2600,6 +2600,14 @@ static void do_freezer_trap(void)
 	spin_unlock_irq(&current->sighand->siglock);
 	cgroup_enter_frozen();
 	schedule();
 	/*
 	 * We could've been woken by task_work, run it to clear
 	 * TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
 	 */
 	clear_notify_signal();
 	if (unlikely(task_work_pending(current)))
 		task_work_run();
 }
 static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
--- a/net/socket.c
+++ b/net/socket.c
@ -1822,6 +1822,20 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
 	return __sys_socketpair(family, type, protocol, usockvec);
 }
 int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
 		      int addrlen)
 {
 	int err;
 	err = security_socket_bind(sock, (struct sockaddr *)address,
 				   addrlen);
 	if (!err)
 		err = READ_ONCE(sock->ops)->bind(sock,
 						 (struct sockaddr *)address,
 						 addrlen);
 	return err;
 }
 /*
 *	Bind a name to a socket. Nothing much to do here since it's
 *	the protocol's responsibility to handle the local address.
@ -1839,15 +1853,8 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (sock) {
 		err = move_addr_to_kernel(umyaddr, addrlen, &address);
-		if (!err) {
+		if (!err)
-			err = security_socket_bind(sock,
+			err = __sys_bind_socket(sock, &address, addrlen);
 						   (struct sockaddr *)&address,
 						   addrlen);
 			if (!err)
 				err = READ_ONCE(sock->ops)->bind(sock,
 						      (struct sockaddr *)
 						      &address, addrlen);
 		}
 		fput_light(sock->file, fput_needed);
 	}
 	return err;
@ -1863,23 +1870,28 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
 *	necessary for a listen, and if that works, we mark the socket as
 *	ready for listening.
 */
 int __sys_listen_socket(struct socket *sock, int backlog)
 {
 	int somaxconn, err;
 	somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
 	if ((unsigned int)backlog > somaxconn)
 		backlog = somaxconn;
 	err = security_socket_listen(sock, backlog);
 	if (!err)
 		err = READ_ONCE(sock->ops)->listen(sock, backlog);
 	return err;
 }
 int __sys_listen(int fd, int backlog)
 {
 	struct socket *sock;
 	int err, fput_needed;
 	int somaxconn;
 	sock = sockfd_lookup_light(fd, &err, &fput_needed);
 	if (sock) {
-		somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
+		err = __sys_listen_socket(sock, backlog);
 		if ((unsigned int)backlog > somaxconn)
 			backlog = somaxconn;
 		err = security_socket_listen(sock, backlog);
 		if (!err)
 			err = READ_ONCE(sock->ops)->listen(sock, backlog);
 		fput_light(sock->file, fput_needed);
 	}
 	return err;