2022-05-25 11:59:19 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/file.h>
|
2023-12-01 00:57:35 +00:00
|
|
|
#include <linux/io_uring/cmd.h>
|
2024-04-09 21:05:53 +00:00
|
|
|
#include <linux/io_uring/net.h>
|
2022-07-15 19:16:22 +00:00
|
|
|
#include <linux/security.h>
|
2022-09-30 06:27:39 +00:00
|
|
|
#include <linux/nospec.h>
|
2024-02-12 23:42:36 +00:00
|
|
|
#include <net/sock.h>
|
2022-05-25 11:59:19 +00:00
|
|
|
|
|
|
|
#include <uapi/linux/io_uring.h>
|
io_uring/cmd: fix breakage in SOCKET_URING_OP_SIOC* implementation
In 8e9fad0e70b7 "io_uring: Add io_uring command support for sockets"
you've got an include of asm-generic/ioctls.h done in io_uring/uring_cmd.c.
That had been done for the sake of this chunk -
+ ret = prot->ioctl(sk, SIOCINQ, &arg);
+ if (ret)
+ return ret;
+ return arg;
+ case SOCKET_URING_OP_SIOCOUTQ:
+ ret = prot->ioctl(sk, SIOCOUTQ, &arg);
SIOC{IN,OUT}Q are defined to symbols (FIONREAD and TIOCOUTQ) that come from
ioctls.h, all right, but the values vary by the architecture.
FIONREAD is
0x467F on mips
0x4004667F on alpha, powerpc and sparc
0x8004667F on sh and xtensa
0x541B everywhere else
TIOCOUTQ is
0x7472 on mips
0x40047473 on alpha, powerpc and sparc
0x80047473 on sh and xtensa
0x5411 everywhere else
->ioctl() expects the same values it would've gotten from userland; all
places where we compare with SIOC{IN,OUT}Q are using asm/ioctls.h, so
they pick the correct values. io_uring_cmd_sock(), OTOH, ends up
passing the default ones.
Fixes: 8e9fad0e70b7 ("io_uring: Add io_uring command support for sockets")
Cc: <stable@vger.kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20231214213408.GT1674809@ZenIV
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2023-12-14 21:34:08 +00:00
|
|
|
#include <asm/ioctls.h>
|
2022-05-25 11:59:19 +00:00
|
|
|
|
|
|
|
#include "io_uring.h"
|
2024-03-20 21:19:44 +00:00
|
|
|
#include "alloc_cache.h"
|
2022-09-30 06:27:38 +00:00
|
|
|
#include "rsrc.h"
|
2022-05-25 11:59:19 +00:00
|
|
|
#include "uring_cmd.h"
|
|
|
|
|
2024-03-19 02:41:58 +00:00
|
|
|
static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
|
|
|
struct uring_cache *cache = req->async_data;
|
|
|
|
|
|
|
|
if (issue_flags & IO_URING_F_UNLOCKED)
|
|
|
|
return;
|
2024-03-20 21:19:44 +00:00
|
|
|
if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) {
|
2024-03-19 02:41:58 +00:00
|
|
|
ioucmd->sqe = NULL;
|
|
|
|
req->async_data = NULL;
|
|
|
|
req->flags &= ~REQ_F_ASYNC_DATA;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-03-18 22:00:23 +00:00
|
|
|
bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
|
2024-11-03 17:22:43 +00:00
|
|
|
struct io_uring_task *tctx, bool cancel_all)
|
2024-03-18 22:00:23 +00:00
|
|
|
{
|
|
|
|
struct hlist_node *tmp;
|
|
|
|
struct io_kiocb *req;
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
|
|
|
|
hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd,
|
|
|
|
hash_node) {
|
|
|
|
struct io_uring_cmd *cmd = io_kiocb_to_cmd(req,
|
|
|
|
struct io_uring_cmd);
|
|
|
|
struct file *file = req->file;
|
|
|
|
|
io_uring: move struct io_kiocb from task_struct to io_uring_task
Rather than store the task_struct itself in struct io_kiocb, store
the io_uring specific task_struct. The life times are the same in terms
of io_uring, and this avoids doing some dereferences through the
task_struct. For the hot path of putting local task references, we can
deref req->tctx instead, which we'll need anyway in that function
regardless of whether it's local or remote references.
This is mostly straight forward, except the original task PF_EXITING
check needs a bit of tweaking. task_work is _always_ run from the
originating task, except in the fallback case, where it's run from a
kernel thread. Replace the potentially racy (in case of fallback work)
checks for req->task->flags with current->flags. It's either the still
the original task, in which case PF_EXITING will be sane, or it has
PF_KTHREAD set, in which case it's fallback work. Both cases should
prevent moving forward with the given request.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-11-03 17:23:38 +00:00
|
|
|
if (!cancel_all && req->tctx != tctx)
|
2024-03-18 22:00:23 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
|
|
|
|
/* ->sqe isn't available if no async data */
|
|
|
|
if (!req_has_async_data(req))
|
|
|
|
cmd->sqe = NULL;
|
2024-03-18 22:00:25 +00:00
|
|
|
file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL |
|
|
|
|
IO_URING_F_COMPLETE_DEFER);
|
2024-03-18 22:00:23 +00:00
|
|
|
ret = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
io_submit_flush_completions(ctx);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2023-09-28 12:43:25 +00:00
|
|
|
static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
|
|
|
|
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE))
|
|
|
|
return;
|
|
|
|
|
|
|
|
cmd->flags &= ~IORING_URING_CMD_CANCELABLE;
|
|
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
|
|
hlist_del(&req->hash_node);
|
|
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark this command as concelable, then io_uring_try_cancel_uring_cmd()
|
|
|
|
* will try to cancel this issued command by sending ->uring_cmd() with
|
|
|
|
* issue_flags of IO_URING_F_CANCEL.
|
|
|
|
*
|
|
|
|
* The command is guaranteed to not be done when calling ->uring_cmd()
|
|
|
|
* with IO_URING_F_CANCEL, but it is driver's responsibility to deal
|
|
|
|
* with race between io_uring canceling and normal completion.
|
|
|
|
*/
|
|
|
|
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(cmd);
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
|
|
|
|
if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) {
|
|
|
|
cmd->flags |= IORING_URING_CMD_CANCELABLE;
|
|
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
|
|
hlist_add_head(&req->hash_node, &ctx->cancelable_uring_cmd);
|
|
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
|
|
|
|
|
2023-03-27 15:38:15 +00:00
|
|
|
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
|
2022-05-25 11:59:19 +00:00
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
2024-11-04 16:12:04 +00:00
|
|
|
unsigned int flags = IO_URING_F_COMPLETE_DEFER;
|
|
|
|
|
|
|
|
if (current->flags & (PF_EXITING | PF_KTHREAD))
|
|
|
|
flags |= IO_URING_F_TASK_DEAD;
|
2024-03-18 22:00:25 +00:00
|
|
|
|
2024-03-18 22:00:30 +00:00
|
|
|
/* task_work executor checks the deffered list completion */
|
2024-11-04 16:12:04 +00:00
|
|
|
ioucmd->task_work_cb(ioucmd, flags);
|
2022-05-25 11:59:19 +00:00
|
|
|
}
|
|
|
|
|
2023-05-15 12:54:42 +00:00
|
|
|
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
|
|
|
|
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
|
|
|
|
unsigned flags)
|
2022-05-25 11:59:19 +00:00
|
|
|
{
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
|
|
|
|
|
|
|
ioucmd->task_work_cb = task_work_cb;
|
|
|
|
req->io_task_work.func = io_uring_cmd_work;
|
2023-05-15 12:54:42 +00:00
|
|
|
__io_req_task_work_add(req, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
|
|
|
|
|
2022-05-25 11:59:19 +00:00
|
|
|
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
|
|
|
|
u64 extra1, u64 extra2)
|
|
|
|
{
|
2023-08-24 22:53:25 +00:00
|
|
|
req->big_cqe.extra1 = extra1;
|
|
|
|
req->big_cqe.extra2 = extra2;
|
2022-05-25 11:59:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called by consumers of io_uring_cmd, if they originally returned
|
|
|
|
* -EIOCBQUEUED upon receiving the command.
|
|
|
|
*/
|
2024-12-03 10:31:05 +00:00
|
|
|
void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2,
|
2023-03-21 02:01:25 +00:00
|
|
|
unsigned issue_flags)
|
2022-05-25 11:59:19 +00:00
|
|
|
{
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
|
|
|
|
2023-09-28 12:43:25 +00:00
|
|
|
io_uring_cmd_del_cancelable(ioucmd, issue_flags);
|
|
|
|
|
2022-05-25 11:59:19 +00:00
|
|
|
if (ret < 0)
|
|
|
|
req_set_fail(req);
|
|
|
|
|
2022-08-03 12:07:57 +00:00
|
|
|
io_req_set_res(req, ret, 0);
|
2022-05-25 11:59:19 +00:00
|
|
|
if (req->ctx->flags & IORING_SETUP_CQE32)
|
|
|
|
io_req_set_cqe32_extra(req, res2, 0);
|
2024-03-19 02:41:58 +00:00
|
|
|
io_req_uring_cleanup(req, issue_flags);
|
2023-04-12 18:07:36 +00:00
|
|
|
if (req->ctx->flags & IORING_SETUP_IOPOLL) {
|
2022-08-23 16:14:41 +00:00
|
|
|
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
|
|
|
|
smp_store_release(&req->iopoll_completed, 1);
|
2024-03-18 22:00:25 +00:00
|
|
|
} else if (issue_flags & IO_URING_F_COMPLETE_DEFER) {
|
|
|
|
if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED))
|
|
|
|
return;
|
2024-03-18 22:00:24 +00:00
|
|
|
io_req_complete_defer(req);
|
2023-04-12 18:07:36 +00:00
|
|
|
} else {
|
2024-03-18 22:00:24 +00:00
|
|
|
req->io_task_work.func = io_req_task_complete;
|
|
|
|
io_req_task_work_add(req);
|
2023-04-12 18:07:36 +00:00
|
|
|
}
|
2022-05-25 11:59:19 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
|
|
|
|
|
2024-03-19 02:41:58 +00:00
|
|
|
static int io_uring_cmd_prep_setup(struct io_kiocb *req,
|
|
|
|
const struct io_uring_sqe *sqe)
|
2022-05-25 11:59:19 +00:00
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
2024-03-19 02:41:58 +00:00
|
|
|
struct uring_cache *cache;
|
2022-05-25 11:59:19 +00:00
|
|
|
|
2024-12-16 20:46:11 +00:00
|
|
|
cache = io_uring_alloc_async_data(&req->ctx->uring_cache, req, NULL);
|
|
|
|
if (!cache)
|
io_uring/uring_cmd: defer SQE copying until it's needed
The previous commit turned on async data for uring_cmd, and did the
basic conversion of setting everything up on the prep side. However, for
a lot of use cases, -EIOCBQUEUED will get returned on issue, as the
operation got successfully queued. For that case, a persistent SQE isn't
needed, as it's just used for issue.
Unless execution goes async immediately, defer copying the double SQE
until it's necessary.
This greatly reduces the overhead of such commands, as evidenced by
a perf diff from before and after this change:
10.60% -8.58% [kernel.vmlinux] [k] io_uring_cmd_prep
where the prep side drops from 10.60% to ~2%, which is more expected.
Performance also rises from ~113M IOPS to ~122M IOPS, bringing us back
to where it was before the async command prep.
Tested-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-03-20 21:23:47 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
if (!(req->flags & REQ_F_FORCE_ASYNC)) {
|
|
|
|
/* defer memcpy until we need it */
|
|
|
|
ioucmd->sqe = sqe;
|
2024-03-19 02:41:58 +00:00
|
|
|
return 0;
|
|
|
|
}
|
io_uring/uring_cmd: defer SQE copying until it's needed
The previous commit turned on async data for uring_cmd, and did the
basic conversion of setting everything up on the prep side. However, for
a lot of use cases, -EIOCBQUEUED will get returned on issue, as the
operation got successfully queued. For that case, a persistent SQE isn't
needed, as it's just used for issue.
Unless execution goes async immediately, defer copying the double SQE
until it's necessary.
This greatly reduces the overhead of such commands, as evidenced by
a perf diff from before and after this change:
10.60% -8.58% [kernel.vmlinux] [k] io_uring_cmd_prep
where the prep side drops from 10.60% to ~2%, which is more expected.
Performance also rises from ~113M IOPS to ~122M IOPS, bringing us back
to where it was before the async command prep.
Tested-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-03-20 21:23:47 +00:00
|
|
|
|
|
|
|
memcpy(req->async_data, sqe, uring_sqe_size(req->ctx));
|
|
|
|
ioucmd->sqe = req->async_data;
|
|
|
|
return 0;
|
2022-05-25 11:59:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
2022-05-25 11:59:19 +00:00
|
|
|
|
2022-09-30 06:27:39 +00:00
|
|
|
if (sqe->__pad1)
|
2022-05-25 11:59:19 +00:00
|
|
|
return -EINVAL;
|
2022-09-30 06:27:39 +00:00
|
|
|
|
|
|
|
ioucmd->flags = READ_ONCE(sqe->uring_cmd_flags);
|
2023-09-28 12:43:24 +00:00
|
|
|
if (ioucmd->flags & ~IORING_URING_CMD_MASK)
|
2022-09-30 06:27:39 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
|
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
2024-10-27 15:08:31 +00:00
|
|
|
struct io_rsrc_node *node;
|
2024-11-11 10:13:18 +00:00
|
|
|
u16 index = READ_ONCE(sqe->buf_index);
|
2022-09-30 06:27:39 +00:00
|
|
|
|
2024-11-11 10:13:18 +00:00
|
|
|
node = io_rsrc_node_lookup(&ctx->buf_table, index);
|
2024-10-27 15:08:31 +00:00
|
|
|
if (unlikely(!node))
|
2022-09-30 06:27:39 +00:00
|
|
|
return -EFAULT;
|
2024-10-16 21:48:38 +00:00
|
|
|
/*
|
|
|
|
* Pi node upfront, prior to io_uring_cmd_import_fixed()
|
|
|
|
* being called. This prevents destruction of the mapped buffer
|
|
|
|
* we'll need at actual import time.
|
|
|
|
*/
|
2024-11-07 11:01:36 +00:00
|
|
|
io_req_assign_buf_node(req, node);
|
2022-09-30 06:27:39 +00:00
|
|
|
}
|
2022-05-25 11:59:19 +00:00
|
|
|
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
|
2024-03-19 02:41:58 +00:00
|
|
|
|
|
|
|
return io_uring_cmd_prep_setup(req, sqe);
|
2022-05-25 11:59:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
|
|
|
|
{
|
2022-08-11 07:11:15 +00:00
|
|
|
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
|
2022-05-25 11:59:19 +00:00
|
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
|
|
struct file *file = req->file;
|
|
|
|
int ret;
|
|
|
|
|
2023-03-08 16:26:13 +00:00
|
|
|
if (!file->f_op->uring_cmd)
|
2022-05-25 11:59:19 +00:00
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
2022-07-15 19:16:22 +00:00
|
|
|
ret = security_uring_cmd(ioucmd);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2022-05-25 11:59:19 +00:00
|
|
|
if (ctx->flags & IORING_SETUP_SQE128)
|
|
|
|
issue_flags |= IO_URING_F_SQE128;
|
|
|
|
if (ctx->flags & IORING_SETUP_CQE32)
|
|
|
|
issue_flags |= IO_URING_F_CQE32;
|
2023-10-16 13:47:43 +00:00
|
|
|
if (ctx->compat)
|
|
|
|
issue_flags |= IO_URING_F_COMPAT;
|
2022-08-23 16:14:41 +00:00
|
|
|
if (ctx->flags & IORING_SETUP_IOPOLL) {
|
2023-03-08 16:26:13 +00:00
|
|
|
if (!file->f_op->uring_cmd_iopoll)
|
|
|
|
return -EOPNOTSUPP;
|
2022-05-25 11:59:19 +00:00
|
|
|
issue_flags |= IO_URING_F_IOPOLL;
|
2022-08-23 16:14:41 +00:00
|
|
|
req->iopoll_completed = 0;
|
|
|
|
}
|
2022-05-25 11:59:19 +00:00
|
|
|
|
|
|
|
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
|
io_uring/uring_cmd: defer SQE copying until it's needed
The previous commit turned on async data for uring_cmd, and did the
basic conversion of setting everything up on the prep side. However, for
a lot of use cases, -EIOCBQUEUED will get returned on issue, as the
operation got successfully queued. For that case, a persistent SQE isn't
needed, as it's just used for issue.
Unless execution goes async immediately, defer copying the double SQE
until it's necessary.
This greatly reduces the overhead of such commands, as evidenced by
a perf diff from before and after this change:
10.60% -8.58% [kernel.vmlinux] [k] io_uring_cmd_prep
where the prep side drops from 10.60% to ~2%, which is more expected.
Performance also rises from ~113M IOPS to ~122M IOPS, bringing us back
to where it was before the async command prep.
Tested-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-03-20 21:23:47 +00:00
|
|
|
if (ret == -EAGAIN) {
|
|
|
|
struct uring_cache *cache = req->async_data;
|
|
|
|
|
|
|
|
if (ioucmd->sqe != (void *) cache)
|
|
|
|
memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx));
|
|
|
|
return -EAGAIN;
|
|
|
|
} else if (ret == -EIOCBQUEUED) {
|
|
|
|
return -EIOCBQUEUED;
|
|
|
|
}
|
2022-05-25 11:59:19 +00:00
|
|
|
|
2024-03-19 02:41:58 +00:00
|
|
|
if (ret < 0)
|
|
|
|
req_set_fail(req);
|
|
|
|
io_req_uring_cleanup(req, issue_flags);
|
|
|
|
io_req_set_res(req, ret, 0);
|
2024-07-24 11:16:19 +00:00
|
|
|
return IOU_OK;
|
2022-05-25 11:59:19 +00:00
|
|
|
}
|
2022-09-30 06:27:38 +00:00
|
|
|
|
|
|
|
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
|
|
|
|
struct iov_iter *iter, void *ioucmd)
|
|
|
|
{
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
2024-11-03 15:46:07 +00:00
|
|
|
struct io_rsrc_node *node = req->buf_node;
|
2022-09-30 06:27:38 +00:00
|
|
|
|
2024-10-16 21:48:38 +00:00
|
|
|
/* Must have had rsrc_node assigned at prep time */
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
if (node)
|
|
|
|
return io_import_fixed(rw, iter, node->buf, ubuf, len);
|
2022-09-30 06:27:38 +00:00
|
|
|
|
2024-10-16 21:48:38 +00:00
|
|
|
return -EFAULT;
|
2022-09-30 06:27:38 +00:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
|
2023-06-27 13:44:24 +00:00
|
|
|
|
2024-09-11 16:34:37 +00:00
|
|
|
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
|
|
|
|
{
|
|
|
|
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
|
|
|
|
|
|
|
|
io_req_queue_iowq(req);
|
|
|
|
}
|
|
|
|
|
2023-10-16 13:47:47 +00:00
|
|
|
static inline int io_uring_cmd_getsockopt(struct socket *sock,
|
|
|
|
struct io_uring_cmd *cmd,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
|
|
|
|
int optlen, optname, level, err;
|
|
|
|
void __user *optval;
|
|
|
|
|
|
|
|
level = READ_ONCE(cmd->sqe->level);
|
|
|
|
if (level != SOL_SOCKET)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval));
|
|
|
|
optname = READ_ONCE(cmd->sqe->optname);
|
|
|
|
optlen = READ_ONCE(cmd->sqe->optlen);
|
|
|
|
|
|
|
|
err = do_sock_getsockopt(sock, compat, level, optname,
|
|
|
|
USER_SOCKPTR(optval),
|
|
|
|
KERNEL_SOCKPTR(&optlen));
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
/* On success, return optlen */
|
|
|
|
return optlen;
|
|
|
|
}
|
|
|
|
|
2023-10-16 13:47:48 +00:00
|
|
|
static inline int io_uring_cmd_setsockopt(struct socket *sock,
|
|
|
|
struct io_uring_cmd *cmd,
|
|
|
|
unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
bool compat = !!(issue_flags & IO_URING_F_COMPAT);
|
|
|
|
int optname, optlen, level;
|
|
|
|
void __user *optval;
|
|
|
|
sockptr_t optval_s;
|
|
|
|
|
|
|
|
optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval));
|
|
|
|
optname = READ_ONCE(cmd->sqe->optname);
|
|
|
|
optlen = READ_ONCE(cmd->sqe->optlen);
|
|
|
|
level = READ_ONCE(cmd->sqe->level);
|
|
|
|
optval_s = USER_SOCKPTR(optval);
|
|
|
|
|
|
|
|
return do_sock_setsockopt(sock, compat, level, optname, optval_s,
|
|
|
|
optlen);
|
|
|
|
}
|
|
|
|
|
2023-10-16 13:47:46 +00:00
|
|
|
#if defined(CONFIG_NET)
|
2023-06-27 13:44:24 +00:00
|
|
|
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
|
|
|
{
|
|
|
|
struct socket *sock = cmd->file->private_data;
|
|
|
|
struct sock *sk = sock->sk;
|
|
|
|
struct proto *prot = READ_ONCE(sk->sk_prot);
|
|
|
|
int ret, arg = 0;
|
|
|
|
|
|
|
|
if (!prot || !prot->ioctl)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
switch (cmd->sqe->cmd_op) {
|
|
|
|
case SOCKET_URING_OP_SIOCINQ:
|
|
|
|
ret = prot->ioctl(sk, SIOCINQ, &arg);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
return arg;
|
|
|
|
case SOCKET_URING_OP_SIOCOUTQ:
|
|
|
|
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
return arg;
|
2023-10-16 13:47:47 +00:00
|
|
|
case SOCKET_URING_OP_GETSOCKOPT:
|
|
|
|
return io_uring_cmd_getsockopt(sock, cmd, issue_flags);
|
2023-10-16 13:47:48 +00:00
|
|
|
case SOCKET_URING_OP_SETSOCKOPT:
|
|
|
|
return io_uring_cmd_setsockopt(sock, cmd, issue_flags);
|
2023-06-27 13:44:24 +00:00
|
|
|
default:
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
|
2023-10-16 13:47:46 +00:00
|
|
|
#endif
|