2022-06-13 13:12:45 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
#ifndef IOU_RSRC_H
|
|
|
|
#define IOU_RSRC_H
|
|
|
|
|
2023-04-04 12:39:57 +00:00
|
|
|
#define IO_NODE_ALLOC_CACHE_MAX 32
|
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
|
|
|
|
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
|
|
|
|
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
|
|
|
|
|
|
|
|
enum {
|
|
|
|
IORING_RSRC_FILE = 0,
|
|
|
|
IORING_RSRC_BUFFER = 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct io_rsrc_node {
|
2024-11-07 11:01:35 +00:00
|
|
|
unsigned char type;
|
2023-04-04 12:39:49 +00:00
|
|
|
int refs;
|
2022-06-13 13:12:45 +00:00
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
u64 tag;
|
|
|
|
union {
|
|
|
|
unsigned long file_ptr;
|
|
|
|
struct io_mapped_ubuf *buf;
|
|
|
|
};
|
2024-10-25 14:54:28 +00:00
|
|
|
};
|
|
|
|
|
2022-06-19 01:44:33 +00:00
|
|
|
struct io_mapped_ubuf {
|
|
|
|
u64 ubuf;
|
2024-09-15 14:53:45 +00:00
|
|
|
unsigned int len;
|
2022-06-19 01:44:33 +00:00
|
|
|
unsigned int nr_bvecs;
|
2024-07-31 09:01:32 +00:00
|
|
|
unsigned int folio_shift;
|
2024-09-11 19:54:32 +00:00
|
|
|
refcount_t refs;
|
2024-09-15 14:53:45 +00:00
|
|
|
unsigned long acct_pages;
|
2023-08-17 21:21:47 +00:00
|
|
|
struct bio_vec bvec[] __counted_by(nr_bvecs);
|
2022-06-19 01:44:33 +00:00
|
|
|
};
|
|
|
|
|
2024-07-31 09:01:33 +00:00
|
|
|
struct io_imu_folio_data {
|
|
|
|
/* Head folio can be partially included in the fixed buf */
|
|
|
|
unsigned int nr_pages_head;
|
|
|
|
/* For non-head/tail folios, has to be fully included */
|
|
|
|
unsigned int nr_pages_mid;
|
|
|
|
unsigned int folio_shift;
|
2024-11-29 13:34:23 +00:00
|
|
|
unsigned int nr_folios;
|
2024-07-31 09:01:33 +00:00
|
|
|
};
|
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
|
2024-11-07 11:01:34 +00:00
|
|
|
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node);
|
|
|
|
void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data);
|
2024-10-26 20:50:13 +00:00
|
|
|
int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
|
2022-06-13 13:12:45 +00:00
|
|
|
|
2022-06-20 00:25:59 +00:00
|
|
|
int io_import_fixed(int ddir, struct iov_iter *iter,
|
|
|
|
struct io_mapped_ubuf *imu,
|
|
|
|
u64 buf_addr, size_t len);
|
2022-06-13 13:12:45 +00:00
|
|
|
|
2024-09-14 14:51:15 +00:00
|
|
|
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
|
2022-06-13 13:12:45 +00:00
|
|
|
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
|
|
|
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned int nr_args, u64 __user *tags);
|
|
|
|
int io_sqe_files_unregister(struct io_ring_ctx *ctx);
|
|
|
|
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned nr_args, u64 __user *tags);
|
|
|
|
|
|
|
|
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned nr_args);
|
|
|
|
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned size, unsigned type);
|
|
|
|
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
|
|
|
|
unsigned int size, unsigned int type);
|
|
|
|
|
2024-11-29 13:34:23 +00:00
|
|
|
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
|
|
|
|
struct io_imu_folio_data *data);
|
|
|
|
|
2024-10-27 15:08:31 +00:00
|
|
|
static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data,
|
|
|
|
int index)
|
|
|
|
{
|
|
|
|
if (index < data->nr)
|
|
|
|
return data->nodes[array_index_nospec(index, data->nr)];
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-11-07 11:01:34 +00:00
|
|
|
static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
2024-10-30 15:51:58 +00:00
|
|
|
if (node && !--node->refs)
|
2024-11-07 11:01:34 +00:00
|
|
|
io_free_rsrc_node(ctx, node);
|
2024-01-11 20:34:33 +00:00
|
|
|
}
|
|
|
|
|
2024-11-07 11:01:34 +00:00
|
|
|
static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
|
|
|
|
struct io_rsrc_data *data, int index)
|
2024-10-29 15:02:38 +00:00
|
|
|
{
|
|
|
|
struct io_rsrc_node *node = data->nodes[index];
|
|
|
|
|
|
|
|
if (!node)
|
|
|
|
return false;
|
2024-11-07 11:01:34 +00:00
|
|
|
io_put_rsrc_node(ctx, node);
|
2024-10-29 15:02:38 +00:00
|
|
|
data->nodes[index] = NULL;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
|
2022-06-13 13:12:45 +00:00
|
|
|
{
|
2024-11-03 15:46:07 +00:00
|
|
|
if (req->file_node) {
|
2024-11-07 11:01:34 +00:00
|
|
|
io_put_rsrc_node(req->ctx, req->file_node);
|
2024-11-03 15:46:07 +00:00
|
|
|
req->file_node = NULL;
|
|
|
|
}
|
|
|
|
if (req->flags & REQ_F_BUF_NODE) {
|
2024-11-07 11:01:34 +00:00
|
|
|
io_put_rsrc_node(req->ctx, req->buf_node);
|
2024-11-03 15:46:07 +00:00
|
|
|
req->buf_node = NULL;
|
|
|
|
}
|
2022-06-13 13:12:45 +00:00
|
|
|
}
|
|
|
|
|
2024-11-03 15:46:07 +00:00
|
|
|
static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
|
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.
As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.
At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.
Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.
Outside of removing the stall in resource reclaim described above, it
has the following advantages:
1) It's a lot simpler than the previous scheme, and easier to follow.
No need to specific quiesce handling anymore.
2) There are no resource node allocations in the fast path, all of that
happens at resource registration time.
3) The structs related to resource handling can all get simplified
quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
go away completely.
4) Handling of resource tags is much simpler, and doesn't require
persistent storage as it can simply get assigned up front at
registration time. Just copy them in one-by-one at registration time
and assign to the resource node.
The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.
With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2024-10-26 01:27:39 +00:00
|
|
|
struct io_rsrc_node *node)
|
2023-04-11 11:06:07 +00:00
|
|
|
{
|
2024-10-30 15:51:58 +00:00
|
|
|
node->refs++;
|
2024-11-03 15:46:07 +00:00
|
|
|
*dst_node = node;
|
2023-04-11 11:06:07 +00:00
|
|
|
}
|
|
|
|
|
2024-11-07 11:01:36 +00:00
|
|
|
static inline void io_req_assign_buf_node(struct io_kiocb *req,
|
|
|
|
struct io_rsrc_node *node)
|
|
|
|
{
|
|
|
|
io_req_assign_rsrc_node(&req->buf_node, node);
|
|
|
|
req->flags |= REQ_F_BUF_NODE;
|
|
|
|
}
|
|
|
|
|
2022-09-01 10:54:02 +00:00
|
|
|
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
|
|
|
|
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
2022-07-25 09:52:05 +00:00
|
|
|
|
|
|
|
int __io_account_mem(struct user_struct *user, unsigned long nr_pages);
|
|
|
|
|
|
|
|
static inline void __io_unaccount_mem(struct user_struct *user,
|
|
|
|
unsigned long nr_pages)
|
|
|
|
{
|
|
|
|
atomic_long_sub(nr_pages, &user->locked_vm);
|
|
|
|
}
|
|
|
|
|
2022-06-13 13:12:45 +00:00
|
|
|
#endif
|