mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2024-12-29 17:23:36 +00:00
io_uring/register: add IORING_REGISTER_RESIZE_RINGS
Once a ring has been created, the size of the CQ and SQ rings are fixed. Usually this isn't a problem on the SQ ring side, as it merely controls the available number of requests that can be submitted in a single system call, and there's rarely a need to change that. For the CQ ring, it's a different story. For most efficient use of io_uring, it's important that the CQ ring never overflows. This means that applications must size it for the worst case scenario, which can be wasteful. Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize the existing rings. It takes a struct io_uring_params argument, the same one which is used to setup the ring initially, and resizes rings according to the sizes given. Certain properties are always inherited from the original ring setup, like SQE128/CQE32 and other setup options. The implementation only allows flag associated with how the CQ ring is sized and clamped. Existing unconsumed SQE and CQE entries are copied as part of the process. If either the SQ or CQ resized destination ring cannot hold the entries already present in the source rings, then the operation is failed with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new submissions, and the internal mapping holds the completion lock as well across moving CQ ring state. To prevent races between mmap and ring resizing, add a mutex that's solely used to serialize ring resize and mmap. mmap_sem can't be used here, as as fork'ed process may be doing mmaps on the ring as well. The ctx->resize_lock is held across mmap operations, and the resize will grab it before swapping out the already mapped new data. Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
parent
d090bffab6
commit
79cfe9e59c
@ -415,6 +415,13 @@ struct io_ring_ctx {
|
||||
/* protected by ->completion_lock */
|
||||
unsigned evfd_last_cq_tail;
|
||||
|
||||
/*
|
||||
* Protection for resize vs mmap races - both the mmap and resize
|
||||
* side will need to grab this lock, to prevent either side from
|
||||
* being run concurrently with the other.
|
||||
*/
|
||||
struct mutex resize_lock;
|
||||
|
||||
/*
|
||||
* If IORING_SETUP_NO_MMAP is used, then the below holds
|
||||
* the gup'ed pages for the two rings, and the sqes.
|
||||
|
@ -615,6 +615,11 @@ enum io_uring_register_op {
|
||||
/* send MSG_RING without having a ring */
|
||||
IORING_REGISTER_SEND_MSG_RING = 31,
|
||||
|
||||
/* 32 reserved for zc rx */
|
||||
|
||||
/* resize CQ ring */
|
||||
IORING_REGISTER_RESIZE_RINGS = 33,
|
||||
|
||||
/* this goes last */
|
||||
IORING_REGISTER_LAST,
|
||||
|
||||
|
@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
|
||||
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
|
||||
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
|
||||
io_napi_init(ctx);
|
||||
mutex_init(&ctx->resize_lock);
|
||||
|
||||
return ctx;
|
||||
|
||||
|
@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
unsigned int npages;
|
||||
void *ptr;
|
||||
|
||||
guard(mutex)(&ctx->resize_lock);
|
||||
|
||||
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
|
||||
if (IS_ERR(ptr))
|
||||
return PTR_ERR(ptr);
|
||||
@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
unsigned long len, unsigned long pgoff,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = filp->private_data;
|
||||
void *ptr;
|
||||
|
||||
/*
|
||||
@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
|
||||
if (addr)
|
||||
return -EINVAL;
|
||||
|
||||
guard(mutex)(&ctx->resize_lock);
|
||||
|
||||
ptr = io_uring_validate_mmap_request(filp, pgoff, len);
|
||||
if (IS_ERR(ptr))
|
||||
return -ENOMEM;
|
||||
@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
|
||||
unsigned long len, unsigned long pgoff,
|
||||
unsigned long flags)
|
||||
{
|
||||
struct io_ring_ctx *ctx = file->private_data;
|
||||
void *ptr;
|
||||
|
||||
guard(mutex)(&ctx->resize_lock);
|
||||
|
||||
ptr = io_uring_validate_mmap_request(file, pgoff, len);
|
||||
if (IS_ERR(ptr))
|
||||
return PTR_ERR(ptr);
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "napi.h"
|
||||
#include "eventfd.h"
|
||||
#include "msg_ring.h"
|
||||
#include "memmap.h"
|
||||
|
||||
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
|
||||
IORING_REGISTER_LAST + IORING_OP_LAST)
|
||||
@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* State to maintain until we can swap. Both new and old state, used for
|
||||
* either mapping or freeing.
|
||||
*/
|
||||
struct io_ring_ctx_rings {
|
||||
unsigned short n_ring_pages;
|
||||
unsigned short n_sqe_pages;
|
||||
struct page **ring_pages;
|
||||
struct page **sqe_pages;
|
||||
struct io_uring_sqe *sq_sqes;
|
||||
struct io_rings *rings;
|
||||
};
|
||||
|
||||
static void io_register_free_rings(struct io_uring_params *p,
|
||||
struct io_ring_ctx_rings *r)
|
||||
{
|
||||
if (!(p->flags & IORING_SETUP_NO_MMAP)) {
|
||||
io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
|
||||
true);
|
||||
io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
|
||||
true);
|
||||
} else {
|
||||
io_pages_free(&r->ring_pages, r->n_ring_pages);
|
||||
io_pages_free(&r->sqe_pages, r->n_sqe_pages);
|
||||
vunmap(r->rings);
|
||||
vunmap(r->sq_sqes);
|
||||
}
|
||||
}
|
||||
|
||||
#define swap_old(ctx, o, n, field) \
|
||||
do { \
|
||||
(o).field = (ctx)->field; \
|
||||
(ctx)->field = (n).field; \
|
||||
} while (0)
|
||||
|
||||
#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
|
||||
#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
|
||||
IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
|
||||
|
||||
static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
|
||||
{
|
||||
struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
|
||||
size_t size, sq_array_offset;
|
||||
struct io_uring_params p;
|
||||
unsigned i, tail;
|
||||
void *ptr;
|
||||
int ret;
|
||||
|
||||
/* for single issuer, must be owner resizing */
|
||||
if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
|
||||
current != ctx->submitter_task)
|
||||
return -EEXIST;
|
||||
if (copy_from_user(&p, arg, sizeof(p)))
|
||||
return -EFAULT;
|
||||
if (p.flags & ~RESIZE_FLAGS)
|
||||
return -EINVAL;
|
||||
|
||||
/* properties that are always inherited */
|
||||
p.flags |= (ctx->flags & COPY_FLAGS);
|
||||
|
||||
ret = io_uring_fill_params(p.sq_entries, &p);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
/* nothing to do, but copy params back */
|
||||
if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
|
||||
if (copy_to_user(arg, &p, sizeof(p)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
}
|
||||
|
||||
size = rings_size(p.flags, p.sq_entries, p.cq_entries,
|
||||
&sq_array_offset);
|
||||
if (size == SIZE_MAX)
|
||||
return -EOVERFLOW;
|
||||
|
||||
if (!(p.flags & IORING_SETUP_NO_MMAP))
|
||||
n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
|
||||
else
|
||||
n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
|
||||
p.cq_off.user_addr, size);
|
||||
if (IS_ERR(n.rings))
|
||||
return PTR_ERR(n.rings);
|
||||
|
||||
n.rings->sq_ring_mask = p.sq_entries - 1;
|
||||
n.rings->cq_ring_mask = p.cq_entries - 1;
|
||||
n.rings->sq_ring_entries = p.sq_entries;
|
||||
n.rings->cq_ring_entries = p.cq_entries;
|
||||
|
||||
if (copy_to_user(arg, &p, sizeof(p))) {
|
||||
io_register_free_rings(&p, &n);
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
if (p.flags & IORING_SETUP_SQE128)
|
||||
size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
|
||||
else
|
||||
size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
|
||||
if (size == SIZE_MAX) {
|
||||
io_register_free_rings(&p, &n);
|
||||
return -EOVERFLOW;
|
||||
}
|
||||
|
||||
if (!(p.flags & IORING_SETUP_NO_MMAP))
|
||||
ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
|
||||
else
|
||||
ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
|
||||
p.sq_off.user_addr,
|
||||
size);
|
||||
if (IS_ERR(ptr)) {
|
||||
io_register_free_rings(&p, &n);
|
||||
return PTR_ERR(ptr);
|
||||
}
|
||||
|
||||
/*
|
||||
* If using SQPOLL, park the thread
|
||||
*/
|
||||
if (ctx->sq_data) {
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
io_sq_thread_park(ctx->sq_data);
|
||||
mutex_lock(&ctx->uring_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* We'll do the swap. Grab the ctx->resize_lock, which will exclude
|
||||
* any new mmap's on the ring fd. Clear out existing mappings to prevent
|
||||
* mmap from seeing them, as we'll unmap them. Any attempt to mmap
|
||||
* existing rings beyond this point will fail. Not that it could proceed
|
||||
* at this point anyway, as the io_uring mmap side needs go grab the
|
||||
* ctx->resize_lock as well. Likewise, hold the completion lock over the
|
||||
* duration of the actual swap.
|
||||
*/
|
||||
mutex_lock(&ctx->resize_lock);
|
||||
spin_lock(&ctx->completion_lock);
|
||||
o.rings = ctx->rings;
|
||||
ctx->rings = NULL;
|
||||
o.sq_sqes = ctx->sq_sqes;
|
||||
ctx->sq_sqes = NULL;
|
||||
|
||||
/*
|
||||
* Now copy SQ and CQ entries, if any. If either of the destination
|
||||
* rings can't hold what is already there, then fail the operation.
|
||||
*/
|
||||
n.sq_sqes = ptr;
|
||||
tail = o.rings->sq.tail;
|
||||
if (tail - o.rings->sq.head > p.sq_entries)
|
||||
goto overflow;
|
||||
for (i = o.rings->sq.head; i < tail; i++) {
|
||||
unsigned src_head = i & (ctx->sq_entries - 1);
|
||||
unsigned dst_head = i & n.rings->sq_ring_mask;
|
||||
|
||||
n.sq_sqes[dst_head] = o.sq_sqes[src_head];
|
||||
}
|
||||
n.rings->sq.head = o.rings->sq.head;
|
||||
n.rings->sq.tail = o.rings->sq.tail;
|
||||
|
||||
tail = o.rings->cq.tail;
|
||||
if (tail - o.rings->cq.head > p.cq_entries) {
|
||||
overflow:
|
||||
/* restore old rings, and return -EOVERFLOW via cleanup path */
|
||||
ctx->rings = o.rings;
|
||||
ctx->sq_sqes = o.sq_sqes;
|
||||
to_free = &n;
|
||||
ret = -EOVERFLOW;
|
||||
goto out;
|
||||
}
|
||||
for (i = o.rings->cq.head; i < tail; i++) {
|
||||
unsigned src_head = i & (ctx->cq_entries - 1);
|
||||
unsigned dst_head = i & n.rings->cq_ring_mask;
|
||||
|
||||
n.rings->cqes[dst_head] = o.rings->cqes[src_head];
|
||||
}
|
||||
n.rings->cq.head = o.rings->cq.head;
|
||||
n.rings->cq.tail = o.rings->cq.tail;
|
||||
/* invalidate cached cqe refill */
|
||||
ctx->cqe_cached = ctx->cqe_sentinel = NULL;
|
||||
|
||||
n.rings->sq_dropped = o.rings->sq_dropped;
|
||||
n.rings->sq_flags = o.rings->sq_flags;
|
||||
n.rings->cq_flags = o.rings->cq_flags;
|
||||
n.rings->cq_overflow = o.rings->cq_overflow;
|
||||
|
||||
/* all done, store old pointers and assign new ones */
|
||||
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
|
||||
ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
|
||||
|
||||
ctx->sq_entries = p.sq_entries;
|
||||
ctx->cq_entries = p.cq_entries;
|
||||
|
||||
ctx->rings = n.rings;
|
||||
ctx->sq_sqes = n.sq_sqes;
|
||||
swap_old(ctx, o, n, n_ring_pages);
|
||||
swap_old(ctx, o, n, n_sqe_pages);
|
||||
swap_old(ctx, o, n, ring_pages);
|
||||
swap_old(ctx, o, n, sqe_pages);
|
||||
to_free = &o;
|
||||
ret = 0;
|
||||
out:
|
||||
spin_unlock(&ctx->completion_lock);
|
||||
mutex_unlock(&ctx->resize_lock);
|
||||
io_register_free_rings(&p, to_free);
|
||||
|
||||
if (ctx->sq_data)
|
||||
io_sq_thread_unpark(ctx->sq_data);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
void __user *arg, unsigned nr_args)
|
||||
__releases(ctx->uring_lock)
|
||||
@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||
break;
|
||||
ret = io_register_clone_buffers(ctx, arg);
|
||||
break;
|
||||
case IORING_REGISTER_RESIZE_RINGS:
|
||||
ret = -EINVAL;
|
||||
if (!arg || nr_args != 1)
|
||||
break;
|
||||
ret = io_register_resize_rings(ctx, arg);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user