io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring

Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_page() and have it Just Work.

This requires a bit of effort on the mmap lookup side, as the ctx
uring_lock isn't held, which  otherwise protects buffer_lists from being
torn down, and it's not safe to grab from mmap context that would
introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
Instead, lookup the buffer_list under RCU, as the the list is RCU freed
already. Use the existing reference count to determine whether it's
possible to safely grab a reference to it (eg if it's not zero already),
and drop that reference when done with the mapping. If the mmap
reference is the last one, the buffer_list and the associated memory can
go away, since the vma insertion has references to the inserted pages at
that point.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2024-03-12 20:24:21 -06:00
parent e270bfd22a
commit 87585b0575
5 changed files with 46 additions and 156 deletions

View File

@ -372,9 +372,6 @@ struct io_ring_ctx {
struct list_head io_buffers_cache; struct list_head io_buffers_cache;
/* deferred free list, protected by ->uring_lock */
struct hlist_head io_buf_list;
/* Keep this last, we don't need it for the fast path */ /* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq; struct wait_queue_head poll_wq;
struct io_restriction restrictions; struct io_restriction restrictions;

View File

@ -303,7 +303,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache); INIT_LIST_HEAD(&ctx->io_buffers_cache);
INIT_HLIST_HEAD(&ctx->io_buf_list);
ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node)); sizeof(struct io_rsrc_node));
ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
@ -2598,15 +2597,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
} }
static void io_pages_unmap(void *ptr, struct page ***pages, void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
unsigned short *npages) bool put_pages)
{ {
bool do_vunmap = false; bool do_vunmap = false;
if (!ptr) if (!ptr)
return; return;
if (*npages) { if (put_pages && *npages) {
struct page **to_free = *pages; struct page **to_free = *pages;
int i; int i;
@ -2628,14 +2627,6 @@ static void io_pages_unmap(void *ptr, struct page ***pages,
*npages = 0; *npages = 0;
} }
void io_mem_free(void *ptr)
{
if (!ptr)
return;
folio_put(virt_to_folio(ptr));
}
static void io_pages_free(struct page ***pages, int npages) static void io_pages_free(struct page ***pages, int npages)
{ {
struct page **page_array = *pages; struct page **page_array = *pages;
@ -2730,8 +2721,10 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
static void io_rings_free(struct io_ring_ctx *ctx) static void io_rings_free(struct io_ring_ctx *ctx)
{ {
if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); true);
io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
true);
} else { } else {
io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
ctx->n_ring_pages = 0; ctx->n_ring_pages = 0;
@ -2788,8 +2781,8 @@ err:
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
static void *io_pages_map(struct page ***out_pages, unsigned short *npages, void *io_pages_map(struct page ***out_pages, unsigned short *npages,
size_t size) size_t size)
{ {
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
struct page **pages; struct page **pages;
@ -2819,17 +2812,6 @@ done:
return ret; return ret;
} }
void *io_mem_alloc(size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
void *ret;
ret = (void *) __get_free_pages(gfp, get_order(size));
if (ret)
return ret;
return ERR_PTR(-ENOMEM);
}
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset) unsigned int cq_entries, size_t *sq_offset)
{ {
@ -2926,7 +2908,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL; ctx->mm_account = NULL;
} }
io_rings_free(ctx); io_rings_free(ctx);
io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs); percpu_ref_exit(&ctx->refs);
free_uid(ctx->user); free_uid(ctx->user);
@ -3396,10 +3377,8 @@ static void *io_uring_validate_mmap_request(struct file *file,
{ {
struct io_ring_ctx *ctx = file->private_data; struct io_ring_ctx *ctx = file->private_data;
loff_t offset = pgoff << PAGE_SHIFT; loff_t offset = pgoff << PAGE_SHIFT;
struct page *page;
void *ptr;
switch (offset & IORING_OFF_MMAP_MASK) { switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING: case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING: case IORING_OFF_CQ_RING:
/* Don't allow mmap if the ring was setup without it */ /* Don't allow mmap if the ring was setup without it */
@ -3414,6 +3393,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
case IORING_OFF_PBUF_RING: { case IORING_OFF_PBUF_RING: {
struct io_buffer_list *bl; struct io_buffer_list *bl;
unsigned int bgid; unsigned int bgid;
void *ptr;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
bl = io_pbuf_get_bl(ctx, bgid); bl = io_pbuf_get_bl(ctx, bgid);
@ -3421,17 +3401,11 @@ static void *io_uring_validate_mmap_request(struct file *file,
return bl; return bl;
ptr = bl->buf_ring; ptr = bl->buf_ring;
io_put_bl(ctx, bl); io_put_bl(ctx, bl);
break; return ptr;
} }
default:
return ERR_PTR(-EINVAL);
} }
page = virt_to_head_page(ptr); return ERR_PTR(-EINVAL);
if (sz > page_size(page))
return ERR_PTR(-EINVAL);
return ptr;
} }
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
@ -3450,7 +3424,6 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
struct io_ring_ctx *ctx = file->private_data; struct io_ring_ctx *ctx = file->private_data;
size_t sz = vma->vm_end - vma->vm_start; size_t sz = vma->vm_end - vma->vm_start;
long offset = vma->vm_pgoff << PAGE_SHIFT; long offset = vma->vm_pgoff << PAGE_SHIFT;
unsigned long pfn;
void *ptr; void *ptr;
ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
@ -3465,10 +3438,11 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
case IORING_OFF_SQES: case IORING_OFF_SQES:
return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
ctx->n_sqe_pages); ctx->n_sqe_pages);
case IORING_OFF_PBUF_RING:
return io_pbuf_mmap(file, vma);
} }
pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return -EINVAL;
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
} }
static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,

View File

@ -109,8 +109,10 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all); bool cancel_all);
void *io_mem_alloc(size_t size); void *io_pages_map(struct page ***out_pages, unsigned short *npages,
void io_mem_free(void *ptr); size_t size);
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
bool put_pages);
enum { enum {
IO_EVENTFD_OP_SIGNAL_BIT, IO_EVENTFD_OP_SIGNAL_BIT,

View File

@ -32,25 +32,12 @@ struct io_provide_buf {
__u16 bid; __u16 bid;
}; };
struct io_buf_free {
struct hlist_node list;
void *mem;
size_t size;
int inuse;
};
static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid)
{
return xa_load(&ctx->io_bl_xa, bgid);
}
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid) unsigned int bgid)
{ {
lockdep_assert_held(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock);
return __io_buffer_get_list(ctx, bgid); return xa_load(&ctx->io_bl_xa, bgid);
} }
static int io_buffer_add_list(struct io_ring_ctx *ctx, static int io_buffer_add_list(struct io_ring_ctx *ctx,
@ -191,24 +178,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
return ret; return ret;
} }
/*
* Mark the given mapped range as free for reuse
*/
static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
struct io_buf_free *ibf;
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
if (bl->buf_ring == ibf->mem) {
ibf->inuse = 0;
return;
}
}
/* can't happen... */
WARN_ON_ONCE(1);
}
static int __io_remove_buffers(struct io_ring_ctx *ctx, static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs) struct io_buffer_list *bl, unsigned nbufs)
{ {
@ -220,23 +189,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_buf_ring) { if (bl->is_buf_ring) {
i = bl->buf_ring->tail - bl->head; i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) { if (bl->buf_nr_pages) {
/*
* io_kbuf_list_free() will free the page(s) at
* ->release() time.
*/
io_kbuf_mark_free(ctx, bl);
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if (bl->buf_nr_pages) {
int j; int j;
for (j = 0; j < bl->buf_nr_pages; j++) if (!bl->is_mmap) {
unpin_user_page(bl->buf_pages[j]); for (j = 0; j < bl->buf_nr_pages; j++)
kvfree(bl->buf_pages); unpin_user_page(bl->buf_pages[j]);
vunmap(bl->buf_ring); }
bl->buf_pages = NULL; io_pages_unmap(bl->buf_ring, &bl->buf_pages,
bl->buf_nr_pages = 0; &bl->buf_nr_pages, bl->is_mmap);
bl->is_mmap = 0;
} }
/* make sure it's seen as empty */ /* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list); INIT_LIST_HEAD(&bl->buf_list);
@ -537,63 +499,18 @@ error_unpin:
return ret; return ret;
} }
/*
* See if we have a suitable region that we can reuse, rather than allocate
* both a new io_buf_free and mem region again. We leave it on the list as
* even a reused entry will need freeing at ring release.
*/
static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
size_t ring_size)
{
struct io_buf_free *ibf, *best = NULL;
size_t best_dist;
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
size_t dist;
if (ibf->inuse || ibf->size < ring_size)
continue;
dist = ibf->size - ring_size;
if (!best || dist < best_dist) {
best = ibf;
if (!dist)
break;
best_dist = dist;
}
}
return best;
}
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg, struct io_uring_buf_reg *reg,
struct io_buffer_list *bl) struct io_buffer_list *bl)
{ {
struct io_buf_free *ibf;
size_t ring_size; size_t ring_size;
void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
/* Reuse existing entry, if we can */ bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
ibf = io_lookup_buf_free_entry(ctx, ring_size); if (!bl->buf_ring)
if (!ibf) { return -ENOMEM;
ptr = io_mem_alloc(ring_size);
if (IS_ERR(ptr))
return PTR_ERR(ptr);
/* Allocate and store deferred free entry */
ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
if (!ibf) {
io_mem_free(ptr);
return -ENOMEM;
}
ibf->mem = ptr;
ibf->size = ring_size;
hlist_add_head(&ibf->list, &ctx->io_buf_list);
}
ibf->inuse = 1;
bl->buf_ring = ibf->mem;
bl->is_buf_ring = 1; bl->is_buf_ring = 1;
bl->is_mmap = 1; bl->is_mmap = 1;
return 0; return 0;
@ -741,18 +658,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
/* int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
* Called at or after ->release(), free the mmap'ed buffers that we used
* for memory mapped provided buffer rings.
*/
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
{ {
struct io_buf_free *ibf; struct io_ring_ctx *ctx = file->private_data;
struct hlist_node *tmp; loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
struct io_buffer_list *bl;
int bgid, ret;
hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
hlist_del(&ibf->list); bl = io_pbuf_get_bl(ctx, bgid);
io_mem_free(ibf->mem); if (IS_ERR(bl))
kfree(ibf); return PTR_ERR(bl);
}
ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
io_put_bl(ctx, bl);
return ret;
} }

View File

@ -55,8 +55,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
@ -64,6 +62,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
unsigned long bgid); unsigned long bgid);
int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{ {