io_uring/kbuf: use region api for pbuf rings

Convert internal parts of the provided buffer ring managment to the
region API. It's the last non-region mapped ring we have, so it also
kills a bunch of now unused memmap.c helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/6c40cf7beaa648558acd4d84bc0fb3279a35d74b.1732886067.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Pavel Begunkov 2024-11-29 13:34:38 +00:00 committed by Jens Axboe
parent be7984ed8a
commit ff4afde8a6
4 changed files with 74 additions and 241 deletions

View File

@ -351,17 +351,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->flags & IOBL_BUF_RING) { if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head; i = bl->buf_ring->tail - bl->head;
if (bl->buf_nr_pages) { io_free_region(ctx, &bl->region);
int j;
if (!(bl->flags & IOBL_MMAP)) {
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
}
io_pages_unmap(bl->buf_ring, &bl->buf_pages,
&bl->buf_nr_pages, bl->flags & IOBL_MMAP);
bl->flags &= ~IOBL_MMAP;
}
/* make sure it's seen as empty */ /* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list); INIT_LIST_HEAD(&bl->buf_list);
bl->flags &= ~IOBL_BUF_RING; bl->flags &= ~IOBL_BUF_RING;
@ -614,75 +604,14 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK; return IOU_OK;
} }
static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
struct io_uring_buf_ring *br = NULL;
struct page **pages;
int nr_pages, ret;
pages = io_pin_pages(reg->ring_addr,
flex_array_size(br, bufs, reg->ring_entries),
&nr_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!br) {
ret = -ENOMEM;
goto error_unpin;
}
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
* is set and we must guarantee that the kernel and user side align
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
* the application mmap's the provided ring buffer. Fail the request
* if we, by chance, don't end up with aligned addresses. The app
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
ret = -EINVAL;
goto error_unpin;
}
#endif
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->buf_ring = br;
bl->flags |= IOBL_BUF_RING;
bl->flags &= ~IOBL_MMAP;
return 0;
error_unpin:
unpin_user_pages(pages, nr_pages);
kvfree(pages);
vunmap(br);
return ret;
}
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
size_t ring_size;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
if (IS_ERR(bl->buf_ring)) {
bl->buf_ring = NULL;
return -ENOMEM;
}
bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
return 0;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{ {
struct io_uring_buf_reg reg; struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL; struct io_buffer_list *bl, *free_bl = NULL;
struct io_uring_region_desc rd;
struct io_uring_buf_ring *br;
unsigned long mmap_offset;
unsigned long ring_size;
int ret; int ret;
lockdep_assert_held(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock);
@ -694,19 +623,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL; return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL; return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
} else {
if (reg.ring_addr)
return -EINVAL;
}
if (!is_power_of_2(reg.ring_entries)) if (!is_power_of_2(reg.ring_entries))
return -EINVAL; return -EINVAL;
/* cannot disambiguate full vs empty due to head/tail size */ /* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536) if (reg.ring_entries >= 65536)
return -EINVAL; return -EINVAL;
@ -722,21 +640,47 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOMEM; return -ENOMEM;
} }
if (!(reg.flags & IOU_PBUF_RING_MMAP)) mmap_offset = reg.bgid << IORING_OFF_PBUF_SHIFT;
ret = io_pin_pbuf_ring(&reg, bl); ring_size = flex_array_size(br, bufs, reg.ring_entries);
else
ret = io_alloc_pbuf_ring(ctx, &reg, bl);
if (!ret) { memset(&rd, 0, sizeof(rd));
bl->nr_entries = reg.ring_entries; rd.size = PAGE_ALIGN(ring_size);
bl->mask = reg.ring_entries - 1; if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (reg.flags & IOU_PBUF_RING_INC) rd.user_addr = reg.ring_addr;
bl->flags |= IOBL_INC; rd.flags |= IORING_MEM_REGION_TYPE_USER;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
} }
ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset);
if (ret)
goto fail;
br = io_region_get_ptr(&bl->region);
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
* is set and we must guarantee that the kernel and user side align
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
* the application mmap's the provided ring buffer. Fail the request
* if we, by chance, don't end up with aligned addresses. The app
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
ret = -EINVAL;
goto fail;
}
#endif
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
bl->flags |= IOBL_BUF_RING;
bl->buf_ring = br;
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
fail:
io_free_region(ctx, &bl->region);
kfree(free_bl); kfree(free_bl);
return ret; return ret;
} }
@ -794,32 +738,18 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
return 0; return 0;
} }
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
unsigned long bgid) unsigned int bgid)
{ {
struct io_buffer_list *bl; struct io_buffer_list *bl;
bl = xa_load(&ctx->io_bl_xa, bgid);
/* must be a mmap'able buffer ring and have pages */
if (bl && bl->flags & IOBL_MMAP)
return bl;
return ERR_PTR(-EINVAL);
}
int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct io_ring_ctx *ctx = file->private_data;
loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
struct io_buffer_list *bl;
int bgid;
lockdep_assert_held(&ctx->mmap_lock); lockdep_assert_held(&ctx->mmap_lock);
bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = xa_load(&ctx->io_bl_xa, bgid);
bl = io_pbuf_get_bl(ctx, bgid); if (!bl || !(bl->flags & IOBL_BUF_RING))
if (IS_ERR(bl)) return NULL;
return PTR_ERR(bl); if (WARN_ON_ONCE(!io_region_is_set(&bl->region)))
return NULL;
return io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); return &bl->region;
} }

View File

@ -3,15 +3,13 @@
#define IOU_KBUF_H #define IOU_KBUF_H
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
#include <linux/io_uring_types.h>
enum { enum {
/* ring mapped provided buffers */ /* ring mapped provided buffers */
IOBL_BUF_RING = 1, IOBL_BUF_RING = 1,
/* ring mapped provided buffers, but mmap'ed by application */
IOBL_MMAP = 2,
/* buffers are consumed incrementally rather than always fully */ /* buffers are consumed incrementally rather than always fully */
IOBL_INC = 4, IOBL_INC = 2,
}; };
struct io_buffer_list { struct io_buffer_list {
@ -21,10 +19,7 @@ struct io_buffer_list {
*/ */
union { union {
struct list_head buf_list; struct list_head buf_list;
struct { struct io_uring_buf_ring *buf_ring;
struct page **buf_pages;
struct io_uring_buf_ring *buf_ring;
};
}; };
__u16 bgid; __u16 bgid;
@ -35,6 +30,8 @@ struct io_buffer_list {
__u16 mask; __u16 mask;
__u16 flags; __u16 flags;
struct io_mapped_region region;
}; };
struct io_buffer { struct io_buffer {
@ -81,9 +78,8 @@ void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
unsigned long bgid); unsigned int bgid);
int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
{ {

View File

@ -36,90 +36,6 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
return page_address(page); return page_address(page);
} }
static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
gfp_t gfp)
{
void *ret;
int i;
for (i = 0; i < nr_pages; i++) {
pages[i] = alloc_page(gfp);
if (!pages[i])
goto err;
}
ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (ret)
return ret;
err:
while (i--)
put_page(pages[i]);
return ERR_PTR(-ENOMEM);
}
void *io_pages_map(struct page ***out_pages, unsigned short *npages,
size_t size)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
struct page **pages;
int nr_pages;
void *ret;
nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
if (!pages)
return ERR_PTR(-ENOMEM);
ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
if (!IS_ERR(ret))
goto done;
if (nr_pages == 1)
goto fail;
ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
if (!IS_ERR(ret)) {
done:
*out_pages = pages;
*npages = nr_pages;
return ret;
}
fail:
kvfree(pages);
*out_pages = NULL;
*npages = 0;
return ret;
}
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
bool put_pages)
{
bool do_vunmap = false;
if (!ptr)
return;
if (put_pages && *npages) {
struct page **to_free = *pages;
int i;
/*
* Only did vmap for the non-compound multiple page case.
* For the compound page, we just need to put the head.
*/
if (PageCompound(to_free[0]))
*npages = 1;
else if (*npages > 1)
do_vunmap = true;
for (i = 0; i < *npages; i++)
put_page(to_free[i]);
}
if (do_vunmap)
vunmap(ptr);
kvfree(*pages);
*pages = NULL;
*npages = 0;
}
struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages)
{ {
unsigned long start, end, nr_pages; unsigned long start, end, nr_pages;
@ -374,16 +290,14 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
return ctx->sq_sqes; return ctx->sq_sqes;
case IORING_OFF_PBUF_RING: { case IORING_OFF_PBUF_RING: {
struct io_buffer_list *bl; struct io_mapped_region *region;
unsigned int bgid; unsigned int bgid;
void *ptr;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
bl = io_pbuf_get_bl(ctx, bgid); region = io_pbuf_get_region(ctx, bgid);
if (IS_ERR(bl)) if (!region)
return bl; return ERR_PTR(-EINVAL);
ptr = bl->buf_ring; return io_region_validate_mmap(ctx, region);
return ptr;
} }
case IORING_MAP_OFF_PARAM_REGION: case IORING_MAP_OFF_PARAM_REGION:
return io_region_validate_mmap(ctx, &ctx->param_region); return io_region_validate_mmap(ctx, &ctx->param_region);
@ -392,15 +306,6 @@ static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
struct page **pages, int npages)
{
unsigned long nr_pages = npages;
vm_flags_set(vma, VM_DONTEXPAND);
return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
}
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
static int io_region_mmap(struct io_ring_ctx *ctx, static int io_region_mmap(struct io_ring_ctx *ctx,
@ -435,8 +340,17 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit);
case IORING_OFF_SQES: case IORING_OFF_SQES:
return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX);
case IORING_OFF_PBUF_RING: case IORING_OFF_PBUF_RING: {
return io_pbuf_mmap(file, vma); struct io_mapped_region *region;
unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
region = io_pbuf_get_region(ctx, bgid);
if (!region)
return -EINVAL;
return io_region_mmap(ctx, region, vma, UINT_MAX);
}
case IORING_MAP_OFF_PARAM_REGION: case IORING_MAP_OFF_PARAM_REGION:
return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX);
} }

View File

@ -4,13 +4,6 @@
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
struct page **pages, int npages);
void *io_pages_map(struct page ***out_pages, unsigned short *npages,
size_t size);
void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
bool put_pages);
#ifndef CONFIG_MMU #ifndef CONFIG_MMU
unsigned int io_uring_nommu_mmap_capabilities(struct file *file); unsigned int io_uring_nommu_mmap_capabilities(struct file *file);