mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-01 10:43:43 +00:00
0701db7439
Rather than allocate an io_rsrc_node for an empty/sparse buffer entry, add a const entry that can be used for that. This just needs checking for writing the tag, and the put check needs to check for that sparse node rather than NULL for validity. This avoids allocating rsrc nodes for sparse buffer entries. Signed-off-by: Jens Axboe <axboe@kernel.dk>
1115 lines
26 KiB
C
1115 lines
26 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
#include <linux/kernel.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/file.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/nospec.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/compat.h>
|
|
#include <linux/io_uring.h>
|
|
|
|
#include <uapi/linux/io_uring.h>
|
|
|
|
#include "io_uring.h"
|
|
#include "openclose.h"
|
|
#include "rsrc.h"
|
|
#include "memmap.h"
|
|
#include "register.h"
|
|
|
|
struct io_rsrc_update {
|
|
struct file *file;
|
|
u64 arg;
|
|
u32 nr_args;
|
|
u32 offset;
|
|
};
|
|
|
|
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
|
struct iovec *iov, struct page **last_hpage);
|
|
|
|
/* only define max */
|
|
#define IORING_MAX_FIXED_FILES (1U << 20)
|
|
#define IORING_MAX_REG_BUFFERS (1U << 14)
|
|
|
|
static const struct io_mapped_ubuf dummy_ubuf = {
|
|
/* set invalid range, so io_import_fixed() fails meeting it */
|
|
.ubuf = -1UL,
|
|
.len = UINT_MAX,
|
|
};
|
|
|
|
const struct io_rsrc_node empty_node = {
|
|
.type = IORING_RSRC_BUFFER,
|
|
.buf = (struct io_mapped_ubuf *) &dummy_ubuf,
|
|
};
|
|
|
|
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
|
|
{
|
|
unsigned long page_limit, cur_pages, new_pages;
|
|
|
|
if (!nr_pages)
|
|
return 0;
|
|
|
|
/* Don't allow more pages than we can safely lock */
|
|
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
|
|
|
cur_pages = atomic_long_read(&user->locked_vm);
|
|
do {
|
|
new_pages = cur_pages + nr_pages;
|
|
if (new_pages > page_limit)
|
|
return -ENOMEM;
|
|
} while (!atomic_long_try_cmpxchg(&user->locked_vm,
|
|
&cur_pages, new_pages));
|
|
return 0;
|
|
}
|
|
|
|
static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
|
|
{
|
|
if (ctx->user)
|
|
__io_unaccount_mem(ctx->user, nr_pages);
|
|
|
|
if (ctx->mm_account)
|
|
atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm);
|
|
}
|
|
|
|
static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
|
|
{
|
|
int ret;
|
|
|
|
if (ctx->user) {
|
|
ret = __io_account_mem(ctx->user, nr_pages);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
if (ctx->mm_account)
|
|
atomic64_add(nr_pages, &ctx->mm_account->pinned_vm);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int io_buffer_validate(struct iovec *iov)
|
|
{
|
|
unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
|
|
|
|
/*
|
|
* Don't impose further limits on the size and buffer
|
|
* constraints here, we'll -EINVAL later when IO is
|
|
* submitted if they are wrong.
|
|
*/
|
|
if (!iov->iov_base)
|
|
return iov->iov_len ? -EFAULT : 0;
|
|
if (!iov->iov_len)
|
|
return -EFAULT;
|
|
|
|
/* arbitrary limit, but we need something */
|
|
if (iov->iov_len > SZ_1G)
|
|
return -EFAULT;
|
|
|
|
if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
|
|
return -EOVERFLOW;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
|
|
{
|
|
unsigned int i;
|
|
|
|
if (node->buf != &dummy_ubuf) {
|
|
struct io_mapped_ubuf *imu = node->buf;
|
|
|
|
if (!refcount_dec_and_test(&imu->refs))
|
|
return;
|
|
for (i = 0; i < imu->nr_bvecs; i++)
|
|
unpin_user_page(imu->bvec[i].bv_page);
|
|
if (imu->acct_pages)
|
|
io_unaccount_mem(ctx, imu->acct_pages);
|
|
kvfree(imu);
|
|
}
|
|
}
|
|
|
|
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
|
|
{
|
|
struct io_rsrc_node *node;
|
|
|
|
node = kzalloc(sizeof(*node), GFP_KERNEL);
|
|
if (node) {
|
|
node->ctx = ctx;
|
|
node->refs = 1;
|
|
node->type = type;
|
|
}
|
|
return node;
|
|
}
|
|
|
|
static void io_rsrc_data_free(struct io_rsrc_data *data)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < data->nr; i++) {
|
|
struct io_rsrc_node *node = data->nodes[i];
|
|
|
|
if (node)
|
|
io_put_rsrc_node(node);
|
|
}
|
|
kvfree(data->nodes);
|
|
kfree(data);
|
|
}
|
|
|
|
__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr,
|
|
struct io_rsrc_data **pdata)
|
|
{
|
|
struct io_rsrc_data *data;
|
|
|
|
data = kzalloc(sizeof(*data), GFP_KERNEL);
|
|
if (!data)
|
|
return -ENOMEM;
|
|
|
|
data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
|
|
GFP_KERNEL | __GFP_ZERO);
|
|
if (!data->nodes) {
|
|
kfree(data);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
data->nr = nr;
|
|
*pdata = data;
|
|
return 0;
|
|
}
|
|
|
|
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
|
|
struct io_uring_rsrc_update2 *up,
|
|
unsigned nr_args)
|
|
{
|
|
u64 __user *tags = u64_to_user_ptr(up->tags);
|
|
__s32 __user *fds = u64_to_user_ptr(up->data);
|
|
int fd, i, err = 0;
|
|
unsigned int done;
|
|
|
|
if (!ctx->file_data)
|
|
return -ENXIO;
|
|
if (up->offset + nr_args > ctx->nr_user_files)
|
|
return -EINVAL;
|
|
|
|
for (done = 0; done < nr_args; done++) {
|
|
u64 tag = 0;
|
|
|
|
if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) ||
|
|
copy_from_user(&fd, &fds[done], sizeof(fd))) {
|
|
err = -EFAULT;
|
|
break;
|
|
}
|
|
if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) {
|
|
err = -EINVAL;
|
|
break;
|
|
}
|
|
if (fd == IORING_REGISTER_FILES_SKIP)
|
|
continue;
|
|
|
|
i = array_index_nospec(up->offset + done, ctx->nr_user_files);
|
|
if (ctx->file_table.nodes[i]) {
|
|
io_put_rsrc_node(ctx->file_table.nodes[i]);
|
|
ctx->file_table.nodes[i] = NULL;
|
|
io_file_bitmap_clear(&ctx->file_table, i);
|
|
}
|
|
if (fd != -1) {
|
|
struct file *file = fget(fd);
|
|
struct io_rsrc_node *node;
|
|
|
|
if (!file) {
|
|
err = -EBADF;
|
|
break;
|
|
}
|
|
/*
|
|
* Don't allow io_uring instances to be registered.
|
|
*/
|
|
if (io_is_uring_fops(file)) {
|
|
fput(file);
|
|
err = -EBADF;
|
|
break;
|
|
}
|
|
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
|
|
if (!node) {
|
|
err = -ENOMEM;
|
|
fput(file);
|
|
break;
|
|
}
|
|
ctx->file_table.nodes[i] = node;
|
|
if (tag)
|
|
node->tag = tag;
|
|
io_fixed_file_set(node, file);
|
|
io_file_bitmap_set(&ctx->file_table, i);
|
|
}
|
|
}
|
|
return done ? done : err;
|
|
}
|
|
|
|
static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
|
|
struct io_uring_rsrc_update2 *up,
|
|
unsigned int nr_args)
|
|
{
|
|
u64 __user *tags = u64_to_user_ptr(up->tags);
|
|
struct iovec fast_iov, *iov;
|
|
struct page *last_hpage = NULL;
|
|
struct iovec __user *uvec;
|
|
u64 user_data = up->data;
|
|
__u32 done;
|
|
int i, err;
|
|
|
|
if (!ctx->buf_data)
|
|
return -ENXIO;
|
|
if (up->offset + nr_args > ctx->nr_user_bufs)
|
|
return -EINVAL;
|
|
|
|
for (done = 0; done < nr_args; done++) {
|
|
struct io_rsrc_node *node;
|
|
u64 tag = 0;
|
|
|
|
uvec = u64_to_user_ptr(user_data);
|
|
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
|
|
if (IS_ERR(iov)) {
|
|
err = PTR_ERR(iov);
|
|
break;
|
|
}
|
|
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
|
|
err = -EFAULT;
|
|
break;
|
|
}
|
|
err = io_buffer_validate(iov);
|
|
if (err)
|
|
break;
|
|
if (!iov->iov_base && tag) {
|
|
err = -EINVAL;
|
|
break;
|
|
}
|
|
i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
|
|
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
|
|
if (IS_ERR(node)) {
|
|
err = PTR_ERR(node);
|
|
break;
|
|
}
|
|
if (ctx->user_bufs[i])
|
|
io_put_rsrc_node(ctx->user_bufs[i]);
|
|
|
|
ctx->user_bufs[i] = node;
|
|
if (tag)
|
|
node->tag = tag;
|
|
if (ctx->compat)
|
|
user_data += sizeof(struct compat_iovec);
|
|
else
|
|
user_data += sizeof(struct iovec);
|
|
}
|
|
return done ? done : err;
|
|
}
|
|
|
|
static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
|
|
struct io_uring_rsrc_update2 *up,
|
|
unsigned nr_args)
|
|
{
|
|
__u32 tmp;
|
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
if (check_add_overflow(up->offset, nr_args, &tmp))
|
|
return -EOVERFLOW;
|
|
|
|
switch (type) {
|
|
case IORING_RSRC_FILE:
|
|
return __io_sqe_files_update(ctx, up, nr_args);
|
|
case IORING_RSRC_BUFFER:
|
|
return __io_sqe_buffers_update(ctx, up, nr_args);
|
|
}
|
|
return -EINVAL;
|
|
}
|
|
|
|
int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg,
|
|
unsigned nr_args)
|
|
{
|
|
struct io_uring_rsrc_update2 up;
|
|
|
|
if (!nr_args)
|
|
return -EINVAL;
|
|
memset(&up, 0, sizeof(up));
|
|
if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
|
|
return -EFAULT;
|
|
if (up.resv || up.resv2)
|
|
return -EINVAL;
|
|
return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);
|
|
}
|
|
|
|
int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
|
|
unsigned size, unsigned type)
|
|
{
|
|
struct io_uring_rsrc_update2 up;
|
|
|
|
if (size != sizeof(up))
|
|
return -EINVAL;
|
|
if (copy_from_user(&up, arg, sizeof(up)))
|
|
return -EFAULT;
|
|
if (!up.nr || up.resv || up.resv2)
|
|
return -EINVAL;
|
|
return __io_register_rsrc_update(ctx, type, &up, up.nr);
|
|
}
|
|
|
|
__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
|
|
unsigned int size, unsigned int type)
|
|
{
|
|
struct io_uring_rsrc_register rr;
|
|
|
|
/* keep it extendible */
|
|
if (size != sizeof(rr))
|
|
return -EINVAL;
|
|
|
|
memset(&rr, 0, sizeof(rr));
|
|
if (copy_from_user(&rr, arg, size))
|
|
return -EFAULT;
|
|
if (!rr.nr || rr.resv2)
|
|
return -EINVAL;
|
|
if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)
|
|
return -EINVAL;
|
|
|
|
switch (type) {
|
|
case IORING_RSRC_FILE:
|
|
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
|
|
break;
|
|
return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data),
|
|
rr.nr, u64_to_user_ptr(rr.tags));
|
|
case IORING_RSRC_BUFFER:
|
|
if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data)
|
|
break;
|
|
return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data),
|
|
rr.nr, u64_to_user_ptr(rr.tags));
|
|
}
|
|
return -EINVAL;
|
|
}
|
|
|
|
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|
{
|
|
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
|
|
|
|
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
|
|
return -EINVAL;
|
|
if (sqe->rw_flags || sqe->splice_fd_in)
|
|
return -EINVAL;
|
|
|
|
up->offset = READ_ONCE(sqe->off);
|
|
up->nr_args = READ_ONCE(sqe->len);
|
|
if (!up->nr_args)
|
|
return -EINVAL;
|
|
up->arg = READ_ONCE(sqe->addr);
|
|
return 0;
|
|
}
|
|
|
|
static int io_files_update_with_index_alloc(struct io_kiocb *req,
|
|
unsigned int issue_flags)
|
|
{
|
|
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
|
|
__s32 __user *fds = u64_to_user_ptr(up->arg);
|
|
unsigned int done;
|
|
struct file *file;
|
|
int ret, fd;
|
|
|
|
if (!req->ctx->file_data)
|
|
return -ENXIO;
|
|
|
|
for (done = 0; done < up->nr_args; done++) {
|
|
if (copy_from_user(&fd, &fds[done], sizeof(fd))) {
|
|
ret = -EFAULT;
|
|
break;
|
|
}
|
|
|
|
file = fget(fd);
|
|
if (!file) {
|
|
ret = -EBADF;
|
|
break;
|
|
}
|
|
ret = io_fixed_fd_install(req, issue_flags, file,
|
|
IORING_FILE_INDEX_ALLOC);
|
|
if (ret < 0)
|
|
break;
|
|
if (copy_to_user(&fds[done], &ret, sizeof(ret))) {
|
|
__io_close_fixed(req->ctx, issue_flags, ret);
|
|
ret = -EFAULT;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (done)
|
|
return done;
|
|
return ret;
|
|
}
|
|
|
|
int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
|
|
{
|
|
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
|
|
struct io_ring_ctx *ctx = req->ctx;
|
|
struct io_uring_rsrc_update2 up2;
|
|
int ret;
|
|
|
|
up2.offset = up->offset;
|
|
up2.data = up->arg;
|
|
up2.nr = 0;
|
|
up2.tags = 0;
|
|
up2.resv = 0;
|
|
up2.resv2 = 0;
|
|
|
|
if (up->offset == IORING_FILE_INDEX_ALLOC) {
|
|
ret = io_files_update_with_index_alloc(req, issue_flags);
|
|
} else {
|
|
io_ring_submit_lock(ctx, issue_flags);
|
|
ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE,
|
|
&up2, up->nr_args);
|
|
io_ring_submit_unlock(ctx, issue_flags);
|
|
}
|
|
|
|
if (ret < 0)
|
|
req_set_fail(req);
|
|
io_req_set_res(req, ret, 0);
|
|
return IOU_OK;
|
|
}
|
|
|
|
void io_free_rsrc_node(struct io_rsrc_node *node)
|
|
{
|
|
struct io_ring_ctx *ctx = node->ctx;
|
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
if (node->tag)
|
|
io_post_aux_cqe(node->ctx, node->tag, 0, 0);
|
|
|
|
switch (node->type) {
|
|
case IORING_RSRC_FILE:
|
|
if (io_slot_file(node))
|
|
fput(io_slot_file(node));
|
|
break;
|
|
case IORING_RSRC_BUFFER:
|
|
if (node->buf)
|
|
io_buffer_unmap(node->ctx, node);
|
|
break;
|
|
default:
|
|
WARN_ON_ONCE(1);
|
|
break;
|
|
}
|
|
|
|
kfree(node);
|
|
}
|
|
|
|
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
|
|
{
|
|
int i;
|
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
for (i = 0; i < ctx->nr_user_files; i++) {
|
|
struct io_rsrc_node *node = ctx->file_table.nodes[i];
|
|
|
|
if (node) {
|
|
io_put_rsrc_node(node);
|
|
io_file_bitmap_clear(&ctx->file_table, i);
|
|
ctx->file_table.nodes[i] = NULL;
|
|
}
|
|
}
|
|
|
|
io_free_file_tables(&ctx->file_table);
|
|
io_file_table_set_alloc_range(ctx, 0, 0);
|
|
io_rsrc_data_free(ctx->file_data);
|
|
ctx->file_data = NULL;
|
|
ctx->nr_user_files = 0;
|
|
}
|
|
|
|
int io_sqe_files_unregister(struct io_ring_ctx *ctx)
|
|
{
|
|
if (!ctx->file_data)
|
|
return -ENXIO;
|
|
|
|
__io_sqe_files_unregister(ctx);
|
|
return 0;
|
|
}
|
|
|
|
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
|
|
unsigned nr_args, u64 __user *tags)
|
|
{
|
|
__s32 __user *fds = (__s32 __user *) arg;
|
|
struct file *file;
|
|
int fd, ret;
|
|
unsigned i;
|
|
|
|
if (ctx->file_data)
|
|
return -EBUSY;
|
|
if (!nr_args)
|
|
return -EINVAL;
|
|
if (nr_args > IORING_MAX_FIXED_FILES)
|
|
return -EMFILE;
|
|
if (nr_args > rlimit(RLIMIT_NOFILE))
|
|
return -EMFILE;
|
|
ret = io_rsrc_data_alloc(ctx, nr_args, &ctx->file_data);
|
|
if (ret)
|
|
return ret;
|
|
|
|
if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
|
|
io_rsrc_data_free(ctx->file_data);
|
|
ctx->file_data = NULL;
|
|
return -ENOMEM;
|
|
}
|
|
|
|
for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
|
|
struct io_rsrc_node *node;
|
|
u64 tag = 0;
|
|
|
|
ret = -EFAULT;
|
|
if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
|
|
goto fail;
|
|
if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
|
|
goto fail;
|
|
/* allow sparse sets */
|
|
if (!fds || fd == -1) {
|
|
ret = -EINVAL;
|
|
if (tag)
|
|
goto fail;
|
|
continue;
|
|
}
|
|
|
|
file = fget(fd);
|
|
ret = -EBADF;
|
|
if (unlikely(!file))
|
|
goto fail;
|
|
|
|
/*
|
|
* Don't allow io_uring instances to be registered.
|
|
*/
|
|
if (io_is_uring_fops(file)) {
|
|
fput(file);
|
|
goto fail;
|
|
}
|
|
ret = -ENOMEM;
|
|
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
|
|
if (!node) {
|
|
fput(file);
|
|
goto fail;
|
|
}
|
|
if (tag)
|
|
node->tag = tag;
|
|
ctx->file_table.nodes[i] = node;
|
|
io_fixed_file_set(node, file);
|
|
io_file_bitmap_set(&ctx->file_table, i);
|
|
}
|
|
|
|
/* default it to the whole table */
|
|
io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
|
|
return 0;
|
|
fail:
|
|
__io_sqe_files_unregister(ctx);
|
|
return ret;
|
|
}
|
|
|
|
static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
|
|
{
|
|
unsigned int i;
|
|
|
|
lockdep_assert_held(&ctx->uring_lock);
|
|
|
|
for (i = 0; i < ctx->nr_user_bufs; i++) {
|
|
if (ctx->user_bufs[i]) {
|
|
io_put_rsrc_node(ctx->user_bufs[i]);
|
|
ctx->user_bufs[i] = NULL;
|
|
}
|
|
}
|
|
kvfree(ctx->user_bufs);
|
|
ctx->user_bufs = NULL;
|
|
io_rsrc_data_free(ctx->buf_data);
|
|
ctx->buf_data = NULL;
|
|
ctx->nr_user_bufs = 0;
|
|
}
|
|
|
|
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
|
|
{
|
|
if (!ctx->buf_data)
|
|
return -ENXIO;
|
|
|
|
__io_sqe_buffers_unregister(ctx);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Not super efficient, but this is just a registration time. And we do cache
|
|
* the last compound head, so generally we'll only do a full search if we don't
|
|
* match that one.
|
|
*
|
|
* We check if the given compound head page has already been accounted, to
|
|
* avoid double accounting it. This allows us to account the full size of the
|
|
* page, not just the constituent pages of a huge page.
|
|
*/
|
|
static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
|
|
int nr_pages, struct page *hpage)
|
|
{
|
|
int i, j;
|
|
|
|
/* check current page array */
|
|
for (i = 0; i < nr_pages; i++) {
|
|
if (!PageCompound(pages[i]))
|
|
continue;
|
|
if (compound_head(pages[i]) == hpage)
|
|
return true;
|
|
}
|
|
|
|
/* check previously registered pages */
|
|
for (i = 0; i < ctx->nr_user_bufs; i++) {
|
|
struct io_rsrc_node *node = ctx->user_bufs[i];
|
|
struct io_mapped_ubuf *imu = node->buf;
|
|
|
|
for (j = 0; j < imu->nr_bvecs; j++) {
|
|
if (!PageCompound(imu->bvec[j].bv_page))
|
|
continue;
|
|
if (compound_head(imu->bvec[j].bv_page) == hpage)
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
|
|
int nr_pages, struct io_mapped_ubuf *imu,
|
|
struct page **last_hpage)
|
|
{
|
|
int i, ret;
|
|
|
|
imu->acct_pages = 0;
|
|
for (i = 0; i < nr_pages; i++) {
|
|
if (!PageCompound(pages[i])) {
|
|
imu->acct_pages++;
|
|
} else {
|
|
struct page *hpage;
|
|
|
|
hpage = compound_head(pages[i]);
|
|
if (hpage == *last_hpage)
|
|
continue;
|
|
*last_hpage = hpage;
|
|
if (headpage_already_acct(ctx, pages, i, hpage))
|
|
continue;
|
|
imu->acct_pages += page_size(hpage) >> PAGE_SHIFT;
|
|
}
|
|
}
|
|
|
|
if (!imu->acct_pages)
|
|
return 0;
|
|
|
|
ret = io_account_mem(ctx, imu->acct_pages);
|
|
if (ret)
|
|
imu->acct_pages = 0;
|
|
return ret;
|
|
}
|
|
|
|
static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
|
|
struct io_imu_folio_data *data, int nr_folios)
|
|
{
|
|
struct page **page_array = *pages, **new_array = NULL;
|
|
int nr_pages_left = *nr_pages, i, j;
|
|
|
|
/* Store head pages only*/
|
|
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
|
|
GFP_KERNEL);
|
|
if (!new_array)
|
|
return false;
|
|
|
|
new_array[0] = compound_head(page_array[0]);
|
|
/*
|
|
* The pages are bound to the folio, it doesn't
|
|
* actually unpin them but drops all but one reference,
|
|
* which is usually put down by io_buffer_unmap().
|
|
* Note, needs a better helper.
|
|
*/
|
|
if (data->nr_pages_head > 1)
|
|
unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
|
|
|
|
j = data->nr_pages_head;
|
|
nr_pages_left -= data->nr_pages_head;
|
|
for (i = 1; i < nr_folios; i++) {
|
|
unsigned int nr_unpin;
|
|
|
|
new_array[i] = page_array[j];
|
|
nr_unpin = min_t(unsigned int, nr_pages_left - 1,
|
|
data->nr_pages_mid - 1);
|
|
if (nr_unpin)
|
|
unpin_user_pages(&page_array[j+1], nr_unpin);
|
|
j += data->nr_pages_mid;
|
|
nr_pages_left -= data->nr_pages_mid;
|
|
}
|
|
kvfree(page_array);
|
|
*pages = new_array;
|
|
*nr_pages = nr_folios;
|
|
return true;
|
|
}
|
|
|
|
static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
|
|
struct io_imu_folio_data *data)
|
|
{
|
|
struct page **page_array = *pages;
|
|
struct folio *folio = page_folio(page_array[0]);
|
|
unsigned int count = 1, nr_folios = 1;
|
|
int i;
|
|
|
|
if (*nr_pages <= 1)
|
|
return false;
|
|
|
|
data->nr_pages_mid = folio_nr_pages(folio);
|
|
if (data->nr_pages_mid == 1)
|
|
return false;
|
|
|
|
data->folio_shift = folio_shift(folio);
|
|
/*
|
|
* Check if pages are contiguous inside a folio, and all folios have
|
|
* the same page count except for the head and tail.
|
|
*/
|
|
for (i = 1; i < *nr_pages; i++) {
|
|
if (page_folio(page_array[i]) == folio &&
|
|
page_array[i] == page_array[i-1] + 1) {
|
|
count++;
|
|
continue;
|
|
}
|
|
|
|
if (nr_folios == 1) {
|
|
if (folio_page_idx(folio, page_array[i-1]) !=
|
|
data->nr_pages_mid - 1)
|
|
return false;
|
|
|
|
data->nr_pages_head = count;
|
|
} else if (count != data->nr_pages_mid) {
|
|
return false;
|
|
}
|
|
|
|
folio = page_folio(page_array[i]);
|
|
if (folio_size(folio) != (1UL << data->folio_shift) ||
|
|
folio_page_idx(folio, page_array[i]) != 0)
|
|
return false;
|
|
|
|
count = 1;
|
|
nr_folios++;
|
|
}
|
|
if (nr_folios == 1)
|
|
data->nr_pages_head = count;
|
|
|
|
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
|
|
}
|
|
|
|
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
|
|
struct iovec *iov,
|
|
struct page **last_hpage)
|
|
{
|
|
struct io_mapped_ubuf *imu = NULL;
|
|
struct page **pages = NULL;
|
|
struct io_rsrc_node *node;
|
|
unsigned long off;
|
|
size_t size;
|
|
int ret, nr_pages, i;
|
|
struct io_imu_folio_data data;
|
|
bool coalesced;
|
|
|
|
node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
|
|
if (!node)
|
|
return ERR_PTR(-ENOMEM);
|
|
node->buf = NULL;
|
|
|
|
ret = -ENOMEM;
|
|
pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
|
|
&nr_pages);
|
|
if (IS_ERR(pages)) {
|
|
ret = PTR_ERR(pages);
|
|
pages = NULL;
|
|
goto done;
|
|
}
|
|
|
|
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
|
|
coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
|
|
|
|
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
|
|
if (!imu)
|
|
goto done;
|
|
|
|
ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage);
|
|
if (ret) {
|
|
unpin_user_pages(pages, nr_pages);
|
|
goto done;
|
|
}
|
|
|
|
size = iov->iov_len;
|
|
/* store original address for later verification */
|
|
imu->ubuf = (unsigned long) iov->iov_base;
|
|
imu->len = iov->iov_len;
|
|
imu->nr_bvecs = nr_pages;
|
|
imu->folio_shift = PAGE_SHIFT;
|
|
if (coalesced)
|
|
imu->folio_shift = data.folio_shift;
|
|
refcount_set(&imu->refs, 1);
|
|
off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
|
|
node->buf = imu;
|
|
ret = 0;
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
size_t vec_len;
|
|
|
|
vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
|
|
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
|
|
off = 0;
|
|
size -= vec_len;
|
|
}
|
|
done:
|
|
if (ret) {
|
|
kvfree(imu);
|
|
if (node)
|
|
io_put_rsrc_node(node);
|
|
node = ERR_PTR(ret);
|
|
}
|
|
kvfree(pages);
|
|
return node;
|
|
}
|
|
|
|
static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
|
|
{
|
|
ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
|
|
return ctx->user_bufs ? 0 : -ENOMEM;
|
|
}
|
|
|
|
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
|
unsigned int nr_args, u64 __user *tags)
|
|
{
|
|
struct page *last_hpage = NULL;
|
|
struct io_rsrc_data *data;
|
|
struct iovec fast_iov, *iov = &fast_iov;
|
|
const struct iovec __user *uvec;
|
|
int i, ret;
|
|
|
|
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
|
|
|
|
if (ctx->user_bufs)
|
|
return -EBUSY;
|
|
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
|
|
return -EINVAL;
|
|
ret = io_rsrc_data_alloc(ctx, nr_args, &data);
|
|
if (ret)
|
|
return ret;
|
|
ret = io_buffers_map_alloc(ctx, nr_args);
|
|
if (ret) {
|
|
io_rsrc_data_free(data);
|
|
return ret;
|
|
}
|
|
|
|
if (!arg)
|
|
memset(iov, 0, sizeof(*iov));
|
|
|
|
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
|
|
struct io_rsrc_node *node;
|
|
u64 tag = 0;
|
|
|
|
if (arg) {
|
|
uvec = (struct iovec __user *) arg;
|
|
iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
|
|
if (IS_ERR(iov)) {
|
|
ret = PTR_ERR(iov);
|
|
break;
|
|
}
|
|
ret = io_buffer_validate(iov);
|
|
if (ret)
|
|
break;
|
|
if (ctx->compat)
|
|
arg += sizeof(struct compat_iovec);
|
|
else
|
|
arg += sizeof(struct iovec);
|
|
}
|
|
|
|
if (tags) {
|
|
if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
|
|
ret = -EFAULT;
|
|
break;
|
|
}
|
|
if (tag && !iov->iov_base) {
|
|
ret = -EINVAL;
|
|
break;
|
|
}
|
|
}
|
|
|
|
node = io_sqe_buffer_register(ctx, iov, &last_hpage);
|
|
if (IS_ERR(node)) {
|
|
ret = PTR_ERR(node);
|
|
break;
|
|
}
|
|
if (tag)
|
|
node->tag = tag;
|
|
ctx->user_bufs[i] = node;
|
|
}
|
|
|
|
WARN_ON_ONCE(ctx->buf_data);
|
|
|
|
ctx->buf_data = data;
|
|
if (ret)
|
|
__io_sqe_buffers_unregister(ctx);
|
|
return ret;
|
|
}
|
|
|
|
int io_import_fixed(int ddir, struct iov_iter *iter,
|
|
struct io_mapped_ubuf *imu,
|
|
u64 buf_addr, size_t len)
|
|
{
|
|
u64 buf_end;
|
|
size_t offset;
|
|
|
|
if (WARN_ON_ONCE(!imu))
|
|
return -EFAULT;
|
|
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
|
|
return -EFAULT;
|
|
/* not inside the mapped region */
|
|
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
|
|
return -EFAULT;
|
|
|
|
/*
|
|
* Might not be a start of buffer, set size appropriately
|
|
* and advance us to the beginning.
|
|
*/
|
|
offset = buf_addr - imu->ubuf;
|
|
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
|
|
|
|
if (offset) {
|
|
/*
|
|
* Don't use iov_iter_advance() here, as it's really slow for
|
|
* using the latter parts of a big fixed buffer - it iterates
|
|
* over each segment manually. We can cheat a bit here, because
|
|
* we know that:
|
|
*
|
|
* 1) it's a BVEC iter, we set it up
|
|
* 2) all bvecs are the same in size, except potentially the
|
|
* first and last bvec
|
|
*
|
|
* So just find our index, and adjust the iterator afterwards.
|
|
* If the offset is within the first bvec (or the whole first
|
|
* bvec, just use iov_iter_advance(). This makes it easier
|
|
* since we can just skip the first segment, which may not
|
|
* be folio_size aligned.
|
|
*/
|
|
const struct bio_vec *bvec = imu->bvec;
|
|
|
|
if (offset < bvec->bv_len) {
|
|
iter->count -= offset;
|
|
iter->iov_offset = offset;
|
|
} else {
|
|
unsigned long seg_skip;
|
|
|
|
/* skip first vec */
|
|
offset -= bvec->bv_len;
|
|
seg_skip = 1 + (offset >> imu->folio_shift);
|
|
|
|
iter->bvec += seg_skip;
|
|
iter->nr_segs -= seg_skip;
|
|
iter->count -= bvec->bv_len + offset;
|
|
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
|
|
{
|
|
struct io_rsrc_node **user_bufs;
|
|
struct io_rsrc_data *data;
|
|
int i, ret, nbufs;
|
|
|
|
/*
|
|
* Drop our own lock here. We'll setup the data we need and reference
|
|
* the source buffers, then re-grab, check, and assign at the end.
|
|
*/
|
|
mutex_unlock(&ctx->uring_lock);
|
|
|
|
mutex_lock(&src_ctx->uring_lock);
|
|
ret = -ENXIO;
|
|
nbufs = src_ctx->nr_user_bufs;
|
|
if (!nbufs)
|
|
goto out_unlock;
|
|
ret = io_rsrc_data_alloc(ctx, nbufs, &data);
|
|
if (ret)
|
|
goto out_unlock;
|
|
|
|
ret = -ENOMEM;
|
|
user_bufs = kvmalloc_array(nbufs, sizeof(struct io_rsrc_node *),
|
|
GFP_KERNEL | __GFP_ZERO);
|
|
if (!user_bufs)
|
|
goto out_free_data;
|
|
|
|
for (i = 0; i < nbufs; i++) {
|
|
struct io_rsrc_node *src_node = src_ctx->user_bufs[i];
|
|
struct io_rsrc_node *dst_node;
|
|
|
|
if (src_node == rsrc_empty_node) {
|
|
dst_node = rsrc_empty_node;
|
|
} else {
|
|
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
|
|
if (!dst_node)
|
|
goto out_put_free;
|
|
|
|
refcount_inc(&src_node->buf->refs);
|
|
dst_node->buf = src_node->buf;
|
|
}
|
|
user_bufs[i] = dst_node;
|
|
}
|
|
|
|
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
|
|
mutex_unlock(&src_ctx->uring_lock);
|
|
mutex_lock(&ctx->uring_lock);
|
|
if (!ctx->user_bufs) {
|
|
ctx->user_bufs = user_bufs;
|
|
ctx->buf_data = data;
|
|
ctx->nr_user_bufs = nbufs;
|
|
return 0;
|
|
}
|
|
|
|
mutex_unlock(&ctx->uring_lock);
|
|
mutex_lock(&src_ctx->uring_lock);
|
|
/* someone raced setting up buffers, dump ours */
|
|
ret = -EBUSY;
|
|
i = nbufs;
|
|
out_put_free:
|
|
while (i--) {
|
|
io_buffer_unmap(src_ctx, user_bufs[i]);
|
|
kfree(user_bufs[i]);
|
|
}
|
|
kvfree(user_bufs);
|
|
out_free_data:
|
|
io_rsrc_data_free(data);
|
|
out_unlock:
|
|
mutex_unlock(&src_ctx->uring_lock);
|
|
mutex_lock(&ctx->uring_lock);
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Copy the registered buffers from the source ring whose file descriptor
|
|
* is given in the src_fd to the current ring. This is identical to registering
|
|
* the buffers with ctx, except faster as mappings already exist.
|
|
*
|
|
* Since the memory is already accounted once, don't account it again.
|
|
*/
|
|
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
|
|
{
|
|
struct io_uring_clone_buffers buf;
|
|
bool registered_src;
|
|
struct file *file;
|
|
int ret;
|
|
|
|
if (ctx->user_bufs || ctx->nr_user_bufs)
|
|
return -EBUSY;
|
|
if (copy_from_user(&buf, arg, sizeof(buf)))
|
|
return -EFAULT;
|
|
if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
|
|
return -EINVAL;
|
|
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
|
|
return -EINVAL;
|
|
|
|
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
|
|
file = io_uring_register_get_file(buf.src_fd, registered_src);
|
|
if (IS_ERR(file))
|
|
return PTR_ERR(file);
|
|
ret = io_clone_buffers(ctx, file->private_data);
|
|
if (!registered_src)
|
|
fput(file);
|
|
return ret;
|
|
}
|