mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-01 02:33:57 +00:00
btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)
Add an io_uring command for encoded reads, using the same interface as the existing BTRFS_IOC_ENCODED_READ ioctl. btrfs_uring_encoded_read() is an io_uring version of btrfs_ioctl_encoded_read(), which validates the user input and calls btrfs_encoded_read() to read the appropriate metadata. If we determine that we need to read an extent from disk, we call btrfs_encoded_read_regular_fill_pages() through btrfs_uring_read_extent() to prepare the bio. The existing btrfs_encoded_read_regular_fill_pages() is changed so that if it is passed a valid uring_ctx, rather than waking up any waiting threads it calls btrfs_uring_read_extent_endio(). This in turn copies the read data back to userspace, and calls io_uring_cmd_done() to complete the io_uring command. Because we're potentially doing a non-blocking read, btrfs_uring_read_extent() doesn't clean up after itself if it returns -EIOCBQUEUED. Instead, it allocates a priv struct, populates the fields there that we will need to unlock the inode and free our allocations, and defers this to the btrfs_uring_read_finished() that gets called when the bio completes. Signed-off-by: Mark Harmstone <maharmstone@fb.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
68d3b27e05
commit
34310c442e
@ -613,7 +613,7 @@ int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
|
||||
int compress_type);
|
||||
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
u64 disk_bytenr, u64 disk_io_size,
|
||||
struct page **pages);
|
||||
struct page **pages, void *uring_ctx);
|
||||
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
|
||||
struct btrfs_ioctl_encoded_io_args *encoded,
|
||||
struct extent_state **cached_state,
|
||||
|
@ -3710,6 +3710,7 @@ const struct file_operations btrfs_file_operations = {
|
||||
.compat_ioctl = btrfs_compat_ioctl,
|
||||
#endif
|
||||
.remap_file_range = btrfs_remap_file_range,
|
||||
.uring_cmd = btrfs_uring_cmd,
|
||||
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
|
||||
};
|
||||
|
||||
|
@ -9056,6 +9056,7 @@ static ssize_t btrfs_encoded_read_inline(
|
||||
|
||||
struct btrfs_encoded_read_private {
|
||||
wait_queue_head_t wait;
|
||||
void *uring_ctx;
|
||||
atomic_t pending;
|
||||
blk_status_t status;
|
||||
};
|
||||
@ -9075,14 +9076,22 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
|
||||
*/
|
||||
WRITE_ONCE(priv->status, bbio->bio.bi_status);
|
||||
}
|
||||
if (!atomic_dec_return(&priv->pending))
|
||||
wake_up(&priv->wait);
|
||||
if (atomic_dec_return(&priv->pending) == 0) {
|
||||
int err = blk_status_to_errno(READ_ONCE(priv->status));
|
||||
|
||||
if (priv->uring_ctx) {
|
||||
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
|
||||
kfree(priv);
|
||||
} else {
|
||||
wake_up(&priv->wait);
|
||||
}
|
||||
}
|
||||
bio_put(&bbio->bio);
|
||||
}
|
||||
|
||||
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
u64 disk_bytenr, u64 disk_io_size,
|
||||
struct page **pages)
|
||||
struct page **pages, void *uring_ctx)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
struct btrfs_encoded_read_private *priv;
|
||||
@ -9097,6 +9106,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
init_waitqueue_head(&priv->wait);
|
||||
atomic_set(&priv->pending, 1);
|
||||
priv->status = 0;
|
||||
priv->uring_ctx = uring_ctx;
|
||||
|
||||
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
|
||||
btrfs_encoded_read_endio, priv);
|
||||
@ -9125,12 +9135,23 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
atomic_inc(&priv->pending);
|
||||
btrfs_submit_bbio(bbio, 0);
|
||||
|
||||
if (atomic_dec_return(&priv->pending))
|
||||
io_wait_event(priv->wait, !atomic_read(&priv->pending));
|
||||
/* See btrfs_encoded_read_endio() for ordering. */
|
||||
ret = blk_status_to_errno(READ_ONCE(priv->status));
|
||||
kfree(priv);
|
||||
return ret;
|
||||
if (uring_ctx) {
|
||||
if (atomic_dec_return(&priv->pending) == 0) {
|
||||
ret = blk_status_to_errno(READ_ONCE(priv->status));
|
||||
btrfs_uring_read_extent_endio(uring_ctx, ret);
|
||||
kfree(priv);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return -EIOCBQUEUED;
|
||||
} else {
|
||||
if (atomic_dec_return(&priv->pending) != 0)
|
||||
io_wait_event(priv->wait, !atomic_read(&priv->pending));
|
||||
/* See btrfs_encoded_read_endio() for ordering. */
|
||||
ret = blk_status_to_errno(READ_ONCE(priv->status));
|
||||
kfree(priv);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
|
||||
@ -9158,7 +9179,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
|
||||
}
|
||||
|
||||
ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
|
||||
disk_io_size, pages);
|
||||
disk_io_size, pages, NULL);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
302
fs/btrfs/ioctl.c
302
fs/btrfs/ioctl.c
@ -29,6 +29,7 @@
|
||||
#include <linux/fileattr.h>
|
||||
#include <linux/fsverity.h>
|
||||
#include <linux/sched/xacct.h>
|
||||
#include <linux/io_uring/cmd.h>
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
#include "export.h"
|
||||
@ -4719,6 +4720,307 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
|
||||
* contains the fields in btrfs_uring_read_extent that are necessary to finish
|
||||
* off and cleanup the I/O in btrfs_uring_read_finished.
|
||||
*/
|
||||
struct btrfs_uring_priv {
|
||||
struct io_uring_cmd *cmd;
|
||||
struct page **pages;
|
||||
unsigned long nr_pages;
|
||||
struct kiocb iocb;
|
||||
struct iovec *iov;
|
||||
struct iov_iter iter;
|
||||
struct extent_state *cached_state;
|
||||
u64 count;
|
||||
u64 start;
|
||||
u64 lockend;
|
||||
int err;
|
||||
bool compressed;
|
||||
};
|
||||
|
||||
static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
||||
{
|
||||
struct btrfs_uring_priv *priv = *io_uring_cmd_to_pdu(cmd, struct btrfs_uring_priv *);
|
||||
struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
|
||||
struct extent_io_tree *io_tree = &inode->io_tree;
|
||||
unsigned long index;
|
||||
u64 cur;
|
||||
size_t page_offset;
|
||||
ssize_t ret;
|
||||
|
||||
if (priv->err) {
|
||||
ret = priv->err;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (priv->compressed) {
|
||||
index = 0;
|
||||
page_offset = 0;
|
||||
} else {
|
||||
index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
|
||||
page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
|
||||
}
|
||||
cur = 0;
|
||||
while (cur < priv->count) {
|
||||
size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset);
|
||||
|
||||
if (copy_page_to_iter(priv->pages[index], page_offset, bytes,
|
||||
&priv->iter) != bytes) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
index++;
|
||||
cur += bytes;
|
||||
page_offset = 0;
|
||||
}
|
||||
ret = priv->count;
|
||||
|
||||
out:
|
||||
unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
|
||||
io_uring_cmd_done(cmd, ret, 0, issue_flags);
|
||||
add_rchar(current, ret);
|
||||
|
||||
for (index = 0; index < priv->nr_pages; index++)
|
||||
__free_page(priv->pages[index]);
|
||||
|
||||
kfree(priv->pages);
|
||||
kfree(priv->iov);
|
||||
kfree(priv);
|
||||
}
|
||||
|
||||
void btrfs_uring_read_extent_endio(void *ctx, int err)
|
||||
{
|
||||
struct btrfs_uring_priv *priv = ctx;
|
||||
|
||||
priv->err = err;
|
||||
|
||||
*io_uring_cmd_to_pdu(priv->cmd, struct btrfs_uring_priv *) = priv;
|
||||
io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
|
||||
}
|
||||
|
||||
static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
|
||||
u64 start, u64 lockend,
|
||||
struct extent_state *cached_state,
|
||||
u64 disk_bytenr, u64 disk_io_size,
|
||||
size_t count, bool compressed,
|
||||
struct iovec *iov, struct io_uring_cmd *cmd)
|
||||
{
|
||||
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
|
||||
struct extent_io_tree *io_tree = &inode->io_tree;
|
||||
struct page **pages;
|
||||
struct btrfs_uring_priv *priv = NULL;
|
||||
unsigned long nr_pages;
|
||||
int ret;
|
||||
|
||||
nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
|
||||
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
|
||||
if (!pages)
|
||||
return -ENOMEM;
|
||||
ret = btrfs_alloc_page_array(nr_pages, pages, 0);
|
||||
if (ret) {
|
||||
ret = -ENOMEM;
|
||||
goto out_fail;
|
||||
}
|
||||
|
||||
priv = kmalloc(sizeof(*priv), GFP_NOFS);
|
||||
if (!priv) {
|
||||
ret = -ENOMEM;
|
||||
goto out_fail;
|
||||
}
|
||||
|
||||
priv->iocb = *iocb;
|
||||
priv->iov = iov;
|
||||
priv->iter = *iter;
|
||||
priv->count = count;
|
||||
priv->cmd = cmd;
|
||||
priv->cached_state = cached_state;
|
||||
priv->compressed = compressed;
|
||||
priv->nr_pages = nr_pages;
|
||||
priv->pages = pages;
|
||||
priv->start = start;
|
||||
priv->lockend = lockend;
|
||||
priv->err = 0;
|
||||
|
||||
ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
|
||||
disk_io_size, pages, priv);
|
||||
if (ret && ret != -EIOCBQUEUED)
|
||||
goto out_fail;
|
||||
|
||||
/*
|
||||
* If we return -EIOCBQUEUED, we're deferring the cleanup to
|
||||
* btrfs_uring_read_finished(), which will handle unlocking the extent
|
||||
* and inode and freeing the allocations.
|
||||
*/
|
||||
|
||||
return -EIOCBQUEUED;
|
||||
|
||||
out_fail:
|
||||
unlock_extent(io_tree, start, lockend, &cached_state);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
kfree(priv);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
||||
{
|
||||
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
|
||||
size_t copy_end;
|
||||
struct btrfs_ioctl_encoded_io_args args = { 0 };
|
||||
int ret;
|
||||
u64 disk_bytenr, disk_io_size;
|
||||
struct file *file;
|
||||
struct btrfs_inode *inode;
|
||||
struct btrfs_fs_info *fs_info;
|
||||
struct extent_io_tree *io_tree;
|
||||
struct iovec iovstack[UIO_FASTIOV];
|
||||
struct iovec *iov = iovstack;
|
||||
struct iov_iter iter;
|
||||
loff_t pos;
|
||||
struct kiocb kiocb;
|
||||
struct extent_state *cached_state = NULL;
|
||||
u64 start, lockend;
|
||||
void __user *sqe_addr;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out_acct;
|
||||
}
|
||||
file = cmd->file;
|
||||
inode = BTRFS_I(file->f_inode);
|
||||
fs_info = inode->root->fs_info;
|
||||
io_tree = &inode->io_tree;
|
||||
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
|
||||
|
||||
if (issue_flags & IO_URING_F_COMPAT) {
|
||||
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
|
||||
struct btrfs_ioctl_encoded_io_args_32 args32;
|
||||
|
||||
copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
|
||||
if (copy_from_user(&args32, sqe_addr, copy_end)) {
|
||||
ret = -EFAULT;
|
||||
goto out_acct;
|
||||
}
|
||||
args.iov = compat_ptr(args32.iov);
|
||||
args.iovcnt = args32.iovcnt;
|
||||
args.offset = args32.offset;
|
||||
args.flags = args32.flags;
|
||||
#else
|
||||
return -ENOTTY;
|
||||
#endif
|
||||
} else {
|
||||
copy_end = copy_end_kernel;
|
||||
if (copy_from_user(&args, sqe_addr, copy_end)) {
|
||||
ret = -EFAULT;
|
||||
goto out_acct;
|
||||
}
|
||||
}
|
||||
|
||||
if (args.flags != 0)
|
||||
return -EINVAL;
|
||||
|
||||
ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
|
||||
&iov, &iter);
|
||||
if (ret < 0)
|
||||
goto out_acct;
|
||||
|
||||
if (iov_iter_count(&iter) == 0) {
|
||||
ret = 0;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
pos = args.offset;
|
||||
ret = rw_verify_area(READ, file, &pos, args.len);
|
||||
if (ret < 0)
|
||||
goto out_free;
|
||||
|
||||
init_sync_kiocb(&kiocb, file);
|
||||
kiocb.ki_pos = pos;
|
||||
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
kiocb.ki_flags |= IOCB_NOWAIT;
|
||||
|
||||
start = ALIGN_DOWN(pos, fs_info->sectorsize);
|
||||
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
|
||||
|
||||
ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
|
||||
&disk_bytenr, &disk_io_size);
|
||||
if (ret < 0 && ret != -EIOCBQUEUED)
|
||||
goto out_free;
|
||||
|
||||
file_accessed(file);
|
||||
|
||||
if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
|
||||
sizeof(args) - copy_end_kernel)) {
|
||||
if (ret == -EIOCBQUEUED) {
|
||||
unlock_extent(io_tree, start, lockend, &cached_state);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
}
|
||||
ret = -EFAULT;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
if (ret == -EIOCBQUEUED) {
|
||||
u64 count;
|
||||
|
||||
/*
|
||||
* If we've optimized things by storing the iovecs on the stack,
|
||||
* undo this.
|
||||
*/
|
||||
if (!iov) {
|
||||
iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
|
||||
if (!iov) {
|
||||
unlock_extent(io_tree, start, lockend, &cached_state);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
ret = -ENOMEM;
|
||||
goto out_acct;
|
||||
}
|
||||
|
||||
memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
|
||||
}
|
||||
|
||||
count = min_t(u64, iov_iter_count(&iter), disk_io_size);
|
||||
|
||||
/* Match ioctl by not returning past EOF if uncompressed. */
|
||||
if (!args.compression)
|
||||
count = min_t(u64, count, args.len);
|
||||
|
||||
ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
|
||||
cached_state, disk_bytenr,
|
||||
disk_io_size, count,
|
||||
args.compression, iov, cmd);
|
||||
|
||||
goto out_acct;
|
||||
}
|
||||
|
||||
out_free:
|
||||
kfree(iov);
|
||||
|
||||
out_acct:
|
||||
if (ret > 0)
|
||||
add_rchar(current, ret);
|
||||
inc_syscr(current);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
|
||||
{
|
||||
switch (cmd->cmd_op) {
|
||||
case BTRFS_IOC_ENCODED_READ:
|
||||
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
|
||||
case BTRFS_IOC_ENCODED_READ_32:
|
||||
#endif
|
||||
return btrfs_uring_encoded_read(cmd, issue_flags);
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
long btrfs_ioctl(struct file *file, unsigned int
|
||||
cmd, unsigned long arg)
|
||||
{
|
||||
|
@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
|
||||
int __pure btrfs_is_empty_uuid(const u8 *uuid);
|
||||
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_ioctl_balance_args *bargs);
|
||||
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
|
||||
void btrfs_uring_read_extent_endio(void *ctx, int err);
|
||||
|
||||
#endif
|
||||
|
@ -5669,7 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
|
||||
ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
|
||||
disk_bytenr, disk_num_bytes,
|
||||
sctx->send_buf_pages +
|
||||
(data_offset >> PAGE_SHIFT));
|
||||
(data_offset >> PAGE_SHIFT),
|
||||
NULL);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user