From 2b3933b1e0a0a4b758fbc164bb31db0c113a7e2c Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 23 Sep 2024 10:13:11 -0700 Subject: [PATCH 01/30] fuse: enable dynamic configuration of fuse max pages limit (FUSE_MAX_MAX_PAGES) Introduce the capability to dynamically configure the max pages limit (FUSE_MAX_MAX_PAGES) through a sysctl. This allows system administrators to dynamically set the maximum number of pages that can be used for servicing requests in fuse. Previously, this is gated by FUSE_MAX_MAX_PAGES which is statically set to 256 pages. One result of this is that the buffer size for a write request is limited to 1 MiB on a 4k-page system. The default value for this sysctl is the original limit (256 pages). $ sysctl -a | grep max_pages_limit fs.fuse.max_pages_limit = 256 $ sysctl -n fs.fuse.max_pages_limit 256 $ echo 1024 | sudo tee /proc/sys/fs/fuse/max_pages_limit 1024 $ sysctl -n fs.fuse.max_pages_limit 1024 $ echo 65536 | sudo tee /proc/sys/fs/fuse/max_pages_limit tee: /proc/sys/fs/fuse/max_pages_limit: Invalid argument $ echo 0 | sudo tee /proc/sys/fs/fuse/max_pages_limit tee: /proc/sys/fs/fuse/max_pages_limit: Invalid argument $ echo 65535 | sudo tee /proc/sys/fs/fuse/max_pages_limit 65535 $ sysctl -n fs.fuse.max_pages_limit 65535 Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Reviewed-by: Sweet Tea Dorminy Signed-off-by: Miklos Szeredi --- Documentation/admin-guide/sysctl/fs.rst | 10 +++++++ fs/fuse/Makefile | 1 + fs/fuse/fuse_i.h | 14 +++++++-- fs/fuse/inode.c | 11 ++++++- fs/fuse/ioctl.c | 4 ++- fs/fuse/sysctl.c | 40 +++++++++++++++++++++++++ 6 files changed, 75 insertions(+), 5 deletions(-) create mode 100644 fs/fuse/sysctl.c diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst index 47499a1742bd..fa25d7e718b3 100644 --- a/Documentation/admin-guide/sysctl/fs.rst +++ b/Documentation/admin-guide/sysctl/fs.rst @@ -332,3 +332,13 @@ Each "watch" costs roughly 90 bytes on a 32-bit kernel, and roughly 160 bytes on a 64-bit one. The current default value for ``max_user_watches`` is 4% of the available low memory, divided by the "watch" cost in bytes. + +5. /proc/sys/fs/fuse - Configuration options for FUSE filesystems +===================================================================== + +This directory contains the following configuration options for FUSE +filesystems: + +``/proc/sys/fs/fuse/max_pages_limit`` is a read/write file for +setting/getting the maximum number of pages that can be used for servicing +requests in FUSE. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index ce0ff7a9007b..2c372180d631 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -14,5 +14,6 @@ fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_SYSCTL) += sysctl.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e6cc3d552b13..7ff00bae4a84 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -35,9 +35,6 @@ /** Default max number of pages that can be used in a single read request */ #define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 -/** Maximum of max_pages received in init_out */ -#define FUSE_MAX_MAX_PAGES 256 - /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN @@ -47,6 +44,9 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/** Maximum of max_pages received in init_out */ +extern unsigned int fuse_max_pages_limit; + /** List of active connections */ extern struct list_head fuse_conn_list; @@ -1480,4 +1480,12 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); +#ifdef CONFIG_SYSCTL +extern int fuse_sysctl_register(void); +extern void fuse_sysctl_unregister(void); +#else +#define fuse_sysctl_register() (0) +#define fuse_sysctl_unregister() do { } while (0) +#endif /* CONFIG_SYSCTL */ + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd3321e29a3e..f1779ff3f8d1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -35,6 +35,8 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); +unsigned int fuse_max_pages_limit = 256; + unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); @@ -944,7 +946,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; - fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + fc->max_pages_limit = fuse_max_pages_limit; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -2063,8 +2065,14 @@ static int __init fuse_fs_init(void) if (err) goto out3; + err = fuse_sysctl_register(); + if (err) + goto out4; + return 0; + out4: + unregister_filesystem(&fuse_fs_type); out3: unregister_fuseblk(); out2: @@ -2075,6 +2083,7 @@ static int __init fuse_fs_init(void) static void fuse_fs_cleanup(void) { + fuse_sysctl_unregister(); unregister_filesystem(&fuse_fs_type); unregister_fuseblk(); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 572ce8a82ceb..a6c8ee551635 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -10,6 +10,8 @@ #include #include +#define FUSE_VERITY_ENABLE_ARG_MAX_PAGES 256 + static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) { @@ -140,7 +142,7 @@ static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, { struct fsverity_enable_arg enable; struct fsverity_enable_arg __user *uarg = (void __user *)arg; - const __u32 max_buffer_len = FUSE_MAX_MAX_PAGES * PAGE_SIZE; + const __u32 max_buffer_len = FUSE_VERITY_ENABLE_ARG_MAX_PAGES * PAGE_SIZE; if (copy_from_user(&enable, uarg, sizeof(enable))) return -EFAULT; diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c new file mode 100644 index 000000000000..b272bb333005 --- /dev/null +++ b/fs/fuse/sysctl.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/fuse/fuse_sysctl.c + * + * Sysctl interface to fuse parameters + */ +#include + +#include "fuse_i.h" + +static struct ctl_table_header *fuse_table_header; + +/* Bound by fuse_init_out max_pages, which is a u16 */ +static unsigned int sysctl_fuse_max_pages_limit = 65535; + +static struct ctl_table fuse_sysctl_table[] = { + { + .procname = "max_pages_limit", + .data = &fuse_max_pages_limit, + .maxlen = sizeof(fuse_max_pages_limit), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &sysctl_fuse_max_pages_limit, + }, +}; + +int fuse_sysctl_register(void) +{ + fuse_table_header = register_sysctl("fs/fuse", fuse_sysctl_table); + if (!fuse_table_header) + return -ENOMEM; + return 0; +} + +void fuse_sysctl_unregister(void) +{ + unregister_sysctl_table(fuse_table_header); + fuse_table_header = NULL; +} From cc23d537e56153560bb2f88fd826675a5a8c6af6 Mon Sep 17 00:00:00 2001 From: yangyun Date: Mon, 26 Aug 2024 21:06:12 +0800 Subject: [PATCH 02/30] fuse: remove useless IOCB_DIRECT in fuse_direct_read/write_iter Commit 23c94e1cdcbf ("fuse: Switch to using async direct IO for FOPEN_DIRECT_IO") gave the async direct IO code path in the fuse_direct_read_iter() and fuse_direct_write_iter(). But since these two functions are only called under FOPEN_DIRECT_IO is set, it seems that we can also use the async direct IO even the flag IOCB_DIRECT is not set to enjoy the async direct IO method. Also move the definition of fuse_io_priv to where it is used in fuse_ direct_write_iter. Signed-off-by: yangyun Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index dafdf766b1d5..5d240d8f1d5a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1650,7 +1650,7 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t res; - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, to); } else { struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); @@ -1664,7 +1664,6 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; bool exclusive; @@ -1672,9 +1671,11 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) res = generic_write_checks(iocb, from); if (res > 0) { task_io_account_write(res); - if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + if (!is_sync_kiocb(iocb)) { res = fuse_direct_IO(iocb, from); } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_write_update_attr(inode, iocb->ki_pos, res); From 41748675c0bf252b3c5f600a95830f0936d366c1 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 31 Aug 2024 17:37:49 +0800 Subject: [PATCH 03/30] virtiofs: use pages instead of pointer for kernel direct IO When trying to insert a 10MB kernel module kept in a virtio-fs with cache disabled, the following warning was reported: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 404 at mm/page_alloc.c:4551 ...... Modules linked in: CPU: 1 PID: 404 Comm: insmod Not tainted 6.9.0-rc5+ #123 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:__alloc_pages+0x2bf/0x380 ...... Call Trace: ? __warn+0x8e/0x150 ? __alloc_pages+0x2bf/0x380 __kmalloc_large_node+0x86/0x160 __kmalloc+0x33c/0x480 virtio_fs_enqueue_req+0x240/0x6d0 virtio_fs_wake_pending_and_unlock+0x7f/0x190 queue_request_and_unlock+0x55/0x60 fuse_simple_request+0x152/0x2b0 fuse_direct_io+0x5d2/0x8c0 fuse_file_read_iter+0x121/0x160 __kernel_read+0x151/0x2d0 kernel_read+0x45/0x50 kernel_read_file+0x1a9/0x2a0 init_module_from_file+0x6a/0xe0 idempotent_init_module+0x175/0x230 __x64_sys_finit_module+0x5d/0xb0 x64_sys_call+0x1c3/0x9e0 do_syscall_64+0x3d/0xc0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 ...... ---[ end trace 0000000000000000 ]--- The warning is triggered as follows: 1) syscall finit_module() handles the module insertion and it invokes kernel_read_file() to read the content of the module first. 2) kernel_read_file() allocates a 10MB buffer by using vmalloc() and passes it to kernel_read(). kernel_read() constructs a kvec iter by using iov_iter_kvec() and passes it to fuse_file_read_iter(). 3) virtio-fs disables the cache, so fuse_file_read_iter() invokes fuse_direct_io(). As for now, the maximal read size for kvec iter is only limited by fc->max_read. For virtio-fs, max_read is UINT_MAX, so fuse_direct_io() doesn't split the 10MB buffer. It saves the address and the size of the 10MB-sized buffer in out_args[0] of a fuse request and passes the fuse request to virtio_fs_wake_pending_and_unlock(). 4) virtio_fs_wake_pending_and_unlock() uses virtio_fs_enqueue_req() to queue the request. Because virtiofs need DMA-able address, so virtio_fs_enqueue_req() uses kmalloc() to allocate a bounce buffer for all fuse args, copies these args into the bounce buffer and passed the physical address of the bounce buffer to virtiofsd. The total length of these fuse args for the passed fuse request is about 10MB, so copy_args_to_argbuf() invokes kmalloc() with a 10MB size parameter and it triggers the warning in __alloc_pages(): if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp)) return NULL; 5) virtio_fs_enqueue_req() will retry the memory allocation in a kworker, but it won't help, because kmalloc() will always return NULL due to the abnormal size and finit_module() will hang forever. A feasible solution is to limit the value of max_read for virtio-fs, so the length passed to kmalloc() will be limited. However it will affect the maximal read size for normal read. And for virtio-fs write initiated from kernel, it has the similar problem but now there is no way to limit fc->max_write in kernel. So instead of limiting both the values of max_read and max_write in kernel, introducing use_pages_for_kvec_io in fuse_conn and setting it as true in virtiofs. When use_pages_for_kvec_io is enabled, fuse will use pages instead of pointer to pass the KVEC_IO data. After switching to pages for KVEC_IO data, these pages will be used for DMA through virtio-fs. If these pages are backed by vmalloc(), {flush|invalidate}_kernel_vmap_range() are necessary to flush or invalidate the cache before the DMA operation. So add two new fields in fuse_args_pages to record the base address of vmalloc area and the condition indicating whether invalidation is needed. Perform the flush in fuse_get_user_pages() for write operations and the invalidation in fuse_release_user_pages() for read operations. It may seem necessary to introduce another field in fuse_conn to indicate that these KVEC_IO pages are used for DMA, However, considering that virtio-fs is currently the only user of use_pages_for_kvec_io, just reuse use_pages_for_kvec_io to indicate that these pages will be used for DMA. Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") Signed-off-by: Hou Tao Tested-by: Jingbo Xu Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 62 +++++++++++++++++++++++++++++++-------------- fs/fuse/fuse_i.h | 6 +++++ fs/fuse/virtio_fs.c | 1 + 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5d240d8f1d5a..1943945c7370 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -645,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; @@ -656,6 +656,9 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, if (ap->args.is_pinned) unpin_user_page(ap->pages[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -754,25 +757,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - - fuse_release_user_pages(&ia->ap, io->should_dirty); + size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else if (ia->write.in.size != ia->write.out.size) { - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; + nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -1468,24 +1475,37 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages) + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - ap->args.in_args[1].value = (void *) user_addr; - else - ap->args.out_args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } } while (nbytes < *nbytesp && ap->num_pages < max_pages) { @@ -1514,6 +1534,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + + ap->args.invalidate_vmap = !write && flush_or_invalidate; ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) @@ -1582,7 +1606,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages); + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; @@ -1596,7 +1620,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7ff00bae4a84..671daa4d07ad 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -309,9 +309,12 @@ struct fuse_args { bool may_block:1; bool is_ext:1; bool is_pinned:1; + bool invalidate_vmap:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); + /* Used for kvec iter backed by vmalloc address */ + void *vmap_base; }; struct fuse_args_pages { @@ -857,6 +860,9 @@ struct fuse_conn { /** Passthrough support for read/write IO */ unsigned int passthrough:1; + /* Use pages instead of pointer for kernel I/O */ + unsigned int use_pages_for_kvec_io:1; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 6404a189e989..d220e28e755f 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1691,6 +1691,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; + fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, From 86b74eb5a11e878151eb429c3810f1dcda090b8c Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Sat, 31 Aug 2024 17:37:50 +0800 Subject: [PATCH 04/30] virtiofs: use GFP_NOFS when enqueuing request through kworker When invoking virtio_fs_enqueue_req() through kworker, both the allocation of the sg array and the bounce buffer still use GFP_ATOMIC. Considering the size of the sg array may be greater than PAGE_SIZE, use GFP_NOFS instead of GFP_ATOMIC to lower the possibility of memory allocation failure and to avoid unnecessarily depleting the atomic reserves. GFP_NOFS is not passed to virtio_fs_enqueue_req() directly, GFP_KERNEL and memalloc_nofs_{save|restore} helpers are used instead. It may seem OK to pass GFP_NOFS to virtio_fs_enqueue_req() as well when queuing the request for the first time, but this is not the case. The reason is that fuse_request_queue_background() may call ->queue_request_and_unlock() while holding fc->bg_lock, which is a spin-lock. Therefore, still use GFP_ATOMIC for it. Signed-off-by: Hou Tao Reviewed-by: Jingbo Xu Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index d220e28e755f..f68527891929 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -97,7 +97,8 @@ struct virtio_fs_req_work { }; static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight); + struct fuse_req *req, bool in_flight, + gfp_t gfp); static const struct constant_table dax_param_enums[] = { {"always", FUSE_DAX_ALWAYS }, @@ -575,6 +576,8 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) /* Dispatch pending requests */ while (1) { + unsigned int flags; + spin_lock(&fsvq->lock); req = list_first_entry_or_null(&fsvq->queued_reqs, struct fuse_req, list); @@ -585,7 +588,9 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - ret = virtio_fs_enqueue_req(fsvq, req, true); + flags = memalloc_nofs_save(); + ret = virtio_fs_enqueue_req(fsvq, req, true, GFP_KERNEL); + memalloc_nofs_restore(flags); if (ret < 0) { if (ret == -ENOSPC) { spin_lock(&fsvq->lock); @@ -686,7 +691,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) } /* Allocate and copy args into req->argbuf */ -static int copy_args_to_argbuf(struct fuse_req *req) +static int copy_args_to_argbuf(struct fuse_req *req, gfp_t gfp) { struct fuse_args *args = req->args; unsigned int offset = 0; @@ -700,7 +705,7 @@ static int copy_args_to_argbuf(struct fuse_req *req) len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + fuse_len_args(num_out, args->out_args); - req->argbuf = kmalloc(len, GFP_ATOMIC); + req->argbuf = kmalloc(len, gfp); if (!req->argbuf) return -ENOMEM; @@ -1366,7 +1371,8 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, /* Add a request to a virtqueue and kick the device */ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, - struct fuse_req *req, bool in_flight) + struct fuse_req *req, bool in_flight, + gfp_t gfp) { /* requests need at least 4 elements */ struct scatterlist *stack_sgs[6]; @@ -1387,8 +1393,8 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Does the sglist fit on the stack? */ total_sgs = sg_count_fuse_req(req); if (total_sgs > ARRAY_SIZE(stack_sgs)) { - sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); - sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), gfp); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), gfp); if (!sgs || !sg) { ret = -ENOMEM; goto out; @@ -1396,7 +1402,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, } /* Use a bounce buffer since stack args cannot be mapped */ - ret = copy_args_to_argbuf(req); + ret = copy_args_to_argbuf(req, gfp); if (ret < 0) goto out; @@ -1490,7 +1496,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) queue_id); fsvq = &fs->vqs[queue_id]; - ret = virtio_fs_enqueue_req(fsvq, req, false); + ret = virtio_fs_enqueue_req(fsvq, req, false, GFP_ATOMIC); if (ret < 0) { if (ret == -ENOSPC) { /* From aaa32429da09a9afa0f54a197733d757334ed169 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:09 -0400 Subject: [PATCH 05/30] fuse: use fuse_range_is_writeback() instead of iterating pages fuse_send_readpages() waits for writeback on each page. This can be replaced by a single call to fuse_range_is_writeback(). [SzM: split this off from "fuse: convert readahead to use folios"] Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1943945c7370..f495ddd9e9ad 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -992,12 +992,17 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int i, max_pages, nr_pages = 0; + pgoff_t first = readahead_index(rac); + pgoff_t last = first + readahead_count(rac) - 1; if (fuse_is_bad(inode)) return; + wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); + max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1024,8 +1029,6 @@ static void fuse_readahead(struct readahead_control *rac) ap = &ia->ap; nr_pages = __readahead_batch(rac, ap->pages, nr_pages); for (i = 0; i < nr_pages; i++) { - fuse_wait_on_page_writeback(inode, - readahead_index(rac) + i); ap->descs[i].length = PAGE_SIZE; } ap->num_pages = nr_pages; From 3eab9d7bc2f4ae7f3f9c9c7852ff61600df79856 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:09 -0400 Subject: [PATCH 06/30] fuse: convert readahead to use folios Currently we're using the __readahead_batch() helper which populates our fuse_args_pages->pages array with pages. Convert this to use the newer folio based pattern which is to call readahead_folio() to get the next folio in the read ahead batch. I've updated the code to use things like folio_size() and to take into account larger folio sizes, but this is purely to make that eventual work easier to do, we currently will not get large folios so this is more future proofing than actual support. [SzM: remove check for readahead_folio() won't return NULL (at least for now) so remove ugly assign in conditional.] Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f495ddd9e9ad..2e7ecbf8e427 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -945,7 +945,6 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, struct folio *folio = page_folio(ap->pages[i]); folio_end_read(folio, !err); - folio_put(folio); } if (ia->ff) fuse_file_put(ia->ff, false); @@ -994,7 +993,7 @@ static void fuse_readahead(struct readahead_control *rac) struct inode *inode = rac->mapping->host; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - unsigned int i, max_pages, nr_pages = 0; + unsigned int max_pages, nr_pages; pgoff_t first = readahead_index(rac); pgoff_t last = first + readahead_count(rac) - 1; @@ -1006,9 +1005,22 @@ static void fuse_readahead(struct readahead_control *rac) max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); - for (;;) { + /* + * This is only accurate the first time through, since readahead_folio() + * doesn't update readahead_count() from the previous folio until the + * next call. Grab nr_pages here so we know how many pages we're going + * to have to process. This means that we will exit here with + * readahead_count() == folio_nr_pages(last_folio), but we will have + * consumed all of the folios, and read_pages() will call + * readahead_folio() again which will clean up the rac. + */ + nr_pages = readahead_count(rac); + + while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; + struct folio *folio; + unsigned cur_pages = min(max_pages, nr_pages); if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1018,21 +1030,19 @@ static void fuse_readahead(struct readahead_control *rac) */ break; - nr_pages = readahead_count(rac) - nr_pages; - if (nr_pages > max_pages) - nr_pages = max_pages; - if (nr_pages == 0) - break; - ia = fuse_io_alloc(NULL, nr_pages); + ia = fuse_io_alloc(NULL, cur_pages); if (!ia) return; ap = &ia->ap; - nr_pages = __readahead_batch(rac, ap->pages, nr_pages); - for (i = 0; i < nr_pages; i++) { - ap->descs[i].length = PAGE_SIZE; + + while (ap->num_pages < cur_pages) { + folio = readahead_folio(rac); + ap->pages[ap->num_pages] = &folio->page; + ap->descs[ap->num_pages].length = folio_size(folio); + ap->num_pages++; } - ap->num_pages = nr_pages; fuse_send_readpages(ia, rac->file); + nr_pages -= cur_pages; } } From 785d06afc840922cced0c4e90f99209210dd6bd9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:10 -0400 Subject: [PATCH 07/30] fuse: convert fuse_send_write_pages to use folios Convert this to grab the folio from the fuse_args_pages and use the appropriate folio related functions. Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Joanne Koong Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2e7ecbf8e427..be6d07751ee5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1175,23 +1175,23 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, offset = ap->descs[0].offset; count = ia->write.out.size; for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; + struct folio *folio = page_folio(ap->pages[i]); if (err) { - ClearPageUptodate(page); + folio_clear_uptodate(folio); } else { - if (count >= PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; + if (count >= folio_size(folio) - offset) + count -= folio_size(folio) - offset; else { if (short_write) - ClearPageUptodate(page); + folio_clear_uptodate(folio); count = 0; } offset = 0; } if (ia->write.page_locked && (i == ap->num_pages - 1)) - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return err; From 9bafbe7ae01321eb1345daf0975355f890c975cf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:11 -0400 Subject: [PATCH 08/30] fuse: convert fuse_fill_write_pages to use folios Convert this to grab the folio directly, and update all the helpers to use the folio related functions. Reviewed-by: Joanne Koong Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index be6d07751ee5..3e9a88af917c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1213,7 +1213,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, do { size_t tmp; - struct page *page; + struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; size_t bytes = min_t(size_t, PAGE_SIZE - offset, iov_iter_count(ii)); @@ -1225,25 +1225,27 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (fault_in_iov_iter_readable(ii, bytes)) break; - err = -ENOMEM; - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; + } if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + flush_dcache_folio(folio); - tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); - flush_dcache_page(page); + tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + flush_dcache_folio(folio); if (!tmp) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto again; } err = 0; - ap->pages[ap->num_pages] = page; + ap->pages[ap->num_pages] = &folio->page; ap->descs[ap->num_pages].length = tmp; ap->num_pages++; @@ -1255,10 +1257,10 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, /* If we copied full page, mark it uptodate */ if (tmp == PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); - if (PageUptodate(page)) { - unlock_page(page); + if (folio_test_uptodate(folio)) { + folio_unlock(folio); } else { ia->write.page_locked = true; break; From 184b6eb3645ad9e0e5ea8a1ac9e6a4fd501a4b45 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:12 -0400 Subject: [PATCH 09/30] fuse: convert fuse_page_mkwrite to use folios Convert this to grab the folio directly, and update all the helpers to use the folio related functions. Reviewed-by: Joanne Koong Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3e9a88af917c..add051e4ad30 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -483,6 +483,16 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); } +static void fuse_wait_on_folio_writeback(struct inode *inode, + struct folio *folio) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + pgoff_t last = folio_next_index(folio) - 1; + + wait_event(fi->page_waitq, + !fuse_range_is_writeback(inode, folio_index(folio), last)); +} + /* * Wait for all pending writepages on the inode to finish. * @@ -2558,17 +2568,17 @@ static void fuse_vma_close(struct vm_area_struct *vma) */ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct inode *inode = file_inode(vmf->vma->vm_file); file_update_time(vmf->vma->vm_file); - lock_page(page); - if (page->mapping != inode->i_mapping) { - unlock_page(page); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { + folio_unlock(folio); return VM_FAULT_NOPAGE; } - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); return VM_FAULT_LOCKED; } From e6befec5e901e06dd6c7c456a4e20d2529efb014 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:13 -0400 Subject: [PATCH 10/30] fuse: use kiocb_modified in buffered write path This combines the file_remove_privs() and file_update_time() call into one call. Signed-off-by: Josef Bacik Reviewed-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index add051e4ad30..5ae8784efe02 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1462,11 +1462,7 @@ writethrough: task_io_account_write(count); - err = file_remove_privs(file); - if (err) - goto out; - - err = file_update_time(file); + err = kiocb_modified(iocb); if (err) goto out; From 65fe891d9005a41de2fccfd5ced3c0bf6f1e3bcd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:14 -0400 Subject: [PATCH 11/30] fuse: convert fuse_do_readpage to use folios Now that the buffered write path is using folios, convert fuse_do_readpage() to take a folio instead of a page, update it to use the appropriate folio helpers, and update the callers to pass in the folio directly instead of a page. Signed-off-by: Josef Bacik Reviewed-by: Joanne Koong Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ae8784efe02..efb5e5ccbe33 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -865,12 +865,13 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, } } -static int fuse_do_readpage(struct file *file, struct page *page) +static int fuse_do_readfolio(struct file *file, struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = page_offset(page); + loff_t pos = folio_pos(folio); struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct page *page = &folio->page; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, @@ -882,11 +883,11 @@ static int fuse_do_readpage(struct file *file, struct page *page) u64 attr_ver; /* - * Page writeback can extend beyond the lifetime of the - * page-cache page, so make sure we read a properly synced - * page. + * With the temporary pages that are used to complete writeback, we can + * have writeback that extends beyond the lifetime of the folio. So + * make sure we read a properly synced folio. */ - fuse_wait_on_page_writeback(inode, page->index); + fuse_wait_on_folio_writeback(inode, folio); attr_ver = fuse_get_attr_version(fm->fc); @@ -904,25 +905,24 @@ static int fuse_do_readpage(struct file *file, struct page *page) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } static int fuse_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; int err; err = -EIO; if (fuse_is_bad(inode)) goto out; - err = fuse_do_readpage(file, page); + err = fuse_do_readfolio(file, folio); fuse_invalidate_atime(inode); out: - unlock_page(page); + folio_unlock(folio); return err; } @@ -2475,7 +2475,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, &folio->page); + err = fuse_do_readfolio(file, folio); if (err) goto cleanup; success: From 6930b8dac19ee86282222ea1cb559ee0602e4877 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:15 -0400 Subject: [PATCH 12/30] fuse: convert fuse_writepage_need_send to take a folio fuse_writepage_need_send is called by fuse_writepages_fill() which already has a folio. Change fuse_writepage_need_send() to take a folio instead, add a helper to check if the folio range is under writeback and use this, as well as the appropriate folio helpers in the rest of the function. Update fuse_writepage_need_send() to pass in the folio directly. Reviewed-by: Joanne Koong Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index efb5e5ccbe33..d4b5d46f7bce 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -483,14 +483,19 @@ static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); } +static inline bool fuse_folio_is_writeback(struct inode *inode, + struct folio *folio) +{ + pgoff_t last = folio_next_index(folio) - 1; + return fuse_range_is_writeback(inode, folio_index(folio), last); +} + static void fuse_wait_on_folio_writeback(struct inode *inode, struct folio *folio) { struct fuse_inode *fi = get_fuse_inode(inode); - pgoff_t last = folio_next_index(folio) - 1; - wait_event(fi->page_waitq, - !fuse_range_is_writeback(inode, folio_index(folio), last)); + wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); } /* @@ -2288,7 +2293,7 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, return false; } -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, +static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { @@ -2300,7 +2305,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, * the pages are faulted with get_user_pages(), and then after the read * completed. */ - if (fuse_page_is_writeback(data->inode, page->index)) + if (fuse_folio_is_writeback(data->inode, folio)) return true; /* Reached max pages */ @@ -2312,7 +2317,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) + if (data->orig_pages[ap->num_pages - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2341,7 +2346,7 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } - if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) { + if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; } From 949d67ac2eff129f1dbe2d6dc69f51f4f64281f2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:16 -0400 Subject: [PATCH 13/30] fuse: use the folio based vmstat helpers In order to make it easier to switch to folios in the fuse_args_pages update the places where we update the vmstat counters for writeback to use the folio related helpers. On the inc side this is easy as we already have the folio, on the dec side we have to page_folio() the pages for now. Reviewed-by: Joanne Koong Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d4b5d46f7bce..ce67af163c9a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1820,12 +1820,12 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct page *page) +static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) { struct backing_dev_info *bdi = inode_to_bdi(inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK_TEMP); + node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } @@ -1837,7 +1837,7 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) int i; for (i = 0; i < ap->num_pages; i++) - fuse_writepage_finish_stat(inode, ap->pages[i]); + fuse_writepage_finish_stat(inode, page_folio(ap->pages[i])); wake_up(&fi->page_waitq); } @@ -1892,7 +1892,8 @@ __acquires(fi->lock) for (aux = wpa->next; aux; aux = next) { next = aux->next; aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, aux->ia.ap.pages[0]); + fuse_writepage_finish_stat(aux->inode, + page_folio(aux->ia.ap.pages[0])); fuse_writepage_free(aux); } @@ -2112,7 +2113,7 @@ static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struc ap->descs[page_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2286,7 +2287,8 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, spin_unlock(&fi->lock); if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, new_ap->pages[0]); + fuse_writepage_finish_stat(new_wpa->inode, + page_folio(new_ap->pages[0])); fuse_writepage_free(new_wpa); } From 71e10dc2f561b1f7cef5152a865813339e96d575 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:17 -0400 Subject: [PATCH 14/30] fuse: convert fuse_retrieve to use folios We're just looking for pages in a mapping, use a folio and the folio lookup function directly instead of using the page helper. Reviewed-by: Joanne Koong Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1f64ae6d7a69..a332cb799967 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1756,15 +1756,15 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; while (num && ap->num_pages < num_pages) { - struct page *page; + struct folio *folio; unsigned int this_num; - page = find_get_page(mapping, index); - if (!page) + folio = filemap_get_folio(mapping, index); + if (IS_ERR(folio)) break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - ap->pages[ap->num_pages] = page; + ap->pages[ap->num_pages] = &folio->page; ap->descs[ap->num_pages].offset = offset; ap->descs[ap->num_pages].length = this_num; ap->num_pages++; From 8807f117be9d15088003e63bfaf0533355371ee8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 30 Sep 2024 09:45:18 -0400 Subject: [PATCH 15/30] fuse: convert fuse_notify_store to use folios This function creates pages in an inode and copies data into them, update the function to use a folio instead of a page, and use the appropriate folio helpers. [SzM: use filemap_grab_folio()] [Hau Tao: The third argument of folio_zero_range() should be the length to be zeroed, not the total length. Fix it by using folio_zero_segment() instead in fuse_notify_store()] Reviewed-by: Joanne Koong Signed-off-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a332cb799967..e25804097ffb 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1654,24 +1654,25 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { + struct folio *folio; struct page *page; unsigned int this_num; - err = -ENOMEM; - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) + folio = filemap_grab_folio(mapping, index); + err = PTR_ERR(folio); + if (IS_ERR(folio)) goto out_iput; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + page = &folio->page; + this_num = min_t(unsigned, num, folio_size(folio) - offset); err = fuse_copy_page(cs, &page, offset, this_num, 0); - if (!PageUptodate(page) && !err && offset == 0 && - (this_num == PAGE_SIZE || file_size == end)) { - zero_user_segment(page, this_num, PAGE_SIZE); - SetPageUptodate(page); + if (!folio_test_uptodate(folio) && !err && offset == 0 && + (this_num == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, this_num, folio_size(folio)); + folio_mark_uptodate(folio); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (err) goto out_iput; From a669c2df36db5fa7a2674ec5ae10548760702f99 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:17:57 -0700 Subject: [PATCH 16/30] fuse: support folios in struct fuse_args_pages and fuse_copy_pages() This adds support in struct fuse_args_pages and fuse_copy_pages() for using folios instead of pages for transferring data. Both folios and pages must be supported right now in struct fuse_args_pages and fuse_copy_pages() until all request types have been converted to use folios. Once all have been converted, then struct fuse_args_pages and fuse_copy_pages() will only support folios. Right now in fuse, all folios are one page (large folios are not yet supported). As such, copying folio->page is sufficient for copying the entire folio in fuse_copy_pages(). No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 40 ++++++++++++++++++++++++++++++++-------- fs/fuse/fuse_i.h | 22 +++++++++++++++++++--- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e25804097ffb..70da1184a3f6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1028,17 +1028,41 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, struct fuse_req *req = cs->req; struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); + if (ap->uses_folios) { + for (i = 0; i < ap->num_folios && (nbytes || zeroing); i++) { + int err; + unsigned int offset = ap->folio_descs[i].offset; + unsigned int count = min(nbytes, ap->folio_descs[i].length); + struct page *orig, *pagep; - for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { - int err; - unsigned int offset = ap->descs[i].offset; - unsigned int count = min(nbytes, ap->descs[i].length); + orig = pagep = &ap->folios[i]->page; - err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); - if (err) - return err; + err = fuse_copy_page(cs, &pagep, offset, count, zeroing); + if (err) + return err; - nbytes -= count; + nbytes -= count; + + /* + * fuse_copy_page may have moved a page from a pipe + * instead of copying into our given page, so update + * the folios if it was replaced. + */ + if (pagep != orig) + ap->folios[i] = page_folio(pagep); + } + } else { + for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { + int err; + unsigned int offset = ap->descs[i].offset; + unsigned int count = min(nbytes, ap->descs[i].length); + + err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); + if (err) + return err; + + nbytes -= count; + } } return 0; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 671daa4d07ad..24a3da8400d1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -291,6 +291,12 @@ struct fuse_page_desc { unsigned int offset; }; +/** FUSE folio descriptor */ +struct fuse_folio_desc { + unsigned int length; + unsigned int offset; +}; + struct fuse_args { uint64_t nodeid; uint32_t opcode; @@ -319,9 +325,19 @@ struct fuse_args { struct fuse_args_pages { struct fuse_args args; - struct page **pages; - struct fuse_page_desc *descs; - unsigned int num_pages; + union { + struct { + struct page **pages; + struct fuse_page_desc *descs; + unsigned int num_pages; + }; + struct { + struct folio **folios; + struct fuse_folio_desc *folio_descs; + unsigned int num_folios; + }; + }; + bool uses_folios; }; struct fuse_release_args { From 29279e1d4284a29cdd4af11e9a19800b8fda2962 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:17:58 -0700 Subject: [PATCH 17/30] fuse: add support in virtio for requests using folios Until all requests have been converted to use folios instead of pages, virtio will need to support both types. Once all requests have been converted, then virtio will support just folios. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 87 +++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 31 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index f68527891929..9f0d98cc20d3 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -766,6 +766,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct fuse_args_pages *ap; unsigned int len, i, thislen; struct page *page; + struct folio *folio; /* * TODO verify that server properly follows FUSE protocol @@ -777,15 +778,29 @@ static void virtio_fs_request_complete(struct fuse_req *req, if (args->out_pages && args->page_zeroing) { len = args->out_args[args->out_numargs - 1].size; ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { - thislen = ap->descs[i].length; - if (len < thislen) { - WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); - len = 0; - } else { - len -= thislen; + if (ap->uses_folios) { + for (i = 0; i < ap->num_folios; i++) { + thislen = ap->folio_descs[i].length; + if (len < thislen) { + WARN_ON(ap->folio_descs[i].offset); + folio = ap->folios[i]; + folio_zero_segment(folio, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } else { + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } } } } @@ -1272,16 +1287,22 @@ static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *r } /* Count number of scatter-gather elements required */ -static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, - unsigned int num_pages, - unsigned int total_len) +static unsigned int sg_count_fuse_pages(struct fuse_args_pages *ap, + unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { - this_len = min(page_descs[i].length, total_len); - total_len -= this_len; + if (ap->uses_folios) { + for (i = 0; i < ap->num_folios && total_len; i++) { + this_len = min(ap->folio_descs[i].length, total_len); + total_len -= this_len; + } + } else { + for (i = 0; i < ap->num_pages && total_len; i++) { + this_len = min(ap->descs[i].length, total_len); + total_len -= this_len; + } } return i; @@ -1299,8 +1320,7 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->in_pages) { size = args->in_args[args->in_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_pages(ap, size); } if (!test_bit(FR_ISREPLY, &req->flags)) @@ -1313,28 +1333,35 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_pages) { size = args->out_args[args->out_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, - size); + total_sgs += sg_count_fuse_pages(ap, size); } return total_sgs; } -/* Add pages to scatter-gather list and return number of elements used */ +/* Add pages/folios to scatter-gather list and return number of elements used */ static unsigned int sg_init_fuse_pages(struct scatterlist *sg, - struct page **pages, - struct fuse_page_desc *page_descs, - unsigned int num_pages, + struct fuse_args_pages *ap, unsigned int total_len) { unsigned int i; unsigned int this_len; - for (i = 0; i < num_pages && total_len; i++) { - sg_init_table(&sg[i], 1); - this_len = min(page_descs[i].length, total_len); - sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); - total_len -= this_len; + if (ap->uses_folios) { + for (i = 0; i < ap->num_folios && total_len; i++) { + sg_init_table(&sg[i], 1); + this_len = min(ap->folio_descs[i].length, total_len); + sg_set_folio(&sg[i], ap->folios[i], this_len, + ap->folio_descs[i].offset); + total_len -= this_len; + } + } else { + for (i = 0; i < ap->num_pages && total_len; i++) { + sg_init_table(&sg[i], 1); + this_len = min(ap->descs[i].length, total_len); + sg_set_page(&sg[i], ap->pages[i], this_len, ap->descs[i].offset); + total_len -= this_len; + } } return i; @@ -1358,9 +1385,7 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, sg_init_one(&sg[total_sgs++], argbuf, len); if (argpages) - total_sgs += sg_init_fuse_pages(&sg[total_sgs], - ap->pages, ap->descs, - ap->num_pages, + total_sgs += sg_init_fuse_pages(&sg[total_sgs], ap, args[numargs - 1].size); if (len_used) From ee80369a8aa850a992e93127bd16023fe1425010 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:17:59 -0700 Subject: [PATCH 18/30] fuse: convert cuse to use folios Convert cuse requests to use a folio instead of a page. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 0b2da7b7e2ad..eed78e303139 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -303,8 +303,8 @@ struct cuse_init_args { struct fuse_args_pages ap; struct cuse_init_in in; struct cuse_init_out out; - struct page *page; - struct fuse_page_desc desc; + struct folio *folio; + struct fuse_folio_desc desc; }; /** @@ -326,7 +326,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = &ia->out; - struct page *page = ap->pages[0]; + struct folio *folio = ap->folios[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; @@ -343,7 +343,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, + rc = cuse_parse_devinfo(folio_address(folio), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -411,7 +411,7 @@ static void cuse_process_init_reply(struct fuse_mount *fm, kobject_uevent(&dev->kobj, KOBJ_ADD); out: kfree(ia); - __free_page(page); + folio_put(folio); return; err_cdev: @@ -429,7 +429,7 @@ err: static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct page *page; + struct folio *folio; struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -437,13 +437,14 @@ static int cuse_send_init(struct cuse_conn *cc) BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); rc = -ENOMEM; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) goto err; ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (!ia) - goto err_free_page; + goto err_free_folio; ap = &ia->ap; ia->in.major = FUSE_KERNEL_VERSION; @@ -459,18 +460,19 @@ static int cuse_send_init(struct cuse_conn *cc) ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; ap->args.out_argvar = true; ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &ia->page; - ap->descs = &ia->desc; - ia->page = page; + ap->uses_folios = true; + ap->num_folios = 1; + ap->folios = &ia->folio; + ap->folio_descs = &ia->desc; + ia->folio = folio; ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); -err_free_page: - __free_page(page); +err_free_folio: + folio_put(folio); } err: return rc; From c1e4862b135954dd59596fbd454321ca4109b67e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:00 -0700 Subject: [PATCH 19/30] fuse: convert readlink to use folios Convert readlink requests to use a folio instead of a page. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 54104dd48af7..a08c532068d0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1585,14 +1585,15 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct page *page) +static int fuse_readlink_page(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { - .num_pages = 1, - .pages = &page, - .descs = &desc, + .uses_folios = true, + .num_folios = 1, + .folios = &folio, + .folio_descs = &desc, }; char *link; ssize_t res; @@ -1614,7 +1615,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) if (WARN_ON(res >= PAGE_SIZE)) return -EIO; - link = page_address(page); + link = folio_address(folio); link[res] = '\0'; return 0; @@ -1624,7 +1625,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); - struct page *page; + struct folio *folio; int err; err = -EIO; @@ -1638,20 +1639,20 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!dentry) goto out_err; - page = alloc_page(GFP_KERNEL); + folio = folio_alloc(GFP_KERNEL, 0); err = -ENOMEM; - if (!page) + if (!folio) goto out_err; - err = fuse_readlink_page(inode, page); + err = fuse_readlink_page(inode, folio); if (err) { - __free_page(page); + folio_put(folio); goto out_err; } - set_delayed_call(callback, page_put_link, page); + set_delayed_call(callback, page_put_link, &folio->page); - return page_address(page); + return folio_address(folio); out_err: return ERR_PTR(err); @@ -2231,7 +2232,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, &folio->page); + int err = fuse_readlink_page(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); From 02b78c7a7a0c72aee6f600a167e6adee9417ac0e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:01 -0700 Subject: [PATCH 20/30] fuse: convert readdir to use folios Convert readdir requests to use a folio instead of a page. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/readdir.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 0377b6dc24c8..fd0eff1b9f2d 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -331,24 +331,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct page *page; + struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; - struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; u64 attr_version = 0; bool locked; - page = alloc_page(GFP_KERNEL); - if (!page) + folio = folio_alloc(GFP_KERNEL, 0); + if (!folio) return -ENOMEM; plus = fuse_use_readdirplus(inode, ctx); ap->args.out_pages = true; - ap->num_pages = 1; - ap->pages = &page; - ap->descs = &desc; + ap->uses_folios = true; + ap->num_folios = 1; + ap->folios = &folio; + ap->folio_descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, @@ -367,15 +368,15 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(page_address(page), res, + res = parse_dirplusfile(folio_address(folio), res, file, ctx, attr_version); } else { - res = parse_dirfile(page_address(page), res, file, + res = parse_dirfile(folio_address(folio), res, file, ctx); } } - __free_page(page); + folio_put(folio); fuse_invalidate_atime(inode); return res; } From 51b025301824f16d51243aa505709d678f2e059e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:02 -0700 Subject: [PATCH 21/30] fuse: convert reads to use folios Convert read requests to use folios instead of pages. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 67 ++++++++++++++++++++++++++++++++---------------- fs/fuse/fuse_i.h | 12 +++++++++ 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ce67af163c9a..ece1c0319e35 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -760,12 +760,37 @@ static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, return ia; } +static struct fuse_io_args *fuse_io_folios_alloc(struct fuse_io_priv *io, + unsigned int nfolios) +{ + struct fuse_io_args *ia; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (ia) { + ia->io = io; + ia->ap.uses_folios = true; + ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, + &ia->ap.folio_descs); + if (!ia->ap.folios) { + kfree(ia); + ia = NULL; + } + } + return ia; +} + static void fuse_io_free(struct fuse_io_args *ia) { kfree(ia->ap.pages); kfree(ia); } +static void fuse_io_folios_free(struct fuse_io_args *ia) +{ + kfree(ia->ap.folios); + kfree(ia); +} + static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, int err) { @@ -865,7 +890,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, * reached the client fs yet. So the hole is not present there. */ if (!fc->writeback_cache) { - loff_t pos = page_offset(ap->pages[0]) + num_read; + loff_t pos = folio_pos(ap->folios[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } @@ -875,14 +900,14 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = folio_pos(folio); - struct fuse_page_desc desc = { .length = PAGE_SIZE }; - struct page *page = &folio->page; + struct fuse_folio_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, - .ap.num_pages = 1, - .ap.pages = &page, - .ap.descs = &desc, + .ap.uses_folios = true, + .ap.num_folios = 1, + .ap.folios = &folio, + .ap.folio_descs = &desc, }; ssize_t res; u64 attr_ver; @@ -941,8 +966,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < ap->num_pages; i++) - mapping = ap->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_folios; i++) + mapping = ap->folios[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -956,15 +981,12 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_pages; i++) { - struct folio *folio = page_folio(ap->pages[i]); - - folio_end_read(folio, !err); - } + for (i = 0; i < ap->num_folios; i++) + folio_end_read(ap->folios[i], !err); if (ia->ff) fuse_file_put(ia->ff, false); - fuse_io_free(ia); + fuse_io_folios_free(ia); } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) @@ -972,8 +994,9 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; - loff_t pos = page_offset(ap->pages[0]); - size_t count = ap->num_pages << PAGE_SHIFT; + loff_t pos = folio_pos(ap->folios[0]); + /* Currently, all folios in FUSE are one page */ + size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -984,7 +1007,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { count--; - ap->descs[ap->num_pages - 1].length--; + ap->folio_descs[ap->num_folios - 1].length--; } WARN_ON((loff_t) (pos + count) < 0); @@ -1045,16 +1068,16 @@ static void fuse_readahead(struct readahead_control *rac) */ break; - ia = fuse_io_alloc(NULL, cur_pages); + ia = fuse_io_folios_alloc(NULL, cur_pages); if (!ia) return; ap = &ia->ap; - while (ap->num_pages < cur_pages) { + while (ap->num_folios < cur_pages) { folio = readahead_folio(rac); - ap->pages[ap->num_pages] = &folio->page; - ap->descs[ap->num_pages].length = folio_size(folio); - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->folio_descs[ap->num_folios].length = folio_size(folio); + ap->num_folios++; } fuse_send_readpages(ia, rac->file); nr_pages -= cur_pages; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24a3da8400d1..b6877064c071 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1029,6 +1029,18 @@ static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, return pages; } +static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, + struct fuse_folio_desc **desc) +{ + struct folio **folios; + + folios = kzalloc(nfolios * (sizeof(struct folio *) + + sizeof(struct fuse_folio_desc)), flags); + *desc = (void *) (folios + nfolios); + + return folios; +} + static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, unsigned int index, unsigned int nr_pages) From f2ef459bab7326f4800ec2098cf073fbda2185af Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:03 -0700 Subject: [PATCH 22/30] fuse: convert writes (non-writeback) to use folios Convert non-writeback write requests to use folios instead of pages. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 33 ++++++++++++++++++--------------- fs/fuse/fuse_i.h | 2 +- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ece1c0319e35..b3d5b7b5da52 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1197,8 +1197,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, bool short_write; int err; - for (i = 0; i < ap->num_pages; i++) - fuse_wait_on_page_writeback(inode, ap->pages[i]->index); + for (i = 0; i < ap->num_folios; i++) + fuse_wait_on_folio_writeback(inode, ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1210,10 +1210,10 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, err = -EIO; short_write = ia->write.out.size < count; - offset = ap->descs[0].offset; + offset = ap->folio_descs[0].offset; count = ia->write.out.size; - for (i = 0; i < ap->num_pages; i++) { - struct folio *folio = page_folio(ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) { + struct folio *folio = ap->folios[i]; if (err) { folio_clear_uptodate(folio); @@ -1227,7 +1227,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, } offset = 0; } - if (ia->write.page_locked && (i == ap->num_pages - 1)) + if (ia->write.folio_locked && (i == ap->num_folios - 1)) folio_unlock(folio); folio_put(folio); } @@ -1243,11 +1243,12 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); + unsigned int nr_pages = 0; size_t count = 0; int err; ap->args.in_pages = true; - ap->descs[0].offset = offset; + ap->folio_descs[0].offset = offset; do { size_t tmp; @@ -1283,9 +1284,10 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, } err = 0; - ap->pages[ap->num_pages] = &folio->page; - ap->descs[ap->num_pages].length = tmp; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->folio_descs[ap->num_folios].length = tmp; + ap->num_folios++; + nr_pages++; count += tmp; pos += tmp; @@ -1300,13 +1302,13 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (folio_test_uptodate(folio)) { folio_unlock(folio); } else { - ia->write.page_locked = true; + ia->write.folio_locked = true; break; } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - ap->num_pages < max_pages && offset == 0); + nr_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1340,8 +1342,9 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); - if (!ap->pages) { + ap->uses_folios = true; + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->folio_descs); + if (!ap->folios) { err = -ENOMEM; break; } @@ -1363,7 +1366,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) err = -EIO; } } - kfree(ap->pages); + kfree(ap->folios); } while (!err && iov_iter_count(ii)); fuse_write_update_attr(inode, pos, res); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index b6877064c071..201b08562b6b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1096,7 +1096,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; - bool page_locked; + bool folio_locked; } write; }; struct fuse_args_pages ap; From ac1cf6e3bbe3dd371bd61a423437c1f67bba8b2a Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:04 -0700 Subject: [PATCH 23/30] fuse: convert ioctls to use folios Convert ioctl requests to use folios instead of pages. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 10 ++++++++++ fs/fuse/ioctl.c | 32 ++++++++++++++++---------------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 201b08562b6b..c1c7def8ee4b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1051,6 +1051,16 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, descs[i].length = PAGE_SIZE - descs[i].offset; } +static inline void fuse_folio_descs_length_init(struct fuse_folio_desc *descs, + unsigned int index, + unsigned int nr_folios) +{ + int i; + + for (i = index; i < index + nr_folios; i++) + descs[i].length = PAGE_SIZE - descs[i].offset; +} + static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) { /* Need RCU protection to prevent use after free after the decrement */ diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index a6c8ee551635..1c77d8a27950 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -251,12 +251,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); + ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.folio_descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!ap.pages || !iov_page) + if (!ap.folios || !iov_page) goto out; - fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); + fuse_folio_descs_length_init(ap.folio_descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -306,14 +306,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fm->fc->max_pages) goto out; - while (ap.num_pages < max_pages) { - ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!ap.pages[ap.num_pages]) + ap.uses_folios = true; + while (ap.num_folios < max_pages) { + ap.folios[ap.num_folios] = folio_alloc(GFP_KERNEL | __GFP_HIGHMEM, 0); + if (!ap.folios[ap.num_folios]) goto out; - ap.num_pages++; + ap.num_folios++; } - /* okay, let's send it to the client */ ap.args.opcode = FUSE_IOCTL; ap.args.nodeid = ff->nodeid; @@ -327,8 +327,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_SOURCE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_from_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } @@ -366,7 +366,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_local_page(ap.pages[0]); + vaddr = kmap_local_folio(ap.folios[0], 0); err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -394,17 +394,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, ITER_DEST, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { - c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_folios); i++) { + c = copy_folio_to_iter(ap.folios[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: free_page((unsigned long) iov_page); - while (ap.num_pages) - __free_page(ap.pages[--ap.num_pages]); - kfree(ap.pages); + while (ap.num_folios) + folio_put(ap.folios[--ap.num_folios]); + kfree(ap.folios); return err ? err : outarg.result; } From 448895df0366041366a84861350ce471446bf560 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:05 -0700 Subject: [PATCH 24/30] fuse: convert retrieves to use folios Convert retrieve requests to use folios instead of pages. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 70da1184a3f6..563a0bfa0e95 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1728,7 +1728,7 @@ static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_retrieve_args *ra = container_of(args, typeof(*ra), ap.args); - release_pages(ra->ap.pages, ra->ap.num_pages); + release_pages(ra->ap.folios, ra->ap.num_folios); kfree(ra); } @@ -1742,7 +1742,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages; + unsigned int num_pages, cur_pages = 0; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1761,15 +1761,16 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); + args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->folio_descs[0])); ra = kzalloc(args_size, GFP_KERNEL); if (!ra) return -ENOMEM; ap = &ra->ap; - ap->pages = (void *) (ra + 1); - ap->descs = (void *) (ap->pages + num_pages); + ap->folios = (void *) (ra + 1); + ap->folio_descs = (void *) (ap->folios + num_pages); + ap->uses_folios = true; args = &ap->args; args->nodeid = outarg->nodeid; @@ -1780,7 +1781,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num && ap->num_pages < num_pages) { + while (num && cur_pages < num_pages) { struct folio *folio; unsigned int this_num; @@ -1789,10 +1790,11 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - ap->pages[ap->num_pages] = &folio->page; - ap->descs[ap->num_pages].offset = offset; - ap->descs[ap->num_pages].length = this_num; - ap->num_pages++; + ap->folios[ap->num_folios] = folio; + ap->folio_descs[ap->num_folios].offset = offset; + ap->folio_descs[ap->num_folios].length = this_num; + ap->num_folios++; + cur_pages++; offset = 0; num -= this_num; From cbe9c115b7441dd790540436118eee4626ec9979 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:06 -0700 Subject: [PATCH 25/30] fuse: convert writebacks to use folios Convert writeback requests to use folios instead of pages. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 126 +++++++++++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 62 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b3d5b7b5da52..99af3c39e529 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -436,7 +436,7 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_pages) + if (idx_from >= curr_index + wpa->ia.ap.num_folios) n = n->rb_right; else if (idx_to < curr_index) n = n->rb_left; @@ -1837,12 +1837,12 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_pages; i++) - __free_page(ap->pages[i]); + for (i = 0; i < ap->num_folios; i++) + folio_put(ap->folios[i]); fuse_file_put(wpa->ia.ff, false); - kfree(ap->pages); + kfree(ap->folios); kfree(wpa); } @@ -1862,8 +1862,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_inode *fi = get_fuse_inode(inode); int i; - for (i = 0; i < ap->num_pages; i++) - fuse_writepage_finish_stat(inode, page_folio(ap->pages[i])); + for (i = 0; i < ap->num_folios; i++) + fuse_writepage_finish_stat(inode, ap->folios[i]); wake_up(&fi->page_waitq); } @@ -1878,7 +1878,8 @@ __acquires(fi->lock) struct fuse_inode *fi = get_fuse_inode(wpa->inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_args *args = &wpa->ia.ap.args; - __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + /* Currently, all folios in FUSE are one page */ + __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; int err; fi->writectr++; @@ -1919,7 +1920,7 @@ __acquires(fi->lock) next = aux->next; aux->next = NULL; fuse_writepage_finish_stat(aux->inode, - page_folio(aux->ia.ap.pages[0])); + aux->ia.ap.folios[0]); fuse_writepage_free(aux); } @@ -1954,11 +1955,11 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, struct fuse_writepage_args *wpa) { pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; - WARN_ON(!wpa->ia.ap.num_pages); + WARN_ON(!wpa->ia.ap.num_folios); while (*p) { struct fuse_writepage_args *curr; pgoff_t curr_index; @@ -1969,7 +1970,7 @@ static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, WARN_ON(curr->inode != wpa->inode); curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + curr->ia.ap.num_pages) + if (idx_from >= curr_index + curr->ia.ap.num_folios) p = &(*p)->rb_right; else if (idx_to < curr_index) p = &(*p)->rb_left; @@ -2101,9 +2102,10 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) wpa = kzalloc(sizeof(*wpa), GFP_NOFS); if (wpa) { ap = &wpa->ia.ap; - ap->num_pages = 0; - ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); - if (!ap->pages) { + ap->num_folios = 0; + ap->uses_folios = true; + ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->folio_descs); + if (!ap->folios) { kfree(wpa); wpa = NULL; } @@ -2127,16 +2129,16 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t page_index) + struct folio *tmp_folio, uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; folio_copy(tmp_folio, folio); - ap->pages[page_index] = &tmp_folio->page; - ap->descs[page_index].offset = 0; - ap->descs[page_index].length = PAGE_SIZE; + ap->folios[folio_index] = tmp_folio; + ap->folio_descs[folio_index].offset = 0; + ap->folio_descs[folio_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); @@ -2193,7 +2195,7 @@ static int fuse_writepage_locked(struct folio *folio) goto err_writepage_args; ap = &wpa->ia.ap; - ap->num_pages = 1; + ap->num_folios = 1; folio_start_writeback(folio); fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); @@ -2221,32 +2223,32 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct page **orig_pages; - unsigned int max_pages; + struct folio **orig_folios; + unsigned int max_folios; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) { struct fuse_args_pages *ap = &data->wpa->ia.ap; struct fuse_conn *fc = get_fuse_conn(data->inode); - struct page **pages; - struct fuse_page_desc *descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, data->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int nfolios = min_t(unsigned int, + max_t(unsigned int, data->max_folios * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), fc->max_pages); - WARN_ON(npages <= data->max_pages); + WARN_ON(nfolios <= data->max_folios); - pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); - if (!pages) + folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); + if (!folios) return false; - memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); - memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); - kfree(ap->pages); - ap->pages = pages; - ap->descs = descs; - data->max_pages = npages; + memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); + memcpy(descs, ap->folio_descs, sizeof(struct fuse_folio_desc) * ap->num_folios); + kfree(ap->folios); + ap->folios = folios; + ap->folio_descs = descs; + data->max_folios = nfolios; return true; } @@ -2256,7 +2258,7 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = wpa->ia.ap.num_pages; + int num_folios = wpa->ia.ap.num_folios; int i; spin_lock(&fi->lock); @@ -2264,8 +2266,8 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - for (i = 0; i < num_pages; i++) - end_page_writeback(data->orig_pages[i]); + for (i = 0; i < num_folios; i++) + folio_end_writeback(data->orig_folios[i]); } /* @@ -2276,15 +2278,15 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * swapping the new temp page with the old one. */ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct page *page) + struct folio *folio) { struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); struct fuse_writepage_args *tmp; struct fuse_writepage_args *old_wpa; struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_ap->num_pages != 0); - new_ap->num_pages = 1; + WARN_ON(new_ap->num_folios != 0); + new_ap->num_folios = 1; spin_lock(&fi->lock); old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); @@ -2298,9 +2300,9 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, WARN_ON(tmp->inode != new_wpa->inode); curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == page->index) { - WARN_ON(tmp->ia.ap.num_pages != 1); - swap(tmp->ia.ap.pages[0], new_ap->pages[0]); + if (curr_index == folio->index) { + WARN_ON(tmp->ia.ap.num_folios != 1); + swap(tmp->ia.ap.folios[0], new_ap->folios[0]); break; } } @@ -2314,7 +2316,7 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, if (tmp) { fuse_writepage_finish_stat(new_wpa->inode, - page_folio(new_ap->pages[0])); + folio); fuse_writepage_free(new_wpa); } @@ -2325,7 +2327,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { - WARN_ON(!ap->num_pages); + WARN_ON(!ap->num_folios); /* * Being under writeback is unlikely but possible. For example direct @@ -2337,19 +2339,19 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, return true; /* Reached max pages */ - if (ap->num_pages == fc->max_pages) + if (ap->num_folios == fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) + if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) return true; /* Discontinuity */ - if (data->orig_pages[ap->num_pages - 1]->index + 1 != folio_index(folio)) + if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) return true; /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) + if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) return true; return false; @@ -2393,7 +2395,7 @@ static int fuse_writepages_fill(struct folio *folio, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_pages. + * request to the fi->writepages list and increment ap->num_folios. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ @@ -2405,13 +2407,13 @@ static int fuse_writepages_fill(struct folio *folio, goto out_unlock; } fuse_file_get(wpa->ia.ff); - data->max_pages = 1; + data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_pages); - data->orig_pages[ap->num_pages] = &folio->page; + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); + data->orig_folios[ap->num_folios] = folio; err = 0; if (data->wpa) { @@ -2420,9 +2422,9 @@ static int fuse_writepages_fill(struct folio *folio, * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - ap->num_pages++; + ap->num_folios++; spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, &folio->page)) { + } else if (fuse_writepage_add(wpa, folio)) { data->wpa = wpa; } else { folio_end_writeback(folio); @@ -2454,21 +2456,21 @@ static int fuse_writepages(struct address_space *mapping, data.ff = NULL; err = -ENOMEM; - data.orig_pages = kcalloc(fc->max_pages, - sizeof(struct page *), - GFP_NOFS); - if (!data.orig_pages) + data.orig_folios = kcalloc(fc->max_pages, + sizeof(struct folio *), + GFP_NOFS); + if (!data.orig_folios) goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_pages); + WARN_ON(!data.wpa->ia.ap.num_folios); fuse_writepages_send(&data); } if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_pages); + kfree(data.orig_folios); out: return err; } From 7fce207af5ec074a9a50e90eb866b17ca4a90f06 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:07 -0700 Subject: [PATCH 26/30] mm/writeback: add folio_mark_dirty_lock() Add a new convenience helper folio_mark_dirty_lock() that grabs the folio lock before calling folio_mark_dirty(). Refactor set_page_dirty_lock() to directly use folio_mark_dirty_lock(). Signed-off-by: Joanne Koong Signed-off-by: Miklos Szeredi --- include/linux/mm.h | 1 + mm/folio-compat.c | 6 ++++++ mm/page-writeback.c | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ecf63d2b0582..446d7096c48f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2539,6 +2539,7 @@ struct kvec; struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); +bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 80746182e9e8..1d1832e2a599 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -52,6 +52,12 @@ bool set_page_dirty(struct page *page) } EXPORT_SYMBOL(set_page_dirty); +int set_page_dirty_lock(struct page *page) +{ + return folio_mark_dirty_lock(page_folio(page)); +} +EXPORT_SYMBOL(set_page_dirty_lock); + bool clear_page_dirty_for_io(struct page *page) { return folio_clear_dirty_for_io(page_folio(page)); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fcd4c1439cb9..db00a66d8b84 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2914,25 +2914,25 @@ bool folio_mark_dirty(struct folio *folio) EXPORT_SYMBOL(folio_mark_dirty); /* - * set_page_dirty() is racy if the caller has no reference against - * page->mapping->host, and if the page is unlocked. This is because another - * CPU could truncate the page off the mapping and then free the mapping. + * folio_mark_dirty() is racy if the caller has no reference against + * folio->mapping->host, and if the folio is unlocked. This is because another + * CPU could truncate the folio off the mapping and then free the mapping. * - * Usually, the page _is_ locked, or the caller is a user-space process which + * Usually, the folio _is_ locked, or the caller is a user-space process which * holds a reference on the inode by having an open file. * - * In other cases, the page should be locked before running set_page_dirty(). + * In other cases, the folio should be locked before running folio_mark_dirty(). */ -int set_page_dirty_lock(struct page *page) +bool folio_mark_dirty_lock(struct folio *folio) { - int ret; + bool ret; - lock_page(page); - ret = set_page_dirty(page); - unlock_page(page); + folio_lock(folio); + ret = folio_mark_dirty(folio); + folio_unlock(folio); return ret; } -EXPORT_SYMBOL(set_page_dirty_lock); +EXPORT_SYMBOL(folio_mark_dirty_lock); /* * This cancels just the dirty bit on the kernel page itself, it does NOT From 3b97c3652d9128ab7f8c9b8adec6108611fdb153 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:08 -0700 Subject: [PATCH 27/30] fuse: convert direct io to use folios Convert direct io requests to use folios instead of pages. No functional changes. Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 82 +++++++++++++++++++++--------------------------- fs/fuse/fuse_i.h | 22 ------------- 2 files changed, 36 insertions(+), 68 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 99af3c39e529..14af8c41fc83 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -665,11 +665,11 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, { unsigned int i; - for (i = 0; i < ap->num_pages; i++) { + for (i = 0; i < ap->num_folios; i++) { if (should_dirty) - set_page_dirty_lock(ap->pages[i]); + folio_mark_dirty_lock(ap->folios[i]); if (ap->args.is_pinned) - unpin_user_page(ap->pages[i]); + unpin_folio(ap->folios[i]); } if (nres > 0 && ap->args.invalidate_vmap) @@ -742,24 +742,6 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) kref_put(&io->refcnt, fuse_io_release); } -static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, - unsigned int npages) -{ - struct fuse_io_args *ia; - - ia = kzalloc(sizeof(*ia), GFP_KERNEL); - if (ia) { - ia->io = io; - ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, - &ia->ap.descs); - if (!ia->ap.pages) { - kfree(ia); - ia = NULL; - } - } - return ia; -} - static struct fuse_io_args *fuse_io_folios_alloc(struct fuse_io_priv *io, unsigned int nfolios) { @@ -779,12 +761,6 @@ static struct fuse_io_args *fuse_io_folios_alloc(struct fuse_io_priv *io, return ia; } -static void fuse_io_free(struct fuse_io_args *ia) -{ - kfree(ia->ap.pages); - kfree(ia); -} - static void fuse_io_folios_free(struct fuse_io_args *ia) { kfree(ia->ap.folios); @@ -821,7 +797,7 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); fuse_aio_complete(io, err, pos); - fuse_io_free(ia); + fuse_io_folios_free(ia); } static ssize_t fuse_async_req_send(struct fuse_mount *fm, @@ -1531,6 +1507,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, bool use_pages_for_kvec_io) { bool flush_or_invalidate = false; + unsigned int nr_pages = 0; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; @@ -1560,15 +1537,23 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, } } - while (nbytes < *nbytesp && ap->num_pages < max_pages) { - unsigned npages; - size_t start; - struct page **pt_pages; + /* + * Until there is support for iov_iter_extract_folios(), we have to + * manually extract pages using iov_iter_extract_pages() and then + * copy that to a folios array. + */ + struct page **pages = kzalloc(max_pages * sizeof(struct page *), + GFP_KERNEL); + if (!pages) + return -ENOMEM; - pt_pages = &ap->pages[ap->num_pages]; - ret = iov_iter_extract_pages(ii, &pt_pages, + while (nbytes < *nbytesp && nr_pages < max_pages) { + unsigned nfolios, i; + size_t start; + + ret = iov_iter_extract_pages(ii, &pages, *nbytesp - nbytes, - max_pages - ap->num_pages, + max_pages - nr_pages, 0, &start); if (ret < 0) break; @@ -1576,15 +1561,20 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, nbytes += ret; ret += start; - npages = DIV_ROUND_UP(ret, PAGE_SIZE); + /* Currently, all folios in FUSE are one page */ + nfolios = DIV_ROUND_UP(ret, PAGE_SIZE); - ap->descs[ap->num_pages].offset = start; - fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); + ap->folio_descs[ap->num_folios].offset = start; + fuse_folio_descs_length_init(ap->folio_descs, ap->num_folios, nfolios); + for (i = 0; i < nfolios; i++) + ap->folios[i + ap->num_folios] = page_folio(pages[i]); - ap->num_pages += npages; - ap->descs[ap->num_pages - 1].length -= + ap->num_folios += nfolios; + ap->folio_descs[ap->num_folios - 1].length -= (PAGE_SIZE - ret) & (PAGE_SIZE - 1); + nr_pages += nfolios; } + kfree(pages); if (write && flush_or_invalidate) flush_kernel_vmap_range(ap->args.vmap_base, nbytes); @@ -1624,14 +1614,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; max_pages = iov_iter_npages(iter, fc->max_pages); - ia = fuse_io_alloc(io, max_pages); + ia = fuse_io_folios_alloc(io, max_pages); if (!ia) return -ENOMEM; if (fopen_direct_io && fc->direct_io_allow_mmap) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { - fuse_io_free(ia); + fuse_io_folios_free(ia); return res; } } @@ -1646,7 +1636,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (fopen_direct_io && write) { res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); if (res) { - fuse_io_free(ia); + fuse_io_folios_free(ia); return res; } } @@ -1673,7 +1663,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!io->async || nres < 0) { fuse_release_user_pages(&ia->ap, nres, io->should_dirty); - fuse_io_free(ia); + fuse_io_folios_free(ia); } ia = NULL; if (nres < 0) { @@ -1692,13 +1682,13 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (count) { max_pages = iov_iter_npages(iter, fc->max_pages); - ia = fuse_io_alloc(io, max_pages); + ia = fuse_io_folios_alloc(io, max_pages); if (!ia) break; } } if (ia) - fuse_io_free(ia); + fuse_io_folios_free(ia); if (res > 0) *ppos = pos; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c1c7def8ee4b..d26d278da886 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1017,18 +1017,6 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } -static inline struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) -{ - struct page **pages; - - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) (pages + npages); - - return pages; -} - static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, struct fuse_folio_desc **desc) { @@ -1041,16 +1029,6 @@ static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags return folios; } -static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, - unsigned int index, - unsigned int nr_pages) -{ - int i; - - for (i = index; i < index + nr_pages; i++) - descs[i].length = PAGE_SIZE - descs[i].offset; -} - static inline void fuse_folio_descs_length_init(struct fuse_folio_desc *descs, unsigned int index, unsigned int nr_folios) From 68bfb7eb7f7de355d5b3812c25a2a36e9eead97b Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 24 Oct 2024 10:18:09 -0700 Subject: [PATCH 28/30] fuse: remove pages for requests and exclusively use folios All fuse requests use folios instead of pages for transferring data. Remove pages from the requests and exclusively use folios. No functional changes. [SzM: rename back folio_descs -> descs, etc.] Signed-off-by: Joanne Koong Reviewed-by: Josef Bacik Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 3 +- fs/fuse/dev.c | 57 ++++++++++----------------- fs/fuse/dir.c | 3 +- fs/fuse/file.c | 58 +++++++++++++--------------- fs/fuse/fuse_i.h | 22 ++--------- fs/fuse/ioctl.c | 5 +-- fs/fuse/readdir.c | 3 +- fs/fuse/virtio_fs.c | 93 +++++++++++++++++---------------------------- 8 files changed, 90 insertions(+), 154 deletions(-) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index eed78e303139..b39844d75a80 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -460,10 +460,9 @@ static int cuse_send_init(struct cuse_conn *cc) ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; ap->args.out_argvar = true; ap->args.out_pages = true; - ap->uses_folios = true; ap->num_folios = 1; ap->folios = &ia->folio; - ap->folio_descs = &ia->desc; + ap->descs = &ia->desc; ia->folio = folio; ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 563a0bfa0e95..29fc61a072ba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1028,41 +1028,27 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, struct fuse_req *req = cs->req; struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); - if (ap->uses_folios) { - for (i = 0; i < ap->num_folios && (nbytes || zeroing); i++) { - int err; - unsigned int offset = ap->folio_descs[i].offset; - unsigned int count = min(nbytes, ap->folio_descs[i].length); - struct page *orig, *pagep; + for (i = 0; i < ap->num_folios && (nbytes || zeroing); i++) { + int err; + unsigned int offset = ap->descs[i].offset; + unsigned int count = min(nbytes, ap->descs[i].length); + struct page *orig, *pagep; - orig = pagep = &ap->folios[i]->page; + orig = pagep = &ap->folios[i]->page; - err = fuse_copy_page(cs, &pagep, offset, count, zeroing); - if (err) - return err; + err = fuse_copy_page(cs, &pagep, offset, count, zeroing); + if (err) + return err; - nbytes -= count; + nbytes -= count; - /* - * fuse_copy_page may have moved a page from a pipe - * instead of copying into our given page, so update - * the folios if it was replaced. - */ - if (pagep != orig) - ap->folios[i] = page_folio(pagep); - } - } else { - for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { - int err; - unsigned int offset = ap->descs[i].offset; - unsigned int count = min(nbytes, ap->descs[i].length); - - err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); - if (err) - return err; - - nbytes -= count; - } + /* + * fuse_copy_page may have moved a page from a pipe instead of + * copying into our given page, so update the folios if it was + * replaced. + */ + if (pagep != orig) + ap->folios[i] = page_folio(pagep); } return 0; } @@ -1761,7 +1747,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->folio_descs[0])); + args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); ra = kzalloc(args_size, GFP_KERNEL); if (!ra) @@ -1769,8 +1755,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, ap = &ra->ap; ap->folios = (void *) (ra + 1); - ap->folio_descs = (void *) (ap->folios + num_pages); - ap->uses_folios = true; + ap->descs = (void *) (ap->folios + num_pages); args = &ap->args; args->nodeid = outarg->nodeid; @@ -1791,8 +1776,8 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, this_num = min_t(unsigned, num, PAGE_SIZE - offset); ap->folios[ap->num_folios] = folio; - ap->folio_descs[ap->num_folios].offset = offset; - ap->folio_descs[ap->num_folios].length = this_num; + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = this_num; ap->num_folios++; cur_pages++; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index a08c532068d0..b8a4608e31af 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1590,10 +1590,9 @@ static int fuse_readlink_page(struct inode *inode, struct folio *folio) struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { - .uses_folios = true, .num_folios = 1, .folios = &folio, - .folio_descs = &desc, + .descs = &desc, }; char *link; ssize_t res; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 14af8c41fc83..88d0946b5bc9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -742,7 +742,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) kref_put(&io->refcnt, fuse_io_release); } -static struct fuse_io_args *fuse_io_folios_alloc(struct fuse_io_priv *io, +static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, unsigned int nfolios) { struct fuse_io_args *ia; @@ -750,9 +750,8 @@ static struct fuse_io_args *fuse_io_folios_alloc(struct fuse_io_priv *io, ia = kzalloc(sizeof(*ia), GFP_KERNEL); if (ia) { ia->io = io; - ia->ap.uses_folios = true; ia->ap.folios = fuse_folios_alloc(nfolios, GFP_KERNEL, - &ia->ap.folio_descs); + &ia->ap.descs); if (!ia->ap.folios) { kfree(ia); ia = NULL; @@ -761,7 +760,7 @@ static struct fuse_io_args *fuse_io_folios_alloc(struct fuse_io_priv *io, return ia; } -static void fuse_io_folios_free(struct fuse_io_args *ia) +static void fuse_io_free(struct fuse_io_args *ia) { kfree(ia->ap.folios); kfree(ia); @@ -797,7 +796,7 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); fuse_aio_complete(io, err, pos); - fuse_io_folios_free(ia); + fuse_io_free(ia); } static ssize_t fuse_async_req_send(struct fuse_mount *fm, @@ -880,10 +879,9 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, - .ap.uses_folios = true, .ap.num_folios = 1, .ap.folios = &folio, - .ap.folio_descs = &desc, + .ap.descs = &desc, }; ssize_t res; u64 attr_ver; @@ -962,7 +960,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, if (ia->ff) fuse_file_put(ia->ff, false); - fuse_io_folios_free(ia); + fuse_io_free(ia); } static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) @@ -983,7 +981,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) /* Don't overflow end offset */ if (pos + (count - 1) == LLONG_MAX) { count--; - ap->folio_descs[ap->num_folios - 1].length--; + ap->descs[ap->num_folios - 1].length--; } WARN_ON((loff_t) (pos + count) < 0); @@ -1044,7 +1042,7 @@ static void fuse_readahead(struct readahead_control *rac) */ break; - ia = fuse_io_folios_alloc(NULL, cur_pages); + ia = fuse_io_alloc(NULL, cur_pages); if (!ia) return; ap = &ia->ap; @@ -1052,7 +1050,7 @@ static void fuse_readahead(struct readahead_control *rac) while (ap->num_folios < cur_pages) { folio = readahead_folio(rac); ap->folios[ap->num_folios] = folio; - ap->folio_descs[ap->num_folios].length = folio_size(folio); + ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; } fuse_send_readpages(ia, rac->file); @@ -1186,7 +1184,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, err = -EIO; short_write = ia->write.out.size < count; - offset = ap->folio_descs[0].offset; + offset = ap->descs[0].offset; count = ia->write.out.size; for (i = 0; i < ap->num_folios; i++) { struct folio *folio = ap->folios[i]; @@ -1224,7 +1222,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, int err; ap->args.in_pages = true; - ap->folio_descs[0].offset = offset; + ap->descs[0].offset = offset; do { size_t tmp; @@ -1261,7 +1259,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, err = 0; ap->folios[ap->num_folios] = folio; - ap->folio_descs[ap->num_folios].length = tmp; + ap->descs[ap->num_folios].length = tmp; ap->num_folios++; nr_pages++; @@ -1318,8 +1316,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - ap->uses_folios = true; - ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->folio_descs); + ap->folios = fuse_folios_alloc(nr_pages, GFP_KERNEL, &ap->descs); if (!ap->folios) { err = -ENOMEM; break; @@ -1564,13 +1561,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, /* Currently, all folios in FUSE are one page */ nfolios = DIV_ROUND_UP(ret, PAGE_SIZE); - ap->folio_descs[ap->num_folios].offset = start; - fuse_folio_descs_length_init(ap->folio_descs, ap->num_folios, nfolios); + ap->descs[ap->num_folios].offset = start; + fuse_folio_descs_length_init(ap->descs, ap->num_folios, nfolios); for (i = 0; i < nfolios; i++) ap->folios[i + ap->num_folios] = page_folio(pages[i]); ap->num_folios += nfolios; - ap->folio_descs[ap->num_folios - 1].length -= + ap->descs[ap->num_folios - 1].length -= (PAGE_SIZE - ret) & (PAGE_SIZE - 1); nr_pages += nfolios; } @@ -1614,14 +1611,14 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, bool fopen_direct_io = ff->open_flags & FOPEN_DIRECT_IO; max_pages = iov_iter_npages(iter, fc->max_pages); - ia = fuse_io_folios_alloc(io, max_pages); + ia = fuse_io_alloc(io, max_pages); if (!ia) return -ENOMEM; if (fopen_direct_io && fc->direct_io_allow_mmap) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { - fuse_io_folios_free(ia); + fuse_io_free(ia); return res; } } @@ -1636,7 +1633,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (fopen_direct_io && write) { res = invalidate_inode_pages2_range(mapping, idx_from, idx_to); if (res) { - fuse_io_folios_free(ia); + fuse_io_free(ia); return res; } } @@ -1663,7 +1660,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!io->async || nres < 0) { fuse_release_user_pages(&ia->ap, nres, io->should_dirty); - fuse_io_folios_free(ia); + fuse_io_free(ia); } ia = NULL; if (nres < 0) { @@ -1682,13 +1679,13 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (count) { max_pages = iov_iter_npages(iter, fc->max_pages); - ia = fuse_io_folios_alloc(io, max_pages); + ia = fuse_io_alloc(io, max_pages); if (!ia) break; } } if (ia) - fuse_io_folios_free(ia); + fuse_io_free(ia); if (res > 0) *ppos = pos; @@ -2093,8 +2090,7 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) if (wpa) { ap = &wpa->ia.ap; ap->num_folios = 0; - ap->uses_folios = true; - ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->folio_descs); + ap->folios = fuse_folios_alloc(1, GFP_NOFS, &ap->descs); if (!ap->folios) { kfree(wpa); wpa = NULL; @@ -2127,8 +2123,8 @@ static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struc folio_copy(tmp_folio, folio); ap->folios[folio_index] = tmp_folio; - ap->folio_descs[folio_index].offset = 0; - ap->folio_descs[folio_index].length = PAGE_SIZE; + ap->descs[folio_index].offset = 0; + ap->descs[folio_index].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); @@ -2234,10 +2230,10 @@ static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) return false; memcpy(folios, ap->folios, sizeof(struct folio *) * ap->num_folios); - memcpy(descs, ap->folio_descs, sizeof(struct fuse_folio_desc) * ap->num_folios); + memcpy(descs, ap->descs, sizeof(struct fuse_folio_desc) * ap->num_folios); kfree(ap->folios); ap->folios = folios; - ap->folio_descs = descs; + ap->descs = descs; data->max_folios = nfolios; return true; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d26d278da886..d35c37ccf9b5 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -285,12 +285,6 @@ struct fuse_arg { void *value; }; -/** FUSE page descriptor */ -struct fuse_page_desc { - unsigned int length; - unsigned int offset; -}; - /** FUSE folio descriptor */ struct fuse_folio_desc { unsigned int length; @@ -325,19 +319,9 @@ struct fuse_args { struct fuse_args_pages { struct fuse_args args; - union { - struct { - struct page **pages; - struct fuse_page_desc *descs; - unsigned int num_pages; - }; - struct { - struct folio **folios; - struct fuse_folio_desc *folio_descs; - unsigned int num_folios; - }; - }; - bool uses_folios; + struct folio **folios; + struct fuse_folio_desc *descs; + unsigned int num_folios; }; struct fuse_release_args { diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 1c77d8a27950..2d9abf48828f 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -251,12 +251,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.folio_descs); + ap.folios = fuse_folios_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!ap.folios || !iov_page) goto out; - fuse_folio_descs_length_init(ap.folio_descs, 0, fm->fc->max_pages); + fuse_folio_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -306,7 +306,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fm->fc->max_pages) goto out; - ap.uses_folios = true; while (ap.num_folios < max_pages) { ap.folios[ap.num_folios] = folio_alloc(GFP_KERNEL | __GFP_HIGHMEM, 0); if (!ap.folios[ap.num_folios]) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index fd0eff1b9f2d..a2deeada4a5a 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -346,10 +346,9 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) plus = fuse_use_readdirplus(inode, ctx); ap->args.out_pages = true; - ap->uses_folios = true; ap->num_folios = 1; ap->folios = &folio; - ap->folio_descs = &desc; + ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 9f0d98cc20d3..d88d3fc5306a 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -765,7 +765,6 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; - struct page *page; struct folio *folio; /* @@ -778,29 +777,15 @@ static void virtio_fs_request_complete(struct fuse_req *req, if (args->out_pages && args->page_zeroing) { len = args->out_args[args->out_numargs - 1].size; ap = container_of(args, typeof(*ap), args); - if (ap->uses_folios) { - for (i = 0; i < ap->num_folios; i++) { - thislen = ap->folio_descs[i].length; - if (len < thislen) { - WARN_ON(ap->folio_descs[i].offset); - folio = ap->folios[i]; - folio_zero_segment(folio, len, thislen); - len = 0; - } else { - len -= thislen; - } - } - } else { - for (i = 0; i < ap->num_pages; i++) { - thislen = ap->descs[i].length; - if (len < thislen) { - WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); - len = 0; - } else { - len -= thislen; - } + for (i = 0; i < ap->num_folios; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + folio = ap->folios[i]; + folio_zero_segment(folio, len, thislen); + len = 0; + } else { + len -= thislen; } } } @@ -1287,22 +1272,16 @@ static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *r } /* Count number of scatter-gather elements required */ -static unsigned int sg_count_fuse_pages(struct fuse_args_pages *ap, - unsigned int total_len) +static unsigned int sg_count_fuse_folios(struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - if (ap->uses_folios) { - for (i = 0; i < ap->num_folios && total_len; i++) { - this_len = min(ap->folio_descs[i].length, total_len); - total_len -= this_len; - } - } else { - for (i = 0; i < ap->num_pages && total_len; i++) { - this_len = min(ap->descs[i].length, total_len); - total_len -= this_len; - } + for (i = 0; i < num_folios && total_len; i++) { + this_len = min(folio_descs[i].length, total_len); + total_len -= this_len; } return i; @@ -1320,7 +1299,8 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->in_pages) { size = args->in_args[args->in_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap, size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } if (!test_bit(FR_ISREPLY, &req->flags)) @@ -1333,35 +1313,28 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_pages) { size = args->out_args[args->out_numargs - 1].size; - total_sgs += sg_count_fuse_pages(ap, size); + total_sgs += sg_count_fuse_folios(ap->descs, ap->num_folios, + size); } return total_sgs; } -/* Add pages/folios to scatter-gather list and return number of elements used */ -static unsigned int sg_init_fuse_pages(struct scatterlist *sg, - struct fuse_args_pages *ap, - unsigned int total_len) +/* Add folios to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_folios(struct scatterlist *sg, + struct folio **folios, + struct fuse_folio_desc *folio_descs, + unsigned int num_folios, + unsigned int total_len) { unsigned int i; unsigned int this_len; - if (ap->uses_folios) { - for (i = 0; i < ap->num_folios && total_len; i++) { - sg_init_table(&sg[i], 1); - this_len = min(ap->folio_descs[i].length, total_len); - sg_set_folio(&sg[i], ap->folios[i], this_len, - ap->folio_descs[i].offset); - total_len -= this_len; - } - } else { - for (i = 0; i < ap->num_pages && total_len; i++) { - sg_init_table(&sg[i], 1); - this_len = min(ap->descs[i].length, total_len); - sg_set_page(&sg[i], ap->pages[i], this_len, ap->descs[i].offset); - total_len -= this_len; - } + for (i = 0; i < num_folios && total_len; i++) { + sg_init_table(&sg[i], 1); + this_len = min(folio_descs[i].length, total_len); + sg_set_folio(&sg[i], folios[i], this_len, folio_descs[i].offset); + total_len -= this_len; } return i; @@ -1385,8 +1358,10 @@ static unsigned int sg_init_fuse_args(struct scatterlist *sg, sg_init_one(&sg[total_sgs++], argbuf, len); if (argpages) - total_sgs += sg_init_fuse_pages(&sg[total_sgs], ap, - args[numargs - 1].size); + total_sgs += sg_init_fuse_folios(&sg[total_sgs], + ap->folios, ap->descs, + ap->num_folios, + args[numargs - 1].size); if (len_used) *len_used = len; From 69eb56f69efb866c791cc87fd7bf62adf2ffcbb3 Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Mon, 18 Nov 2024 18:16:00 +0800 Subject: [PATCH 29/30] fuse: check attributes staleness on fuse_iget() Function fuse_direntplus_link() might call fuse_iget() to initialize a new fuse_inode and change its attributes. If fi->attr_version is always initialized with 0, even if the attributes returned by the FUSE_READDIR request is staled, as the new fi->attr_version is 0, fuse_change_attributes will still set the staled attributes to inode. This wrong behaviour may cause file size inconsistency even when there is no changes from server-side. To reproduce the issue, consider the following 2 programs (A and B) are running concurrently, A B ---------------------------------- -------------------------------- { /fusemnt/dir/f is a file path in a fuse mount, the size of f is 0. } readdir(/fusemnt/dir) start //Daemon set size 0 to f direntry fallocate(f, 1024) stat(f) // B see size 1024 echo 2 > /proc/sys/vm/drop_caches readdir(/fusemnt/dir) reply to kernel Kernel set 0 to the I_NEW inode stat(f) // B see size 0 In the above case, only program B is modifying the file size, however, B observes file size changing between the 2 'readonly' stat() calls. To fix this issue, we should make sure readdirplus still follows the rule of attr_version staleness checking even if the fi->attr_version is lost due to inode eviction. To identify this situation, the new fc->evict_ctr is used to record whether the eviction of inodes occurs during the readdirplus request processing. If it does, the result of readdirplus may be inaccurate; otherwise, the result of readdirplus can be trusted. Although this may still lead to incorrect invalidation, considering the relatively low frequency of evict occurrences, it should be acceptable. Link: https://lore.kernel.org/lkml/20230711043405.66256-2-zhangjiachen.jaycee@bytedance.com/ Link: https://lore.kernel.org/lkml/20241114070905.48901-1-zhangtianci.1997@bytedance.com/ Reported-by: Jiachen Zhang Suggested-by: Miklos Szeredi Signed-off-by: Zhang Tianci Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 11 +++++----- fs/fuse/fuse_i.h | 14 ++++++++++-- fs/fuse/inode.c | 56 +++++++++++++++++++++++++++++++++++++---------- fs/fuse/readdir.c | 15 ++++++++----- 4 files changed, 71 insertions(+), 25 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b8a4608e31af..494ac372ace0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -366,7 +366,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; - u64 attr_version; + u64 attr_version, evict_ctr; int err; *inode = NULL; @@ -381,6 +381,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out; attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); err = fuse_simple_request(fm, &args); @@ -398,7 +399,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), - attr_version); + attr_version, evict_ctr); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); @@ -691,7 +692,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, ff->nodeid = outentry.nodeid; ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, - &outentry.attr, ATTR_TIMEOUT(&outentry), 0); + &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); @@ -822,7 +823,7 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, - &outarg.attr, ATTR_TIMEOUT(&outarg), 0); + &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; @@ -2028,7 +2029,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), - fuse_get_cache_mask(inode)); + fuse_get_cache_mask(inode), 0); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d35c37ccf9b5..74744c6f2860 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -890,6 +890,9 @@ struct fuse_conn { /** Version counter for attribute changes */ atomic64_t attr_version; + /** Version counter for evict inode */ + atomic64_t evict_ctr; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -984,6 +987,11 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc) return atomic64_read(&fc->attr_version); } +static inline u64 fuse_get_evict_ctr(struct fuse_conn *fc) +{ + return atomic64_read(&fc->evict_ctr); +} + static inline bool fuse_stale_inode(const struct inode *inode, int generation, struct fuse_attr *attr) { @@ -1043,7 +1051,8 @@ extern const struct dentry_operations fuse_root_dentry_operations; */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version); + u64 attr_valid, u64 attr_version, + u64 evict_ctr); int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode); @@ -1133,7 +1142,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask); + u64 attr_valid, u32 cache_mask, + u64 evict_ctr); u32 fuse_get_cache_mask(struct inode *inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f1779ff3f8d1..3ce4f4e81d09 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,6 +175,14 @@ static void fuse_evict_inode(struct inode *inode) fuse_cleanup_submount_lookup(fc, fi->submount_lookup); fi->submount_lookup = NULL; } + /* + * Evict of non-deleted inode may race with outstanding + * LOOKUP/READDIRPLUS requests and result in inconsistency when + * the request finishes. Deal with that here by bumping a + * counter that can be compared to the starting value. + */ + if (inode->i_nlink > 0) + atomic64_inc(&fc->evict_ctr); } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { WARN_ON(fi->iocachectr != 0); @@ -208,17 +216,30 @@ static ino_t fuse_squash_ino(u64 ino64) void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_statx *sx, - u64 attr_valid, u32 cache_mask) + u64 attr_valid, u32 cache_mask, + u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); lockdep_assert_held(&fi->lock); + /* + * Clear basic stats from invalid mask. + * + * Don't do this if this is coming from a fuse_iget() call and there + * might have been a racing evict which would've invalidated the result + * if the attr_version would've been preserved. + * + * !evict_ctr -> this is create + * fi->attr_version != 0 -> this is not a new inode + * evict_ctr == fuse_get_evict_ctr() -> no evicts while during request + */ + if (!evict_ctr || fi->attr_version || evict_ctr == fuse_get_evict_ctr(fc)) + set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; - /* Clear basic stats from invalid mask */ - set_mask_bits(&fi->inval_mask, STATX_BASIC_STATS, 0); inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -297,9 +318,9 @@ u32 fuse_get_cache_mask(struct inode *inode) return STATX_MTIME | STATX_CTIME | STATX_SIZE; } -void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - struct fuse_statx *sx, - u64 attr_valid, u64 attr_version) +static void fuse_change_attributes_i(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version, u64 evict_ctr) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -333,7 +354,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode_get_mtime(inode); - fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); + fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask, + evict_ctr); oldsize = inode->i_size; /* @@ -374,6 +396,13 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fuse_dax_dontcache(inode, attr->flags); } +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + struct fuse_statx *sx, u64 attr_valid, + u64 attr_version) +{ + fuse_change_attributes_i(inode, attr, sx, attr_valid, attr_version, 0); +} + static void fuse_init_submount_lookup(struct fuse_submount_lookup *sl, u64 nodeid) { @@ -428,7 +457,8 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp) struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) + u64 attr_valid, u64 attr_version, + u64 evict_ctr) { struct inode *inode; struct fuse_inode *fi; @@ -489,8 +519,8 @@ retry: fi->nlookup++; spin_unlock(&fi->lock); done: - fuse_change_attributes(inode, attr, NULL, attr_valid, attr_version); - + fuse_change_attributes_i(inode, attr, NULL, attr_valid, attr_version, + evict_ctr); return inode; } @@ -942,6 +972,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->initialized = 0; fc->connected = 1; atomic64_set(&fc->attr_version, 1); + atomic64_set(&fc->evict_ctr, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); @@ -1003,7 +1034,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0, 0); } struct fuse_inode_handle { @@ -1612,7 +1643,8 @@ static int fuse_fill_super_submount(struct super_block *sb, return -ENOMEM; fuse_fill_attr_from_inode(&root_attr, parent_fi); - root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0, + fuse_get_evict_ctr(fm->fc)); /* * This inode is just a duplicate, so it is not looked up and * its nlookup should not be incremented. fuse_iget() does diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index a2deeada4a5a..17ce9636a2b1 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -149,7 +149,7 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file, static int fuse_direntplus_link(struct file *file, struct fuse_direntplus *direntplus, - u64 attr_version) + u64 attr_version, u64 evict_ctr) { struct fuse_entry_out *o = &direntplus->entry_out; struct fuse_dirent *dirent = &direntplus->dirent; @@ -233,7 +233,7 @@ retry: } else { inode = fuse_iget(dir->i_sb, o->nodeid, o->generation, &o->attr, ATTR_TIMEOUT(o), - attr_version); + attr_version, evict_ctr); if (!inode) inode = ERR_PTR(-ENOMEM); @@ -284,7 +284,8 @@ static void fuse_force_forget(struct file *file, u64 nodeid) } static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, - struct dir_context *ctx, u64 attr_version) + struct dir_context *ctx, u64 attr_version, + u64 evict_ctr) { struct fuse_direntplus *direntplus; struct fuse_dirent *dirent; @@ -319,7 +320,7 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, buf += reclen; nbytes -= reclen; - ret = fuse_direntplus_link(file, direntplus, attr_version); + ret = fuse_direntplus_link(file, direntplus, attr_version, evict_ctr); if (ret) fuse_force_forget(file, direntplus->entry_out.nodeid); } @@ -337,7 +338,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; struct fuse_folio_desc desc = { .length = PAGE_SIZE }; - u64 attr_version = 0; + u64 attr_version = 0, evict_ctr = 0; bool locked; folio = folio_alloc(GFP_KERNEL, 0); @@ -351,6 +352,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); + evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -368,7 +370,8 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { res = parse_dirplusfile(folio_address(folio), res, - file, ctx, attr_version); + file, ctx, attr_version, + evict_ctr); } else { res = parse_dirfile(folio_address(folio), res, file, ctx); From d1dfb5f52ffc4a142d88da5c0ed0514f3602c4b8 Mon Sep 17 00:00:00 2001 From: Asahi Lina Date: Wed, 13 Nov 2024 04:55:32 +0900 Subject: [PATCH 30/30] virtiofs: dax: remove ->writepages() callback When using FUSE DAX with virtiofs, cache coherency is managed by the host. Disk persistence is handled via fsync() and friends, which are passed directly via the FUSE layer to the host. Therefore, there's no need to do dax_writeback_mapping_range(). All that ends up doing is a cache flush operation, which is not caught by KVM and doesn't do much, since the host and guest are already cache-coherent. Since dax_writeback_mapping_range() checks that the inode block size is equal to PAGE_SIZE, this fixes a spurious WARN when virtiofs is used with a mismatched guest PAGE_SIZE and virtiofs backing FS block size (this happens, for example, when it's a tmpfs and the host and guest have a different PAGE_SIZE). FUSE DAX does not require any particular FS block size, since it always performs DAX mappings in aligned 2MiB blocks. See discussion in [1]. [1] https://lore.kernel.org/lkml/20241101-dax-page-size-v1-1-eedbd0c6b08f@asahilina.net/T/#u [SzM: remove the empty callback] Suggested-by: Dan Williams Signed-off-by: Asahi Lina Acked-by: Dan Williams Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..9abbc2f2894f 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -774,16 +774,6 @@ out: return ret; } -static int fuse_dax_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - - struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); - - return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); -} - static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, bool write) { @@ -1323,7 +1313,6 @@ bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) } static const struct address_space_operations fuse_dax_file_aops = { - .writepages = fuse_dax_writepages, .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, };