linux-next/fs/netfs/objects.c

235 lines
6.6 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/* Object lifetime handling and tracing.
*
* Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/slab.h>
#include <linux/mempool.h>
#include <linux/delay.h>
#include "internal.h"
/*
* Allocate an I/O request and initialise it.
*/
struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct file *file,
loff_t start, size_t len,
enum netfs_io_origin origin)
{
static atomic_t debug_ids;
netfs: Add a netfs inode context Add a netfs_i_context struct that should be included in the network filesystem's own inode struct wrapper, directly after the VFS's inode struct, e.g.: struct my_inode { struct { /* These must be contiguous */ struct inode vfs_inode; struct netfs_i_context netfs_ctx; }; }; The netfs_i_context struct so far contains a single field for the network filesystem to use - the cache cookie: struct netfs_i_context { ... struct fscache_cookie *cache; }; Three functions are provided to help with this: (1) void netfs_i_context_init(struct inode *inode, const struct netfs_request_ops *ops); Initialise the netfs context and set the operations. (2) struct netfs_i_context *netfs_i_context(struct inode *inode); Find the netfs context from the VFS inode. (3) struct inode *netfs_inode(struct netfs_i_context *ctx); Find the VFS inode from the netfs context. Changes ======= ver #4) - Fix netfs_is_cache_enabled() to check cookie->cache_priv to see if a cache is present[3]. - Fix netfs_skip_folio_read() to zero out all of the page, not just some of it[3]. ver #3) - Split out the bit to move ceph cap-getting on readahead into ceph_init_request()[1]. - Stick in a comment to the netfs inode structs indicating the contiguity requirements[2]. ver #2) - Adjust documentation to match. - Use "#if IS_ENABLED()" in netfs_i_cookie(), not "#ifdef". - Move the cap check from ceph_readahead() to ceph_init_request() to be called from netfslib. - Remove ceph_readahead() and use netfs_readahead() directly instead. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/8af0d47f17d89c06bbf602496dd845f2b0bf25b3.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/beaf4f6a6c2575ed489adb14b257253c868f9a5c.camel@kernel.org/ [2] Link: https://lore.kernel.org/r/3536452.1647421585@warthog.procyon.org.uk/ [3] Link: https://lore.kernel.org/r/164622984545.3564931.15691742939278418580.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/164678213320.1200972.16807551936267647470.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/164692909854.2099075.9535537286264248057.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/306388.1647595110@warthog.procyon.org.uk/ # v4
2021-06-29 21:37:05 +00:00
struct inode *inode = file ? file_inode(file) : mapping->host;
netfs: Fix gcc-12 warning by embedding vfs inode in netfs_i_context While randstruct was satisfied with using an open-coded "void *" offset cast for the netfs_i_context <-> inode casting, __builtin_object_size() as used by FORTIFY_SOURCE was not as easily fooled. This was causing the following complaint[1] from gcc v12: In file included from include/linux/string.h:253, from include/linux/ceph/ceph_debug.h:7, from fs/ceph/inode.c:2: In function 'fortify_memset_chk', inlined from 'netfs_i_context_init' at include/linux/netfs.h:326:2, inlined from 'ceph_alloc_inode' at fs/ceph/inode.c:463:2: include/linux/fortify-string.h:242:25: warning: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Wattribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix this by embedding a struct inode into struct netfs_i_context (which should perhaps be renamed to struct netfs_inode). The struct inode vfs_inode fields are then removed from the 9p, afs, ceph and cifs inode structs and vfs_inode is then simply changed to "netfs.inode" in those filesystems. Further, rename netfs_i_context to netfs_inode, get rid of the netfs_inode() function that converted a netfs_i_context pointer to an inode pointer (that can now be done with &ctx->inode) and rename the netfs_i_context() function to netfs_inode() (which is now a wrapper around container_of()). Most of the changes were done with: perl -p -i -e 's/vfs_inode/netfs.inode/'g \ `git grep -l 'vfs_inode' -- fs/{9p,afs,ceph,cifs}/*.[ch]` Kees suggested doing it with a pair structure[2] and a special declarator to insert that into the network filesystem's inode wrapper[3], but I think it's cleaner to embed it - and then it doesn't matter if struct randomisation reorders things. Dave Chinner suggested using a filesystem-specific VFS_I() function in each filesystem to convert that filesystem's own inode wrapper struct into the VFS inode struct[4]. Version #2: - Fix a couple of missed name changes due to a disabled cifs option. - Rename nfs_i_context to nfs_inode - Use "netfs" instead of "nic" as the member name in per-fs inode wrapper structs. [ This also undoes commit 507160f46c55 ("netfs: gcc-12: temporarily disable '-Wattribute-warning' for now") that is no longer needed ] Fixes: bc899ee1c898 ("netfs: Add a netfs inode context") Reported-by: Jeff Layton <jlayton@kernel.org> Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: Kees Cook <keescook@chromium.org> Reviewed-by: Xiubo Li <xiubli@redhat.com> cc: Jonathan Corbet <corbet@lwn.net> cc: Eric Van Hensbergen <ericvh@gmail.com> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Steve French <smfrench@gmail.com> cc: William Kucharski <william.kucharski@oracle.com> cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> cc: Dave Chinner <david@fromorbit.com> cc: linux-doc@vger.kernel.org cc: v9fs-developer@lists.sourceforge.net cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: samba-technical@lists.samba.org cc: linux-fsdevel@vger.kernel.org cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/r/d2ad3a3d7bdd794c6efb562d2f2b655fb67756b9.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/20220517210230.864239-1-keescook@chromium.org/ [2] Link: https://lore.kernel.org/r/20220518202212.2322058-1-keescook@chromium.org/ [3] Link: https://lore.kernel.org/r/20220524101205.GI2306852@dread.disaster.area/ [4] Link: https://lore.kernel.org/r/165296786831.3591209.12111293034669289733.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/165305805651.4094995.7763502506786714216.stgit@warthog.procyon.org.uk # v2 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-06-09 20:46:04 +00:00
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
struct kmem_cache *cache = mempool->pool_data;
bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
origin == NETFS_DIO_READ ||
origin == NETFS_DIO_WRITE);
bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
for (;;) {
rreq = mempool_alloc(mempool, GFP_KERNEL);
if (rreq)
break;
msleep(10);
}
memset(rreq, 0, kmem_cache_size(cache));
rreq->start = start;
rreq->len = len;
rreq->upper_len = len;
rreq->origin = origin;
netfs: Add a netfs inode context Add a netfs_i_context struct that should be included in the network filesystem's own inode struct wrapper, directly after the VFS's inode struct, e.g.: struct my_inode { struct { /* These must be contiguous */ struct inode vfs_inode; struct netfs_i_context netfs_ctx; }; }; The netfs_i_context struct so far contains a single field for the network filesystem to use - the cache cookie: struct netfs_i_context { ... struct fscache_cookie *cache; }; Three functions are provided to help with this: (1) void netfs_i_context_init(struct inode *inode, const struct netfs_request_ops *ops); Initialise the netfs context and set the operations. (2) struct netfs_i_context *netfs_i_context(struct inode *inode); Find the netfs context from the VFS inode. (3) struct inode *netfs_inode(struct netfs_i_context *ctx); Find the VFS inode from the netfs context. Changes ======= ver #4) - Fix netfs_is_cache_enabled() to check cookie->cache_priv to see if a cache is present[3]. - Fix netfs_skip_folio_read() to zero out all of the page, not just some of it[3]. ver #3) - Split out the bit to move ceph cap-getting on readahead into ceph_init_request()[1]. - Stick in a comment to the netfs inode structs indicating the contiguity requirements[2]. ver #2) - Adjust documentation to match. - Use "#if IS_ENABLED()" in netfs_i_cookie(), not "#ifdef". - Move the cap check from ceph_readahead() to ceph_init_request() to be called from netfslib. - Remove ceph_readahead() and use netfs_readahead() directly instead. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/8af0d47f17d89c06bbf602496dd845f2b0bf25b3.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/beaf4f6a6c2575ed489adb14b257253c868f9a5c.camel@kernel.org/ [2] Link: https://lore.kernel.org/r/3536452.1647421585@warthog.procyon.org.uk/ [3] Link: https://lore.kernel.org/r/164622984545.3564931.15691742939278418580.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/164678213320.1200972.16807551936267647470.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/164692909854.2099075.9535537286264248057.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/306388.1647595110@warthog.procyon.org.uk/ # v4
2021-06-29 21:37:05 +00:00
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
netfs: Add a netfs inode context Add a netfs_i_context struct that should be included in the network filesystem's own inode struct wrapper, directly after the VFS's inode struct, e.g.: struct my_inode { struct { /* These must be contiguous */ struct inode vfs_inode; struct netfs_i_context netfs_ctx; }; }; The netfs_i_context struct so far contains a single field for the network filesystem to use - the cache cookie: struct netfs_i_context { ... struct fscache_cookie *cache; }; Three functions are provided to help with this: (1) void netfs_i_context_init(struct inode *inode, const struct netfs_request_ops *ops); Initialise the netfs context and set the operations. (2) struct netfs_i_context *netfs_i_context(struct inode *inode); Find the netfs context from the VFS inode. (3) struct inode *netfs_inode(struct netfs_i_context *ctx); Find the VFS inode from the netfs context. Changes ======= ver #4) - Fix netfs_is_cache_enabled() to check cookie->cache_priv to see if a cache is present[3]. - Fix netfs_skip_folio_read() to zero out all of the page, not just some of it[3]. ver #3) - Split out the bit to move ceph cap-getting on readahead into ceph_init_request()[1]. - Stick in a comment to the netfs inode structs indicating the contiguity requirements[2]. ver #2) - Adjust documentation to match. - Use "#if IS_ENABLED()" in netfs_i_cookie(), not "#ifdef". - Move the cap check from ceph_readahead() to ceph_init_request() to be called from netfslib. - Remove ceph_readahead() and use netfs_readahead() directly instead. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/8af0d47f17d89c06bbf602496dd845f2b0bf25b3.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/beaf4f6a6c2575ed489adb14b257253c868f9a5c.camel@kernel.org/ [2] Link: https://lore.kernel.org/r/3536452.1647421585@warthog.procyon.org.uk/ [3] Link: https://lore.kernel.org/r/164622984545.3564931.15691742939278418580.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/164678213320.1200972.16807551936267647470.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/164692909854.2099075.9535537286264248057.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/306388.1647595110@warthog.procyon.org.uk/ # v4
2021-06-29 21:37:05 +00:00
rreq->inode = inode;
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
netfs: New writeback implementation The current netfslib writeback implementation creates writeback requests of contiguous folio data and then separately tiles subrequests over the space twice, once for the server and once for the cache. This creates a few issues: (1) Every time there's a discontiguity or a change between writing to only one destination or writing to both, it must create a new request. This makes it harder to do vectored writes. (2) The folios don't have the writeback mark removed until the end of the request - and a request could be hundreds of megabytes. (3) In future, I want to support a larger cache granularity, which will require aggregation of some folios that contain unmodified data (which only need to go to the cache) and some which contain modifications (which need to be uploaded and stored to the cache) - but, currently, these are treated as discontiguous. There's also a move to get everyone to use writeback_iter() to extract writable folios from the pagecache. That said, currently writeback_iter() has some issues that make it less than ideal: (1) there's no way to cancel the iteration, even if you find a "temporary" error that means the current folio and all subsequent folios are going to fail; (2) there's no way to filter the folios being written back - something that will impact Ceph with it's ordered snap system; (3) and if you get a folio you can't immediately deal with (say you need to flush the preceding writes), you are left with a folio hanging in the locked state for the duration, when really we should unlock it and relock it later. In this new implementation, I use writeback_iter() to pump folios, progressively creating two parallel, but separate streams and cleaning up the finished folios as the subrequests complete. Either or both streams can contain gaps, and the subrequests in each stream can be of variable size, don't need to align with each other and don't need to align with the folios. Indeed, subrequests can cross folio boundaries, may cover several folios or a folio may be spanned by multiple folios, e.g.: +---+---+-----+-----+---+----------+ Folios: | | | | | | | +---+---+-----+-----+---+----------+ +------+------+ +----+----+ Upload: | | |.....| | | +------+------+ +----+----+ +------+------+------+------+------+ Cache: | | | | | | +------+------+------+------+------+ The progressive subrequest construction permits the algorithm to be preparing both the next upload to the server and the next write to the cache whilst the previous ones are already in progress. Throttling can be applied to control the rate of production of subrequests - and, in any case, we probably want to write them to the server in ascending order, particularly if the file will be extended. Content crypto can also be prepared at the same time as the subrequests and run asynchronously, with the prepped requests being stalled until the crypto catches up with them. This might also be useful for transport crypto, but that happens at a lower layer, so probably would be harder to pull off. The algorithm is split into three parts: (1) The issuer. This walks through the data, packaging it up, encrypting it and creating subrequests. The part of this that generates subrequests only deals with file positions and spans and so is usable for DIO/unbuffered writes as well as buffered writes. (2) The collector. This asynchronously collects completed subrequests, unlocks folios, frees crypto buffers and performs any retries. This runs in a work queue so that the issuer can return to the caller for writeback (so that the VM can have its kswapd thread back) or async writes. (3) The retryer. This pauses the issuer, waits for all outstanding subrequests to complete and then goes through the failed subrequests to reissue them. This may involve reprepping them (with cifs, the credits must be renegotiated, and a subrequest may need splitting), and doing RMW for content crypto if there's a conflicting change on the server. [!] Note that some of the functions are prefixed with "new_" to avoid clashes with existing functions. These will be renamed in a later patch that cuts over to the new algorithm. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Eric Van Hensbergen <ericvh@kernel.org> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org
2024-03-18 16:52:05 +00:00
rreq->wsize = INT_MAX;
spin_lock_init(&rreq->lock);
INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
INIT_LIST_HEAD(&rreq->subrequests);
INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
netfs: Replace PG_fscache by setting folio->private and marking dirty When dirty data is being written to the cache, setting/waiting on/clearing the fscache flag is always done in tandem with setting/waiting on/clearing the writeback flag. The netfslib buffered write routines wait on and set both flags and the write request cleanup clears both flags, so the fscache flag is almost superfluous. The reason it isn't superfluous is because the fscache flag is also used to indicate that data just read from the server is being written to the cache. The flag is used to prevent a race involving overlapping direct-I/O writes to the cache. Change this to indicate that a page is in need of being copied to the cache by placing a magic value in folio->private and marking the folios dirty. Then when the writeback code sees a folio marked in this way, it only writes it to the cache and not to the server. If a folio that has this magic value set is modified, the value is just replaced and the folio will then be uplodaded too. With this, PG_fscache is no longer required by the netfslib core, 9p and afs. Ceph and nfs, however, still need to use the old PG_fscache-based tracking. To deal with this, a flag, NETFS_ICTX_USE_PGPRIV2, now has to be set on the flags in the netfs_inode struct for those filesystems. This reenables the use of PG_fscache in that inode. 9p and afs use the netfslib write helpers so get switched over; cifs, for the moment, does page-by-page manual access to the cache, so doesn't use PG_fscache and is unaffected. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Matthew Wilcox (Oracle) <willy@infradead.org> cc: Eric Van Hensbergen <ericvh@kernel.org> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Xiubo Li <xiubli@redhat.com> cc: Steve French <sfrench@samba.org> cc: Paulo Alcantara <pc@manguebit.com> cc: Ronnie Sahlberg <ronniesahlberg@gmail.com> cc: Shyam Prasad N <sprasad@microsoft.com> cc: Tom Talpey <tom@talpey.com> cc: Bharath SM <bharathsm@microsoft.com> cc: Trond Myklebust <trond.myklebust@hammerspace.com> cc: Anna Schumaker <anna@kernel.org> cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org
2024-03-19 10:00:09 +00:00
if (cached) {
__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
netfs: Replace PG_fscache by setting folio->private and marking dirty When dirty data is being written to the cache, setting/waiting on/clearing the fscache flag is always done in tandem with setting/waiting on/clearing the writeback flag. The netfslib buffered write routines wait on and set both flags and the write request cleanup clears both flags, so the fscache flag is almost superfluous. The reason it isn't superfluous is because the fscache flag is also used to indicate that data just read from the server is being written to the cache. The flag is used to prevent a race involving overlapping direct-I/O writes to the cache. Change this to indicate that a page is in need of being copied to the cache by placing a magic value in folio->private and marking the folios dirty. Then when the writeback code sees a folio marked in this way, it only writes it to the cache and not to the server. If a folio that has this magic value set is modified, the value is just replaced and the folio will then be uplodaded too. With this, PG_fscache is no longer required by the netfslib core, 9p and afs. Ceph and nfs, however, still need to use the old PG_fscache-based tracking. To deal with this, a flag, NETFS_ICTX_USE_PGPRIV2, now has to be set on the flags in the netfs_inode struct for those filesystems. This reenables the use of PG_fscache in that inode. 9p and afs use the netfslib write helpers so get switched over; cifs, for the moment, does page-by-page manual access to the cache, so doesn't use PG_fscache and is unaffected. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Matthew Wilcox (Oracle) <willy@infradead.org> cc: Eric Van Hensbergen <ericvh@kernel.org> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: Ilya Dryomov <idryomov@gmail.com> cc: Xiubo Li <xiubli@redhat.com> cc: Steve French <sfrench@samba.org> cc: Paulo Alcantara <pc@manguebit.com> cc: Ronnie Sahlberg <ronniesahlberg@gmail.com> cc: Shyam Prasad N <sprasad@microsoft.com> cc: Tom Talpey <tom@talpey.com> cc: Bharath SM <bharathsm@microsoft.com> cc: Trond Myklebust <trond.myklebust@hammerspace.com> cc: Anna Schumaker <anna@kernel.org> cc: netfs@lists.linux.dev cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: ceph-devel@vger.kernel.org cc: linux-cifs@vger.kernel.org cc: linux-nfs@vger.kernel.org cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org
2024-03-19 10:00:09 +00:00
if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags))
/* Filesystem uses deprecated PG_private_2 marking. */
__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
}
if (file && file->f_flags & O_NONBLOCK)
__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
return ERR_PTR(ret);
}
}
trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
}
void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what)
{
int r;
__refcount_inc(&rreq->ref, &r);
trace_netfs_rreq_ref(rreq->debug_id, r + 1, what);
}
void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
{
struct netfs_io_subrequest *subreq;
netfs: New writeback implementation The current netfslib writeback implementation creates writeback requests of contiguous folio data and then separately tiles subrequests over the space twice, once for the server and once for the cache. This creates a few issues: (1) Every time there's a discontiguity or a change between writing to only one destination or writing to both, it must create a new request. This makes it harder to do vectored writes. (2) The folios don't have the writeback mark removed until the end of the request - and a request could be hundreds of megabytes. (3) In future, I want to support a larger cache granularity, which will require aggregation of some folios that contain unmodified data (which only need to go to the cache) and some which contain modifications (which need to be uploaded and stored to the cache) - but, currently, these are treated as discontiguous. There's also a move to get everyone to use writeback_iter() to extract writable folios from the pagecache. That said, currently writeback_iter() has some issues that make it less than ideal: (1) there's no way to cancel the iteration, even if you find a "temporary" error that means the current folio and all subsequent folios are going to fail; (2) there's no way to filter the folios being written back - something that will impact Ceph with it's ordered snap system; (3) and if you get a folio you can't immediately deal with (say you need to flush the preceding writes), you are left with a folio hanging in the locked state for the duration, when really we should unlock it and relock it later. In this new implementation, I use writeback_iter() to pump folios, progressively creating two parallel, but separate streams and cleaning up the finished folios as the subrequests complete. Either or both streams can contain gaps, and the subrequests in each stream can be of variable size, don't need to align with each other and don't need to align with the folios. Indeed, subrequests can cross folio boundaries, may cover several folios or a folio may be spanned by multiple folios, e.g.: +---+---+-----+-----+---+----------+ Folios: | | | | | | | +---+---+-----+-----+---+----------+ +------+------+ +----+----+ Upload: | | |.....| | | +------+------+ +----+----+ +------+------+------+------+------+ Cache: | | | | | | +------+------+------+------+------+ The progressive subrequest construction permits the algorithm to be preparing both the next upload to the server and the next write to the cache whilst the previous ones are already in progress. Throttling can be applied to control the rate of production of subrequests - and, in any case, we probably want to write them to the server in ascending order, particularly if the file will be extended. Content crypto can also be prepared at the same time as the subrequests and run asynchronously, with the prepped requests being stalled until the crypto catches up with them. This might also be useful for transport crypto, but that happens at a lower layer, so probably would be harder to pull off. The algorithm is split into three parts: (1) The issuer. This walks through the data, packaging it up, encrypting it and creating subrequests. The part of this that generates subrequests only deals with file positions and spans and so is usable for DIO/unbuffered writes as well as buffered writes. (2) The collector. This asynchronously collects completed subrequests, unlocks folios, frees crypto buffers and performs any retries. This runs in a work queue so that the issuer can return to the caller for writeback (so that the VM can have its kswapd thread back) or async writes. (3) The retryer. This pauses the issuer, waits for all outstanding subrequests to complete and then goes through the failed subrequests to reissue them. This may involve reprepping them (with cifs, the credits must be renegotiated, and a subrequest may need splitting), and doing RMW for content crypto if there's a conflicting change on the server. [!] Note that some of the functions are prefixed with "new_" to avoid clashes with existing functions. These will be renamed in a later patch that cuts over to the new algorithm. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Eric Van Hensbergen <ericvh@kernel.org> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org
2024-03-18 16:52:05 +00:00
struct netfs_io_stream *stream;
int s;
while (!list_empty(&rreq->subrequests)) {
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, was_async,
netfs_sreq_trace_put_clear);
}
netfs: New writeback implementation The current netfslib writeback implementation creates writeback requests of contiguous folio data and then separately tiles subrequests over the space twice, once for the server and once for the cache. This creates a few issues: (1) Every time there's a discontiguity or a change between writing to only one destination or writing to both, it must create a new request. This makes it harder to do vectored writes. (2) The folios don't have the writeback mark removed until the end of the request - and a request could be hundreds of megabytes. (3) In future, I want to support a larger cache granularity, which will require aggregation of some folios that contain unmodified data (which only need to go to the cache) and some which contain modifications (which need to be uploaded and stored to the cache) - but, currently, these are treated as discontiguous. There's also a move to get everyone to use writeback_iter() to extract writable folios from the pagecache. That said, currently writeback_iter() has some issues that make it less than ideal: (1) there's no way to cancel the iteration, even if you find a "temporary" error that means the current folio and all subsequent folios are going to fail; (2) there's no way to filter the folios being written back - something that will impact Ceph with it's ordered snap system; (3) and if you get a folio you can't immediately deal with (say you need to flush the preceding writes), you are left with a folio hanging in the locked state for the duration, when really we should unlock it and relock it later. In this new implementation, I use writeback_iter() to pump folios, progressively creating two parallel, but separate streams and cleaning up the finished folios as the subrequests complete. Either or both streams can contain gaps, and the subrequests in each stream can be of variable size, don't need to align with each other and don't need to align with the folios. Indeed, subrequests can cross folio boundaries, may cover several folios or a folio may be spanned by multiple folios, e.g.: +---+---+-----+-----+---+----------+ Folios: | | | | | | | +---+---+-----+-----+---+----------+ +------+------+ +----+----+ Upload: | | |.....| | | +------+------+ +----+----+ +------+------+------+------+------+ Cache: | | | | | | +------+------+------+------+------+ The progressive subrequest construction permits the algorithm to be preparing both the next upload to the server and the next write to the cache whilst the previous ones are already in progress. Throttling can be applied to control the rate of production of subrequests - and, in any case, we probably want to write them to the server in ascending order, particularly if the file will be extended. Content crypto can also be prepared at the same time as the subrequests and run asynchronously, with the prepped requests being stalled until the crypto catches up with them. This might also be useful for transport crypto, but that happens at a lower layer, so probably would be harder to pull off. The algorithm is split into three parts: (1) The issuer. This walks through the data, packaging it up, encrypting it and creating subrequests. The part of this that generates subrequests only deals with file positions and spans and so is usable for DIO/unbuffered writes as well as buffered writes. (2) The collector. This asynchronously collects completed subrequests, unlocks folios, frees crypto buffers and performs any retries. This runs in a work queue so that the issuer can return to the caller for writeback (so that the VM can have its kswapd thread back) or async writes. (3) The retryer. This pauses the issuer, waits for all outstanding subrequests to complete and then goes through the failed subrequests to reissue them. This may involve reprepping them (with cifs, the credits must be renegotiated, and a subrequest may need splitting), and doing RMW for content crypto if there's a conflicting change on the server. [!] Note that some of the functions are prefixed with "new_" to avoid clashes with existing functions. These will be renamed in a later patch that cuts over to the new algorithm. Signed-off-by: David Howells <dhowells@redhat.com> Reviewed-by: Jeff Layton <jlayton@kernel.org> cc: Eric Van Hensbergen <ericvh@kernel.org> cc: Latchesar Ionkov <lucho@ionkov.net> cc: Dominique Martinet <asmadeus@codewreck.org> cc: Christian Schoenebeck <linux_oss@crudebyte.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: v9fs@lists.linux.dev cc: linux-afs@lists.infradead.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org
2024-03-18 16:52:05 +00:00
for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
stream = &rreq->io_streams[s];
while (!list_empty(&stream->subrequests)) {
subreq = list_first_entry(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, was_async,
netfs_sreq_trace_put_clear);
}
}
}
static void netfs_free_request_rcu(struct rcu_head *rcu)
{
struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu);
mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
netfs_stat_d(&netfs_n_rh_rreq);
}
static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
unsigned int i;
netfs: Add a netfs inode context Add a netfs_i_context struct that should be included in the network filesystem's own inode struct wrapper, directly after the VFS's inode struct, e.g.: struct my_inode { struct { /* These must be contiguous */ struct inode vfs_inode; struct netfs_i_context netfs_ctx; }; }; The netfs_i_context struct so far contains a single field for the network filesystem to use - the cache cookie: struct netfs_i_context { ... struct fscache_cookie *cache; }; Three functions are provided to help with this: (1) void netfs_i_context_init(struct inode *inode, const struct netfs_request_ops *ops); Initialise the netfs context and set the operations. (2) struct netfs_i_context *netfs_i_context(struct inode *inode); Find the netfs context from the VFS inode. (3) struct inode *netfs_inode(struct netfs_i_context *ctx); Find the VFS inode from the netfs context. Changes ======= ver #4) - Fix netfs_is_cache_enabled() to check cookie->cache_priv to see if a cache is present[3]. - Fix netfs_skip_folio_read() to zero out all of the page, not just some of it[3]. ver #3) - Split out the bit to move ceph cap-getting on readahead into ceph_init_request()[1]. - Stick in a comment to the netfs inode structs indicating the contiguity requirements[2]. ver #2) - Adjust documentation to match. - Use "#if IS_ENABLED()" in netfs_i_cookie(), not "#ifdef". - Move the cap check from ceph_readahead() to ceph_init_request() to be called from netfslib. - Remove ceph_readahead() and use netfs_readahead() directly instead. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Jeff Layton <jlayton@kernel.org> cc: linux-cachefs@redhat.com Link: https://lore.kernel.org/r/8af0d47f17d89c06bbf602496dd845f2b0bf25b3.camel@kernel.org/ [1] Link: https://lore.kernel.org/r/beaf4f6a6c2575ed489adb14b257253c868f9a5c.camel@kernel.org/ [2] Link: https://lore.kernel.org/r/3536452.1647421585@warthog.procyon.org.uk/ [3] Link: https://lore.kernel.org/r/164622984545.3564931.15691742939278418580.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/164678213320.1200972.16807551936267647470.stgit@warthog.procyon.org.uk/ # v2 Link: https://lore.kernel.org/r/164692909854.2099075.9535537286264248057.stgit@warthog.procyon.org.uk/ # v3 Link: https://lore.kernel.org/r/306388.1647595110@warthog.procyon.org.uk/ # v4
2021-06-29 21:37:05 +00:00
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
netfs_proc_del_rreq(rreq);
netfs_clear_subrequests(rreq, false);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
if (rreq->direct_bv) {
for (i = 0; i < rreq->direct_bv_count; i++) {
if (rreq->direct_bv[i].bv_page) {
if (rreq->direct_bv_unpin)
unpin_user_page(rreq->direct_bv[i].bv_page);
}
}
kvfree(rreq->direct_bv);
}
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
enum netfs_rreq_ref_trace what)
{
unsigned int debug_id;
bool dead;
int r;
if (rreq) {
debug_id = rreq->debug_id;
dead = __refcount_dec_and_test(&rreq->ref, &r);
trace_netfs_rreq_ref(debug_id, r - 1, what);
if (dead) {
if (was_async) {
rreq->work.func = netfs_free_request;
if (!queue_work(system_unbound_wq, &rreq->work))
BUG();
} else {
netfs_free_request(&rreq->work);
}
}
}
}
/*
* Allocate and partially initialise an I/O request structure.
*/
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool;
struct kmem_cache *cache = mempool->pool_data;
for (;;) {
subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool,
GFP_KERNEL);
if (subreq)
break;
msleep(10);
}
memset(subreq, 0, kmem_cache_size(cache));
INIT_WORK(&subreq->work, NULL);
INIT_LIST_HEAD(&subreq->rreq_link);
refcount_set(&subreq->ref, 2);
subreq->rreq = rreq;
subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
netfs_stat(&netfs_n_rh_sreq);
return subreq;
}
void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
enum netfs_sreq_ref_trace what)
{
int r;
__refcount_inc(&subreq->ref, &r);
trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index, r + 1,
what);
}
static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
bool was_async)
{
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
if (rreq->netfs_ops->free_subrequest)
rreq->netfs_ops->free_subrequest(subreq);
mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
}
void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async,
enum netfs_sreq_ref_trace what)
{
unsigned int debug_index = subreq->debug_index;
unsigned int debug_id = subreq->rreq->debug_id;
bool dead;
int r;
dead = __refcount_dec_and_test(&subreq->ref, &r);
trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what);
if (dead)
netfs_free_subrequest(subreq, was_async);
}