Merge branch 'vfs.all' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git

This commit is contained in:
Stephen Rothwell 2024-12-20 09:19:26 +11:00
commit cd07c43f9b
126 changed files with 6278 additions and 2617 deletions

View File

@ -12,21 +12,10 @@ returns a list of extents.
Request Basics Request Basics
-------------- --------------
A fiemap request is encoded within struct fiemap:: A fiemap request is encoded within struct fiemap:
struct fiemap {
__u64 fm_start; /* logical offset (inclusive) at
* which to start mapping (in) */
__u64 fm_length; /* logical length of mapping which
* userspace cares about (in) */
__u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
__u32 fm_mapped_extents; /* number of extents that were
* mapped (out) */
__u32 fm_extent_count; /* size of fm_extents array (in) */
__u32 fm_reserved;
struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
};
.. kernel-doc:: include/uapi/linux/fiemap.h
:identifiers: fiemap
fm_start, and fm_length specify the logical range within the file fm_start, and fm_length specify the logical range within the file
which the process would like mappings for. Extents returned mirror which the process would like mappings for. Extents returned mirror
@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR
If this flag is set, the extents returned will describe the inodes If this flag is set, the extents returned will describe the inodes
extended attribute lookup tree, instead of its data tree. extended attribute lookup tree, instead of its data tree.
FIEMAP_FLAG_CACHE
This flag requests caching of the extents.
Extent Mapping Extent Mapping
-------------- --------------
@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST
flag set (see the next section on extent flags). flag set (see the next section on extent flags).
Each extent is described by a single fiemap_extent structure as Each extent is described by a single fiemap_extent structure as
returned in fm_extents:: returned in fm_extents:
struct fiemap_extent { .. kernel-doc:: include/uapi/linux/fiemap.h
__u64 fe_logical; /* logical offset in bytes for the start of :identifiers: fiemap_extent
* the extent */
__u64 fe_physical; /* physical offset in bytes for the start
* of the extent */
__u64 fe_length; /* length in bytes for the extent */
__u64 fe_reserved64[2];
__u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
__u32 fe_reserved[3];
};
All offsets and lengths are in bytes and mirror those on disk. It is valid All offsets and lengths are in bytes and mirror those on disk. It is valid
for an extents logical offset to start before the request or its logical for an extents logical offset to start before the request or its logical
@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED
userspace would be highly inefficient, the kernel will try to merge most userspace would be highly inefficient, the kernel will try to merge most
adjacent blocks into 'extents'. adjacent blocks into 'extents'.
FIEMAP_EXTENT_SHARED
This flag is set to request that space be shared with other files.
VFS -> File System Implementation VFS -> File System Implementation
--------------------------------- ---------------------------------
@ -191,14 +176,10 @@ each discovered extent::
u64 len); u64 len);
->fiemap is passed struct fiemap_extent_info which describes the ->fiemap is passed struct fiemap_extent_info which describes the
fiemap request:: fiemap request:
struct fiemap_extent_info { .. kernel-doc:: include/linux/fiemap.h
unsigned int fi_flags; /* Flags as passed from user */ :identifiers: fiemap_extent_info
unsigned int fi_extents_mapped; /* Number of mapped extents */
unsigned int fi_extents_max; /* Size of fiemap_extent array */
struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
};
It is intended that the file system should not need to access any of this It is intended that the file system should not need to access any of this
structure directly. Filesystem handlers should be tolerant to signals and return structure directly. Filesystem handlers should be tolerant to signals and return

View File

@ -527,11 +527,6 @@ There are some functions to help manage credentials:
This gets a reference on a live set of credentials, returning a pointer to This gets a reference on a live set of credentials, returning a pointer to
that set of credentials. that set of credentials.
- ``struct cred *get_new_cred(struct cred *cred);``
This gets a reference on a set of credentials that is under construction
and is thus still mutable, returning a pointer to that set of credentials.
Open File Credentials Open File Credentials
===================== =====================

View File

@ -12387,6 +12387,13 @@ F: Documentation/kbuild/kconfig*
F: scripts/Kconfig.include F: scripts/Kconfig.include
F: scripts/kconfig/ F: scripts/kconfig/
KCORE
M: Omar Sandoval <osandov@osandov.com>
L: linux-debuggers@vger.kernel.org
S: Maintained
F: fs/proc/kcore.c
F: include/linux/kcore.h
KCOV KCOV
R: Dmitry Vyukov <dvyukov@google.com> R: Dmitry Vyukov <dvyukov@google.com>
R: Andrey Konovalov <andreyknvl@gmail.com> R: Andrey Konovalov <andreyknvl@gmail.com>

View File

@ -249,7 +249,7 @@ static struct file *open_file_as_root(const char *filename, int flags, umode_t m
fp = file_open_root(&root, filename, flags, mode); fp = file_open_root(&root, filename, flags, mode);
path_put(&root); path_put(&root);
revert_creds(old_cred); put_cred(revert_creds(old_cred));
return fp; return fp;
} }

View File

@ -79,11 +79,13 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (pos + total >= i_size_read(rreq->inode)) if (pos + total >= i_size_read(rreq->inode))
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
if (!err && total) {
if (!err) __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
subreq->transferred += total; subreq->transferred += total;
}
netfs_read_subreq_terminated(subreq, err, false); subreq->error = err;
netfs_read_subreq_terminated(subreq);
} }
/** /**

View File

@ -11,6 +11,7 @@ kafs-y := \
cmservice.o \ cmservice.o \
dir.o \ dir.o \
dir_edit.o \ dir_edit.o \
dir_search.o \
dir_silly.o \ dir_silly.o \
dynroot.o \ dynroot.o \
file.o \ file.o \

View File

@ -41,7 +41,7 @@ static void afs_volume_init_callback(struct afs_volume *volume)
list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) { list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) { if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
queue_work(system_unbound_wq, &vnode->cb_work); queue_work(system_unbound_wq, &vnode->cb_work);
} }
} }
@ -79,7 +79,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
_enter(""); _enter("");
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) { if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_cb_break)) {
vnode->cb_break++; vnode->cb_break++;
vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
afs_clear_permits(vnode); afs_clear_permits(vnode);

File diff suppressed because it is too large Load Diff

View File

@ -10,6 +10,7 @@
#include <linux/namei.h> #include <linux/namei.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/iversion.h> #include <linux/iversion.h>
#include <linux/folio_queue.h>
#include "internal.h" #include "internal.h"
#include "xdr_fs.h" #include "xdr_fs.h"
@ -105,23 +106,57 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
} }
/* /*
* Get a new directory folio. * Get a specific block, extending the directory storage to cover it as needed.
*/ */
static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) static union afs_xdr_dir_block *afs_dir_get_block(struct afs_dir_iter *iter, size_t block)
{ {
struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio_queue *fq;
struct afs_vnode *dvnode = iter->dvnode;
struct folio *folio; struct folio *folio;
size_t blpos = block * AFS_DIR_BLOCK_SIZE;
size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
int ret;
folio = __filemap_get_folio(mapping, index, if (dvnode->directory_size < blend) {
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, size_t cur_size = dvnode->directory_size;
mapping->gfp_mask);
if (IS_ERR(folio)) { ret = netfs_alloc_folioq_buffer(
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); NULL, &dvnode->directory, &cur_size, blend,
return NULL; mapping_gfp_mask(dvnode->netfs.inode.i_mapping));
dvnode->directory_size = cur_size;
if (ret < 0)
goto fail;
} }
if (!folio_test_private(folio))
folio_attach_private(folio, (void *)1); fq = iter->fq;
return folio; if (!fq)
fq = dvnode->directory;
/* Search the folio queue for the folio containing the block... */
for (; fq; fq = fq->next) {
for (int s = iter->fq_slot; s < folioq_count(fq); s++) {
size_t fsize = folioq_folio_size(fq, s);
if (blend <= fpos + fsize) {
/* ... and then return the mapped block. */
folio = folioq_folio(fq, s);
if (WARN_ON_ONCE(folio_pos(folio) != fpos))
goto fail;
iter->fq = fq;
iter->fq_slot = s;
iter->fpos = fpos;
return kmap_local_folio(folio, blpos - fpos);
}
fpos += fsize;
}
iter->fq_slot = 0;
}
fail:
iter->fq = NULL;
iter->fq_slot = 0;
afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
return NULL;
} }
/* /*
@ -209,9 +244,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
{ {
union afs_xdr_dir_block *meta, *block; union afs_xdr_dir_block *meta, *block;
union afs_xdr_dirent *de; union afs_xdr_dirent *de;
struct folio *folio0, *folio; struct afs_dir_iter iter = { .dvnode = vnode };
unsigned int need_slots, nr_blocks, b; unsigned int nr_blocks, b, entry;
pgoff_t index;
loff_t i_size; loff_t i_size;
int slot; int slot;
@ -220,20 +254,17 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
i_size = i_size_read(&vnode->netfs.inode); i_size = i_size_read(&vnode->netfs.inode);
if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) { (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_bad_size);
return; return;
} }
folio0 = afs_dir_get_folio(vnode, 0); meta = afs_dir_get_block(&iter, 0);
if (!folio0) { if (!meta)
_leave(" [fgp]");
return; return;
}
/* Work out how many slots we're going to need. */ /* Work out how many slots we're going to need. */
need_slots = afs_dir_calc_slots(name->len); iter.nr_slots = afs_dir_calc_slots(name->len);
meta = kmap_local_folio(folio0, 0);
if (i_size == 0) if (i_size == 0)
goto new_directory; goto new_directory;
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
@ -245,22 +276,21 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
/* If the directory extended into a new folio, then we need to /* If the directory extended into a new folio, then we need to
* tack a new folio on the end. * tack a new folio on the end.
*/ */
index = b / AFS_DIR_BLOCKS_PER_PAGE;
if (nr_blocks >= AFS_DIR_MAX_BLOCKS) if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
goto error; goto error_too_many_blocks;
if (index >= folio_nr_pages(folio0)) {
folio = afs_dir_get_folio(vnode, index);
if (!folio)
goto error;
} else {
folio = folio0;
}
block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio)); /* Lower dir blocks have a counter in the header we can check. */
if (b < AFS_DIR_BLOCKS_WITH_CTR &&
meta->meta.alloc_ctrs[b] < iter.nr_slots)
continue;
block = afs_dir_get_block(&iter, b);
if (!block)
goto error;
/* Abandon the edit if we got a callback break. */ /* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto invalidated; goto already_invalidated;
_debug("block %u: %2u %3u %u", _debug("block %u: %2u %3u %u",
b, b,
@ -275,31 +305,23 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE); afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
} }
/* Only lower dir blocks have a counter in the header. */ /* We need to try and find one or more consecutive slots to
if (b >= AFS_DIR_BLOCKS_WITH_CTR || * hold the entry.
meta->meta.alloc_ctrs[b] >= need_slots) { */
/* We need to try and find one or more consecutive slot = afs_find_contig_bits(block, iter.nr_slots);
* slots to hold the entry. if (slot >= 0) {
*/ _debug("slot %u", slot);
slot = afs_find_contig_bits(block, need_slots); goto found_space;
if (slot >= 0) {
_debug("slot %u", slot);
goto found_space;
}
} }
kunmap_local(block); kunmap_local(block);
if (folio != folio0) {
folio_unlock(folio);
folio_put(folio);
}
} }
/* There are no spare slots of sufficient size, yet the operation /* There are no spare slots of sufficient size, yet the operation
* succeeded. Download the directory again. * succeeded. Download the directory again.
*/ */
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name); trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_no_slots);
goto out_unmap; goto out_unmap;
new_directory: new_directory:
@ -307,8 +329,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
i_size = AFS_DIR_BLOCK_SIZE; i_size = AFS_DIR_BLOCK_SIZE;
afs_set_i_size(vnode, i_size); afs_set_i_size(vnode, i_size);
slot = AFS_DIR_RESV_BLOCKS0; slot = AFS_DIR_RESV_BLOCKS0;
folio = folio0; block = afs_dir_get_block(&iter, 0);
block = kmap_local_folio(folio, 0);
nr_blocks = 1; nr_blocks = 1;
b = 0; b = 0;
@ -326,41 +347,39 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
de->u.name[name->len] = 0; de->u.name[name->len] = 0;
/* Adjust the bitmap. */ /* Adjust the bitmap. */
afs_set_contig_bits(block, slot, need_slots); afs_set_contig_bits(block, slot, iter.nr_slots);
kunmap_local(block);
if (folio != folio0) {
folio_unlock(folio);
folio_put(folio);
}
/* Adjust the allocation counter. */ /* Adjust the allocation counter. */
if (b < AFS_DIR_BLOCKS_WITH_CTR) if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] -= need_slots; meta->meta.alloc_ctrs[b] -= iter.nr_slots;
/* Adjust the hash chain. */
entry = b * AFS_DIR_SLOTS_PER_BLOCK + slot;
iter.bucket = afs_dir_hash_name(name);
de->u.hash_next = meta->meta.hashtable[iter.bucket];
meta->meta.hashtable[iter.bucket] = htons(entry);
kunmap_local(block);
inode_inc_iversion_raw(&vnode->netfs.inode); inode_inc_iversion_raw(&vnode->netfs.inode);
afs_stat_v(vnode, n_dir_cr); afs_stat_v(vnode, n_dir_cr);
_debug("Insert %s in %u[%u]", name->name, b, slot); _debug("Insert %s in %u[%u]", name->name, b, slot);
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
out_unmap: out_unmap:
kunmap_local(meta); kunmap_local(meta);
folio_unlock(folio0);
folio_put(folio0);
_leave(""); _leave("");
return; return;
invalidated: already_invalidated:
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
kunmap_local(block); kunmap_local(block);
if (folio != folio0) {
folio_unlock(folio);
folio_put(folio);
}
goto out_unmap; goto out_unmap;
error_too_many_blocks:
afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_too_many_blocks);
error: error:
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name); trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
goto out_unmap; goto out_unmap;
} }
@ -374,13 +393,14 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
void afs_edit_dir_remove(struct afs_vnode *vnode, void afs_edit_dir_remove(struct afs_vnode *vnode,
struct qstr *name, enum afs_edit_dir_reason why) struct qstr *name, enum afs_edit_dir_reason why)
{ {
union afs_xdr_dir_block *meta, *block; union afs_xdr_dir_block *meta, *block, *pblock;
union afs_xdr_dirent *de; union afs_xdr_dirent *de, *pde;
struct folio *folio0, *folio; struct afs_dir_iter iter = { .dvnode = vnode };
unsigned int need_slots, nr_blocks, b; struct afs_fid fid;
pgoff_t index; unsigned int b, slot, entry;
loff_t i_size; loff_t i_size;
int slot; __be16 next;
int found;
_enter(",,{%d,%s},", name->len, name->name); _enter(",,{%d,%s},", name->len, name->name);
@ -388,81 +408,95 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
if (i_size < AFS_DIR_BLOCK_SIZE || if (i_size < AFS_DIR_BLOCK_SIZE ||
i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) { (i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size);
return;
}
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
folio0 = afs_dir_get_folio(vnode, 0);
if (!folio0) {
_leave(" [fgp]");
return; return;
} }
/* Work out how many slots we're going to discard. */ if (!afs_dir_init_iter(&iter, name))
need_slots = afs_dir_calc_slots(name->len); return;
meta = kmap_local_folio(folio0, 0); meta = afs_dir_find_block(&iter, 0);
if (!meta)
return;
/* Find a block that has sufficient slots available. Each folio /* Find the entry in the blob. */
* contains two or more directory blocks. found = afs_dir_search_bucket(&iter, name, &fid);
*/ if (found < 0) {
for (b = 0; b < nr_blocks; b++) { /* Didn't find the dirent to clobber. Re-download. */
index = b / AFS_DIR_BLOCKS_PER_PAGE; trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
if (index >= folio_nr_pages(folio0)) { 0, 0, 0, 0, name->name);
folio = afs_dir_get_folio(vnode, index); afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name);
if (!folio) goto out_unmap;
goto error;
} else {
folio = folio0;
}
block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
/* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto invalidated;
if (b > AFS_DIR_BLOCKS_WITH_CTR ||
meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
slot = afs_dir_scan_block(block, name, b);
if (slot >= 0)
goto found_dirent;
}
kunmap_local(block);
if (folio != folio0) {
folio_unlock(folio);
folio_put(folio);
}
} }
/* Didn't find the dirent to clobber. Download the directory again. */ entry = found;
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent, b = entry / AFS_DIR_SLOTS_PER_BLOCK;
0, 0, 0, 0, name->name); slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
goto out_unmap;
found_dirent: block = afs_dir_find_block(&iter, b);
if (!block)
goto error;
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto already_invalidated;
/* Check and clear the entry. */
de = &block->dirents[slot]; de = &block->dirents[slot];
if (de->u.valid != 1)
goto error_unmap;
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot, trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot,
ntohl(de->u.vnode), ntohl(de->u.unique), ntohl(de->u.vnode), ntohl(de->u.unique),
name->name); name->name);
memset(de, 0, sizeof(*de) * need_slots);
/* Adjust the bitmap. */ /* Adjust the bitmap. */
afs_clear_contig_bits(block, slot, need_slots); afs_clear_contig_bits(block, slot, iter.nr_slots);
kunmap_local(block);
if (folio != folio0) {
folio_unlock(folio);
folio_put(folio);
}
/* Adjust the allocation counter. */ /* Adjust the allocation counter. */
if (b < AFS_DIR_BLOCKS_WITH_CTR) if (b < AFS_DIR_BLOCKS_WITH_CTR)
meta->meta.alloc_ctrs[b] += need_slots; meta->meta.alloc_ctrs[b] += iter.nr_slots;
/* Clear the constituent entries. */
next = de->u.hash_next;
memset(de, 0, sizeof(*de) * iter.nr_slots);
kunmap_local(block);
/* Adjust the hash chain: if iter->prev_entry is 0, the hashtable head
* index is previous; otherwise it's slot number of the previous entry.
*/
if (!iter.prev_entry) {
__be16 prev_next = meta->meta.hashtable[iter.bucket];
if (unlikely(prev_next != htons(entry))) {
pr_warn("%llx:%llx:%x: not head of chain b=%x p=%x,%x e=%x %*s",
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
iter.bucket, iter.prev_entry, prev_next, entry,
name->len, name->name);
goto error;
}
meta->meta.hashtable[iter.bucket] = next;
} else {
unsigned int pb = iter.prev_entry / AFS_DIR_SLOTS_PER_BLOCK;
unsigned int ps = iter.prev_entry % AFS_DIR_SLOTS_PER_BLOCK;
__be16 prev_next;
pblock = afs_dir_find_block(&iter, pb);
if (!pblock)
goto error;
pde = &pblock->dirents[ps];
prev_next = pde->u.hash_next;
if (prev_next != htons(entry)) {
kunmap_local(pblock);
pr_warn("%llx:%llx:%x: not prev in chain b=%x p=%x,%x e=%x %*s",
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
iter.bucket, iter.prev_entry, prev_next, entry,
name->len, name->name);
goto error;
}
pde->u.hash_next = next;
kunmap_local(pblock);
}
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
afs_stat_v(vnode, n_dir_rm); afs_stat_v(vnode, n_dir_rm);
@ -470,26 +504,20 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
out_unmap: out_unmap:
kunmap_local(meta); kunmap_local(meta);
folio_unlock(folio0);
folio_put(folio0);
_leave(""); _leave("");
return; return;
invalidated: already_invalidated:
kunmap_local(block);
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
0, 0, 0, 0, name->name); 0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
kunmap_local(block);
if (folio != folio0) {
folio_unlock(folio);
folio_put(folio);
}
goto out_unmap; goto out_unmap;
error_unmap:
kunmap_local(block);
error: error:
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error, trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error,
0, 0, 0, 0, name->name); 0, 0, 0, 0, name->name);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
goto out_unmap; goto out_unmap;
} }
@ -502,9 +530,8 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
{ {
union afs_xdr_dir_block *block; union afs_xdr_dir_block *block;
union afs_xdr_dirent *de; union afs_xdr_dirent *de;
struct folio *folio; struct afs_dir_iter iter = { .dvnode = vnode };
unsigned int nr_blocks, b; unsigned int nr_blocks, b;
pgoff_t index;
loff_t i_size; loff_t i_size;
int slot; int slot;
@ -512,39 +539,35 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
i_size = i_size_read(&vnode->netfs.inode); i_size = i_size_read(&vnode->netfs.inode);
if (i_size < AFS_DIR_BLOCK_SIZE) { if (i_size < AFS_DIR_BLOCK_SIZE) {
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size);
return; return;
} }
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
/* Find a block that has sufficient slots available. Each folio /* Find a block that has sufficient slots available. Each folio
* contains two or more directory blocks. * contains two or more directory blocks.
*/ */
for (b = 0; b < nr_blocks; b++) { for (b = 0; b < nr_blocks; b++) {
index = b / AFS_DIR_BLOCKS_PER_PAGE; block = afs_dir_get_block(&iter, b);
folio = afs_dir_get_folio(vnode, index); if (!block)
if (!folio)
goto error; goto error;
block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
/* Abandon the edit if we got a callback break. */ /* Abandon the edit if we got a callback break. */
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
goto invalidated; goto already_invalidated;
slot = afs_dir_scan_block(block, &dotdot_name, b); slot = afs_dir_scan_block(block, &dotdot_name, b);
if (slot >= 0) if (slot >= 0)
goto found_dirent; goto found_dirent;
kunmap_local(block); kunmap_local(block);
folio_unlock(folio);
folio_put(folio);
} }
/* Didn't find the dirent to clobber. Download the directory again. */ /* Didn't find the dirent to clobber. Download the directory again. */
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd, trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
0, 0, 0, 0, ".."); 0, 0, 0, 0, "..");
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
goto out; goto out;
found_dirent: found_dirent:
@ -556,26 +579,70 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
ntohl(de->u.vnode), ntohl(de->u.unique), ".."); ntohl(de->u.vnode), ntohl(de->u.unique), "..");
kunmap_local(block); kunmap_local(block);
folio_unlock(folio); netfs_single_mark_inode_dirty(&vnode->netfs.inode);
folio_put(folio);
inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
out: out:
_leave(""); _leave("");
return; return;
invalidated: already_invalidated:
kunmap_local(block); kunmap_local(block);
folio_unlock(folio);
folio_put(folio);
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval, trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
0, 0, 0, 0, ".."); 0, 0, 0, 0, "..");
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
goto out; goto out;
error: error:
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error, trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
0, 0, 0, 0, ".."); 0, 0, 0, 0, "..");
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
goto out; goto out;
} }
/*
* Initialise a new directory. We need to fill in the "." and ".." entries.
*/
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_dvnode)
{
union afs_xdr_dir_block *meta;
struct afs_dir_iter iter = { .dvnode = dvnode };
union afs_xdr_dirent *de;
unsigned int slot = AFS_DIR_RESV_BLOCKS0;
loff_t i_size;
i_size = i_size_read(&dvnode->netfs.inode);
if (i_size != AFS_DIR_BLOCK_SIZE) {
afs_invalidate_dir(dvnode, afs_dir_invalid_edit_add_bad_size);
return;
}
meta = afs_dir_get_block(&iter, 0);
if (!meta)
return;
afs_edit_init_block(meta, meta, 0);
de = &meta->dirents[slot];
de->u.valid = 1;
de->u.vnode = htonl(dvnode->fid.vnode);
de->u.unique = htonl(dvnode->fid.unique);
memcpy(de->u.name, ".", 2);
trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
dvnode->fid.vnode, dvnode->fid.unique, ".");
slot++;
de = &meta->dirents[slot];
de->u.valid = 1;
de->u.vnode = htonl(parent_dvnode->fid.vnode);
de->u.unique = htonl(parent_dvnode->fid.unique);
memcpy(de->u.name, "..", 3);
trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
parent_dvnode->fid.vnode, parent_dvnode->fid.unique, "..");
afs_set_contig_bits(meta, AFS_DIR_RESV_BLOCKS0, 2);
meta->meta.alloc_ctrs[0] -= 2;
kunmap_local(meta);
netfs_single_mark_inode_dirty(&dvnode->netfs.inode);
set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
}

227
fs/afs/dir_search.c Normal file
View File

@ -0,0 +1,227 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Search a directory's hash table.
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* https://tools.ietf.org/html/draft-keiser-afs3-directory-object-00
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/iversion.h>
#include "internal.h"
#include "afs_fs.h"
#include "xdr_fs.h"
/*
* Calculate the name hash.
*/
unsigned int afs_dir_hash_name(const struct qstr *name)
{
const unsigned char *p = name->name;
unsigned int hash = 0, i;
int bucket;
for (i = 0; i < name->len; i++)
hash = (hash * 173) + p[i];
bucket = hash & (AFS_DIR_HASHTBL_SIZE - 1);
if (hash > INT_MAX) {
bucket = AFS_DIR_HASHTBL_SIZE - bucket;
bucket &= (AFS_DIR_HASHTBL_SIZE - 1);
}
return bucket;
}
/*
* Reset a directory iterator.
*/
static bool afs_dir_reset_iter(struct afs_dir_iter *iter)
{
unsigned long long i_size = i_size_read(&iter->dvnode->netfs.inode);
unsigned int nblocks;
/* Work out the maximum number of steps we can take. */
nblocks = umin(i_size / AFS_DIR_BLOCK_SIZE, AFS_DIR_MAX_BLOCKS);
if (!nblocks)
return false;
iter->loop_check = nblocks * (AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS);
iter->prev_entry = 0; /* Hash head is previous */
return true;
}
/*
* Initialise a directory iterator for looking up a name.
*/
bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name)
{
iter->nr_slots = afs_dir_calc_slots(name->len);
iter->bucket = afs_dir_hash_name(name);
return afs_dir_reset_iter(iter);
}
/*
* Get a specific block.
*/
union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block)
{
struct folio_queue *fq = iter->fq;
struct afs_vnode *dvnode = iter->dvnode;
struct folio *folio;
size_t blpos = block * AFS_DIR_BLOCK_SIZE;
size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
int slot = iter->fq_slot;
_enter("%zx,%d", block, slot);
if (iter->block) {
kunmap_local(iter->block);
iter->block = NULL;
}
if (dvnode->directory_size < blend)
goto fail;
if (!fq || blpos < fpos) {
fq = dvnode->directory;
slot = 0;
fpos = 0;
}
/* Search the folio queue for the folio containing the block... */
for (; fq; fq = fq->next) {
for (; slot < folioq_count(fq); slot++) {
size_t fsize = folioq_folio_size(fq, slot);
if (blend <= fpos + fsize) {
/* ... and then return the mapped block. */
folio = folioq_folio(fq, slot);
if (WARN_ON_ONCE(folio_pos(folio) != fpos))
goto fail;
iter->fq = fq;
iter->fq_slot = slot;
iter->fpos = fpos;
iter->block = kmap_local_folio(folio, blpos - fpos);
return iter->block;
}
fpos += fsize;
}
slot = 0;
}
fail:
iter->fq = NULL;
iter->fq_slot = 0;
afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
return NULL;
}
/*
* Search through a directory bucket.
*/
int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
struct afs_fid *_fid)
{
const union afs_xdr_dir_block *meta;
unsigned int entry;
int ret = -ESTALE;
meta = afs_dir_find_block(iter, 0);
if (!meta)
return -ESTALE;
entry = ntohs(meta->meta.hashtable[iter->bucket & (AFS_DIR_HASHTBL_SIZE - 1)]);
_enter("%x,%x", iter->bucket, entry);
while (entry) {
const union afs_xdr_dir_block *block;
const union afs_xdr_dirent *dire;
unsigned int blnum = entry / AFS_DIR_SLOTS_PER_BLOCK;
unsigned int slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
unsigned int resv = (blnum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
_debug("search %x", entry);
if (slot < resv) {
kdebug("slot out of range h=%x rs=%2x sl=%2x-%2x",
iter->bucket, resv, slot, slot + iter->nr_slots - 1);
goto bad;
}
block = afs_dir_find_block(iter, blnum);
if (!block)
goto bad;
dire = &block->dirents[slot];
if (slot + iter->nr_slots <= AFS_DIR_SLOTS_PER_BLOCK &&
memcmp(dire->u.name, name->name, name->len) == 0 &&
dire->u.name[name->len] == '\0') {
_fid->vnode = ntohl(dire->u.vnode);
_fid->unique = ntohl(dire->u.unique);
ret = entry;
goto found;
}
iter->prev_entry = entry;
entry = ntohs(dire->u.hash_next);
if (!--iter->loop_check) {
kdebug("dir chain loop h=%x", iter->bucket);
goto bad;
}
}
ret = -ENOENT;
found:
if (iter->block) {
kunmap_local(iter->block);
iter->block = NULL;
}
bad:
if (ret == -ESTALE)
afs_invalidate_dir(iter->dvnode, afs_dir_invalid_iter_stale);
_leave(" = %d", ret);
return ret;
}
/*
* Search the appropriate hash chain in the contents of an AFS directory.
*/
int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version)
{
struct afs_dir_iter iter = { .dvnode = dvnode, };
int ret, retry_limit = 3;
_enter("{%lu},,,", dvnode->netfs.inode.i_ino);
if (!afs_dir_init_iter(&iter, name))
return -ENOENT;
do {
if (--retry_limit < 0) {
pr_warn("afs_read_dir(): Too many retries\n");
ret = -ESTALE;
break;
}
ret = afs_read_dir(dvnode, NULL);
if (ret < 0) {
if (ret != -ESTALE)
break;
if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) {
ret = -ESTALE;
break;
}
continue;
}
*_dir_version = inode_peek_iversion_raw(&dvnode->netfs.inode);
ret = afs_dir_search_bucket(&iter, name, _fid);
up_read(&dvnode->validate_lock);
if (ret == -ESTALE)
afs_dir_reset_iter(&iter);
} while (ret == -ESTALE);
_leave(" = %d", ret);
return ret;
}

View File

@ -20,7 +20,6 @@
#include "internal.h" #include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_symlink_read_folio(struct file *file, struct folio *folio);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos, static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@ -61,13 +60,6 @@ const struct address_space_operations afs_file_aops = {
.writepages = afs_writepages, .writepages = afs_writepages,
}; };
const struct address_space_operations afs_symlink_aops = {
.read_folio = afs_symlink_read_folio,
.release_folio = netfs_release_folio,
.invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
};
static const struct vm_operations_struct afs_vm_ops = { static const struct vm_operations_struct afs_vm_ops = {
.open = afs_vm_open, .open = afs_vm_open,
.close = afs_vm_close, .close = afs_vm_close,
@ -208,49 +200,12 @@ int afs_release(struct inode *inode, struct file *file)
return ret; return ret;
} }
/*
* Allocate a new read record.
*/
struct afs_read *afs_alloc_read(gfp_t gfp)
{
struct afs_read *req;
req = kzalloc(sizeof(struct afs_read), gfp);
if (req)
refcount_set(&req->usage, 1);
return req;
}
/*
* Dispose of a ref to a read record.
*/
void afs_put_read(struct afs_read *req)
{
if (refcount_dec_and_test(&req->usage)) {
if (req->cleanup)
req->cleanup(req);
key_put(req->key);
kfree(req);
}
}
static void afs_fetch_data_notify(struct afs_operation *op) static void afs_fetch_data_notify(struct afs_operation *op)
{ {
struct afs_read *req = op->fetch.req; struct netfs_io_subrequest *subreq = op->fetch.subreq;
struct netfs_io_subrequest *subreq = req->subreq;
int error = afs_op_error(op);
req->error = error; subreq->error = afs_op_error(op);
if (subreq) { netfs_read_subreq_terminated(subreq);
subreq->rreq->i_size = req->file_size;
if (req->pos + req->actual_len >= req->file_size)
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
netfs_read_subreq_terminated(subreq, error, false);
req->subreq = NULL;
} else if (req->done) {
req->done(req);
}
} }
static void afs_fetch_data_success(struct afs_operation *op) static void afs_fetch_data_success(struct afs_operation *op)
@ -260,7 +215,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
_enter("op=%08x", op->debug_id); _enter("op=%08x", op->debug_id);
afs_vnode_commit_status(op, &op->file[0]); afs_vnode_commit_status(op, &op->file[0]);
afs_stat_v(vnode, n_fetches); afs_stat_v(vnode, n_fetches);
atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); atomic_long_add(op->fetch.subreq->transferred, &op->net->n_fetch_bytes);
afs_fetch_data_notify(op); afs_fetch_data_notify(op);
} }
@ -270,107 +225,188 @@ static void afs_fetch_data_aborted(struct afs_operation *op)
afs_fetch_data_notify(op); afs_fetch_data_notify(op);
} }
static void afs_fetch_data_put(struct afs_operation *op) const struct afs_operation_ops afs_fetch_data_operation = {
{
op->fetch.req->error = afs_op_error(op);
afs_put_read(op->fetch.req);
}
static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_afs_rpc = afs_fs_fetch_data, .issue_afs_rpc = afs_fs_fetch_data,
.issue_yfs_rpc = yfs_fs_fetch_data, .issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success, .success = afs_fetch_data_success,
.aborted = afs_fetch_data_aborted, .aborted = afs_fetch_data_aborted,
.failed = afs_fetch_data_notify, .failed = afs_fetch_data_notify,
.put = afs_fetch_data_put,
}; };
static void afs_issue_read_call(struct afs_operation *op)
{
op->call_responded = false;
op->call_error = 0;
op->call_abort_code = 0;
if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags))
yfs_fs_fetch_data(op);
else
afs_fs_fetch_data(op);
}
static void afs_end_read(struct afs_operation *op)
{
if (op->call_responded && op->server)
set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
if (!afs_op_error(op))
afs_fetch_data_success(op);
else if (op->cumul_error.aborted)
afs_fetch_data_aborted(op);
else
afs_fetch_data_notify(op);
afs_end_vnode_operation(op);
afs_put_operation(op);
}
/*
* Perform I/O processing on an asynchronous call. The work item carries a ref
* to the call struct that we either need to release or to pass on.
*/
static void afs_read_receive(struct afs_call *call)
{
struct afs_operation *op = call->op;
enum afs_call_state state;
_enter("");
state = READ_ONCE(call->state);
if (state == AFS_CALL_COMPLETE)
return;
trace_afs_read_recv(op, call);
while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) {
WRITE_ONCE(call->need_attention, false);
afs_deliver_to_call(call);
state = READ_ONCE(call->state);
}
if (state < AFS_CALL_COMPLETE) {
netfs_read_subreq_progress(op->fetch.subreq);
if (rxrpc_kernel_check_life(call->net->socket, call->rxcall))
return;
/* rxrpc terminated the call. */
afs_set_call_complete(call, call->error, call->abort_code);
}
op->call_abort_code = call->abort_code;
op->call_error = call->error;
op->call_responded = call->responded;
op->call = NULL;
call->op = NULL;
afs_put_call(call);
/* If the call failed, then we need to crank the server rotation
* handle and try the next.
*/
if (afs_select_fileserver(op)) {
afs_issue_read_call(op);
return;
}
afs_end_read(op);
}
void afs_fetch_data_async_rx(struct work_struct *work)
{
struct afs_call *call = container_of(work, struct afs_call, async_work);
afs_read_receive(call);
afs_put_call(call);
}
void afs_fetch_data_immediate_cancel(struct afs_call *call)
{
if (call->async) {
afs_get_call(call, afs_call_trace_wake);
if (!queue_work(afs_async_calls, &call->async_work))
afs_deferred_put_call(call);
flush_work(&call->async_work);
}
}
/* /*
* Fetch file data from the volume. * Fetch file data from the volume.
*/ */
int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) static void afs_issue_read(struct netfs_io_subrequest *subreq)
{ {
struct afs_operation *op; struct afs_operation *op;
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
struct key *key = subreq->rreq->netfs_priv;
_enter("%s{%llx:%llu.%u},%x,,,", _enter("%s{%llx:%llu.%u},%x,,,",
vnode->volume->name, vnode->volume->name,
vnode->fid.vid, vnode->fid.vid,
vnode->fid.vnode, vnode->fid.vnode,
vnode->fid.unique, vnode->fid.unique,
key_serial(req->key)); key_serial(key));
op = afs_alloc_operation(req->key, vnode->volume); op = afs_alloc_operation(key, vnode->volume);
if (IS_ERR(op)) { if (IS_ERR(op)) {
if (req->subreq) subreq->error = PTR_ERR(op);
netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false); netfs_read_subreq_terminated(subreq);
return PTR_ERR(op); return;
} }
afs_op_set_vnode(op, 0, vnode); afs_op_set_vnode(op, 0, vnode);
op->fetch.req = afs_get_read(req); op->fetch.subreq = subreq;
op->ops = &afs_fetch_data_operation; op->ops = &afs_fetch_data_operation;
return afs_do_sync_operation(op);
}
static void afs_read_worker(struct work_struct *work)
{
struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
struct afs_read *fsreq;
fsreq = afs_alloc_read(GFP_NOFS);
if (!fsreq)
return netfs_read_subreq_terminated(subreq, -ENOMEM, false);
fsreq->subreq = subreq;
fsreq->pos = subreq->start + subreq->transferred;
fsreq->len = subreq->len - subreq->transferred;
fsreq->key = key_get(subreq->rreq->netfs_priv);
fsreq->vnode = vnode;
fsreq->iter = &subreq->io_iter;
trace_netfs_sreq(subreq, netfs_sreq_trace_submit); trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
}
static void afs_issue_read(struct netfs_io_subrequest *subreq) if (subreq->rreq->origin == NETFS_READAHEAD ||
{ subreq->rreq->iocb) {
INIT_WORK(&subreq->work, afs_read_worker); op->flags |= AFS_OPERATION_ASYNC;
queue_work(system_long_wq, &subreq->work);
}
static int afs_symlink_read_folio(struct file *file, struct folio *folio) if (!afs_begin_vnode_operation(op)) {
{ subreq->error = afs_put_operation(op);
struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host); netfs_read_subreq_terminated(subreq);
struct afs_read *fsreq; return;
int ret; }
fsreq = afs_alloc_read(GFP_NOFS); if (!afs_select_fileserver(op)) {
if (!fsreq) afs_end_read(op);
return -ENOMEM; return;
}
fsreq->pos = folio_pos(folio); afs_issue_read_call(op);
fsreq->len = folio_size(folio); } else {
fsreq->vnode = vnode; afs_do_sync_operation(op);
fsreq->iter = &fsreq->def_iter; }
iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
fsreq->pos, fsreq->len);
ret = afs_fetch_data(fsreq->vnode, fsreq);
if (ret == 0)
folio_mark_uptodate(folio);
folio_unlock(folio);
return ret;
} }
static int afs_init_request(struct netfs_io_request *rreq, struct file *file) static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{ {
struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
if (file) if (file)
rreq->netfs_priv = key_get(afs_file_key(file)); rreq->netfs_priv = key_get(afs_file_key(file));
rreq->rsize = 256 * 1024; rreq->rsize = 256 * 1024;
rreq->wsize = 256 * 1024 * 1024; rreq->wsize = 256 * 1024 * 1024;
switch (rreq->origin) {
case NETFS_READ_SINGLE:
if (!file) {
struct key *key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key))
return PTR_ERR(key);
rreq->netfs_priv = key;
}
break;
case NETFS_WRITEBACK:
case NETFS_WRITETHROUGH:
case NETFS_UNBUFFERED_WRITE:
case NETFS_DIO_WRITE:
if (S_ISREG(rreq->inode->i_mode))
rreq->io_streams[0].avail = true;
break;
case NETFS_WRITEBACK_SINGLE:
default:
break;
}
return 0; return 0;
} }

View File

@ -49,6 +49,105 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
return op; return op;
} }
struct afs_io_locker {
struct list_head link;
struct task_struct *task;
unsigned long have_lock;
};
/*
* Unlock the I/O lock on a vnode.
*/
static void afs_unlock_for_io(struct afs_vnode *vnode)
{
struct afs_io_locker *locker;
spin_lock(&vnode->lock);
locker = list_first_entry_or_null(&vnode->io_lock_waiters,
struct afs_io_locker, link);
if (locker) {
list_del(&locker->link);
smp_store_release(&locker->have_lock, 1);
smp_mb__after_atomic(); /* Store have_lock before task state */
wake_up_process(locker->task);
} else {
clear_bit(AFS_VNODE_IO_LOCK, &vnode->flags);
}
spin_unlock(&vnode->lock);
}
/*
* Lock the I/O lock on a vnode uninterruptibly. We can't use an ordinary
* mutex as lockdep will complain if we unlock it in the wrong thread.
*/
static void afs_lock_for_io(struct afs_vnode *vnode)
{
struct afs_io_locker myself = { .task = current, };
spin_lock(&vnode->lock);
if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
spin_unlock(&vnode->lock);
return;
}
list_add_tail(&myself.link, &vnode->io_lock_waiters);
spin_unlock(&vnode->lock);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (smp_load_acquire(&myself.have_lock))
break;
schedule();
}
__set_current_state(TASK_RUNNING);
}
/*
* Lock the I/O lock on a vnode interruptibly. We can't use an ordinary mutex
* as lockdep will complain if we unlock it in the wrong thread.
*/
static int afs_lock_for_io_interruptible(struct afs_vnode *vnode)
{
struct afs_io_locker myself = { .task = current, };
int ret = 0;
spin_lock(&vnode->lock);
if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
spin_unlock(&vnode->lock);
return 0;
}
list_add_tail(&myself.link, &vnode->io_lock_waiters);
spin_unlock(&vnode->lock);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
if (smp_load_acquire(&myself.have_lock) ||
signal_pending(current))
break;
schedule();
}
__set_current_state(TASK_RUNNING);
/* If we got a signal, try to transfer the lock onto the next
* waiter.
*/
if (unlikely(signal_pending(current))) {
spin_lock(&vnode->lock);
if (myself.have_lock) {
spin_unlock(&vnode->lock);
afs_unlock_for_io(vnode);
} else {
list_del(&myself.link);
spin_unlock(&vnode->lock);
}
ret = -ERESTARTSYS;
}
return ret;
}
/* /*
* Lock the vnode(s) being operated upon. * Lock the vnode(s) being operated upon.
*/ */
@ -60,7 +159,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
_enter(""); _enter("");
if (op->flags & AFS_OPERATION_UNINTR) { if (op->flags & AFS_OPERATION_UNINTR) {
mutex_lock(&vnode->io_lock); afs_lock_for_io(vnode);
op->flags |= AFS_OPERATION_LOCK_0; op->flags |= AFS_OPERATION_LOCK_0;
_leave(" = t [1]"); _leave(" = t [1]");
return true; return true;
@ -72,7 +171,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
if (vnode2 > vnode) if (vnode2 > vnode)
swap(vnode, vnode2); swap(vnode, vnode2);
if (mutex_lock_interruptible(&vnode->io_lock) < 0) { if (afs_lock_for_io_interruptible(vnode) < 0) {
afs_op_set_error(op, -ERESTARTSYS); afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP; op->flags |= AFS_OPERATION_STOP;
_leave(" = f [I 0]"); _leave(" = f [I 0]");
@ -81,10 +180,10 @@ static bool afs_get_io_locks(struct afs_operation *op)
op->flags |= AFS_OPERATION_LOCK_0; op->flags |= AFS_OPERATION_LOCK_0;
if (vnode2) { if (vnode2) {
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { if (afs_lock_for_io_interruptible(vnode2) < 0) {
afs_op_set_error(op, -ERESTARTSYS); afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP; op->flags |= AFS_OPERATION_STOP;
mutex_unlock(&vnode->io_lock); afs_unlock_for_io(vnode);
op->flags &= ~AFS_OPERATION_LOCK_0; op->flags &= ~AFS_OPERATION_LOCK_0;
_leave(" = f [I 1]"); _leave(" = f [I 1]");
return false; return false;
@ -104,9 +203,9 @@ static void afs_drop_io_locks(struct afs_operation *op)
_enter(""); _enter("");
if (op->flags & AFS_OPERATION_LOCK_1) if (op->flags & AFS_OPERATION_LOCK_1)
mutex_unlock(&vnode2->io_lock); afs_unlock_for_io(vnode2);
if (op->flags & AFS_OPERATION_LOCK_0) if (op->flags & AFS_OPERATION_LOCK_0)
mutex_unlock(&vnode->io_lock); afs_unlock_for_io(vnode);
} }
static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp, static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp,
@ -157,7 +256,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
/* /*
* Tidy up a filesystem cursor and unlock the vnode. * Tidy up a filesystem cursor and unlock the vnode.
*/ */
static void afs_end_vnode_operation(struct afs_operation *op) void afs_end_vnode_operation(struct afs_operation *op)
{ {
_enter(""); _enter("");

View File

@ -301,19 +301,19 @@ void afs_fs_fetch_status(struct afs_operation *op)
static int afs_deliver_fs_fetch_data(struct afs_call *call) static int afs_deliver_fs_fetch_data(struct afs_call *call)
{ {
struct afs_operation *op = call->op; struct afs_operation *op = call->op;
struct netfs_io_subrequest *subreq = op->fetch.subreq;
struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp; const __be32 *bp;
size_t count_before; size_t count_before;
int ret; int ret;
_enter("{%u,%zu,%zu/%llu}", _enter("{%u,%zu,%zu/%llu}",
call->unmarshall, call->iov_len, iov_iter_count(call->iter), call->unmarshall, call->iov_len, iov_iter_count(call->iter),
req->actual_len); call->remaining);
switch (call->unmarshall) { switch (call->unmarshall) {
case 0: case 0:
req->actual_len = 0; call->remaining = 0;
call->unmarshall++; call->unmarshall++;
if (call->operation_ID == FSFETCHDATA64) { if (call->operation_ID == FSFETCHDATA64) {
afs_extract_to_tmp64(call); afs_extract_to_tmp64(call);
@ -323,8 +323,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
} }
fallthrough; fallthrough;
/* Extract the returned data length into /* Extract the returned data length into ->remaining.
* ->actual_len. This may indicate more or less data than was * This may indicate more or less data than was
* requested will be returned. * requested will be returned.
*/ */
case 1: case 1:
@ -333,42 +333,40 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
if (ret < 0) if (ret < 0)
return ret; return ret;
req->actual_len = be64_to_cpu(call->tmp64); call->remaining = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len); _debug("DATA length: %llu", call->remaining);
if (req->actual_len == 0) if (call->remaining == 0)
goto no_more_data; goto no_more_data;
call->iter = req->iter; call->iter = &subreq->io_iter;
call->iov_len = min(req->actual_len, req->len); call->iov_len = umin(call->remaining, subreq->len - subreq->transferred);
call->unmarshall++; call->unmarshall++;
fallthrough; fallthrough;
/* extract the returned data */ /* extract the returned data */
case 2: case 2:
count_before = call->iov_len; count_before = call->iov_len;
_debug("extract data %zu/%llu", count_before, req->actual_len); _debug("extract data %zu/%llu", count_before, call->remaining);
ret = afs_extract_data(call, true); ret = afs_extract_data(call, true);
if (req->subreq) { subreq->transferred += count_before - call->iov_len;
req->subreq->transferred += count_before - call->iov_len; call->remaining -= count_before - call->iov_len;
netfs_read_subreq_progress(req->subreq, false);
}
if (ret < 0) if (ret < 0)
return ret; return ret;
call->iter = &call->def_iter; call->iter = &call->def_iter;
if (req->actual_len <= req->len) if (call->remaining)
goto no_more_data; goto no_more_data;
/* Discard any excess data the server gave us */ /* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len); afs_extract_discard(call, call->remaining);
call->unmarshall = 3; call->unmarshall = 3;
fallthrough; fallthrough;
case 3: case 3:
_debug("extract discard %zu/%llu", _debug("extract discard %zu/%llu",
iov_iter_count(call->iter), req->actual_len - req->len); iov_iter_count(call->iter), call->remaining);
ret = afs_extract_data(call, true); ret = afs_extract_data(call, true);
if (ret < 0) if (ret < 0)
@ -390,8 +388,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
xdr_decode_AFSCallBack(&bp, call, &vp->scb); xdr_decode_AFSCallBack(&bp, call, &vp->scb);
xdr_decode_AFSVolSync(&bp, &op->volsync); xdr_decode_AFSVolSync(&bp, &op->volsync);
req->data_version = vp->scb.status.data_version; if (subreq->start + subreq->transferred >= vp->scb.status.size)
req->file_size = vp->scb.status.size; __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
call->unmarshall++; call->unmarshall++;
fallthrough; fallthrough;
@ -410,14 +408,18 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
static const struct afs_call_type afs_RXFSFetchData = { static const struct afs_call_type afs_RXFSFetchData = {
.name = "FS.FetchData", .name = "FS.FetchData",
.op = afs_FS_FetchData, .op = afs_FS_FetchData,
.async_rx = afs_fetch_data_async_rx,
.deliver = afs_deliver_fs_fetch_data, .deliver = afs_deliver_fs_fetch_data,
.immediate_cancel = afs_fetch_data_immediate_cancel,
.destructor = afs_flat_call_destructor, .destructor = afs_flat_call_destructor,
}; };
static const struct afs_call_type afs_RXFSFetchData64 = { static const struct afs_call_type afs_RXFSFetchData64 = {
.name = "FS.FetchData64", .name = "FS.FetchData64",
.op = afs_FS_FetchData64, .op = afs_FS_FetchData64,
.async_rx = afs_fetch_data_async_rx,
.deliver = afs_deliver_fs_fetch_data, .deliver = afs_deliver_fs_fetch_data,
.immediate_cancel = afs_fetch_data_immediate_cancel,
.destructor = afs_flat_call_destructor, .destructor = afs_flat_call_destructor,
}; };
@ -426,8 +428,8 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
*/ */
static void afs_fs_fetch_data64(struct afs_operation *op) static void afs_fs_fetch_data64(struct afs_operation *op)
{ {
struct netfs_io_subrequest *subreq = op->fetch.subreq;
struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
struct afs_call *call; struct afs_call *call;
__be32 *bp; __be32 *bp;
@ -437,16 +439,19 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
if (!call) if (!call)
return afs_op_nomem(op); return afs_op_nomem(op);
if (op->flags & AFS_OPERATION_ASYNC)
call->async = true;
/* marshall the parameters */ /* marshall the parameters */
bp = call->request; bp = call->request;
bp[0] = htonl(FSFETCHDATA64); bp[0] = htonl(FSFETCHDATA64);
bp[1] = htonl(vp->fid.vid); bp[1] = htonl(vp->fid.vid);
bp[2] = htonl(vp->fid.vnode); bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique); bp[3] = htonl(vp->fid.unique);
bp[4] = htonl(upper_32_bits(req->pos)); bp[4] = htonl(upper_32_bits(subreq->start + subreq->transferred));
bp[5] = htonl(lower_32_bits(req->pos)); bp[5] = htonl(lower_32_bits(subreq->start + subreq->transferred));
bp[6] = 0; bp[6] = 0;
bp[7] = htonl(lower_32_bits(req->len)); bp[7] = htonl(lower_32_bits(subreq->len - subreq->transferred));
call->fid = vp->fid; call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid); trace_afs_make_fs_call(call, &vp->fid);
@ -458,9 +463,9 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
*/ */
void afs_fs_fetch_data(struct afs_operation *op) void afs_fs_fetch_data(struct afs_operation *op)
{ {
struct netfs_io_subrequest *subreq = op->fetch.subreq;
struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode_param *vp = &op->file[0];
struct afs_call *call; struct afs_call *call;
struct afs_read *req = op->fetch.req;
__be32 *bp; __be32 *bp;
if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
@ -472,16 +477,14 @@ void afs_fs_fetch_data(struct afs_operation *op)
if (!call) if (!call)
return afs_op_nomem(op); return afs_op_nomem(op);
req->call_debug_id = call->debug_id;
/* marshall the parameters */ /* marshall the parameters */
bp = call->request; bp = call->request;
bp[0] = htonl(FSFETCHDATA); bp[0] = htonl(FSFETCHDATA);
bp[1] = htonl(vp->fid.vid); bp[1] = htonl(vp->fid.vid);
bp[2] = htonl(vp->fid.vnode); bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique); bp[3] = htonl(vp->fid.unique);
bp[4] = htonl(lower_32_bits(req->pos)); bp[4] = htonl(lower_32_bits(subreq->start + subreq->transferred));
bp[5] = htonl(lower_32_bits(req->len)); bp[5] = htonl(lower_32_bits(subreq->len + subreq->transferred));
call->fid = vp->fid; call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid); trace_afs_make_fs_call(call, &vp->fid);
@ -1733,6 +1736,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
.op = afs_FS_GetCapabilities, .op = afs_FS_GetCapabilities,
.deliver = afs_deliver_fs_get_capabilities, .deliver = afs_deliver_fs_get_capabilities,
.done = afs_fileserver_probe_result, .done = afs_fileserver_probe_result,
.immediate_cancel = afs_fileserver_probe_result,
.destructor = afs_fs_get_capabilities_destructor, .destructor = afs_fs_get_capabilities_destructor,
}; };

View File

@ -25,8 +25,94 @@
#include "internal.h" #include "internal.h"
#include "afs_fs.h" #include "afs_fs.h"
void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op)
{
size_t size = strlen(op->create.symlink) + 1;
size_t dsize = 0;
char *p;
if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size,
mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0)
return;
vnode->directory_size = dsize;
p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0);
memcpy(p, op->create.symlink, size);
kunmap_local(p);
set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
}
static void afs_put_link(void *arg)
{
struct folio *folio = virt_to_folio(arg);
kunmap_local(arg);
folio_put(folio);
}
const char *afs_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
struct folio *folio;
char *content;
ssize_t ret;
if (!dentry) {
/* RCU pathwalk. */
if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode))
return ERR_PTR(-ECHILD);
goto good;
}
if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
goto fetch;
ret = afs_validate(vnode, NULL);
if (ret < 0)
return ERR_PTR(ret);
if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
goto good;
fetch:
ret = afs_read_single(vnode, NULL);
if (ret < 0)
return ERR_PTR(ret);
set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
good:
folio = folioq_folio(vnode->directory, 0);
folio_get(folio);
content = kmap_local_folio(folio, 0);
set_delayed_call(callback, afs_put_link, content);
return content;
}
int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
DEFINE_DELAYED_CALL(done);
const char *content;
int len;
content = afs_get_link(dentry, d_inode(dentry), &done);
if (IS_ERR(content)) {
do_delayed_call(&done);
return PTR_ERR(content);
}
len = umin(strlen(content), buflen);
if (copy_to_user(buffer, content, len))
len = -EFAULT;
do_delayed_call(&done);
return len;
}
static const struct inode_operations afs_symlink_inode_operations = { static const struct inode_operations afs_symlink_inode_operations = {
.get_link = page_get_link, .get_link = afs_get_link,
.readlink = afs_readlink,
}; };
static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode) static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
@ -110,7 +196,9 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_op = &afs_dir_inode_operations; inode->i_op = &afs_dir_inode_operations;
inode->i_fop = &afs_dir_file_operations; inode->i_fop = &afs_dir_file_operations;
inode->i_mapping->a_ops = &afs_dir_aops; inode->i_mapping->a_ops = &afs_dir_aops;
mapping_set_large_folios(inode->i_mapping); __set_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &vnode->netfs.flags);
/* Assume locally cached directory data will be valid. */
__set_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
break; break;
case AFS_FTYPE_SYMLINK: case AFS_FTYPE_SYMLINK:
/* Symlinks with a mode of 0644 are actually mountpoints. */ /* Symlinks with a mode of 0644 are actually mountpoints. */
@ -122,13 +210,13 @@ static int afs_inode_init_from_status(struct afs_operation *op,
inode->i_mode = S_IFDIR | 0555; inode->i_mode = S_IFDIR | 0555;
inode->i_op = &afs_mntpt_inode_operations; inode->i_op = &afs_mntpt_inode_operations;
inode->i_fop = &afs_mntpt_file_operations; inode->i_fop = &afs_mntpt_file_operations;
inode->i_mapping->a_ops = &afs_symlink_aops;
} else { } else {
inode->i_mode = S_IFLNK | status->mode; inode->i_mode = S_IFLNK | status->mode;
inode->i_op = &afs_symlink_inode_operations; inode->i_op = &afs_symlink_inode_operations;
inode->i_mapping->a_ops = &afs_symlink_aops;
} }
inode->i_mapping->a_ops = &afs_dir_aops;
inode_nohighmem(inode); inode_nohighmem(inode);
mapping_set_release_always(inode->i_mapping);
break; break;
default: default:
dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL); dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL);
@ -140,15 +228,17 @@ static int afs_inode_init_from_status(struct afs_operation *op,
afs_set_netfs_context(vnode); afs_set_netfs_context(vnode);
vnode->invalid_before = status->data_version; vnode->invalid_before = status->data_version;
trace_afs_set_dv(vnode, status->data_version);
inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
if (!vp->scb.have_cb) { if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver /* it's a symlink we just created (the fileserver
* didn't give us a callback) */ * didn't give us a callback) */
atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); afs_clear_cb_promise(vnode, afs_cb_promise_set_new_symlink);
} else { } else {
vnode->cb_server = op->server; vnode->cb_server = op->server;
atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at); afs_set_cb_promise(vnode, vp->scb.callback.expires_at,
afs_cb_promise_set_new_inode);
} }
write_sequnlock(&vnode->cb_lock); write_sequnlock(&vnode->cb_lock);
@ -207,12 +297,17 @@ static void afs_apply_status(struct afs_operation *op,
if (vp->update_ctime) if (vp->update_ctime)
inode_set_ctime_to_ts(inode, op->ctime); inode_set_ctime_to_ts(inode, op->ctime);
if (vnode->status.data_version != status->data_version) if (vnode->status.data_version != status->data_version) {
trace_afs_set_dv(vnode, status->data_version);
data_changed = true; data_changed = true;
}
vnode->status = *status; vnode->status = *status;
if (vp->dv_before + vp->dv_delta != status->data_version) { if (vp->dv_before + vp->dv_delta != status->data_version) {
trace_afs_dv_mismatch(vnode, vp->dv_before, vp->dv_delta,
status->data_version);
if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) && if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE) atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
@ -223,12 +318,10 @@ static void afs_apply_status(struct afs_operation *op,
op->debug_id); op->debug_id);
vnode->invalid_before = status->data_version; vnode->invalid_before = status->data_version;
if (vnode->status.type == AFS_FTYPE_DIR) { if (vnode->status.type == AFS_FTYPE_DIR)
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) afs_invalidate_dir(vnode, afs_dir_invalid_dv_mismatch);
afs_stat_v(vnode, n_inval); else
} else {
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
}
change_size = true; change_size = true;
data_changed = true; data_changed = true;
unexpected_jump = true; unexpected_jump = true;
@ -258,6 +351,8 @@ static void afs_apply_status(struct afs_operation *op,
inode_set_ctime_to_ts(inode, t); inode_set_ctime_to_ts(inode, t);
inode_set_atime_to_ts(inode, t); inode_set_atime_to_ts(inode, t);
} }
if (op->ops == &afs_fetch_data_operation)
op->fetch.subreq->rreq->i_size = status->size;
} }
} }
@ -273,7 +368,7 @@ static void afs_apply_callback(struct afs_operation *op,
if (!afs_cb_is_broken(vp->cb_break_before, vnode)) { if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
if (op->volume->type == AFSVL_RWVOL) if (op->volume->type == AFSVL_RWVOL)
vnode->cb_server = op->server; vnode->cb_server = op->server;
atomic64_set(&vnode->cb_expires_at, cb->expires_at); afs_set_cb_promise(vnode, cb->expires_at, afs_cb_promise_set_apply_cb);
} }
} }
@ -435,7 +530,9 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
} __packed key; } __packed key;
struct afs_vnode_cache_aux aux; struct afs_vnode_cache_aux aux;
if (vnode->status.type != AFS_FTYPE_FILE) { if (vnode->status.type != AFS_FTYPE_FILE &&
vnode->status.type != AFS_FTYPE_DIR &&
vnode->status.type != AFS_FTYPE_SYMLINK) {
vnode->netfs.cache = NULL; vnode->netfs.cache = NULL;
return; return;
} }
@ -637,6 +734,7 @@ int afs_drop_inode(struct inode *inode)
void afs_evict_inode(struct inode *inode) void afs_evict_inode(struct inode *inode)
{ {
struct afs_vnode_cache_aux aux; struct afs_vnode_cache_aux aux;
struct afs_super_info *sbi = AFS_FS_S(inode->i_sb);
struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_vnode *vnode = AFS_FS_I(inode);
_enter("{%llx:%llu.%d}", _enter("{%llx:%llu.%d}",
@ -648,8 +746,22 @@ void afs_evict_inode(struct inode *inode)
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
if ((S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) &&
(inode->i_state & I_DIRTY) &&
!sbi->dyn_root) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.for_sync = true,
.range_end = LLONG_MAX,
};
afs_single_writepages(inode->i_mapping, &wbc);
}
netfs_wait_for_outstanding_io(inode); netfs_wait_for_outstanding_io(inode);
truncate_inode_pages_final(&inode->i_data); truncate_inode_pages_final(&inode->i_data);
netfs_free_folioq_buffer(vnode->directory);
afs_set_cache_aux(vnode, &aux); afs_set_cache_aux(vnode, &aux);
netfs_clear_inode_writeback(inode, &aux); netfs_clear_inode_writeback(inode, &aux);

View File

@ -163,6 +163,7 @@ struct afs_call {
spinlock_t state_lock; spinlock_t state_lock;
int error; /* error code */ int error; /* error code */
u32 abort_code; /* Remote abort ID or 0 */ u32 abort_code; /* Remote abort ID or 0 */
unsigned long long remaining; /* How much is left to receive */
unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */ unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */
unsigned request_size; /* size of request data */ unsigned request_size; /* size of request data */
unsigned reply_max; /* maximum size of reply */ unsigned reply_max; /* maximum size of reply */
@ -201,11 +202,17 @@ struct afs_call_type {
/* clean up a call */ /* clean up a call */
void (*destructor)(struct afs_call *call); void (*destructor)(struct afs_call *call);
/* Async receive processing function */
void (*async_rx)(struct work_struct *work);
/* Work function */ /* Work function */
void (*work)(struct work_struct *work); void (*work)(struct work_struct *work);
/* Call done function (gets called immediately on success or failure) */ /* Call done function (gets called immediately on success or failure) */
void (*done)(struct afs_call *call); void (*done)(struct afs_call *call);
/* Handle a call being immediately cancelled. */
void (*immediate_cancel)(struct afs_call *call);
}; };
/* /*
@ -232,28 +239,6 @@ static inline struct key *afs_file_key(struct file *file)
return af->key; return af->key;
} }
/*
* Record of an outstanding read operation on a vnode.
*/
struct afs_read {
loff_t pos; /* Where to start reading */
loff_t len; /* How much we're asking for */
loff_t actual_len; /* How much we're actually getting */
loff_t file_size; /* File size returned by server */
struct key *key; /* The key to use to reissue the read */
struct afs_vnode *vnode; /* The file being read into. */
struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */
afs_dataversion_t data_version; /* Version number returned by server */
refcount_t usage;
unsigned int call_debug_id;
unsigned int nr_pages;
int error;
void (*done)(struct afs_read *);
void (*cleanup)(struct afs_read *);
struct iov_iter *iter; /* Iterator representing the buffer */
struct iov_iter def_iter; /* Default iterator */
};
/* /*
* AFS superblock private data * AFS superblock private data
* - there's one superblock per volume * - there's one superblock per volume
@ -702,13 +687,14 @@ struct afs_vnode {
struct afs_file_status status; /* AFS status info for this file */ struct afs_file_status status; /* AFS status info for this file */
afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */
struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */
struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */
struct rw_semaphore validate_lock; /* lock for validating this vnode */ struct rw_semaphore validate_lock; /* lock for validating this vnode */
struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */ struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */
struct key *silly_key; /* Silly rename key */ struct key *silly_key; /* Silly rename key */
spinlock_t wb_lock; /* lock for wb_keys */ spinlock_t wb_lock; /* lock for wb_keys */
spinlock_t lock; /* waitqueue/flags lock */ spinlock_t lock; /* waitqueue/flags lock */
unsigned long flags; unsigned long flags;
#define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */
#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */ #define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */ #define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
@ -719,7 +705,9 @@ struct afs_vnode {
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ #define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */
#define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */
struct folio_queue *directory; /* Directory contents */
struct list_head wb_keys; /* List of keys available for writeback */ struct list_head wb_keys; /* List of keys available for writeback */
struct list_head pending_locks; /* locks waiting to be granted */ struct list_head pending_locks; /* locks waiting to be granted */
struct list_head granted_locks; /* locks granted on this file */ struct list_head granted_locks; /* locks granted on this file */
@ -728,6 +716,7 @@ struct afs_vnode {
ktime_t locked_at; /* Time at which lock obtained */ ktime_t locked_at; /* Time at which lock obtained */
enum afs_lock_state lock_state : 8; enum afs_lock_state lock_state : 8;
afs_lock_type_t lock_type : 8; afs_lock_type_t lock_type : 8;
unsigned int directory_size; /* Amount of space in ->directory */
/* outstanding callback notification on this file */ /* outstanding callback notification on this file */
struct work_struct cb_work; /* Work for mmap'd files */ struct work_struct cb_work; /* Work for mmap'd files */
@ -907,7 +896,7 @@ struct afs_operation {
bool new_negative; bool new_negative;
} rename; } rename;
struct { struct {
struct afs_read *req; struct netfs_io_subrequest *subreq;
} fetch; } fetch;
struct { struct {
afs_lock_type_t type; afs_lock_type_t type;
@ -959,6 +948,7 @@ struct afs_operation {
#define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */ #define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */
#define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */ #define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */
#define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */ #define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */
#define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */
}; };
/* /*
@ -983,6 +973,21 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
i_size_read(&vnode->netfs.inode), flags); i_size_read(&vnode->netfs.inode), flags);
} }
/*
* Directory iteration management.
*/
struct afs_dir_iter {
struct afs_vnode *dvnode;
union afs_xdr_dir_block *block;
struct folio_queue *fq;
unsigned int fpos;
int fq_slot;
unsigned int loop_check;
u8 nr_slots;
u8 bucket;
unsigned int prev_entry;
};
#include <trace/events/afs.h> #include <trace/events/afs.h>
/*****************************************************************************/ /*****************************************************************************/
@ -1064,8 +1069,13 @@ extern const struct inode_operations afs_dir_inode_operations;
extern const struct address_space_operations afs_dir_aops; extern const struct address_space_operations afs_dir_aops;
extern const struct dentry_operations afs_fs_dentry_operations; extern const struct dentry_operations afs_fs_dentry_operations;
ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file);
ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
__acquires(&dvnode->validate_lock);
extern void afs_d_release(struct dentry *); extern void afs_d_release(struct dentry *);
extern void afs_check_for_remote_deletion(struct afs_operation *); extern void afs_check_for_remote_deletion(struct afs_operation *);
int afs_single_writepages(struct address_space *mapping,
struct writeback_control *wbc);
/* /*
* dir_edit.c * dir_edit.c
@ -1075,6 +1085,18 @@ extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *
extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode, void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
enum afs_edit_dir_reason why); enum afs_edit_dir_reason why);
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
/*
* dir_search.c
*/
unsigned int afs_dir_hash_name(const struct qstr *name);
bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name);
union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block);
int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
struct afs_fid *_fid);
int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
struct afs_fid *_fid, afs_dataversion_t *_dir_version);
/* /*
* dir_silly.c * dir_silly.c
@ -1099,24 +1121,17 @@ extern void afs_dynroot_depopulate(struct super_block *);
* file.c * file.c
*/ */
extern const struct address_space_operations afs_file_aops; extern const struct address_space_operations afs_file_aops;
extern const struct address_space_operations afs_symlink_aops;
extern const struct inode_operations afs_file_inode_operations; extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations; extern const struct file_operations afs_file_operations;
extern const struct afs_operation_ops afs_fetch_data_operation;
extern const struct netfs_request_ops afs_req_ops; extern const struct netfs_request_ops afs_req_ops;
extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
extern void afs_put_wb_key(struct afs_wb_key *); extern void afs_put_wb_key(struct afs_wb_key *);
extern int afs_open(struct inode *, struct file *); extern int afs_open(struct inode *, struct file *);
extern int afs_release(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *);
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); void afs_fetch_data_async_rx(struct work_struct *work);
extern struct afs_read *afs_alloc_read(gfp_t); void afs_fetch_data_immediate_cancel(struct afs_call *call);
extern void afs_put_read(struct afs_read *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
{
refcount_inc(&req->usage);
return req;
}
/* /*
* flock.c * flock.c
@ -1168,6 +1183,7 @@ extern void afs_fs_store_acl(struct afs_operation *);
extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *); extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *);
extern int afs_put_operation(struct afs_operation *); extern int afs_put_operation(struct afs_operation *);
extern bool afs_begin_vnode_operation(struct afs_operation *); extern bool afs_begin_vnode_operation(struct afs_operation *);
extern void afs_end_vnode_operation(struct afs_operation *op);
extern void afs_wait_for_operation(struct afs_operation *); extern void afs_wait_for_operation(struct afs_operation *);
extern int afs_do_sync_operation(struct afs_operation *); extern int afs_do_sync_operation(struct afs_operation *);
@ -1205,6 +1221,10 @@ extern void afs_fs_probe_cleanup(struct afs_net *);
*/ */
extern const struct afs_operation_ops afs_fetch_status_operation; extern const struct afs_operation_ops afs_fetch_status_operation;
void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op);
const char *afs_get_link(struct dentry *dentry, struct inode *inode,
struct delayed_call *callback);
int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
extern int afs_ilookup5_test_by_fid(struct inode *, void *); extern int afs_ilookup5_test_by_fid(struct inode *, void *);
@ -1336,6 +1356,7 @@ extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *); extern void afs_put_call(struct afs_call *);
void afs_deferred_put_call(struct afs_call *call); void afs_deferred_put_call(struct afs_call *call);
void afs_make_call(struct afs_call *call, gfp_t gfp); void afs_make_call(struct afs_call *call, gfp_t gfp);
void afs_deliver_to_call(struct afs_call *call);
void afs_wait_for_call_to_complete(struct afs_call *call); void afs_wait_for_call_to_complete(struct afs_call *call);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *, extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
const struct afs_call_type *, const struct afs_call_type *,
@ -1346,6 +1367,28 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
extern int afs_extract_data(struct afs_call *, bool); extern int afs_extract_data(struct afs_call *, bool);
extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause); extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
static inline struct afs_call *afs_get_call(struct afs_call *call,
enum afs_call_trace why)
{
int r;
__refcount_inc(&call->ref, &r);
trace_afs_call(call->debug_id, why, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
return call;
}
static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why)
{
int r = refcount_read(&call->ref);
trace_afs_call(call->debug_id, why, r,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
}
static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call, static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call,
gfp_t gfp) gfp_t gfp)
{ {
@ -1712,6 +1755,38 @@ static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where)
return -EIO; return -EIO;
} }
/*
* Set the callback promise on a vnode.
*/
static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at,
enum afs_cb_promise_trace trace)
{
atomic64_set(&vnode->cb_expires_at, expires_at);
trace_afs_cb_promise(vnode, trace);
}
/*
* Clear the callback promise on a vnode, returning true if it was promised.
*/
static inline bool afs_clear_cb_promise(struct afs_vnode *vnode,
enum afs_cb_promise_trace trace)
{
trace_afs_cb_promise(vnode, trace);
return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE;
}
/*
* Mark a directory as being invalid.
*/
static inline void afs_invalidate_dir(struct afs_vnode *dvnode,
enum afs_dir_invalid_trace trace)
{
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
trace_afs_dir_invalid(dvnode, trace);
afs_stat_v(dvnode, n_inval);
}
}
/*****************************************************************************/ /*****************************************************************************/
/* /*
* debug tracing * debug tracing

View File

@ -177,7 +177,7 @@ static int __init afs_init(void)
afs_wq = alloc_workqueue("afs", 0, 0); afs_wq = alloc_workqueue("afs", 0, 0);
if (!afs_wq) if (!afs_wq)
goto error_afs_wq; goto error_afs_wq;
afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0); afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
if (!afs_async_calls) if (!afs_async_calls)
goto error_async; goto error_async;
afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0); afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);

View File

@ -30,7 +30,7 @@ const struct file_operations afs_mntpt_file_operations = {
const struct inode_operations afs_mntpt_inode_operations = { const struct inode_operations afs_mntpt_inode_operations = {
.lookup = afs_mntpt_lookup, .lookup = afs_mntpt_lookup,
.readlink = page_readlink, .readlink = afs_readlink,
.getattr = afs_getattr, .getattr = afs_getattr,
}; };
@ -118,9 +118,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ctx->volnamesz = sizeof(afs_root_volume) - 1; ctx->volnamesz = sizeof(afs_root_volume) - 1;
} else { } else {
/* read the contents of the AFS special symlink */ /* read the contents of the AFS special symlink */
struct page *page; DEFINE_DELAYED_CALL(cleanup);
const char *content;
loff_t size = i_size_read(d_inode(mntpt)); loff_t size = i_size_read(d_inode(mntpt));
char *buf;
if (src_as->cell) if (src_as->cell)
ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt); ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt);
@ -128,16 +128,16 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (size < 2 || size > PAGE_SIZE - 1) if (size < 2 || size > PAGE_SIZE - 1)
return -EINVAL; return -EINVAL;
page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); content = afs_get_link(mntpt, d_inode(mntpt), &cleanup);
if (IS_ERR(page)) if (IS_ERR(content)) {
return PTR_ERR(page); do_delayed_call(&cleanup);
return PTR_ERR(content);
}
buf = kmap(page);
ret = -EINVAL; ret = -EINVAL;
if (buf[size - 1] == '.') if (content[size - 1] == '.')
ret = vfs_parse_fs_string(fc, "source", buf, size - 1); ret = vfs_parse_fs_string(fc, "source", content, size - 1);
kunmap(page); do_delayed_call(&cleanup);
put_page(page);
if (ret < 0) if (ret < 0)
return ret; return ret;

View File

@ -99,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
write_seqlock(&vnode->cb_lock); write_seqlock(&vnode->cb_lock);
ASSERTCMP(cb_server, ==, vnode->cb_server); ASSERTCMP(cb_server, ==, vnode->cb_server);
vnode->cb_server = NULL; vnode->cb_server = NULL;
if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_rotate_server))
vnode->cb_break++; vnode->cb_break++;
write_sequnlock(&vnode->cb_lock); write_sequnlock(&vnode->cb_lock);
} }
@ -583,7 +583,7 @@ bool afs_select_fileserver(struct afs_operation *op)
if (vnode->cb_server != server) { if (vnode->cb_server != server) {
vnode->cb_server = server; vnode->cb_server = server;
vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break); vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE); afs_clear_cb_promise(vnode, afs_cb_promise_clear_server_change);
} }
retry_server: retry_server:

View File

@ -149,7 +149,8 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
call->net = net; call->net = net;
call->debug_id = atomic_inc_return(&rxrpc_debug_id); call->debug_id = atomic_inc_return(&rxrpc_debug_id);
refcount_set(&call->ref, 1); refcount_set(&call->ref, 1);
INIT_WORK(&call->async_work, afs_process_async_call); INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call);
INIT_WORK(&call->work, call->type->work);
INIT_WORK(&call->free_work, afs_deferred_free_worker); INIT_WORK(&call->free_work, afs_deferred_free_worker);
init_waitqueue_head(&call->waitq); init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock); spin_lock_init(&call->state_lock);
@ -235,27 +236,12 @@ void afs_deferred_put_call(struct afs_call *call)
schedule_work(&call->free_work); schedule_work(&call->free_work);
} }
static struct afs_call *afs_get_call(struct afs_call *call,
enum afs_call_trace why)
{
int r;
__refcount_inc(&call->ref, &r);
trace_afs_call(call->debug_id, why, r + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
return call;
}
/* /*
* Queue the call for actual work. * Queue the call for actual work.
*/ */
static void afs_queue_call_work(struct afs_call *call) static void afs_queue_call_work(struct afs_call *call)
{ {
if (call->type->work) { if (call->type->work) {
INIT_WORK(&call->work, call->type->work);
afs_get_call(call, afs_call_trace_work); afs_get_call(call, afs_call_trace_work);
if (!queue_work(afs_wq, &call->work)) if (!queue_work(afs_wq, &call->work))
afs_put_call(call); afs_put_call(call);
@ -430,11 +416,16 @@ void afs_make_call(struct afs_call *call, gfp_t gfp)
return; return;
error_do_abort: error_do_abort:
if (ret != -ECONNABORTED) { if (ret != -ECONNABORTED)
rxrpc_kernel_abort_call(call->net->socket, rxcall, rxrpc_kernel_abort_call(call->net->socket, rxcall,
RX_USER_ABORT, ret, RX_USER_ABORT, ret,
afs_abort_send_data_error); afs_abort_send_data_error);
} else { if (call->async) {
afs_see_call(call, afs_call_trace_async_abort);
return;
}
if (ret == -ECONNABORTED) {
len = 0; len = 0;
iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0); iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall, rxrpc_kernel_recv_data(call->net->socket, rxcall,
@ -445,8 +436,10 @@ void afs_make_call(struct afs_call *call, gfp_t gfp)
call->error = ret; call->error = ret;
trace_afs_call_done(call); trace_afs_call_done(call);
error_kill_call: error_kill_call:
if (call->type->done) if (call->async)
call->type->done(call); afs_see_call(call, afs_call_trace_async_kill);
if (call->type->immediate_cancel)
call->type->immediate_cancel(call);
/* We need to dispose of the extra ref we grabbed for an async call. /* We need to dispose of the extra ref we grabbed for an async call.
* The call, however, might be queued on afs_async_calls and we need to * The call, however, might be queued on afs_async_calls and we need to
@ -501,7 +494,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
/* /*
* deliver messages to a call * deliver messages to a call
*/ */
static void afs_deliver_to_call(struct afs_call *call) void afs_deliver_to_call(struct afs_call *call)
{ {
enum afs_call_state state; enum afs_call_state state;
size_t len; size_t len;
@ -602,7 +595,6 @@ static void afs_deliver_to_call(struct afs_call *call)
abort_code = 0; abort_code = 0;
call_complete: call_complete:
afs_set_call_complete(call, ret, remote_abort); afs_set_call_complete(call, ret, remote_abort);
state = AFS_CALL_COMPLETE;
goto done; goto done;
} }
@ -803,6 +795,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
return -ENOTSUPP; return -ENOTSUPP;
trace_afs_cb_call(call); trace_afs_cb_call(call);
call->work.func = call->type->work;
/* pass responsibility for the remainer of this message off to the /* pass responsibility for the remainer of this message off to the
* cache manager op */ * cache manager op */

View File

@ -663,7 +663,7 @@ static void afs_i_init_once(void *_vnode)
memset(vnode, 0, sizeof(*vnode)); memset(vnode, 0, sizeof(*vnode));
inode_init_once(&vnode->netfs.inode); inode_init_once(&vnode->netfs.inode);
mutex_init(&vnode->io_lock); INIT_LIST_HEAD(&vnode->io_lock_waiters);
init_rwsem(&vnode->validate_lock); init_rwsem(&vnode->validate_lock);
spin_lock_init(&vnode->wb_lock); spin_lock_init(&vnode->wb_lock);
spin_lock_init(&vnode->lock); spin_lock_init(&vnode->lock);
@ -696,6 +696,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
vnode->volume = NULL; vnode->volume = NULL;
vnode->lock_key = NULL; vnode->lock_key = NULL;
vnode->permit_cache = NULL; vnode->permit_cache = NULL;
vnode->directory = NULL;
vnode->directory_size = 0;
vnode->flags = 1 << AFS_VNODE_UNSET; vnode->flags = 1 << AFS_VNODE_UNSET;
vnode->lock_state = AFS_VNODE_LOCK_NONE; vnode->lock_state = AFS_VNODE_LOCK_NONE;

View File

@ -120,22 +120,31 @@
bool afs_check_validity(const struct afs_vnode *vnode) bool afs_check_validity(const struct afs_vnode *vnode)
{ {
const struct afs_volume *volume = vnode->volume; const struct afs_volume *volume = vnode->volume;
enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
time64_t deadline = ktime_get_real_seconds() + 10; time64_t deadline = ktime_get_real_seconds() + 10;
if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
return true; return true;
if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))
atomic64_read(&vnode->cb_expires_at) <= deadline || trace = afs_vnode_invalid_trace_cb_v_break;
volume->cb_expires_at <= deadline || else if (cb_expires_at == AFS_NO_CB_PROMISE)
vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) || trace = afs_vnode_invalid_trace_no_cb_promise;
vnode->cb_scrub != atomic_read(&volume->cb_scrub) || else if (cb_expires_at <= deadline)
test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { trace = afs_vnode_invalid_trace_expired;
_debug("inval"); else if (volume->cb_expires_at <= deadline)
return false; trace = afs_vnode_invalid_trace_vol_expired;
} else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))
trace = afs_vnode_invalid_trace_cb_ro_snapshot;
return true; else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))
trace = afs_vnode_invalid_trace_cb_scrub;
else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
trace = afs_vnode_invalid_trace_zap_data;
else
return true;
trace_afs_vnode_invalid(vnode, trace);
return false;
} }
/* /*

View File

@ -370,6 +370,7 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
.name = "VL.GetCapabilities", .name = "VL.GetCapabilities",
.op = afs_VL_GetCapabilities, .op = afs_VL_GetCapabilities,
.deliver = afs_deliver_vl_get_capabilities, .deliver = afs_deliver_vl_get_capabilities,
.immediate_cancel = afs_vlserver_probe_result,
.done = afs_vlserver_probe_result, .done = afs_vlserver_probe_result,
.destructor = afs_destroy_vl_get_capabilities, .destructor = afs_destroy_vl_get_capabilities,
}; };

View File

@ -179,8 +179,8 @@ void afs_issue_write(struct netfs_io_subrequest *subreq)
*/ */
void afs_begin_writeback(struct netfs_io_request *wreq) void afs_begin_writeback(struct netfs_io_request *wreq)
{ {
afs_get_writeback_key(wreq); if (S_ISREG(wreq->inode->i_mode))
wreq->io_streams[0].avail = true; afs_get_writeback_key(wreq);
} }
/* /*
@ -193,6 +193,18 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st
list_first_entry(&stream->subrequests, list_first_entry(&stream->subrequests,
struct netfs_io_subrequest, rreq_link); struct netfs_io_subrequest, rreq_link);
switch (wreq->origin) {
case NETFS_READAHEAD:
case NETFS_READPAGE:
case NETFS_READ_GAPS:
case NETFS_READ_SINGLE:
case NETFS_READ_FOR_WRITE:
case NETFS_DIO_READ:
return;
default:
break;
}
switch (subreq->error) { switch (subreq->error) {
case -EACCES: case -EACCES:
case -EPERM: case -EPERM:

View File

@ -88,7 +88,7 @@ union afs_xdr_dir_block {
struct { struct {
struct afs_xdr_dir_hdr hdr; struct afs_xdr_dir_hdr hdr;
u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS]; u8 alloc_ctrs[AFS_DIR_BLOCKS_WITH_CTR];
__be16 hashtable[AFS_DIR_HASHTBL_SIZE]; __be16 hashtable[AFS_DIR_HASHTBL_SIZE];
} meta; } meta;

View File

@ -352,19 +352,19 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
static int yfs_deliver_fs_fetch_data64(struct afs_call *call) static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
{ {
struct afs_operation *op = call->op; struct afs_operation *op = call->op;
struct netfs_io_subrequest *subreq = op->fetch.subreq;
struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp; const __be32 *bp;
size_t count_before; size_t count_before;
int ret; int ret;
_enter("{%u,%zu, %zu/%llu}", _enter("{%u,%zu, %zu/%llu}",
call->unmarshall, call->iov_len, iov_iter_count(call->iter), call->unmarshall, call->iov_len, iov_iter_count(call->iter),
req->actual_len); call->remaining);
switch (call->unmarshall) { switch (call->unmarshall) {
case 0: case 0:
req->actual_len = 0; call->remaining = 0;
afs_extract_to_tmp64(call); afs_extract_to_tmp64(call);
call->unmarshall++; call->unmarshall++;
fallthrough; fallthrough;
@ -379,42 +379,39 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
if (ret < 0) if (ret < 0)
return ret; return ret;
req->actual_len = be64_to_cpu(call->tmp64); call->remaining = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len); _debug("DATA length: %llu", call->remaining);
if (req->actual_len == 0) if (call->remaining == 0)
goto no_more_data; goto no_more_data;
call->iter = req->iter; call->iter = &subreq->io_iter;
call->iov_len = min(req->actual_len, req->len); call->iov_len = min(call->remaining, subreq->len - subreq->transferred);
call->unmarshall++; call->unmarshall++;
fallthrough; fallthrough;
/* extract the returned data */ /* extract the returned data */
case 2: case 2:
count_before = call->iov_len; count_before = call->iov_len;
_debug("extract data %zu/%llu", count_before, req->actual_len); _debug("extract data %zu/%llu", count_before, call->remaining);
ret = afs_extract_data(call, true); ret = afs_extract_data(call, true);
if (req->subreq) { subreq->transferred += count_before - call->iov_len;
req->subreq->transferred += count_before - call->iov_len;
netfs_read_subreq_progress(req->subreq, false);
}
if (ret < 0) if (ret < 0)
return ret; return ret;
call->iter = &call->def_iter; call->iter = &call->def_iter;
if (req->actual_len <= req->len) if (call->remaining)
goto no_more_data; goto no_more_data;
/* Discard any excess data the server gave us */ /* Discard any excess data the server gave us */
afs_extract_discard(call, req->actual_len - req->len); afs_extract_discard(call, call->remaining);
call->unmarshall = 3; call->unmarshall = 3;
fallthrough; fallthrough;
case 3: case 3:
_debug("extract discard %zu/%llu", _debug("extract discard %zu/%llu",
iov_iter_count(call->iter), req->actual_len - req->len); iov_iter_count(call->iter), call->remaining);
ret = afs_extract_data(call, true); ret = afs_extract_data(call, true);
if (ret < 0) if (ret < 0)
@ -439,8 +436,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
xdr_decode_YFSCallBack(&bp, call, &vp->scb); xdr_decode_YFSCallBack(&bp, call, &vp->scb);
xdr_decode_YFSVolSync(&bp, &op->volsync); xdr_decode_YFSVolSync(&bp, &op->volsync);
req->data_version = vp->scb.status.data_version; if (subreq->start + subreq->transferred >= vp->scb.status.size)
req->file_size = vp->scb.status.size; __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
call->unmarshall++; call->unmarshall++;
fallthrough; fallthrough;
@ -459,7 +456,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
static const struct afs_call_type yfs_RXYFSFetchData64 = { static const struct afs_call_type yfs_RXYFSFetchData64 = {
.name = "YFS.FetchData64", .name = "YFS.FetchData64",
.op = yfs_FS_FetchData64, .op = yfs_FS_FetchData64,
.async_rx = afs_fetch_data_async_rx,
.deliver = yfs_deliver_fs_fetch_data64, .deliver = yfs_deliver_fs_fetch_data64,
.immediate_cancel = afs_fetch_data_immediate_cancel,
.destructor = afs_flat_call_destructor, .destructor = afs_flat_call_destructor,
}; };
@ -468,14 +467,15 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = {
*/ */
void yfs_fs_fetch_data(struct afs_operation *op) void yfs_fs_fetch_data(struct afs_operation *op)
{ {
struct netfs_io_subrequest *subreq = op->fetch.subreq;
struct afs_vnode_param *vp = &op->file[0]; struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
struct afs_call *call; struct afs_call *call;
__be32 *bp; __be32 *bp;
_enter(",%x,{%llx:%llu},%llx,%llx", _enter(",%x,{%llx:%llu},%llx,%zx",
key_serial(op->key), vp->fid.vid, vp->fid.vnode, key_serial(op->key), vp->fid.vid, vp->fid.vnode,
req->pos, req->len); subreq->start + subreq->transferred,
subreq->len - subreq->transferred);
call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64, call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64,
sizeof(__be32) * 2 + sizeof(__be32) * 2 +
@ -487,15 +487,16 @@ void yfs_fs_fetch_data(struct afs_operation *op)
if (!call) if (!call)
return afs_op_nomem(op); return afs_op_nomem(op);
req->call_debug_id = call->debug_id; if (op->flags & AFS_OPERATION_ASYNC)
call->async = true;
/* marshall the parameters */ /* marshall the parameters */
bp = call->request; bp = call->request;
bp = xdr_encode_u32(bp, YFSFETCHDATA64); bp = xdr_encode_u32(bp, YFSFETCHDATA64);
bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_YFSFid(bp, &vp->fid);
bp = xdr_encode_u64(bp, req->pos); bp = xdr_encode_u64(bp, subreq->start + subreq->transferred);
bp = xdr_encode_u64(bp, req->len); bp = xdr_encode_u64(bp, subreq->len - subreq->transferred);
yfs_check_req(call, bp); yfs_check_req(call, bp);
call->fid = vp->fid; call->fid = vp->fid;

View File

@ -176,7 +176,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
!(file->f_mode & FMODE_CAN_ODIRECT)) !(file->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL; return -EINVAL;
old_cred = override_creds_light(ctx->cred); old_cred = override_creds(ctx->cred);
if (is_sync_kiocb(iocb)) { if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags); rwf_t rwf = iocb_to_rw_flags(flags);
@ -197,7 +197,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
backing_aio_cleanup(aio, ret); backing_aio_cleanup(aio, ret);
} }
out: out:
revert_creds_light(old_cred); revert_creds(old_cred);
if (ctx->accessed) if (ctx->accessed)
ctx->accessed(iocb->ki_filp); ctx->accessed(iocb->ki_filp);
@ -233,7 +233,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
*/ */
flags &= ~IOCB_DIO_CALLER_COMP; flags &= ~IOCB_DIO_CALLER_COMP;
old_cred = override_creds_light(ctx->cred); old_cred = override_creds(ctx->cred);
if (is_sync_kiocb(iocb)) { if (is_sync_kiocb(iocb)) {
rwf_t rwf = iocb_to_rw_flags(flags); rwf_t rwf = iocb_to_rw_flags(flags);
@ -264,7 +264,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
backing_aio_cleanup(aio, ret); backing_aio_cleanup(aio, ret);
} }
out: out:
revert_creds_light(old_cred); revert_creds(old_cred);
return ret; return ret;
} }
@ -281,9 +281,9 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
return -EIO; return -EIO;
old_cred = override_creds_light(ctx->cred); old_cred = override_creds(ctx->cred);
ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags); ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
revert_creds_light(old_cred); revert_creds(old_cred);
if (ctx->accessed) if (ctx->accessed)
ctx->accessed(iocb->ki_filp); ctx->accessed(iocb->ki_filp);
@ -310,11 +310,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
if (ret) if (ret)
return ret; return ret;
old_cred = override_creds_light(ctx->cred); old_cred = override_creds(ctx->cred);
file_start_write(out); file_start_write(out);
ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags); ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
file_end_write(out); file_end_write(out);
revert_creds_light(old_cred); revert_creds(old_cred);
if (ctx->end_write) if (ctx->end_write)
ctx->end_write(iocb, ret); ctx->end_write(iocb, ret);
@ -338,9 +338,9 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
vma_set_file(vma, file); vma_set_file(vma, file);
old_cred = override_creds_light(ctx->cred); old_cred = override_creds(ctx->cred);
ret = call_mmap(vma->vm_file, vma); ret = call_mmap(vma->vm_file, vma);
revert_creds_light(old_cred); revert_creds(old_cred);
if (ctx->accessed) if (ctx->accessed)
ctx->accessed(user_file); ctx->accessed(user_file);

View File

@ -13,6 +13,7 @@
#include <linux/falloc.h> #include <linux/falloc.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <trace/events/fscache.h> #include <trace/events/fscache.h>
#include <trace/events/netfs.h>
#include "internal.h" #include "internal.h"
struct cachefiles_kiocb { struct cachefiles_kiocb {
@ -366,6 +367,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) { if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
if (term_func) if (term_func)
term_func(term_func_priv, -ENOBUFS, false); term_func(term_func_priv, -ENOBUFS, false);
trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite);
return -ENOBUFS; return -ENOBUFS;
} }
@ -695,6 +697,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
iov_iter_truncate(&subreq->io_iter, len); iov_iter_truncate(&subreq->io_iter, len);
} }
trace_netfs_sreq(subreq, netfs_sreq_trace_cache_prepare);
cachefiles_begin_secure(cache, &saved_cred); cachefiles_begin_secure(cache, &saved_cred);
ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
&start, &len, len, true); &start, &len, len, true);
@ -704,6 +707,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
return; return;
} }
trace_netfs_sreq(subreq, netfs_sreq_trace_cache_write);
cachefiles_write(&subreq->rreq->cache_resources, cachefiles_write(&subreq->rreq->cache_resources,
subreq->start, &subreq->io_iter, subreq->start, &subreq->io_iter,
netfs_write_subrequest_terminated, subreq); netfs_write_subrequest_terminated, subreq);

View File

@ -77,6 +77,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object)
trace_cachefiles_vfs_error(object, file_inode(file), ret, trace_cachefiles_vfs_error(object, file_inode(file), ret,
cachefiles_trace_setxattr_error); cachefiles_trace_setxattr_error);
trace_cachefiles_coherency(object, file_inode(file)->i_ino, trace_cachefiles_coherency(object, file_inode(file)->i_ino,
be64_to_cpup((__be64 *)buf->data),
buf->content, buf->content,
cachefiles_coherency_set_fail); cachefiles_coherency_set_fail);
if (ret != -ENOMEM) if (ret != -ENOMEM)
@ -85,6 +86,7 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object)
"Failed to set xattr with error %d", ret); "Failed to set xattr with error %d", ret);
} else { } else {
trace_cachefiles_coherency(object, file_inode(file)->i_ino, trace_cachefiles_coherency(object, file_inode(file)->i_ino,
be64_to_cpup((__be64 *)buf->data),
buf->content, buf->content,
cachefiles_coherency_set_ok); cachefiles_coherency_set_ok);
} }
@ -126,7 +128,10 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file
object, object,
"Failed to read aux with error %zd", xlen); "Failed to read aux with error %zd", xlen);
why = cachefiles_coherency_check_xattr; why = cachefiles_coherency_check_xattr;
} else if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) { goto out;
}
if (buf->type != CACHEFILES_COOKIE_TYPE_DATA) {
why = cachefiles_coherency_check_type; why = cachefiles_coherency_check_type;
} else if (memcmp(buf->data, p, len) != 0) { } else if (memcmp(buf->data, p, len) != 0) {
why = cachefiles_coherency_check_aux; why = cachefiles_coherency_check_aux;
@ -141,7 +146,9 @@ int cachefiles_check_auxdata(struct cachefiles_object *object, struct file *file
ret = 0; ret = 0;
} }
out:
trace_cachefiles_coherency(object, file_inode(file)->i_ino, trace_cachefiles_coherency(object, file_inode(file)->i_ino,
be64_to_cpup((__be64 *)buf->data),
buf->content, why); buf->content, why);
kfree(buf); kfree(buf);
return ret; return ret;

View File

@ -253,8 +253,9 @@ static void finish_netfs_read(struct ceph_osd_request *req)
subreq->transferred = err; subreq->transferred = err;
err = 0; err = 0;
} }
subreq->error = err;
trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress); trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
netfs_read_subreq_terminated(subreq, err, false); netfs_read_subreq_terminated(subreq);
iput(req->r_inode); iput(req->r_inode);
ceph_dec_osd_stopping_blocker(fsc->mdsc); ceph_dec_osd_stopping_blocker(fsc->mdsc);
} }
@ -314,7 +315,9 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
out: out:
netfs_read_subreq_terminated(subreq, err, false); subreq->error = err;
trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
netfs_read_subreq_terminated(subreq);
return true; return true;
} }
@ -426,8 +429,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
ceph_osdc_start_request(req->r_osdc, req); ceph_osdc_start_request(req->r_osdc, req);
out: out:
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
if (err) if (err) {
netfs_read_subreq_terminated(subreq, err, false); subreq->error = err;
netfs_read_subreq_terminated(subreq);
}
doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
} }

View File

@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_ENCRYPTED(inode)) { if (IS_ENCRYPTED(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations; inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) { } else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_link = (char *)ei->i_data;
inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_op = &ext4_fast_symlink_inode_operations;
nd_terminate_link(ei->i_data, inode->i_size, nd_terminate_link(ei->i_data, inode->i_size,
sizeof(ei->i_data) - 1); sizeof(ei->i_data) - 1);
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else { } else {
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
} }

View File

@ -3418,7 +3418,6 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_op = &ext4_symlink_inode_operations; inode->i_op = &ext4_symlink_inode_operations;
} else { } else {
inode->i_op = &ext4_fast_symlink_inode_operations; inode->i_op = &ext4_fast_symlink_inode_operations;
inode->i_link = (char *)&EXT4_I(inode)->i_data;
} }
} }
@ -3434,6 +3433,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
disk_link.len); disk_link.len);
inode->i_size = disk_link.len - 1; inode->i_size = disk_link.len - 1;
EXT4_I(inode)->i_disksize = inode->i_size; EXT4_I(inode)->i_disksize = inode->i_size;
if (!IS_ENCRYPTED(inode))
inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data,
inode->i_size);
} }
err = ext4_add_nondir(handle, dentry, &inode); err = ext4_add_nondir(handle, dentry, &inode);
if (handle) if (handle)

View File

@ -187,17 +187,6 @@ static int get_path_from_fd(int fd, struct path *root)
return 0; return 0;
} }
enum handle_to_path_flags {
HANDLE_CHECK_PERMS = (1 << 0),
HANDLE_CHECK_SUBTREE = (1 << 1),
};
struct handle_to_path_ctx {
struct path root;
enum handle_to_path_flags flags;
unsigned int fh_flags;
};
static int vfs_dentry_acceptable(void *context, struct dentry *dentry) static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
{ {
struct handle_to_path_ctx *ctx = context; struct handle_to_path_ctx *ctx = context;
@ -261,50 +250,55 @@ static int do_handle_to_path(struct file_handle *handle, struct path *path,
{ {
int handle_dwords; int handle_dwords;
struct vfsmount *mnt = ctx->root.mnt; struct vfsmount *mnt = ctx->root.mnt;
struct dentry *dentry;
/* change the handle size to multiple of sizeof(u32) */ /* change the handle size to multiple of sizeof(u32) */
handle_dwords = handle->handle_bytes >> 2; handle_dwords = handle->handle_bytes >> 2;
path->dentry = exportfs_decode_fh_raw(mnt, dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle,
(struct fid *)handle->f_handle, handle_dwords, handle->handle_type,
handle_dwords, handle->handle_type, ctx->fh_flags, vfs_dentry_acceptable,
ctx->fh_flags, ctx);
vfs_dentry_acceptable, ctx); if (IS_ERR_OR_NULL(dentry)) {
if (IS_ERR_OR_NULL(path->dentry)) { if (dentry == ERR_PTR(-ENOMEM))
if (path->dentry == ERR_PTR(-ENOMEM))
return -ENOMEM; return -ENOMEM;
return -ESTALE; return -ESTALE;
} }
path->dentry = dentry;
path->mnt = mntget(mnt); path->mnt = mntget(mnt);
return 0; return 0;
} }
/* static inline int may_decode_fh(struct handle_to_path_ctx *ctx,
* Allow relaxed permissions of file handles if the caller has the unsigned int o_flags)
* ability to mount the filesystem or create a bind-mount of the
* provided @mountdirfd.
*
* In both cases the caller may be able to get an unobstructed way to
* the encoded file handle. If the caller is only able to create a
* bind-mount we need to verify that there are no locked mounts on top
* of it that could prevent us from getting to the encoded file.
*
* In principle, locked mounts can prevent the caller from mounting the
* filesystem but that only applies to procfs and sysfs neither of which
* support decoding file handles.
*/
static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
unsigned int o_flags)
{ {
struct path *root = &ctx->root; struct path *root = &ctx->root;
if (capable(CAP_DAC_READ_SEARCH))
return 0;
/* /*
* Restrict to O_DIRECTORY to provide a deterministic API that avoids a * Allow relaxed permissions of file handles if the caller has
* confusing api in the face of disconnected non-dir dentries. * the ability to mount the filesystem or create a bind-mount of
* the provided @mountdirfd.
*
* In both cases the caller may be able to get an unobstructed
* way to the encoded file handle. If the caller is only able to
* create a bind-mount we need to verify that there are no
* locked mounts on top of it that could prevent us from getting
* to the encoded file.
*
* In principle, locked mounts can prevent the caller from
* mounting the filesystem but that only applies to procfs and
* sysfs neither of which support decoding file handles.
*
* Restrict to O_DIRECTORY to provide a deterministic API that
* avoids a confusing api in the face of disconnected non-dir
* dentries.
* *
* There's only one dentry for each directory inode (VFS rule)... * There's only one dentry for each directory inode (VFS rule)...
*/ */
if (!(o_flags & O_DIRECTORY)) if (!(o_flags & O_DIRECTORY))
return false; return -EPERM;
if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
ctx->flags = HANDLE_CHECK_PERMS; ctx->flags = HANDLE_CHECK_PERMS;
@ -314,14 +308,14 @@ static inline bool may_decode_fh(struct handle_to_path_ctx *ctx,
!has_locked_children(real_mount(root->mnt), root->dentry)) !has_locked_children(real_mount(root->mnt), root->dentry))
ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE;
else else
return false; return -EPERM;
/* Are we able to override DAC permissions? */ /* Are we able to override DAC permissions? */
if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH))
return false; return -EPERM;
ctx->fh_flags = EXPORT_FH_DIR_ONLY; ctx->fh_flags = EXPORT_FH_DIR_ONLY;
return true; return 0;
} }
static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
@ -331,15 +325,19 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
struct file_handle f_handle; struct file_handle f_handle;
struct file_handle *handle = NULL; struct file_handle *handle = NULL;
struct handle_to_path_ctx ctx = {}; struct handle_to_path_ctx ctx = {};
const struct export_operations *eops;
retval = get_path_from_fd(mountdirfd, &ctx.root); retval = get_path_from_fd(mountdirfd, &ctx.root);
if (retval) if (retval)
goto out_err; goto out_err;
if (!capable(CAP_DAC_READ_SEARCH) && !may_decode_fh(&ctx, o_flags)) { eops = ctx.root.mnt->mnt_sb->s_export_op;
retval = -EPERM; if (eops && eops->permission)
retval = eops->permission(&ctx, o_flags);
else
retval = may_decode_fh(&ctx, o_flags);
if (retval)
goto out_path; goto out_path;
}
if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
retval = -EFAULT; retval = -EFAULT;
@ -398,29 +396,28 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
int open_flag) int open_flag)
{ {
long retval = 0; long retval = 0;
struct path path; struct path path __free(path_put) = {};
struct file *file; struct file *file;
int fd; const struct export_operations *eops;
retval = handle_to_path(mountdirfd, ufh, &path, open_flag); retval = handle_to_path(mountdirfd, ufh, &path, open_flag);
if (retval) if (retval)
return retval; return retval;
fd = get_unused_fd_flags(open_flag); CLASS(get_unused_fd, fd)(O_CLOEXEC);
if (fd < 0) { if (fd < 0)
path_put(&path);
return fd; return fd;
}
file = file_open_root(&path, "", open_flag, 0); eops = path.mnt->mnt_sb->s_export_op;
if (IS_ERR(file)) { if (eops->open)
put_unused_fd(fd); file = eops->open(&path, open_flag);
retval = PTR_ERR(file); else
} else { file = file_open_root(&path, "", open_flag, 0);
retval = fd; if (IS_ERR(file))
fd_install(fd, file); return PTR_ERR(file);
}
path_put(&path); fd_install(fd, file);
return retval; return take_fd(fd);
} }
/** /**

View File

@ -279,10 +279,6 @@ static int expand_files(struct files_struct *files, unsigned int nr)
if (nr < fdt->max_fds) if (nr < fdt->max_fds)
return 0; return 0;
/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;
if (unlikely(files->resize_in_progress)) { if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
wait_event(files->resize_wait, !files->resize_in_progress); wait_event(files->resize_wait, !files->resize_in_progress);
@ -290,6 +286,10 @@ static int expand_files(struct files_struct *files, unsigned int nr)
goto repeat; goto repeat;
} }
/* Can we expand? */
if (unlikely(nr >= sysctl_nr_open))
return -EMFILE;
/* All good, so we try */ /* All good, so we try */
files->resize_in_progress = true; files->resize_in_progress = true;
error = expand_fdtable(files, nr); error = expand_fdtable(files, nr);
@ -1231,17 +1231,9 @@ __releases(&files->file_lock)
/* /*
* We need to detect attempts to do dup2() over allocated but still * We need to detect attempts to do dup2() over allocated but still
* not finished descriptor. NB: OpenBSD avoids that at the price of * not finished descriptor.
* extra work in their equivalent of fget() - they insert struct *
* file immediately after grabbing descriptor, mark it larval if * POSIX is silent on the issue, we return -EBUSY.
* more work (e.g. actual opening) is needed and make sure that
* fget() treats larval files as absent. Potentially interesting,
* but while extra work in fget() is trivial, locking implications
* and amount of surgery on open()-related paths in VFS are not.
* FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
* deadlocks in rather amusing ways, AFAICS. All of that is out of
* scope of POSIX or SUS, since neither considers shared descriptor
* tables and this condition does not arise without those.
*/ */
fdt = files_fdtable(files); fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds); fd = array_index_nospec(fd, fdt->max_fds);

View File

@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = {
.data = &sysctl_nr_open, .data = &sysctl_nr_open,
.maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec_minmax, .proc_handler = proc_douintvec_minmax,
.extra1 = &sysctl_nr_open_min, .extra1 = &sysctl_nr_open_min,
.extra2 = &sysctl_nr_open_max, .extra2 = &sysctl_nr_open_max,
}, },
@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work)
__fput(container_of(work, struct file, f_task_work)); __fput(container_of(work, struct file, f_task_work));
} }
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
/* /*
* If kernel thread really needs to have the final fput() it has done * If kernel thread really needs to have the final fput() it has done
* to complete, call this. The only user right now is the boot - we * to complete, call this. The only user right now is the boot - we
@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work)
void flush_delayed_fput(void) void flush_delayed_fput(void)
{ {
delayed_fput(NULL); delayed_fput(NULL);
flush_delayed_work(&delayed_fput_work);
} }
EXPORT_SYMBOL_GPL(flush_delayed_fput); EXPORT_SYMBOL_GPL(flush_delayed_fput);
static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file) void fput(struct file *file)
{ {
if (file_ref_put(&file->f_ref)) { if (file_ref_put(&file->f_ref)) {

View File

@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc)
if (log) { if (log) {
if (refcount_dec_and_test(&log->usage)) { if (refcount_dec_and_test(&log->usage)) {
fc->log.log = NULL; fc->log.log = NULL;
for (i = 0; i <= 7; i++) for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++)
if (log->need_free & (1 << i)) if (log->need_free & (1 << i))
kfree(log->buffer[i]); kfree(log->buffer[i]);
kfree(log); kfree(log);

View File

@ -245,9 +245,17 @@ const struct inode_operations simple_dir_inode_operations = {
}; };
EXPORT_SYMBOL(simple_dir_inode_operations); EXPORT_SYMBOL(simple_dir_inode_operations);
/* 0 is '.', 1 is '..', so always start with offset 2 or more */ /* simple_offset_add() allocation range */
enum { enum {
DIR_OFFSET_MIN = 2, DIR_OFFSET_MIN = 3,
DIR_OFFSET_MAX = LONG_MAX - 1,
};
/* simple_offset_add() never assigns these to a dentry */
enum {
DIR_OFFSET_FIRST = 2, /* Find first real entry */
DIR_OFFSET_EOD = LONG_MAX, /* Marks EOD */
}; };
static void offset_set(struct dentry *dentry, long offset) static void offset_set(struct dentry *dentry, long offset)
@ -291,8 +299,11 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
return -EBUSY; return -EBUSY;
ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
LONG_MAX, &octx->next_offset, GFP_KERNEL); DIR_OFFSET_MAX, &octx->next_offset,
if (ret < 0) GFP_KERNEL);
if (unlikely(ret == -EBUSY))
return -ENOSPC;
if (unlikely(ret < 0))
return ret; return ret;
offset_set(dentry, offset); offset_set(dentry, offset);
@ -329,38 +340,6 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
offset_set(dentry, 0); offset_set(dentry, 0);
} }
/**
* simple_offset_empty - Check if a dentry can be unlinked
* @dentry: dentry to be tested
*
* Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
*/
int simple_offset_empty(struct dentry *dentry)
{
struct inode *inode = d_inode(dentry);
struct offset_ctx *octx;
struct dentry *child;
unsigned long index;
int ret = 1;
if (!inode || !S_ISDIR(inode->i_mode))
return ret;
index = DIR_OFFSET_MIN;
octx = inode->i_op->get_offset_ctx(inode);
mt_for_each(&octx->mt, child, index, LONG_MAX) {
spin_lock(&child->d_lock);
if (simple_positive(child)) {
spin_unlock(&child->d_lock);
ret = 0;
break;
}
spin_unlock(&child->d_lock);
}
return ret;
}
/** /**
* simple_offset_rename - handle directory offsets for rename * simple_offset_rename - handle directory offsets for rename
* @old_dir: parent directory of source entry * @old_dir: parent directory of source entry
@ -454,14 +433,6 @@ void simple_offset_destroy(struct offset_ctx *octx)
mtree_destroy(&octx->mt); mtree_destroy(&octx->mt);
} }
static int offset_dir_open(struct inode *inode, struct file *file)
{
struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
file->private_data = (void *)ctx->next_offset;
return 0;
}
/** /**
* offset_dir_llseek - Advance the read position of a directory descriptor * offset_dir_llseek - Advance the read position of a directory descriptor
* @file: an open directory whose position is to be updated * @file: an open directory whose position is to be updated
@ -475,9 +446,6 @@ static int offset_dir_open(struct inode *inode, struct file *file)
*/ */
static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
{ {
struct inode *inode = file->f_inode;
struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
switch (whence) { switch (whence) {
case SEEK_CUR: case SEEK_CUR:
offset += file->f_pos; offset += file->f_pos;
@ -490,25 +458,46 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
return -EINVAL; return -EINVAL;
} }
/* In this case, ->private_data is protected by f_pos_lock */
if (!offset)
file->private_data = (void *)ctx->next_offset;
return vfs_setpos(file, offset, LONG_MAX); return vfs_setpos(file, offset, LONG_MAX);
} }
static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) /* Cf. find_next_child() */
static struct dentry *find_next_sibling_locked(struct dentry *parent,
struct dentry *dentry)
{ {
MA_STATE(mas, &octx->mt, offset, offset); struct dentry *found = NULL;
hlist_for_each_entry_from(dentry, d_sib) {
if (!simple_positive(dentry))
continue;
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(dentry))
found = dget_dlock(dentry);
spin_unlock(&dentry->d_lock);
if (likely(found))
break;
}
return found;
}
static noinline_for_stack struct dentry *
offset_dir_lookup(struct file *file, loff_t offset)
{
struct dentry *parent = file->f_path.dentry;
struct dentry *child, *found = NULL; struct dentry *child, *found = NULL;
struct inode *inode = d_inode(parent);
struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
MA_STATE(mas, &octx->mt, offset, offset);
rcu_read_lock(); rcu_read_lock();
child = mas_find(&mas, LONG_MAX); child = mas_find(&mas, DIR_OFFSET_MAX);
if (!child) if (!child)
goto out; goto out;
spin_lock(&child->d_lock);
if (simple_positive(child)) spin_lock(&parent->d_lock);
found = dget_dlock(child); found = find_next_sibling_locked(parent, child);
spin_unlock(&child->d_lock); spin_unlock(&parent->d_lock);
out: out:
rcu_read_unlock(); rcu_read_unlock();
return found; return found;
@ -517,35 +506,46 @@ static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{ {
struct inode *inode = d_inode(dentry); struct inode *inode = d_inode(dentry);
long offset = dentry2offset(dentry);
return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
inode->i_ino, fs_umode_to_dtype(inode->i_mode)); inode->i_ino, fs_umode_to_dtype(inode->i_mode));
} }
static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) static void offset_iterate_dir(struct file *file, struct dir_context *ctx)
{ {
struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dir = file->f_path.dentry;
struct dentry *dentry; struct dentry *dentry;
if (ctx->pos == DIR_OFFSET_FIRST) {
spin_lock(&dir->d_lock);
dentry = find_next_sibling_locked(dir, d_first_child(dir));
spin_unlock(&dir->d_lock);
} else
dentry = offset_dir_lookup(file, ctx->pos);
if (!dentry)
goto out_eod;
while (true) { while (true) {
dentry = offset_find_next(octx, ctx->pos); struct dentry *next;
if (!dentry)
return;
if (dentry2offset(dentry) >= last_index) { ctx->pos = dentry2offset(dentry);
dput(dentry); if (!offset_dir_emit(ctx, dentry))
return; break;
}
if (!offset_dir_emit(ctx, dentry)) { spin_lock(&dir->d_lock);
dput(dentry); next = find_next_sibling_locked(dir, d_next_sibling(dentry));
return; spin_unlock(&dir->d_lock);
}
ctx->pos = dentry2offset(dentry) + 1;
dput(dentry); dput(dentry);
if (!next)
goto out_eod;
dentry = next;
} }
dput(dentry);
return;
out_eod:
ctx->pos = DIR_OFFSET_EOD;
} }
/** /**
@ -565,6 +565,8 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
* *
* On return, @ctx->pos contains an offset that will read the next entry * On return, @ctx->pos contains an offset that will read the next entry
* in this directory when offset_readdir() is called again with @ctx. * in this directory when offset_readdir() is called again with @ctx.
* Caller places this value in the d_off field of the last entry in the
* user's buffer.
* *
* Return values: * Return values:
* %0 - Complete * %0 - Complete
@ -572,19 +574,17 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
static int offset_readdir(struct file *file, struct dir_context *ctx) static int offset_readdir(struct file *file, struct dir_context *ctx)
{ {
struct dentry *dir = file->f_path.dentry; struct dentry *dir = file->f_path.dentry;
long last_index = (long)file->private_data;
lockdep_assert_held(&d_inode(dir)->i_rwsem); lockdep_assert_held(&d_inode(dir)->i_rwsem);
if (!dir_emit_dots(file, ctx)) if (!dir_emit_dots(file, ctx))
return 0; return 0;
if (ctx->pos != DIR_OFFSET_EOD)
offset_iterate_dir(d_inode(dir), ctx, last_index); offset_iterate_dir(file, ctx);
return 0; return 0;
} }
const struct file_operations simple_offset_dir_operations = { const struct file_operations simple_offset_dir_operations = {
.open = offset_dir_open,
.llseek = offset_dir_llseek, .llseek = offset_dir_llseek,
.iterate_shared = offset_readdir, .iterate_shared = offset_readdir,
.read = generic_read_dir, .read = generic_read_dir,
@ -673,6 +673,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = PAGE_SHIFT; s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = ctx->magic; s->s_magic = ctx->magic;
s->s_op = ctx->ops ?: &simple_super_operations; s->s_op = ctx->ops ?: &simple_super_operations;
s->s_export_op = ctx->eops;
s->s_xattr = ctx->xattr; s->s_xattr = ctx->xattr;
s->s_time_gran = 1; s->s_time_gran = 1;
root = new_inode(s); root = new_inode(s);

View File

@ -8,15 +8,23 @@
struct mnt_namespace { struct mnt_namespace {
struct ns_common ns; struct ns_common ns;
struct mount * root; struct mount * root;
struct rb_root mounts; /* Protected by namespace_sem */ struct {
struct rb_root mounts; /* Protected by namespace_sem */
struct rb_node *mnt_last_node; /* last (rightmost) mount in the rbtree */
struct rb_node *mnt_first_node; /* first (leftmost) mount in the rbtree */
};
struct user_namespace *user_ns; struct user_namespace *user_ns;
struct ucounts *ucounts; struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */ u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll; union {
wait_queue_head_t poll;
struct rcu_head mnt_ns_rcu;
};
u64 event; u64 event;
unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts; unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */
refcount_t passive; /* number references not pinning @mounts */ refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout; } __randomize_layout;
@ -38,6 +46,7 @@ struct mount {
struct dentry *mnt_mountpoint; struct dentry *mnt_mountpoint;
struct vfsmount mnt; struct vfsmount mnt;
union { union {
struct rb_node mnt_node; /* node in the ns->mounts rbtree */
struct rcu_head mnt_rcu; struct rcu_head mnt_rcu;
struct llist_node mnt_llist; struct llist_node mnt_llist;
}; };
@ -51,10 +60,7 @@ struct mount {
struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
union { struct list_head mnt_list;
struct rb_node mnt_node; /* Under ns->mounts */
struct list_head mnt_list;
};
struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */ struct list_head mnt_slave_list;/* list of slave mounts */
@ -145,24 +151,28 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
return ns->seq == 0; return ns->seq == 0;
} }
static inline bool mnt_ns_attached(const struct mount *mnt)
{
return !RB_EMPTY_NODE(&mnt->mnt_node);
}
static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{ {
WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); struct mnt_namespace *ns = mnt->mnt_ns;
mnt->mnt.mnt_flags &= ~MNT_ONRB; WARN_ON(!mnt_ns_attached(mnt));
rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); if (ns->mnt_last_node == &mnt->mnt_node)
ns->mnt_last_node = rb_prev(&mnt->mnt_node);
if (ns->mnt_first_node == &mnt->mnt_node)
ns->mnt_first_node = rb_next(&mnt->mnt_node);
rb_erase(&mnt->mnt_node, &ns->mounts);
RB_CLEAR_NODE(&mnt->mnt_node);
list_add_tail(&mnt->mnt_list, dt_list); list_add_tail(&mnt->mnt_list, dt_list);
} }
bool has_locked_children(struct mount *mnt, struct dentry *dentry); bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous); struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns) bool previous);
{
return __lookup_next_mnt_ns(mntns, false);
}
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, true);
}
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{ {
return container_of(ns, struct mnt_namespace, ns); return container_of(ns, struct mnt_namespace, ns);

View File

@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
getname(newname), 0); getname(newname), 0);
} }
int readlink_copy(char __user *buffer, int buflen, const char *link) int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
{ {
int len = PTR_ERR(link); int copylen;
if (IS_ERR(link))
goto out;
len = strlen(link); copylen = linklen;
if (len > (unsigned) buflen) if (unlikely(copylen > (unsigned) buflen))
len = buflen; copylen = buflen;
if (copy_to_user(buffer, link, len)) if (copy_to_user(buffer, link, copylen))
len = -EFAULT; copylen = -EFAULT;
out: return copylen;
return len;
} }
/** /**
@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
const char *link; const char *link;
int res; int res;
if (inode->i_opflags & IOP_CACHED_LINK)
return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
if (unlikely(inode->i_op->readlink)) if (unlikely(inode->i_op->readlink))
return inode->i_op->readlink(dentry, buffer, buflen); return inode->i_op->readlink(dentry, buffer, buflen);
@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
if (IS_ERR(link)) if (IS_ERR(link))
return PTR_ERR(link); return PTR_ERR(link);
} }
res = readlink_copy(buffer, buflen, link); res = readlink_copy(buffer, buflen, link, strlen(link));
do_delayed_call(&done); do_delayed_call(&done);
return res; return res;
} }
@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link);
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{ {
const char *link;
int res;
DEFINE_DELAYED_CALL(done); DEFINE_DELAYED_CALL(done);
int res = readlink_copy(buffer, buflen, link = page_get_link(dentry, d_inode(dentry), &done);
page_get_link(dentry, d_inode(dentry), res = PTR_ERR(link);
&done)); if (!IS_ERR(link))
res = readlink_copy(buffer, buflen, link, strlen(link));
do_delayed_call(&done); do_delayed_call(&done);
return res; return res;
} }

View File

@ -32,7 +32,6 @@
#include <linux/fs_context.h> #include <linux/fs_context.h>
#include <linux/shmem_fs.h> #include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h> #include <linux/mnt_idmapping.h>
#include <linux/nospec.h>
#include "pnode.h" #include "pnode.h"
#include "internal.h" #include "internal.h"
@ -66,12 +65,12 @@ static int __init set_mphash_entries(char *str)
__setup("mphash_entries=", set_mphash_entries); __setup("mphash_entries=", set_mphash_entries);
static u64 event; static u64 event;
static DEFINE_IDA(mnt_id_ida); static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida); static DEFINE_IDA(mnt_group_ida);
/* Don't allow confusion with old 32bit mount ID */ /* Don't allow confusion with old 32bit mount ID */
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31) #define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET); static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init;
@ -79,8 +78,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem); static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_RWLOCK(mnt_ns_tree_lock); static DEFINE_SEQLOCK(mnt_ns_tree_lock);
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
struct mount_kattr { struct mount_kattr {
unsigned int attr_set; unsigned int attr_set;
@ -106,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/ */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
{
u64 seq_b = ns->seq;
if (seq < seq_b)
return -1;
if (seq > seq_b)
return 1;
return 0;
}
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{ {
if (!node) if (!node)
@ -124,24 +114,53 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
} }
static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{ {
struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_a = node_to_mnt_ns(a);
struct mnt_namespace *ns_b = node_to_mnt_ns(b); struct mnt_namespace *ns_b = node_to_mnt_ns(b);
u64 seq_a = ns_a->seq; u64 seq_a = ns_a->seq;
u64 seq_b = ns_b->seq;
return mnt_ns_cmp(seq_a, ns_b) < 0; if (seq_a < seq_b)
return -1;
if (seq_a > seq_b)
return 1;
return 0;
}
static inline void mnt_ns_tree_write_lock(void)
{
write_seqlock(&mnt_ns_tree_lock);
}
static inline void mnt_ns_tree_write_unlock(void)
{
write_sequnlock(&mnt_ns_tree_lock);
} }
static void mnt_ns_tree_add(struct mnt_namespace *ns) static void mnt_ns_tree_add(struct mnt_namespace *ns)
{ {
guard(write_lock)(&mnt_ns_tree_lock); struct rb_node *node, *prev;
rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
mnt_ns_tree_write_lock();
node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
/*
* If there's no previous entry simply add it after the
* head and if there is add it after the previous entry.
*/
prev = rb_prev(&ns->mnt_ns_tree_node);
if (!prev)
list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
else
list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
mnt_ns_tree_write_unlock();
WARN_ON_ONCE(node);
} }
static void mnt_ns_release(struct mnt_namespace *ns) static void mnt_ns_release(struct mnt_namespace *ns)
{ {
lockdep_assert_not_held(&mnt_ns_tree_lock); lockdep_assert_not_held(&mnt_ns_tree_lock.lock);
/* keep alive for {list,stat}mount() */ /* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) { if (refcount_dec_and_test(&ns->passive)) {
@ -151,41 +170,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
} }
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
}
static void mnt_ns_tree_remove(struct mnt_namespace *ns) static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{ {
/* remove from global mount namespace list */ /* remove from global mount namespace list */
if (!is_anon_ns(ns)) { if (!is_anon_ns(ns)) {
guard(write_lock)(&mnt_ns_tree_lock); mnt_ns_tree_write_lock();
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
list_bidir_del_rcu(&ns->mnt_ns_list);
mnt_ns_tree_write_unlock();
} }
mnt_ns_release(ns); call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
} }
/* static int mnt_ns_find(const void *key, const struct rb_node *node)
* Returns the mount namespace which either has the specified id, or has the
* next smallest id afer the specified one.
*/
static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
{ {
struct rb_node *node = mnt_ns_tree.rb_node; const u64 mnt_ns_id = *(u64 *)key;
struct mnt_namespace *ret = NULL; const struct mnt_namespace *ns = node_to_mnt_ns(node);
lockdep_assert_held(&mnt_ns_tree_lock); if (mnt_ns_id < ns->seq)
return -1;
while (node) { if (mnt_ns_id > ns->seq)
struct mnt_namespace *n = node_to_mnt_ns(node); return 1;
return 0;
if (mnt_ns_id <= n->seq) {
ret = node_to_mnt_ns(node);
if (mnt_ns_id == n->seq)
break;
node = node->rb_left;
} else {
node = node->rb_right;
}
}
return ret;
} }
/* /*
@ -195,18 +207,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
* namespace the @namespace_sem must first be acquired. If the namespace has * namespace the @namespace_sem must first be acquired. If the namespace has
* already shut down before acquiring @namespace_sem, {list,stat}mount() will * already shut down before acquiring @namespace_sem, {list,stat}mount() will
* see that the mount rbtree of the namespace is empty. * see that the mount rbtree of the namespace is empty.
*
* Note the lookup is lockless protected by a sequence counter. We only
* need to guard against false negatives as false positives aren't
* possible. So if we didn't find a mount namespace and the sequence
* counter has changed we need to retry. If the sequence counter is
* still the same we know the search actually failed.
*/ */
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{ {
struct mnt_namespace *ns; struct mnt_namespace *ns;
struct rb_node *node;
unsigned int seq;
guard(read_lock)(&mnt_ns_tree_lock); guard(rcu)();
ns = mnt_ns_find_id_at(mnt_ns_id); do {
if (!ns || ns->seq != mnt_ns_id) seq = read_seqbegin(&mnt_ns_tree_lock);
return NULL; node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
if (node)
break;
} while (read_seqretry(&mnt_ns_tree_lock, seq));
refcount_inc(&ns->passive); if (!node)
return ns; return NULL;
/*
* The last reference count is put with RCU delay so we can
* unconditonally acquire a reference here.
*/
ns = node_to_mnt_ns(node);
refcount_inc(&ns->passive);
return ns;
} }
static inline void lock_mount_hash(void) static inline void lock_mount_hash(void)
@ -236,18 +267,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)
static int mnt_alloc_id(struct mount *mnt) static int mnt_alloc_id(struct mount *mnt)
{ {
int res = ida_alloc(&mnt_id_ida, GFP_KERNEL); int res;
if (res < 0) xa_lock(&mnt_id_xa);
return res; res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
mnt->mnt_id = res; if (!res)
mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); mnt->mnt_id_unique = ++mnt_id_ctr;
return 0; xa_unlock(&mnt_id_xa);
return res;
} }
static void mnt_free_id(struct mount *mnt) static void mnt_free_id(struct mount *mnt)
{ {
ida_free(&mnt_id_ida, mnt->mnt_id); xa_erase(&mnt_id_xa, mnt->mnt_id);
} }
/* /*
@ -344,6 +376,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children); INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
RB_CLEAR_NODE(&mnt->mnt_node);
mnt->mnt.mnt_idmap = &nop_mnt_idmap; mnt->mnt.mnt_idmap = &nop_mnt_idmap;
} }
return mnt; return mnt;
@ -1123,19 +1156,27 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{ {
struct rb_node **link = &ns->mounts.rb_node; struct rb_node **link = &ns->mounts.rb_node;
struct rb_node *parent = NULL; struct rb_node *parent = NULL;
bool mnt_first_node = true, mnt_last_node = true;
WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); WARN_ON(mnt_ns_attached(mnt));
mnt->mnt_ns = ns; mnt->mnt_ns = ns;
while (*link) { while (*link) {
parent = *link; parent = *link;
if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
link = &parent->rb_left; link = &parent->rb_left;
else mnt_last_node = false;
} else {
link = &parent->rb_right; link = &parent->rb_right;
mnt_first_node = false;
}
} }
if (mnt_last_node)
ns->mnt_last_node = &mnt->mnt_node;
if (mnt_first_node)
ns->mnt_first_node = &mnt->mnt_node;
rb_link_node(&mnt->mnt_node, parent, link); rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts); rb_insert_color(&mnt->mnt_node, &ns->mounts);
mnt->mnt.mnt_flags |= MNT_ONRB;
} }
/* /*
@ -1305,7 +1346,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
} }
mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags = old->mnt.mnt_flags;
mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
atomic_inc(&sb->s_active); atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@ -1763,7 +1804,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
/* Gather the mounts to umount */ /* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) { for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT; p->mnt.mnt_flags |= MNT_UMOUNT;
if (p->mnt.mnt_flags & MNT_ONRB) if (mnt_ns_attached(p))
move_from_ns(p, &tmp_list); move_from_ns(p, &tmp_list);
else else
list_move(&p->mnt_list, &tmp_list); list_move(&p->mnt_list, &tmp_list);
@ -1912,16 +1953,14 @@ static int do_umount(struct mount *mnt, int flags)
event++; event++;
if (flags & MNT_DETACH) { if (flags & MNT_DETACH) {
if (mnt->mnt.mnt_flags & MNT_ONRB || if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE); umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0; retval = 0;
} else { } else {
shrink_submounts(mnt); shrink_submounts(mnt);
retval = -EBUSY; retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) { if (!propagate_mount_busy(mnt, 2)) {
if (mnt->mnt.mnt_flags & MNT_ONRB || if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0; retval = 0;
} }
@ -2071,30 +2110,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
return &mnt->ns; return &mnt->ns;
} }
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous) struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{ {
guard(read_lock)(&mnt_ns_tree_lock); guard(rcu)();
for (;;) { for (;;) {
struct rb_node *node; struct list_head *list;
if (previous) if (previous)
node = rb_prev(&mntns->mnt_ns_tree_node); list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
else else
node = rb_next(&mntns->mnt_ns_tree_node); list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
if (!node) if (list_is_head(list, &mnt_ns_list))
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
mntns = node_to_mnt_ns(node); mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
node = &mntns->mnt_ns_tree_node;
/*
* The last passive reference count is put with RCU
* delay so accessing the mount namespace is not just
* safe but all relevant members are still valid.
*/
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
continue; continue;
/* /*
* Holding mnt_ns_tree_lock prevents the mount namespace from * We need an active reference count as we're persisting
* being freed but it may well be on it's deathbed. We want an * the mount namespace and it might already be on its
* active reference, not just a passive one here as we're * deathbed.
* persisting the mount namespace.
*/ */
if (!refcount_inc_not_zero(&mntns->ns.count)) if (!refcount_inc_not_zero(&mntns->ns.count))
continue; continue;
@ -3911,6 +3954,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1); refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT; new_ns->mounts = RB_ROOT;
INIT_LIST_HEAD(&new_ns->mnt_ns_list);
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll); init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns); new_ns->user_ns = get_user_ns(user_ns);
@ -3990,7 +4034,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root) while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old); p = next_mnt(skip_mnt_tree(p), old);
} }
mnt_ns_tree_add(new_ns);
namespace_unlock(); namespace_unlock();
if (rootmnt) if (rootmnt)
@ -3998,6 +4041,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
if (pwdmnt) if (pwdmnt)
mntput(pwdmnt); mntput(pwdmnt);
mnt_ns_tree_add(new_ns);
return new_ns; return new_ns;
} }
@ -5044,6 +5088,10 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
if (sb->s_op->show_options) { if (sb->s_op->show_options) {
size_t start = seq->count; size_t start = seq->count;
err = security_sb_show_options(seq, sb);
if (err)
return err;
err = sb->s_op->show_options(seq, mnt->mnt_root); err = sb->s_op->show_options(seq, mnt->mnt_root);
if (err) if (err)
return err; return err;
@ -5531,9 +5579,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
if (!last_mnt_id) { if (!last_mnt_id) {
if (reverse) if (reverse)
first = node_to_mount(rb_last(&ns->mounts)); first = node_to_mount(ns->mnt_last_node);
else else
first = node_to_mount(rb_first(&ns->mounts)); first = node_to_mount(ns->mnt_first_node);
} else { } else {
if (reverse) if (reverse)
first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);

View File

@ -13,8 +13,11 @@ netfs-y := \
read_collect.o \ read_collect.o \
read_pgpriv2.o \ read_pgpriv2.o \
read_retry.o \ read_retry.o \
read_single.o \
rolling_buffer.o \
write_collect.o \ write_collect.o \
write_issue.o write_issue.o \
write_retry.o
netfs-$(CONFIG_NETFS_STATS) += stats.o netfs-$(CONFIG_NETFS_STATS) += stats.o

View File

@ -63,37 +63,6 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx)); return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
} }
/*
* Decant the list of folios to read into a rolling buffer.
*/
static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
struct folio_queue *folioq,
struct folio_batch *put_batch)
{
unsigned int order, nr;
size_t size = 0;
nr = __readahead_batch(rreq->ractl, (struct page **)folioq->vec.folios,
ARRAY_SIZE(folioq->vec.folios));
folioq->vec.nr = nr;
for (int i = 0; i < nr; i++) {
struct folio *folio = folioq_folio(folioq, i);
trace_netfs_folio(folio, netfs_folio_trace_read);
order = folio_order(folio);
folioq->orders[i] = order;
size += PAGE_SIZE << order;
if (!folio_batch_add(put_batch, folio))
folio_batch_release(put_batch);
}
for (int i = nr; i < folioq_nr_slots(folioq); i++)
folioq_clear(folioq, i);
return size;
}
/* /*
* netfs_prepare_read_iterator - Prepare the subreq iterator for I/O * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
* @subreq: The subrequest to be set up * @subreq: The subrequest to be set up
@ -128,19 +97,12 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
folio_batch_init(&put_batch); folio_batch_init(&put_batch);
while (rreq->submitted < subreq->start + rsize) { while (rreq->submitted < subreq->start + rsize) {
struct folio_queue *tail = rreq->buffer_tail, *new; ssize_t added;
size_t added;
new = kmalloc(sizeof(*new), GFP_NOFS); added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl,
if (!new) &put_batch);
return -ENOMEM; if (added < 0)
netfs_stat(&netfs_n_folioq); return added;
folioq_init(new);
new->prev = tail;
tail->next = new;
rreq->buffer_tail = new;
added = netfs_load_buffer_from_ra(rreq, new, &put_batch);
rreq->iter.count += added;
rreq->submitted += added; rreq->submitted += added;
} }
folio_batch_release(&put_batch); folio_batch_release(&put_batch);
@ -148,7 +110,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
subreq->len = rsize; subreq->len = rsize;
if (unlikely(rreq->io_streams[0].sreq_max_segs)) { if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize,
rreq->io_streams[0].sreq_max_segs); rreq->io_streams[0].sreq_max_segs);
if (limit < rsize) { if (limit < rsize) {
@ -157,20 +119,10 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
} }
} }
subreq->io_iter = rreq->iter; subreq->io_iter = rreq->buffer.iter;
if (iov_iter_is_folioq(&subreq->io_iter)) {
if (subreq->io_iter.folioq_slot >= folioq_nr_slots(subreq->io_iter.folioq)) {
subreq->io_iter.folioq = subreq->io_iter.folioq->next;
subreq->io_iter.folioq_slot = 0;
}
subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq;
subreq->curr_folioq_slot = subreq->io_iter.folioq_slot;
subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
}
iov_iter_truncate(&subreq->io_iter, subreq->len); iov_iter_truncate(&subreq->io_iter, subreq->len);
iov_iter_advance(&rreq->iter, subreq->len); rolling_buffer_advance(&rreq->buffer, subreq->len);
return subreq->len; return subreq->len;
} }
@ -179,25 +131,14 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rr
loff_t i_size) loff_t i_size)
{ {
struct netfs_cache_resources *cres = &rreq->cache_resources; struct netfs_cache_resources *cres = &rreq->cache_resources;
enum netfs_io_source source;
if (!cres->ops) if (!cres->ops)
return NETFS_DOWNLOAD_FROM_SERVER; return NETFS_DOWNLOAD_FROM_SERVER;
return cres->ops->prepare_read(subreq, i_size); source = cres->ops->prepare_read(subreq, i_size);
} trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
return source;
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
bool was_async)
{
struct netfs_io_subrequest *subreq = priv;
if (transferred_or_error < 0) {
netfs_read_subreq_terminated(subreq, transferred_or_error, was_async);
return;
}
if (transferred_or_error > 0)
subreq->transferred += transferred_or_error;
netfs_read_subreq_terminated(subreq, 0, was_async);
} }
/* /*
@ -214,6 +155,47 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
netfs_cache_read_terminated, subreq); netfs_cache_read_terminated, subreq);
} }
static void netfs_issue_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
/* We add to the end of the list whilst the collector may be walking
* the list. The collector only goes nextwards and uses the lock to
* remove entries off of the front.
*/
spin_lock(&rreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
stream->front = subreq;
if (!stream->active) {
stream->collected_to = stream->front->start;
/* Store list pointers before active flag */
smp_store_release(&stream->active, true);
}
}
spin_unlock(&rreq->lock);
switch (subreq->source) {
case NETFS_DOWNLOAD_FROM_SERVER:
rreq->netfs_ops->issue_read(subreq);
break;
case NETFS_READ_FROM_CACHE:
netfs_read_cache_to_pagecache(rreq, subreq);
break;
default:
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
subreq->error = 0;
iov_iter_zero(subreq->len, &subreq->io_iter);
subreq->transferred = subreq->len;
netfs_read_subreq_terminated(subreq);
break;
}
}
/* /*
* Perform a read to the pagecache from a series of sources of different types, * Perform a read to the pagecache from a series of sources of different types,
* slicing up the region to be read according to available cache blocks and * slicing up the region to be read according to available cache blocks and
@ -226,11 +208,9 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
ssize_t size = rreq->len; ssize_t size = rreq->len;
int ret = 0; int ret = 0;
atomic_inc(&rreq->nr_outstanding);
do { do {
struct netfs_io_subrequest *subreq; struct netfs_io_subrequest *subreq;
enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER; enum netfs_io_source source = NETFS_SOURCE_UNKNOWN;
ssize_t slice; ssize_t slice;
subreq = netfs_alloc_subrequest(rreq); subreq = netfs_alloc_subrequest(rreq);
@ -242,20 +222,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
subreq->start = start; subreq->start = start;
subreq->len = size; subreq->len = size;
atomic_inc(&rreq->nr_outstanding);
spin_lock_bh(&rreq->lock);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
subreq->prev_donated = rreq->prev_donated;
rreq->prev_donated = 0;
trace_netfs_sreq(subreq, netfs_sreq_trace_added);
spin_unlock_bh(&rreq->lock);
source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size); source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
subreq->source = source; subreq->source = source;
if (source == NETFS_DOWNLOAD_FROM_SERVER) { if (source == NETFS_DOWNLOAD_FROM_SERVER) {
unsigned long long zp = umin(ictx->zero_point, rreq->i_size); unsigned long long zp = umin(ictx->zero_point, rreq->i_size);
size_t len = subreq->len; size_t len = subreq->len;
if (unlikely(rreq->origin == NETFS_READ_SINGLE))
zp = rreq->i_size;
if (subreq->start >= zp) { if (subreq->start >= zp) {
subreq->source = source = NETFS_FILL_WITH_ZEROES; subreq->source = source = NETFS_FILL_WITH_ZEROES;
goto fill_with_zeroes; goto fill_with_zeroes;
@ -276,24 +250,13 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
if (rreq->netfs_ops->prepare_read) { if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq); ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) { if (ret < 0) {
atomic_dec(&rreq->nr_outstanding);
netfs_put_subrequest(subreq, false, netfs_put_subrequest(subreq, false,
netfs_sreq_trace_put_cancel); netfs_sreq_trace_put_cancel);
break; break;
} }
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
} }
goto issue;
slice = netfs_prepare_read_iterator(subreq);
if (slice < 0) {
atomic_dec(&rreq->nr_outstanding);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
ret = slice;
break;
}
rreq->netfs_ops->issue_read(subreq);
goto done;
} }
fill_with_zeroes: fill_with_zeroes:
@ -301,82 +264,46 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
subreq->source = NETFS_FILL_WITH_ZEROES; subreq->source = NETFS_FILL_WITH_ZEROES;
trace_netfs_sreq(subreq, netfs_sreq_trace_submit); trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
netfs_stat(&netfs_n_rh_zero); netfs_stat(&netfs_n_rh_zero);
slice = netfs_prepare_read_iterator(subreq); goto issue;
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_read_subreq_terminated(subreq, 0, false);
goto done;
} }
if (source == NETFS_READ_FROM_CACHE) { if (source == NETFS_READ_FROM_CACHE) {
trace_netfs_sreq(subreq, netfs_sreq_trace_submit); trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
slice = netfs_prepare_read_iterator(subreq); goto issue;
netfs_read_cache_to_pagecache(rreq, subreq);
goto done;
} }
pr_err("Unexpected read source %u\n", source); pr_err("Unexpected read source %u\n", source);
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
break; break;
done: issue:
slice = netfs_prepare_read_iterator(subreq);
if (slice < 0) {
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
ret = slice;
break;
}
size -= slice; size -= slice;
start += slice; start += slice;
if (size <= 0) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
}
netfs_issue_read(rreq, subreq);
cond_resched(); cond_resched();
} while (size > 0); } while (size > 0);
if (atomic_dec_and_test(&rreq->nr_outstanding)) if (unlikely(size > 0)) {
netfs_rreq_terminated(rreq, false); smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
netfs_wake_read_collector(rreq);
}
/* Defer error return as we may need to wait for outstanding I/O. */ /* Defer error return as we may need to wait for outstanding I/O. */
cmpxchg(&rreq->error, 0, ret); cmpxchg(&rreq->error, 0, ret);
} }
/*
* Wait for the read operation to complete, successfully or otherwise.
*/
static int netfs_wait_for_read(struct netfs_io_request *rreq)
{
int ret;
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
ret = rreq->error;
if (ret == 0 && rreq->submitted < rreq->len) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
return ret;
}
/*
* Set up the initial folioq of buffer folios in the rolling buffer and set the
* iterator to refer to it.
*/
static int netfs_prime_buffer(struct netfs_io_request *rreq)
{
struct folio_queue *folioq;
struct folio_batch put_batch;
size_t added;
folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
if (!folioq)
return -ENOMEM;
netfs_stat(&netfs_n_folioq);
folioq_init(folioq);
rreq->buffer = folioq;
rreq->buffer_tail = folioq;
rreq->submitted = rreq->start;
iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
folio_batch_init(&put_batch);
added = netfs_load_buffer_from_ra(rreq, folioq, &put_batch);
folio_batch_release(&put_batch);
rreq->iter.count += added;
rreq->submitted += added;
return 0;
}
/** /**
* netfs_readahead - Helper to manage a read request * netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request * @ractl: The description of the readahead request
@ -405,6 +332,8 @@ void netfs_readahead(struct readahead_control *ractl)
if (IS_ERR(rreq)) if (IS_ERR(rreq))
return; return;
__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);
ret = netfs_begin_cache_read(rreq, ictx); ret = netfs_begin_cache_read(rreq, ictx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto cleanup_free; goto cleanup_free;
@ -416,7 +345,8 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl); netfs_rreq_expand(rreq, ractl);
rreq->ractl = ractl; rreq->ractl = ractl;
if (netfs_prime_buffer(rreq) < 0) rreq->submitted = rreq->start;
if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
goto cleanup_free; goto cleanup_free;
netfs_read_to_pagecache(rreq); netfs_read_to_pagecache(rreq);
@ -432,23 +362,18 @@ EXPORT_SYMBOL(netfs_readahead);
/* /*
* Create a rolling buffer with a single occupying folio. * Create a rolling buffer with a single occupying folio.
*/ */
static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio) static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct folio *folio,
unsigned int rollbuf_flags)
{ {
struct folio_queue *folioq; ssize_t added;
folioq = kmalloc(sizeof(*folioq), GFP_KERNEL); if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
if (!folioq)
return -ENOMEM; return -ENOMEM;
netfs_stat(&netfs_n_folioq); added = rolling_buffer_append(&rreq->buffer, folio, rollbuf_flags);
folioq_init(folioq); if (added < 0)
folioq_append(folioq, folio); return added;
BUG_ON(folioq_folio(folioq, 0) != folio); rreq->submitted = rreq->start + added;
BUG_ON(folioq_folio_order(folioq, 0) != folio_order(folio));
rreq->buffer = folioq;
rreq->buffer_tail = folioq;
rreq->submitted = rreq->start + rreq->len;
iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, rreq->len);
rreq->ractl = (struct readahead_control *)1UL; rreq->ractl = (struct readahead_control *)1UL;
return 0; return 0;
} }
@ -516,7 +441,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
} }
if (to < flen) if (to < flen)
bvec_set_folio(&bvec[i++], folio, flen - to, to); bvec_set_folio(&bvec[i++], folio, flen - to, to);
iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len); iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len);
rreq->submitted = rreq->start + flen; rreq->submitted = rreq->start + flen;
netfs_read_to_pagecache(rreq); netfs_read_to_pagecache(rreq);
@ -525,7 +450,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
folio_put(sink); folio_put(sink);
ret = netfs_wait_for_read(rreq); ret = netfs_wait_for_read(rreq);
if (ret == 0) { if (ret >= 0) {
flush_dcache_folio(folio); flush_dcache_folio(folio);
folio_mark_uptodate(folio); folio_mark_uptodate(folio);
} }
@ -584,7 +509,7 @@ int netfs_read_folio(struct file *file, struct folio *folio)
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage); trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
/* Set up the output buffer */ /* Set up the output buffer */
ret = netfs_create_singular_buffer(rreq, folio); ret = netfs_create_singular_buffer(rreq, folio, 0);
if (ret < 0) if (ret < 0)
goto discard; goto discard;
@ -741,7 +666,7 @@ int netfs_write_begin(struct netfs_inode *ctx,
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin); trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
/* Set up the output buffer */ /* Set up the output buffer */
ret = netfs_create_singular_buffer(rreq, folio); ret = netfs_create_singular_buffer(rreq, folio, 0);
if (ret < 0) if (ret < 0)
goto error_put; goto error_put;
@ -806,15 +731,14 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write); trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
/* Set up the output buffer */ /* Set up the output buffer */
ret = netfs_create_singular_buffer(rreq, folio); ret = netfs_create_singular_buffer(rreq, folio, NETFS_ROLLBUF_PAGECACHE_MARK);
if (ret < 0) if (ret < 0)
goto error_put; goto error_put;
folioq_mark2(rreq->buffer, 0);
netfs_read_to_pagecache(rreq); netfs_read_to_pagecache(rreq);
ret = netfs_wait_for_read(rreq); ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_return); netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return ret; return ret < 0 ? ret : 0;
error_put: error_put:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);

View File

@ -25,7 +25,7 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
subreq->len = rsize; subreq->len = rsize;
if (unlikely(rreq->io_streams[0].sreq_max_segs)) { if (unlikely(rreq->io_streams[0].sreq_max_segs)) {
size_t limit = netfs_limit_iter(&rreq->iter, 0, rsize, size_t limit = netfs_limit_iter(&rreq->buffer.iter, 0, rsize,
rreq->io_streams[0].sreq_max_segs); rreq->io_streams[0].sreq_max_segs);
if (limit < rsize) { if (limit < rsize) {
@ -36,9 +36,9 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare); trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
subreq->io_iter = rreq->iter; subreq->io_iter = rreq->buffer.iter;
iov_iter_truncate(&subreq->io_iter, subreq->len); iov_iter_truncate(&subreq->io_iter, subreq->len);
iov_iter_advance(&rreq->iter, subreq->len); iov_iter_advance(&rreq->buffer.iter, subreq->len);
} }
/* /*
@ -47,12 +47,11 @@ static void netfs_prepare_dio_read_iterator(struct netfs_io_subrequest *subreq)
*/ */
static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
{ {
struct netfs_io_stream *stream = &rreq->io_streams[0];
unsigned long long start = rreq->start; unsigned long long start = rreq->start;
ssize_t size = rreq->len; ssize_t size = rreq->len;
int ret = 0; int ret = 0;
atomic_set(&rreq->nr_outstanding, 1);
do { do {
struct netfs_io_subrequest *subreq; struct netfs_io_subrequest *subreq;
ssize_t slice; ssize_t slice;
@ -67,19 +66,25 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
subreq->start = start; subreq->start = start;
subreq->len = size; subreq->len = size;
atomic_inc(&rreq->nr_outstanding); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
spin_lock_bh(&rreq->lock);
list_add_tail(&subreq->rreq_link, &rreq->subrequests); spin_lock(&rreq->lock);
subreq->prev_donated = rreq->prev_donated; list_add_tail(&subreq->rreq_link, &stream->subrequests);
rreq->prev_donated = 0; if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
stream->front = subreq;
if (!stream->active) {
stream->collected_to = stream->front->start;
/* Store list pointers before active flag */
smp_store_release(&stream->active, true);
}
}
trace_netfs_sreq(subreq, netfs_sreq_trace_added); trace_netfs_sreq(subreq, netfs_sreq_trace_added);
spin_unlock_bh(&rreq->lock); spin_unlock(&rreq->lock);
netfs_stat(&netfs_n_rh_download); netfs_stat(&netfs_n_rh_download);
if (rreq->netfs_ops->prepare_read) { if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq); ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) { if (ret < 0) {
atomic_dec(&rreq->nr_outstanding);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
break; break;
} }
@ -87,20 +92,34 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
netfs_prepare_dio_read_iterator(subreq); netfs_prepare_dio_read_iterator(subreq);
slice = subreq->len; slice = subreq->len;
rreq->netfs_ops->issue_read(subreq);
size -= slice; size -= slice;
start += slice; start += slice;
rreq->submitted += slice; rreq->submitted += slice;
if (size <= 0) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
}
rreq->netfs_ops->issue_read(subreq);
if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) {
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
wait_on_bit(&rreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
}
if (test_bit(NETFS_RREQ_FAILED, &rreq->flags))
break;
if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break; break;
cond_resched(); cond_resched();
} while (size > 0); } while (size > 0);
if (atomic_dec_and_test(&rreq->nr_outstanding)) if (unlikely(size > 0)) {
netfs_rreq_terminated(rreq, false); smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
netfs_wake_read_collector(rreq);
}
return ret; return ret;
} }
@ -133,21 +152,10 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
goto out; goto out;
} }
if (sync) { if (sync)
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip); ret = netfs_wait_for_read(rreq);
wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, else
TASK_UNINTERRUPTIBLE);
ret = rreq->error;
if (ret == 0 && rreq->submitted < rreq->len &&
rreq->origin != NETFS_DIO_READ) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
} else {
ret = -EIOCBQUEUED; ret = -EIOCBQUEUED;
}
out: out:
_leave(" = %d", ret); _leave(" = %d", ret);
return ret; return ret;
@ -199,15 +207,15 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
* the request. * the request.
*/ */
if (user_backed_iter(iter)) { if (user_backed_iter(iter)) {
ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0); ret = netfs_extract_user_iter(iter, rreq->len, &rreq->buffer.iter, 0);
if (ret < 0) if (ret < 0)
goto out; goto out;
rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec; rreq->direct_bv = (struct bio_vec *)rreq->buffer.iter.bvec;
rreq->direct_bv_count = ret; rreq->direct_bv_count = ret;
rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
rreq->len = iov_iter_count(&rreq->iter); rreq->len = iov_iter_count(&rreq->buffer.iter);
} else { } else {
rreq->iter = *iter; rreq->buffer.iter = *iter;
rreq->len = orig_count; rreq->len = orig_count;
rreq->direct_bv_unpin = false; rreq->direct_bv_unpin = false;
iov_iter_advance(iter, orig_count); iov_iter_advance(iter, orig_count);
@ -215,8 +223,10 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
// TODO: Set up bounce buffer if needed // TODO: Set up bounce buffer if needed
if (!sync) if (!sync) {
rreq->iocb = iocb; rreq->iocb = iocb;
__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);
}
ret = netfs_unbuffered_read(rreq, sync); ret = netfs_unbuffered_read(rreq, sync);
if (ret < 0) if (ret < 0)

View File

@ -68,19 +68,19 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
* request. * request.
*/ */
if (async || user_backed_iter(iter)) { if (async || user_backed_iter(iter)) {
n = netfs_extract_user_iter(iter, len, &wreq->iter, 0); n = netfs_extract_user_iter(iter, len, &wreq->buffer.iter, 0);
if (n < 0) { if (n < 0) {
ret = n; ret = n;
goto out; goto out;
} }
wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec; wreq->direct_bv = (struct bio_vec *)wreq->buffer.iter.bvec;
wreq->direct_bv_count = n; wreq->direct_bv_count = n;
wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter); wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
} else { } else {
wreq->iter = *iter; wreq->buffer.iter = *iter;
} }
wreq->io_iter = wreq->iter; wreq->buffer.iter = wreq->buffer.iter;
} }
__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
@ -92,7 +92,7 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags); __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
if (async) if (async)
wreq->iocb = iocb; wreq->iocb = iocb;
wreq->len = iov_iter_count(&wreq->io_iter); wreq->len = iov_iter_count(&wreq->buffer.iter);
wreq->cleanup = netfs_cleanup_dio_write; wreq->cleanup = netfs_cleanup_dio_write;
ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len); ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
if (ret < 0) { if (ret < 0) {

View File

@ -23,6 +23,7 @@
/* /*
* buffered_read.c * buffered_read.c
*/ */
void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
int netfs_prefetch_for_write(struct file *file, struct folio *folio, int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len); size_t offset, size_t len);
@ -58,11 +59,8 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
/* /*
* misc.c * misc.c
*/ */
struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq); struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio, enum netfs_folioq_trace trace);
bool needs_put);
struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq);
void netfs_clear_buffer(struct netfs_io_request *rreq);
void netfs_reset_iter(struct netfs_io_subrequest *subreq); void netfs_reset_iter(struct netfs_io_subrequest *subreq);
/* /*
@ -84,17 +82,25 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what); trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
} }
static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq,
enum netfs_sreq_ref_trace what)
{
trace_netfs_sreq_ref(subreq->rreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref), what);
}
/* /*
* read_collect.c * read_collect.c
*/ */
void netfs_read_termination_worker(struct work_struct *work); void netfs_read_collection_worker(struct work_struct *work);
void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async); void netfs_wake_read_collector(struct netfs_io_request *rreq);
void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);
/* /*
* read_pgpriv2.c * read_pgpriv2.c
*/ */
void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_request *rreq,
struct netfs_io_request *rreq,
struct folio_queue *folioq, struct folio_queue *folioq,
int slot); int slot);
void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq); void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq);
@ -113,6 +119,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq);
extern atomic_t netfs_n_rh_dio_read; extern atomic_t netfs_n_rh_dio_read;
extern atomic_t netfs_n_rh_readahead; extern atomic_t netfs_n_rh_readahead;
extern atomic_t netfs_n_rh_read_folio; extern atomic_t netfs_n_rh_read_folio;
extern atomic_t netfs_n_rh_read_single;
extern atomic_t netfs_n_rh_rreq; extern atomic_t netfs_n_rh_rreq;
extern atomic_t netfs_n_rh_sreq; extern atomic_t netfs_n_rh_sreq;
extern atomic_t netfs_n_rh_download; extern atomic_t netfs_n_rh_download;
@ -181,9 +188,9 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
struct iov_iter *source); struct iov_iter *source);
void netfs_issue_write(struct netfs_io_request *wreq, void netfs_issue_write(struct netfs_io_request *wreq,
struct netfs_io_stream *stream); struct netfs_io_stream *stream);
int netfs_advance_write(struct netfs_io_request *wreq, size_t netfs_advance_write(struct netfs_io_request *wreq,
struct netfs_io_stream *stream, struct netfs_io_stream *stream,
loff_t start, size_t len, bool to_eof); loff_t start, size_t len, bool to_eof);
struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len); struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
struct folio *folio, size_t copied, bool to_page_end, struct folio *folio, size_t copied, bool to_page_end,
@ -192,6 +199,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
struct folio *writethrough_cache); struct folio *writethrough_cache);
int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
/*
* write_retry.c
*/
void netfs_retry_writes(struct netfs_io_request *wreq);
/* /*
* Miscellaneous functions. * Miscellaneous functions.
*/ */

View File

@ -37,9 +37,11 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READAHEAD] = "RA", [NETFS_READAHEAD] = "RA",
[NETFS_READPAGE] = "RP", [NETFS_READPAGE] = "RP",
[NETFS_READ_GAPS] = "RG", [NETFS_READ_GAPS] = "RG",
[NETFS_READ_SINGLE] = "R1",
[NETFS_READ_FOR_WRITE] = "RW", [NETFS_READ_FOR_WRITE] = "RW",
[NETFS_DIO_READ] = "DR", [NETFS_DIO_READ] = "DR",
[NETFS_WRITEBACK] = "WB", [NETFS_WRITEBACK] = "WB",
[NETFS_WRITEBACK_SINGLE] = "W1",
[NETFS_WRITETHROUGH] = "WT", [NETFS_WRITETHROUGH] = "WT",
[NETFS_UNBUFFERED_WRITE] = "UW", [NETFS_UNBUFFERED_WRITE] = "UW",
[NETFS_DIO_WRITE] = "DW", [NETFS_DIO_WRITE] = "DW",
@ -69,7 +71,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
refcount_read(&rreq->ref), refcount_read(&rreq->ref),
rreq->flags, rreq->flags,
rreq->error, rreq->error,
atomic_read(&rreq->nr_outstanding), 0,
rreq->start, rreq->submitted, rreq->len); rreq->start, rreq->submitted, rreq->len);
seq_putc(m, '\n'); seq_putc(m, '\n');
return 0; return 0;
@ -116,7 +118,7 @@ static int __init netfs_init(void)
goto error_reqpool; goto error_reqpool;
netfs_subrequest_slab = kmem_cache_create("netfs_subrequest", netfs_subrequest_slab = kmem_cache_create("netfs_subrequest",
sizeof(struct netfs_io_subrequest), 0, sizeof(struct netfs_io_subrequest) + 16, 0,
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
NULL); NULL);
if (!netfs_subrequest_slab) if (!netfs_subrequest_slab)

View File

@ -8,113 +8,100 @@
#include <linux/swap.h> #include <linux/swap.h>
#include "internal.h" #include "internal.h"
/* /**
* Make sure there's space in the rolling queue. * netfs_alloc_folioq_buffer - Allocate buffer space into a folio queue
* @mapping: Address space to set on the folio (or NULL).
* @_buffer: Pointer to the folio queue to add to (may point to a NULL; updated).
* @_cur_size: Current size of the buffer (updated).
* @size: Target size of the buffer.
* @gfp: The allocation constraints.
*/ */
struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq) int netfs_alloc_folioq_buffer(struct address_space *mapping,
struct folio_queue **_buffer,
size_t *_cur_size, ssize_t size, gfp_t gfp)
{ {
struct folio_queue *tail = rreq->buffer_tail, *prev; struct folio_queue *tail = *_buffer, *p;
unsigned int prev_nr_slots = 0;
if (WARN_ON_ONCE(!rreq->buffer && tail) || size = round_up(size, PAGE_SIZE);
WARN_ON_ONCE(rreq->buffer && !tail)) if (*_cur_size >= size)
return ERR_PTR(-EIO); return 0;
prev = tail; if (tail)
if (prev) { while (tail->next)
if (!folioq_full(tail)) tail = tail->next;
return tail;
prev_nr_slots = folioq_nr_slots(tail);
}
tail = kmalloc(sizeof(*tail), GFP_NOFS); do {
if (!tail) struct folio *folio;
return ERR_PTR(-ENOMEM); int order = 0, slot;
netfs_stat(&netfs_n_folioq);
folioq_init(tail);
tail->prev = prev;
if (prev)
/* [!] NOTE: After we set prev->next, the consumer is entirely
* at liberty to delete prev.
*/
WRITE_ONCE(prev->next, tail);
rreq->buffer_tail = tail; if (!tail || folioq_full(tail)) {
if (!rreq->buffer) { p = netfs_folioq_alloc(0, GFP_NOFS, netfs_trace_folioq_alloc_buffer);
rreq->buffer = tail; if (!p)
iov_iter_folio_queue(&rreq->io_iter, ITER_SOURCE, tail, 0, 0, 0); return -ENOMEM;
} else { if (tail) {
/* Make sure we don't leave the master iterator pointing to a tail->next = p;
* block that might get immediately consumed. p->prev = tail;
*/ } else {
if (rreq->io_iter.folioq == prev && *_buffer = p;
rreq->io_iter.folioq_slot == prev_nr_slots) { }
rreq->io_iter.folioq = tail; tail = p;
rreq->io_iter.folioq_slot = 0;
} }
}
rreq->buffer_tail_slot = 0;
return tail;
}
/* if (size - *_cur_size > PAGE_SIZE)
* Append a folio to the rolling queue. order = umin(ilog2(size - *_cur_size) - PAGE_SHIFT,
*/ MAX_PAGECACHE_ORDER);
int netfs_buffer_append_folio(struct netfs_io_request *rreq, struct folio *folio,
bool needs_put)
{
struct folio_queue *tail;
unsigned int slot, order = folio_order(folio);
tail = netfs_buffer_make_space(rreq); folio = folio_alloc(gfp, order);
if (IS_ERR(tail)) if (!folio && order > 0)
return PTR_ERR(tail); folio = folio_alloc(gfp, 0);
if (!folio)
return -ENOMEM;
rreq->io_iter.count += PAGE_SIZE << order; folio->mapping = mapping;
folio->index = *_cur_size / PAGE_SIZE;
trace_netfs_folio(folio, netfs_folio_trace_alloc_buffer);
slot = folioq_append_mark(tail, folio);
*_cur_size += folioq_folio_size(tail, slot);
} while (*_cur_size < size);
slot = folioq_append(tail, folio);
/* Store the counter after setting the slot. */
smp_store_release(&rreq->buffer_tail_slot, slot);
return 0; return 0;
} }
EXPORT_SYMBOL(netfs_alloc_folioq_buffer);
/* /**
* Delete the head of a rolling queue. * netfs_free_folioq_buffer - Free a folio queue.
* @fq: The start of the folio queue to free
*
* Free up a chain of folio_queues and, if marked, the marked folios they point
* to.
*/ */
struct folio_queue *netfs_delete_buffer_head(struct netfs_io_request *wreq) void netfs_free_folioq_buffer(struct folio_queue *fq)
{ {
struct folio_queue *head = wreq->buffer, *next = head->next; struct folio_queue *next;
struct folio_batch fbatch;
if (next) folio_batch_init(&fbatch);
next->prev = NULL;
netfs_stat_d(&netfs_n_folioq);
kfree(head);
wreq->buffer = next;
return next;
}
/* for (; fq; fq = next) {
* Clear out a rolling queue. for (int slot = 0; slot < folioq_count(fq); slot++) {
*/ struct folio *folio = folioq_folio(fq, slot);
void netfs_clear_buffer(struct netfs_io_request *rreq) if (!folio ||
{ !folioq_is_marked(fq, slot))
struct folio_queue *p;
while ((p = rreq->buffer)) {
rreq->buffer = p->next;
for (int slot = 0; slot < folioq_count(p); slot++) {
struct folio *folio = folioq_folio(p, slot);
if (!folio)
continue; continue;
if (folioq_is_marked(p, slot)) {
trace_netfs_folio(folio, netfs_folio_trace_put); trace_netfs_folio(folio, netfs_folio_trace_put);
folio_put(folio); if (folio_batch_add(&fbatch, folio))
} folio_batch_release(&fbatch);
} }
netfs_stat_d(&netfs_n_folioq); netfs_stat_d(&netfs_n_folioq);
kfree(p); next = fq->next;
kfree(fq);
} }
folio_batch_release(&fbatch);
} }
EXPORT_SYMBOL(netfs_free_folioq_buffer);
/* /*
* Reset the subrequest iterator to refer just to the region remaining to be * Reset the subrequest iterator to refer just to the region remaining to be

View File

@ -48,17 +48,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
spin_lock_init(&rreq->lock); spin_lock_init(&rreq->lock);
INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
INIT_LIST_HEAD(&rreq->subrequests); init_waitqueue_head(&rreq->waitq);
refcount_set(&rreq->ref, 1); refcount_set(&rreq->ref, 1);
if (origin == NETFS_READAHEAD || if (origin == NETFS_READAHEAD ||
origin == NETFS_READPAGE || origin == NETFS_READPAGE ||
origin == NETFS_READ_GAPS || origin == NETFS_READ_GAPS ||
origin == NETFS_READ_SINGLE ||
origin == NETFS_READ_FOR_WRITE || origin == NETFS_READ_FOR_WRITE ||
origin == NETFS_DIO_READ) origin == NETFS_DIO_READ) {
INIT_WORK(&rreq->work, netfs_read_termination_worker); INIT_WORK(&rreq->work, netfs_read_collection_worker);
else rreq->io_streams[0].avail = true;
} else {
INIT_WORK(&rreq->work, netfs_write_collection_worker); INIT_WORK(&rreq->work, netfs_write_collection_worker);
}
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
if (file && file->f_flags & O_NONBLOCK) if (file && file->f_flags & O_NONBLOCK)
@ -92,14 +95,6 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
struct netfs_io_stream *stream; struct netfs_io_stream *stream;
int s; int s;
while (!list_empty(&rreq->subrequests)) {
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, was_async,
netfs_sreq_trace_put_clear);
}
for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) { for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
stream = &rreq->io_streams[s]; stream = &rreq->io_streams[s];
while (!list_empty(&stream->subrequests)) { while (!list_empty(&stream->subrequests)) {
@ -143,7 +138,7 @@ static void netfs_free_request(struct work_struct *work)
} }
kvfree(rreq->direct_bv); kvfree(rreq->direct_bv);
} }
netfs_clear_buffer(rreq); rolling_buffer_clear(&rreq->buffer);
if (atomic_dec_and_test(&ictx->io_count)) if (atomic_dec_and_test(&ictx->io_count))
wake_up_var(&ictx->io_count); wake_up_var(&ictx->io_count);

View File

@ -14,6 +14,14 @@
#include <linux/task_io_accounting_ops.h> #include <linux/task_io_accounting_ops.h>
#include "internal.h" #include "internal.h"
/* Notes made in the collector */
#define HIT_PENDING 0x01 /* A front op was still pending */
#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
#define BUFFERED 0x08 /* The pagecache needs cleaning up */
#define NEED_RETRY 0x10 /* A front op requests retrying */
#define COPY_TO_CACHE 0x40 /* Need to copy subrequest to cache */
#define ABANDON_SREQ 0x80 /* Need to abandon untransferred part of subrequest */
/* /*
* Clear the unread part of an I/O request. * Clear the unread part of an I/O request.
*/ */
@ -31,14 +39,18 @@ static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
* cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it * cache the folio, we set the group to NETFS_FOLIO_COPY_TO_CACHE, mark it
* dirty and let writeback handle it. * dirty and let writeback handle it.
*/ */
static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq, static void netfs_unlock_read_folio(struct netfs_io_request *rreq,
struct netfs_io_request *rreq,
struct folio_queue *folioq, struct folio_queue *folioq,
int slot) int slot)
{ {
struct netfs_folio *finfo; struct netfs_folio *finfo;
struct folio *folio = folioq_folio(folioq, slot); struct folio *folio = folioq_folio(folioq, slot);
if (unlikely(folio_pos(folio) < rreq->abandon_to)) {
trace_netfs_folio(folio, netfs_folio_trace_abandon);
goto just_unlock;
}
flush_dcache_folio(folio); flush_dcache_folio(folio);
folio_mark_uptodate(folio); folio_mark_uptodate(folio);
@ -53,7 +65,7 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
kfree(finfo); kfree(finfo);
} }
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags)) {
if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) { if (!WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache); trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE); folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
@ -64,10 +76,11 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
} }
} else { } else {
// TODO: Use of PG_private_2 is deprecated. // TODO: Use of PG_private_2 is deprecated.
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) if (test_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags))
netfs_pgpriv2_mark_copy_to_cache(subreq, rreq, folioq, slot); netfs_pgpriv2_mark_copy_to_cache(rreq, folioq, slot);
} }
just_unlock:
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
if (folio->index == rreq->no_unlock_folio && if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
@ -82,234 +95,243 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
} }
/* /*
* Unlock any folios that are now completely read. Returns true if the * Unlock any folios we've finished with.
* subrequest is removed from the list.
*/ */
static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was_async) static void netfs_read_unlock_folios(struct netfs_io_request *rreq,
unsigned int *notes)
{ {
struct netfs_io_subrequest *prev, *next; struct folio_queue *folioq = rreq->buffer.tail;
struct netfs_io_request *rreq = subreq->rreq; unsigned long long collected_to = rreq->collected_to;
struct folio_queue *folioq = subreq->curr_folioq; unsigned int slot = rreq->buffer.first_tail_slot;
size_t avail, prev_donated, next_donated, fsize, part, excess;
loff_t fpos, start;
loff_t fend;
int slot = subreq->curr_folioq_slot;
if (WARN(subreq->transferred > subreq->len, if (rreq->cleaned_to >= rreq->collected_to)
"Subreq overread: R%x[%x] %zu > %zu", return;
rreq->debug_id, subreq->debug_index,
subreq->transferred, subreq->len))
subreq->transferred = subreq->len;
next_folio: // TODO: Begin decryption
fsize = PAGE_SIZE << subreq->curr_folio_order;
fpos = round_down(subreq->start + subreq->consumed, fsize);
fend = fpos + fsize;
if (WARN_ON_ONCE(!folioq) || if (slot >= folioq_nr_slots(folioq)) {
WARN_ON_ONCE(!folioq_folio(folioq, slot)) || folioq = rolling_buffer_delete_spent(&rreq->buffer);
WARN_ON_ONCE(folioq_folio(folioq, slot)->index != fpos / PAGE_SIZE)) { if (!folioq) {
pr_err("R=%08x[%x] s=%llx-%llx ctl=%zx/%zx/%zx sl=%u\n", rreq->front_folio_order = 0;
rreq->debug_id, subreq->debug_index, return;
subreq->start, subreq->start + subreq->transferred - 1,
subreq->consumed, subreq->transferred, subreq->len,
slot);
if (folioq) {
struct folio *folio = folioq_folio(folioq, slot);
pr_err("folioq: orders=%02x%02x%02x%02x\n",
folioq->orders[0], folioq->orders[1],
folioq->orders[2], folioq->orders[3]);
if (folio)
pr_err("folio: %llx-%llx ix=%llx o=%u qo=%u\n",
fpos, fend - 1, folio_pos(folio), folio_order(folio),
folioq_folio_order(folioq, slot));
} }
slot = 0;
} }
donation_changed: for (;;) {
/* Try to consume the current folio if we've hit or passed the end of struct folio *folio;
* it. There's a possibility that this subreq doesn't start at the unsigned long long fpos, fend;
* beginning of the folio, in which case we need to donate to/from the unsigned int order;
* preceding subreq. size_t fsize;
*
* We also need to include any potential donation back from the
* following subreq.
*/
prev_donated = READ_ONCE(subreq->prev_donated);
next_donated = READ_ONCE(subreq->next_donated);
if (prev_donated || next_donated) {
spin_lock_bh(&rreq->lock);
prev_donated = subreq->prev_donated;
next_donated = subreq->next_donated;
subreq->start -= prev_donated;
subreq->len += prev_donated;
subreq->transferred += prev_donated;
prev_donated = subreq->prev_donated = 0;
if (subreq->transferred == subreq->len) {
subreq->len += next_donated;
subreq->transferred += next_donated;
next_donated = subreq->next_donated = 0;
}
trace_netfs_sreq(subreq, netfs_sreq_trace_add_donations);
spin_unlock_bh(&rreq->lock);
}
avail = subreq->transferred; if (*notes & COPY_TO_CACHE)
if (avail == subreq->len) set_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
avail += next_donated;
start = subreq->start;
if (subreq->consumed == 0) {
start -= prev_donated;
avail += prev_donated;
} else {
start += subreq->consumed;
avail -= subreq->consumed;
}
part = umin(avail, fsize);
trace_netfs_progress(subreq, start, avail, part); folio = folioq_folio(folioq, slot);
if (WARN_ONCE(!folio_test_locked(folio),
"R=%08x: folio %lx is not locked\n",
rreq->debug_id, folio->index))
trace_netfs_folio(folio, netfs_folio_trace_not_locked);
if (start + avail >= fend) { order = folioq_folio_order(folioq, slot);
if (fpos == start) { rreq->front_folio_order = order;
/* Flush, unlock and mark for caching any folio we've just read. */ fsize = PAGE_SIZE << order;
subreq->consumed = fend - subreq->start; fpos = folio_pos(folio);
netfs_unlock_read_folio(subreq, rreq, folioq, slot); fend = umin(fpos + fsize, rreq->i_size);
folioq_mark2(folioq, slot);
if (subreq->consumed >= subreq->len)
goto remove_subreq;
} else if (fpos < start) {
excess = fend - subreq->start;
spin_lock_bh(&rreq->lock); trace_netfs_collect_folio(rreq, folio, fend, collected_to);
/* If we complete first on a folio split with the
* preceding subreq, donate to that subreq - otherwise
* we get the responsibility.
*/
if (subreq->prev_donated != prev_donated) {
spin_unlock_bh(&rreq->lock);
goto donation_changed;
}
if (list_is_first(&subreq->rreq_link, &rreq->subrequests)) { /* Unlock any folio we've transferred all of. */
spin_unlock_bh(&rreq->lock); if (collected_to < fend)
pr_err("Can't donate prior to front\n"); break;
goto bad;
}
prev = list_prev_entry(subreq, rreq_link); netfs_unlock_read_folio(rreq, folioq, slot);
WRITE_ONCE(prev->next_donated, prev->next_donated + excess); WRITE_ONCE(rreq->cleaned_to, fpos + fsize);
subreq->start += excess; *notes |= MADE_PROGRESS;
subreq->len -= excess;
subreq->transferred -= excess;
trace_netfs_donate(rreq, subreq, prev, excess,
netfs_trace_donate_tail_to_prev);
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
if (subreq->consumed >= subreq->len) clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
goto remove_subreq_locked;
spin_unlock_bh(&rreq->lock);
} else {
pr_err("fpos > start\n");
goto bad;
}
/* Advance the rolling buffer to the next folio. */ /* Clean up the head folioq. If we clear an entire folioq, then
* we can get rid of it provided it's not also the tail folioq
* being filled by the issuer.
*/
folioq_clear(folioq, slot);
slot++; slot++;
if (slot >= folioq_nr_slots(folioq)) { if (slot >= folioq_nr_slots(folioq)) {
folioq = rolling_buffer_delete_spent(&rreq->buffer);
if (!folioq)
goto done;
slot = 0; slot = 0;
folioq = folioq->next; trace_netfs_folioq(folioq, netfs_trace_folioq_read_progress);
subreq->curr_folioq = folioq;
} }
subreq->curr_folioq_slot = slot;
if (folioq && folioq_folio(folioq, slot)) if (fpos + fsize >= collected_to)
subreq->curr_folio_order = folioq->orders[slot]; break;
if (!was_async)
cond_resched();
goto next_folio;
} }
/* Deal with partial progress. */ rreq->buffer.tail = folioq;
if (subreq->transferred < subreq->len) done:
return false; rreq->buffer.first_tail_slot = slot;
}
/* Donate the remaining downloaded data to one of the neighbouring /*
* subrequests. Note that we may race with them doing the same thing. * Collect and assess the results of various read subrequests. We may need to
* retry some of the results.
*
* Note that we have a sequence of subrequests, which may be drawing on
* different sources and may or may not be the same size or starting position
* and may not even correspond in boundary alignment.
*/
static void netfs_collect_read_results(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *front, *remove;
struct netfs_io_stream *stream = &rreq->io_streams[0];
unsigned int notes;
_enter("%llx-%llx", rreq->start, rreq->start + rreq->len);
trace_netfs_rreq(rreq, netfs_rreq_trace_collect);
trace_netfs_collect(rreq);
reassess:
if (rreq->origin == NETFS_READAHEAD ||
rreq->origin == NETFS_READPAGE ||
rreq->origin == NETFS_READ_FOR_WRITE)
notes = BUFFERED;
else
notes = 0;
/* Remove completed subrequests from the front of the stream and
* advance the completion point. We stop when we hit something that's
* in progress. The issuer thread may be adding stuff to the tail
* whilst we're doing this.
*/ */
spin_lock_bh(&rreq->lock); front = READ_ONCE(stream->front);
while (front) {
size_t transferred;
if (subreq->prev_donated != prev_donated || trace_netfs_collect_sreq(rreq, front);
subreq->next_donated != next_donated) { _debug("sreq [%x] %llx %zx/%zx",
spin_unlock_bh(&rreq->lock); front->debug_index, front->start, front->transferred, front->len);
cond_resched();
goto donation_changed; if (stream->collected_to < front->start) {
trace_netfs_collect_gap(rreq, stream, front->start, 'F');
stream->collected_to = front->start;
}
if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags))
notes |= HIT_PENDING;
smp_rmb(); /* Read counters after IN_PROGRESS flag. */
transferred = READ_ONCE(front->transferred);
/* If we can now collect the next folio, do so. We don't want
* to defer this as we have to decide whether we need to copy
* to the cache or not, and that may differ between adjacent
* subreqs.
*/
if (notes & BUFFERED) {
size_t fsize = PAGE_SIZE << rreq->front_folio_order;
/* Clear the tail of a short read. */
if (!(notes & HIT_PENDING) &&
front->error == 0 &&
transferred < front->len &&
(test_bit(NETFS_SREQ_HIT_EOF, &front->flags) ||
test_bit(NETFS_SREQ_CLEAR_TAIL, &front->flags))) {
netfs_clear_unread(front);
transferred = front->transferred = front->len;
trace_netfs_sreq(front, netfs_sreq_trace_clear);
}
stream->collected_to = front->start + transferred;
rreq->collected_to = stream->collected_to;
if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &front->flags))
notes |= COPY_TO_CACHE;
if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
rreq->abandon_to = front->start + front->len;
front->transferred = front->len;
transferred = front->len;
trace_netfs_rreq(rreq, netfs_rreq_trace_set_abandon);
}
if (front->start + transferred >= rreq->cleaned_to + fsize ||
test_bit(NETFS_SREQ_HIT_EOF, &front->flags))
netfs_read_unlock_folios(rreq, &notes);
} else {
stream->collected_to = front->start + transferred;
rreq->collected_to = stream->collected_to;
}
/* Stall if the front is still undergoing I/O. */
if (notes & HIT_PENDING)
break;
if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
if (!stream->failed) {
stream->error = front->error;
rreq->error = front->error;
set_bit(NETFS_RREQ_FAILED, &rreq->flags);
stream->failed = true;
}
notes |= MADE_PROGRESS | ABANDON_SREQ;
} else if (test_bit(NETFS_SREQ_NEED_RETRY, &front->flags)) {
stream->need_retry = true;
notes |= NEED_RETRY | MADE_PROGRESS;
break;
} else {
if (!stream->failed)
stream->transferred = stream->collected_to - rreq->start;
notes |= MADE_PROGRESS;
}
/* Remove if completely consumed. */
stream->source = front->source;
spin_lock(&rreq->lock);
remove = front;
trace_netfs_sreq(front, netfs_sreq_trace_discard);
list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
stream->front = front;
spin_unlock(&rreq->lock);
netfs_put_subrequest(remove, false,
notes & ABANDON_SREQ ?
netfs_sreq_trace_put_cancel :
netfs_sreq_trace_put_done);
} }
/* Deal with the trickiest case: that this subreq is in the middle of a trace_netfs_collect_stream(rreq, stream);
* folio, not touching either edge, but finishes first. In such a trace_netfs_collect_state(rreq, rreq->collected_to, notes);
* case, we donate to the previous subreq, if there is one, so that the
* donation is only handled when that completes - and remove this if (!(notes & BUFFERED))
* subreq from the list. rreq->cleaned_to = rreq->collected_to;
*
* If the previous subreq finished first, we will have acquired their if (notes & NEED_RETRY)
* donation and should be able to unlock folios and/or donate nextwards. goto need_retry;
if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) {
trace_netfs_rreq(rreq, netfs_rreq_trace_unpause);
clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_PAUSE);
}
if (notes & MADE_PROGRESS) {
//cond_resched();
goto reassess;
}
out:
_leave(" = %x", notes);
return;
need_retry:
/* Okay... We're going to have to retry parts of the stream. Note
* that any partially completed op will have had any wholly transferred
* folios removed from it.
*/ */
if (!subreq->consumed && _debug("retry");
!prev_donated && netfs_retry_reads(rreq);
!list_is_first(&subreq->rreq_link, &rreq->subrequests)) { goto out;
prev = list_prev_entry(subreq, rreq_link);
WRITE_ONCE(prev->next_donated, prev->next_donated + subreq->len);
subreq->start += subreq->len;
subreq->len = 0;
subreq->transferred = 0;
trace_netfs_donate(rreq, subreq, prev, subreq->len,
netfs_trace_donate_to_prev);
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_prev);
goto remove_subreq_locked;
}
/* If we can't donate down the chain, donate up the chain instead. */
excess = subreq->len - subreq->consumed + next_donated;
if (!subreq->consumed)
excess += prev_donated;
if (list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
rreq->prev_donated = excess;
trace_netfs_donate(rreq, subreq, NULL, excess,
netfs_trace_donate_to_deferred_next);
} else {
next = list_next_entry(subreq, rreq_link);
WRITE_ONCE(next->prev_donated, excess);
trace_netfs_donate(rreq, subreq, next, excess,
netfs_trace_donate_to_next);
}
trace_netfs_sreq(subreq, netfs_sreq_trace_donate_to_next);
subreq->len = subreq->consumed;
subreq->transferred = subreq->consumed;
goto remove_subreq_locked;
remove_subreq:
spin_lock_bh(&rreq->lock);
remove_subreq_locked:
subreq->consumed = subreq->len;
list_del(&subreq->rreq_link);
spin_unlock_bh(&rreq->lock);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_consumed);
return true;
bad:
/* Errr... prev and next both donated to us, but insufficient to finish
* the folio.
*/
printk("R=%08x[%x] s=%llx-%llx %zx/%zx/%zx\n",
rreq->debug_id, subreq->debug_index,
subreq->start, subreq->start + subreq->transferred - 1,
subreq->consumed, subreq->transferred, subreq->len);
printk("folio: %llx-%llx\n", fpos, fend - 1);
printk("donated: prev=%zx next=%zx\n", prev_donated, next_donated);
printk("s=%llx av=%zx part=%zx\n", start, avail, part);
BUG();
} }
/* /*
@ -318,12 +340,13 @@ static bool netfs_consume_read_data(struct netfs_io_subrequest *subreq, bool was
static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
{ {
struct netfs_io_subrequest *subreq; struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
unsigned int i; unsigned int i;
/* Collect unbuffered reads and direct reads, adding up the transfer /* Collect unbuffered reads and direct reads, adding up the transfer
* sizes until we find the first short or failed subrequest. * sizes until we find the first short or failed subrequest.
*/ */
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
rreq->transferred += subreq->transferred; rreq->transferred += subreq->transferred;
if (subreq->transferred < subreq->len || if (subreq->transferred < subreq->len ||
@ -356,25 +379,67 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
} }
/* /*
* Assess the state of a read request and decide what to do next. * Do processing after reading a monolithic single object.
*/
static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
if (!rreq->error && stream->source == NETFS_DOWNLOAD_FROM_SERVER &&
fscache_resources_valid(&rreq->cache_resources)) {
trace_netfs_rreq(rreq, netfs_rreq_trace_dirty);
netfs_single_mark_inode_dirty(rreq->inode);
}
if (rreq->iocb) {
rreq->iocb->ki_pos += rreq->transferred;
if (rreq->iocb->ki_complete)
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
}
/*
* Perform the collection of subrequests and folios.
* *
* Note that we're in normal kernel thread context at this point, possibly * Note that we're in normal kernel thread context at this point, possibly
* running on a workqueue. * running on a workqueue.
*/ */
static void netfs_rreq_assess(struct netfs_io_request *rreq) static void netfs_read_collection(struct netfs_io_request *rreq)
{ {
trace_netfs_rreq(rreq, netfs_rreq_trace_assess); struct netfs_io_stream *stream = &rreq->io_streams[0];
netfs_collect_read_results(rreq);
/* We're done when the app thread has finished posting subreqs and the
* queue is empty.
*/
if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
return;
smp_rmb(); /* Read ALL_QUEUED before subreq lists. */
if (!list_empty(&stream->subrequests))
return;
/* Okay, declare that all I/O is complete. */
rreq->transferred = stream->transferred;
trace_netfs_rreq(rreq, netfs_rreq_trace_complete);
//netfs_rreq_is_still_valid(rreq); //netfs_rreq_is_still_valid(rreq);
if (test_and_clear_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags)) { switch (rreq->origin) {
netfs_retry_reads(rreq); case NETFS_DIO_READ:
return; case NETFS_READ_GAPS:
}
if (rreq->origin == NETFS_DIO_READ ||
rreq->origin == NETFS_READ_GAPS)
netfs_rreq_assess_dio(rreq); netfs_rreq_assess_dio(rreq);
break;
case NETFS_READ_SINGLE:
netfs_rreq_assess_single(rreq);
break;
default:
break;
}
task_io_account_read(rreq->transferred); task_io_account_read(rreq->transferred);
trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
@ -388,57 +453,62 @@ static void netfs_rreq_assess(struct netfs_io_request *rreq)
netfs_pgpriv2_write_to_the_cache(rreq); netfs_pgpriv2_write_to_the_cache(rreq);
} }
void netfs_read_termination_worker(struct work_struct *work) void netfs_read_collection_worker(struct work_struct *work)
{ {
struct netfs_io_request *rreq = struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
container_of(work, struct netfs_io_request, work);
netfs_see_request(rreq, netfs_rreq_trace_see_work); netfs_see_request(rreq, netfs_rreq_trace_see_work);
netfs_rreq_assess(rreq); if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
netfs_put_request(rreq, false, netfs_rreq_trace_put_work_complete); netfs_read_collection(rreq);
netfs_put_request(rreq, false, netfs_rreq_trace_put_work);
} }
/* /*
* Handle the completion of all outstanding I/O operations on a read request. * Wake the collection work item.
* We inherit a ref from the caller.
*/ */
void netfs_rreq_terminated(struct netfs_io_request *rreq, bool was_async) void netfs_wake_read_collector(struct netfs_io_request *rreq)
{ {
if (!was_async) if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
return netfs_rreq_assess(rreq); if (!work_pending(&rreq->work)) {
if (!work_pending(&rreq->work)) { netfs_get_request(rreq, netfs_rreq_trace_get_work);
netfs_get_request(rreq, netfs_rreq_trace_get_work); if (!queue_work(system_unbound_wq, &rreq->work))
if (!queue_work(system_unbound_wq, &rreq->work)) netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_work_nq); }
} else {
trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
wake_up(&rreq->waitq);
} }
} }
/** /**
* netfs_read_subreq_progress - Note progress of a read operation. * netfs_read_subreq_progress - Note progress of a read operation.
* @subreq: The read request that has terminated. * @subreq: The read request that has terminated.
* @was_async: True if we're in an asynchronous context.
* *
* This tells the read side of netfs lib that a contributory I/O operation has * This tells the read side of netfs lib that a contributory I/O operation has
* made some progress and that it may be possible to unlock some folios. * made some progress and that it may be possible to unlock some folios.
* *
* Before calling, the filesystem should update subreq->transferred to track * Before calling, the filesystem should update subreq->transferred to track
* the amount of data copied into the output buffer. * the amount of data copied into the output buffer.
*
* If @was_async is true, the caller might be running in softirq or interrupt
* context and we can't sleep.
*/ */
void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq)
bool was_async)
{ {
struct netfs_io_request *rreq = subreq->rreq; struct netfs_io_request *rreq = subreq->rreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
size_t fsize = PAGE_SIZE << rreq->front_folio_order;
trace_netfs_sreq(subreq, netfs_sreq_trace_progress); trace_netfs_sreq(subreq, netfs_sreq_trace_progress);
if (subreq->transferred > subreq->consumed && /* If we are at the head of the queue, wake up the collector,
* getting a ref to it if we were the ones to do so.
*/
if (subreq->start + subreq->transferred > rreq->cleaned_to + fsize &&
(rreq->origin == NETFS_READAHEAD || (rreq->origin == NETFS_READAHEAD ||
rreq->origin == NETFS_READPAGE || rreq->origin == NETFS_READPAGE ||
rreq->origin == NETFS_READ_FOR_WRITE)) { rreq->origin == NETFS_READ_FOR_WRITE) &&
netfs_consume_read_data(subreq, was_async); list_is_first(&subreq->rreq_link, &stream->subrequests)
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags); ) {
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
netfs_wake_read_collector(rreq);
} }
} }
EXPORT_SYMBOL(netfs_read_subreq_progress); EXPORT_SYMBOL(netfs_read_subreq_progress);
@ -446,27 +516,23 @@ EXPORT_SYMBOL(netfs_read_subreq_progress);
/** /**
* netfs_read_subreq_terminated - Note the termination of an I/O operation. * netfs_read_subreq_terminated - Note the termination of an I/O operation.
* @subreq: The I/O request that has terminated. * @subreq: The I/O request that has terminated.
* @error: Error code indicating type of completion.
* @was_async: The termination was asynchronous
* *
* This tells the read helper that a contributory I/O operation has terminated, * This tells the read helper that a contributory I/O operation has terminated,
* one way or another, and that it should integrate the results. * one way or another, and that it should integrate the results.
* *
* The caller indicates the outcome of the operation through @error, supplying * The caller indicates the outcome of the operation through @subreq->error,
* 0 to indicate a successful or retryable transfer (if NETFS_SREQ_NEED_RETRY * supplying 0 to indicate a successful or retryable transfer (if
* is set) or a negative error code. The helper will look after reissuing I/O * NETFS_SREQ_NEED_RETRY is set) or a negative error code. The helper will
* operations as appropriate and writing downloaded data to the cache. * look after reissuing I/O operations as appropriate and writing downloaded
* data to the cache.
* *
* Before calling, the filesystem should update subreq->transferred to track * Before calling, the filesystem should update subreq->transferred to track
* the amount of data copied into the output buffer. * the amount of data copied into the output buffer.
*
* If @was_async is true, the caller might be running in softirq or interrupt
* context and we can't sleep.
*/ */
void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq, void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
int error, bool was_async)
{ {
struct netfs_io_request *rreq = subreq->rreq; struct netfs_io_request *rreq = subreq->rreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
switch (subreq->source) { switch (subreq->source) {
case NETFS_READ_FROM_CACHE: case NETFS_READ_FROM_CACHE:
@ -479,68 +545,114 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
break; break;
} }
if (rreq->origin != NETFS_DIO_READ) {
/* Collect buffered reads.
*
* If the read completed validly short, then we can clear the
* tail before going on to unlock the folios.
*/
if (error == 0 && subreq->transferred < subreq->len &&
(test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags) ||
test_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags))) {
netfs_clear_unread(subreq);
subreq->transferred = subreq->len;
trace_netfs_sreq(subreq, netfs_sreq_trace_clear);
}
if (subreq->transferred > subreq->consumed &&
(rreq->origin == NETFS_READAHEAD ||
rreq->origin == NETFS_READPAGE ||
rreq->origin == NETFS_READ_FOR_WRITE)) {
netfs_consume_read_data(subreq, was_async);
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
}
rreq->transferred += subreq->transferred;
}
/* Deal with retry requests, short reads and errors. If we retry /* Deal with retry requests, short reads and errors. If we retry
* but don't make progress, we abandon the attempt. * but don't make progress, we abandon the attempt.
*/ */
if (!error && subreq->transferred < subreq->len) { if (!subreq->error && subreq->transferred < subreq->len) {
if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) { if (test_bit(NETFS_SREQ_HIT_EOF, &subreq->flags)) {
trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof); trace_netfs_sreq(subreq, netfs_sreq_trace_hit_eof);
} else if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
trace_netfs_sreq(subreq, netfs_sreq_trace_need_retry);
} else if (test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)) {
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
trace_netfs_sreq(subreq, netfs_sreq_trace_partial_read);
} else { } else {
__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
subreq->error = -ENODATA;
trace_netfs_sreq(subreq, netfs_sreq_trace_short); trace_netfs_sreq(subreq, netfs_sreq_trace_short);
if (subreq->transferred > subreq->consumed) {
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
} else if (!__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
set_bit(NETFS_RREQ_NEED_RETRY, &rreq->flags);
} else {
__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
error = -ENODATA;
}
} }
} }
subreq->error = error; if (unlikely(subreq->error < 0)) {
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); trace_netfs_failure(rreq, subreq, subreq->error, netfs_fail_read);
if (unlikely(error < 0)) {
trace_netfs_failure(rreq, subreq, error, netfs_fail_read);
if (subreq->source == NETFS_READ_FROM_CACHE) { if (subreq->source == NETFS_READ_FROM_CACHE) {
netfs_stat(&netfs_n_rh_read_failed); netfs_stat(&netfs_n_rh_read_failed);
__set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
} else { } else {
netfs_stat(&netfs_n_rh_download_failed); netfs_stat(&netfs_n_rh_download_failed);
set_bit(NETFS_RREQ_FAILED, &rreq->flags); __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
rreq->error = subreq->error; }
trace_netfs_rreq(rreq, netfs_rreq_trace_set_pause);
set_bit(NETFS_RREQ_PAUSE, &rreq->flags);
}
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
/* If we are at the head of the queue, wake up the collector. */
if (list_is_first(&subreq->rreq_link, &stream->subrequests))
netfs_wake_read_collector(rreq);
netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_read_subreq_terminated);
/*
* Handle termination of a read from the cache.
*/
void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
{
struct netfs_io_subrequest *subreq = priv;
if (transferred_or_error > 0) {
subreq->error = 0;
if (transferred_or_error > 0) {
subreq->transferred += transferred_or_error;
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
}
} else {
subreq->error = transferred_or_error;
}
netfs_read_subreq_terminated(subreq);
}
/*
* Wait for the read operation to complete, successfully or otherwise.
*/
ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
DEFINE_WAIT(myself);
ssize_t ret;
for (;;) {
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
subreq = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
if (subreq &&
(!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags)))
netfs_read_collection(rreq);
if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
break;
schedule();
trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
}
finish_wait(&rreq->waitq, &myself);
ret = rreq->error;
if (ret == 0) {
ret = rreq->transferred;
switch (rreq->origin) {
case NETFS_DIO_READ:
case NETFS_READ_SINGLE:
ret = rreq->transferred;
break;
default:
if (rreq->submitted < rreq->len) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
break;
} }
} }
if (atomic_dec_and_test(&rreq->nr_outstanding)) return ret;
netfs_rreq_terminated(rreq, was_async);
netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
} }
EXPORT_SYMBOL(netfs_read_subreq_terminated);

View File

@ -18,8 +18,7 @@
* third mark in the folio queue is used to indicate that this folio needs * third mark in the folio queue is used to indicate that this folio needs
* writing. * writing.
*/ */
void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq, void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_request *rreq,
struct netfs_io_request *rreq,
struct folio_queue *folioq, struct folio_queue *folioq,
int slot) int slot)
{ {
@ -34,8 +33,9 @@ void netfs_pgpriv2_mark_copy_to_cache(struct netfs_io_subrequest *subreq,
* [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an * [DEPRECATED] Cancel PG_private_2 on all marked folios in the event of an
* unrecoverable error. * unrecoverable error.
*/ */
static void netfs_pgpriv2_cancel(struct folio_queue *folioq) static void netfs_pgpriv2_cancel(struct rolling_buffer *buffer)
{ {
struct folio_queue *folioq = buffer->tail;
struct folio *folio; struct folio *folio;
int slot; int slot;
@ -94,7 +94,7 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
trace_netfs_folio(folio, netfs_folio_trace_store_copy); trace_netfs_folio(folio, netfs_folio_trace_store_copy);
/* Attach the folio to the rolling buffer. */ /* Attach the folio to the rolling buffer. */
if (netfs_buffer_append_folio(wreq, folio, false) < 0) if (rolling_buffer_append(&wreq->buffer, folio, 0) < 0)
return -ENOMEM; return -ENOMEM;
cache->submit_extendable_to = fsize; cache->submit_extendable_to = fsize;
@ -109,7 +109,7 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
do { do {
ssize_t part; ssize_t part;
wreq->io_iter.iov_offset = cache->submit_off; wreq->buffer.iter.iov_offset = cache->submit_off;
atomic64_set(&wreq->issued_to, fpos + cache->submit_off); atomic64_set(&wreq->issued_to, fpos + cache->submit_off);
cache->submit_extendable_to = fsize - cache->submit_off; cache->submit_extendable_to = fsize - cache->submit_off;
@ -122,8 +122,8 @@ static int netfs_pgpriv2_copy_folio(struct netfs_io_request *wreq, struct folio
cache->submit_len -= part; cache->submit_len -= part;
} while (cache->submit_len > 0); } while (cache->submit_len > 0);
wreq->io_iter.iov_offset = 0; wreq->buffer.iter.iov_offset = 0;
iov_iter_advance(&wreq->io_iter, fsize); rolling_buffer_advance(&wreq->buffer, fsize);
atomic64_set(&wreq->issued_to, fpos + fsize); atomic64_set(&wreq->issued_to, fpos + fsize);
if (flen < fsize) if (flen < fsize)
@ -151,7 +151,7 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
goto couldnt_start; goto couldnt_start;
/* Need the first folio to be able to set up the op. */ /* Need the first folio to be able to set up the op. */
for (folioq = rreq->buffer; folioq; folioq = folioq->next) { for (folioq = rreq->buffer.tail; folioq; folioq = folioq->next) {
if (folioq->marks3) { if (folioq->marks3) {
slot = __ffs(folioq->marks3); slot = __ffs(folioq->marks3);
break; break;
@ -194,7 +194,7 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
netfs_put_request(wreq, false, netfs_rreq_trace_put_return); netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
_leave(" = %d", error); _leave(" = %d", error);
couldnt_start: couldnt_start:
netfs_pgpriv2_cancel(rreq->buffer); netfs_pgpriv2_cancel(&rreq->buffer);
} }
/* /*
@ -203,13 +203,13 @@ void netfs_pgpriv2_write_to_the_cache(struct netfs_io_request *rreq)
*/ */
bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq) bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
{ {
struct folio_queue *folioq = wreq->buffer; struct folio_queue *folioq = wreq->buffer.tail;
unsigned long long collected_to = wreq->collected_to; unsigned long long collected_to = wreq->collected_to;
unsigned int slot = wreq->buffer_head_slot; unsigned int slot = wreq->buffer.first_tail_slot;
bool made_progress = false; bool made_progress = false;
if (slot >= folioq_nr_slots(folioq)) { if (slot >= folioq_nr_slots(folioq)) {
folioq = netfs_delete_buffer_head(wreq); folioq = rolling_buffer_delete_spent(&wreq->buffer);
slot = 0; slot = 0;
} }
@ -248,9 +248,9 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
folioq_clear(folioq, slot); folioq_clear(folioq, slot);
slot++; slot++;
if (slot >= folioq_nr_slots(folioq)) { if (slot >= folioq_nr_slots(folioq)) {
if (READ_ONCE(wreq->buffer_tail) == folioq) folioq = rolling_buffer_delete_spent(&wreq->buffer);
break; if (!folioq)
folioq = netfs_delete_buffer_head(wreq); goto done;
slot = 0; slot = 0;
} }
@ -258,7 +258,8 @@ bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *wreq)
break; break;
} }
wreq->buffer = folioq; wreq->buffer.tail = folioq;
wreq->buffer_head_slot = slot; done:
wreq->buffer.first_tail_slot = slot;
return made_progress; return made_progress;
} }

View File

@ -12,15 +12,8 @@
static void netfs_reissue_read(struct netfs_io_request *rreq, static void netfs_reissue_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq) struct netfs_io_subrequest *subreq)
{ {
struct iov_iter *io_iter = &subreq->io_iter; __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
if (iov_iter_is_folioq(io_iter)) {
subreq->curr_folioq = (struct folio_queue *)io_iter->folioq;
subreq->curr_folioq_slot = io_iter->folioq_slot;
subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
}
atomic_inc(&rreq->nr_outstanding);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
subreq->rreq->netfs_ops->issue_read(subreq); subreq->rreq->netfs_ops->issue_read(subreq);
@ -33,13 +26,12 @@ static void netfs_reissue_read(struct netfs_io_request *rreq,
static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
{ {
struct netfs_io_subrequest *subreq; struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream0 = &rreq->io_streams[0]; struct netfs_io_stream *stream = &rreq->io_streams[0];
LIST_HEAD(sublist); struct list_head *next;
LIST_HEAD(queue);
_enter("R=%x", rreq->debug_id); _enter("R=%x", rreq->debug_id);
if (list_empty(&rreq->subrequests)) if (list_empty(&stream->subrequests))
return; return;
if (rreq->netfs_ops->retry_request) if (rreq->netfs_ops->retry_request)
@ -52,7 +44,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
!test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) {
struct netfs_io_subrequest *subreq; struct netfs_io_subrequest *subreq;
list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break; break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
@ -73,48 +65,44 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
* populating with smaller subrequests. In the event that the subreq * populating with smaller subrequests. In the event that the subreq
* we just launched finishes before we insert the next subreq, it'll * we just launched finishes before we insert the next subreq, it'll
* fill in rreq->prev_donated instead. * fill in rreq->prev_donated instead.
*
* Note: Alternatively, we could split the tail subrequest right before * Note: Alternatively, we could split the tail subrequest right before
* we reissue it and fix up the donations under lock. * we reissue it and fix up the donations under lock.
*/ */
list_splice_init(&rreq->subrequests, &queue); next = stream->subrequests.next;
do { do {
struct netfs_io_subrequest *from; struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
struct iov_iter source; struct iov_iter source;
unsigned long long start, len; unsigned long long start, len;
size_t part, deferred_next_donated = 0; size_t part;
bool boundary = false; bool boundary = false;
/* Go through the subreqs and find the next span of contiguous /* Go through the subreqs and find the next span of contiguous
* buffer that we then rejig (cifs, for example, needs the * buffer that we then rejig (cifs, for example, needs the
* rsize renegotiating) and reissue. * rsize renegotiating) and reissue.
*/ */
from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link); from = list_entry(next, struct netfs_io_subrequest, rreq_link);
list_move_tail(&from->rreq_link, &sublist); to = from;
start = from->start + from->transferred; start = from->start + from->transferred;
len = from->len - from->transferred; len = from->len - from->transferred;
_debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx", _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx",
rreq->debug_id, from->debug_index, rreq->debug_id, from->debug_index,
from->start, from->consumed, from->transferred, from->len); from->start, from->transferred, from->len);
if (test_bit(NETFS_SREQ_FAILED, &from->flags) || if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
!test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
goto abandon; goto abandon;
deferred_next_donated = from->next_donated; list_for_each_continue(next, &stream->subrequests) {
while ((subreq = list_first_entry_or_null( subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
&queue, struct netfs_io_subrequest, rreq_link))) { if (subreq->start + subreq->transferred != start + len ||
if (subreq->start != start + len || test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
subreq->transferred > 0 ||
!test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
break; break;
list_move_tail(&subreq->rreq_link, &sublist); to = subreq;
len += subreq->len; len += to->len;
deferred_next_donated = subreq->next_donated;
if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags))
break;
} }
_debug(" - range: %llx-%llx %llx", start, start + len - 1, len); _debug(" - range: %llx-%llx %llx", start, start + len - 1, len);
@ -127,36 +115,28 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
source.count = len; source.count = len;
/* Work through the sublist. */ /* Work through the sublist. */
while ((subreq = list_first_entry_or_null( subreq = from;
&sublist, struct netfs_io_subrequest, rreq_link))) { list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
list_del(&subreq->rreq_link); if (!len)
break;
subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start - subreq->transferred; subreq->start = start - subreq->transferred;
subreq->len = len + subreq->transferred; subreq->len = len + subreq->transferred;
stream0->sreq_max_len = subreq->len;
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags); __set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
spin_lock_bh(&rreq->lock);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
subreq->prev_donated += rreq->prev_donated;
rreq->prev_donated = 0;
trace_netfs_sreq(subreq, netfs_sreq_trace_retry); trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
spin_unlock_bh(&rreq->lock);
BUG_ON(!len);
/* Renegotiate max_len (rsize) */ /* Renegotiate max_len (rsize) */
stream->sreq_max_len = subreq->len;
if (rreq->netfs_ops->prepare_read(subreq) < 0) { if (rreq->netfs_ops->prepare_read(subreq) < 0) {
trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
__set_bit(NETFS_SREQ_FAILED, &subreq->flags); __set_bit(NETFS_SREQ_FAILED, &subreq->flags);
goto abandon;
} }
part = umin(len, stream0->sreq_max_len); part = umin(len, stream->sreq_max_len);
if (unlikely(rreq->io_streams[0].sreq_max_segs)) if (unlikely(stream->sreq_max_segs))
part = netfs_limit_iter(&source, 0, part, stream0->sreq_max_segs); part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
subreq->len = subreq->transferred + part; subreq->len = subreq->transferred + part;
subreq->io_iter = source; subreq->io_iter = source;
iov_iter_truncate(&subreq->io_iter, part); iov_iter_truncate(&subreq->io_iter, part);
@ -166,58 +146,106 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
if (!len) { if (!len) {
if (boundary) if (boundary)
__set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); __set_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
subreq->next_donated = deferred_next_donated;
} else { } else {
__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags); __clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
subreq->next_donated = 0;
} }
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_read(rreq, subreq); netfs_reissue_read(rreq, subreq);
if (!len) if (subreq == to)
break; break;
/* If we ran out of subrequests, allocate another. */
if (list_empty(&sublist)) {
subreq = netfs_alloc_subrequest(rreq);
if (!subreq)
goto abandon;
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start;
/* We get two refs, but need just one. */
netfs_put_subrequest(subreq, false, netfs_sreq_trace_new);
trace_netfs_sreq(subreq, netfs_sreq_trace_split);
list_add_tail(&subreq->rreq_link, &sublist);
}
} }
/* If we managed to use fewer subreqs, we can discard the /* If we managed to use fewer subreqs, we can discard the
* excess. * excess; if we used the same number, then we're done.
*/ */
while ((subreq = list_first_entry_or_null( if (!len) {
&sublist, struct netfs_io_subrequest, rreq_link))) { if (subreq == to)
trace_netfs_sreq(subreq, netfs_sreq_trace_discard); continue;
list_del(&subreq->rreq_link); list_for_each_entry_safe_from(subreq, tmp,
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); &stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
if (subreq == to)
break;
}
continue;
} }
} while (!list_empty(&queue)); /* We ran out of subrequests, so we need to allocate some more
* and insert them after.
*/
do {
subreq = netfs_alloc_subrequest(rreq);
if (!subreq) {
subreq = to;
goto abandon_after;
}
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start;
subreq->len = len;
subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
subreq->stream_nr = stream->stream_nr;
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
stream->sreq_max_len = umin(len, rreq->rsize);
stream->sreq_max_segs = 0;
if (unlikely(stream->sreq_max_segs))
part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
netfs_stat(&netfs_n_rh_download);
if (rreq->netfs_ops->prepare_read(subreq) < 0) {
trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed);
__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
goto abandon;
}
part = umin(len, stream->sreq_max_len);
subreq->len = subreq->transferred + part;
subreq->io_iter = source;
iov_iter_truncate(&subreq->io_iter, part);
iov_iter_advance(&source, part);
len -= part;
start += part;
if (!len && boundary) {
__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
boundary = false;
}
netfs_reissue_read(rreq, subreq);
} while (len);
} while (!list_is_head(next, &stream->subrequests));
return; return;
/* If we hit ENOMEM, fail all remaining subrequests */ /* If we hit an error, fail all remaining incomplete subrequests */
abandon_after:
if (list_is_last(&subreq->rreq_link, &stream->subrequests))
return;
subreq = list_next_entry(subreq, rreq_link);
abandon: abandon:
list_splice_init(&sublist, &queue); list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
list_for_each_entry(subreq, &queue, rreq_link) { if (!subreq->error &&
if (!subreq->error) !test_bit(NETFS_SREQ_FAILED, &subreq->flags) &&
subreq->error = -ENOMEM; !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
__clear_bit(NETFS_SREQ_FAILED, &subreq->flags); continue;
subreq->error = -ENOMEM;
__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__clear_bit(NETFS_SREQ_RETRYING, &subreq->flags); __clear_bit(NETFS_SREQ_RETRYING, &subreq->flags);
} }
spin_lock_bh(&rreq->lock);
list_splice_tail_init(&queue, &rreq->subrequests);
spin_unlock_bh(&rreq->lock);
} }
/* /*
@ -225,14 +253,19 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
*/ */
void netfs_retry_reads(struct netfs_io_request *rreq) void netfs_retry_reads(struct netfs_io_request *rreq)
{ {
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
}
trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
atomic_inc(&rreq->nr_outstanding);
netfs_retry_read_subrequests(rreq); netfs_retry_read_subrequests(rreq);
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_terminated(rreq, false);
} }
/* /*
@ -243,7 +276,7 @@ void netfs_unlock_abandoned_read_pages(struct netfs_io_request *rreq)
{ {
struct folio_queue *p; struct folio_queue *p;
for (p = rreq->buffer; p; p = p->next) { for (p = rreq->buffer.tail; p; p = p->next) {
for (int slot = 0; slot < folioq_count(p); slot++) { for (int slot = 0; slot < folioq_count(p); slot++) {
struct folio *folio = folioq_folio(p, slot); struct folio *folio = folioq_folio(p, slot);

195
fs/netfs/read_single.c Normal file
View File

@ -0,0 +1,195 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Single, monolithic object support (e.g. AFS directory).
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/uio.h>
#include <linux/sched/mm.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/netfs.h>
#include "internal.h"
/**
* netfs_single_mark_inode_dirty - Mark a single, monolithic object inode dirty
* @inode: The inode to mark
*
* Mark an inode that contains a single, monolithic object as dirty so that its
* writepages op will get called. If set, the SINGLE_NO_UPLOAD flag indicates
* that the object will only be written to the cache and not uploaded (e.g. AFS
* directory contents).
*/
void netfs_single_mark_inode_dirty(struct inode *inode)
{
struct netfs_inode *ictx = netfs_inode(inode);
bool cache_only = test_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &ictx->flags);
bool caching = fscache_cookie_enabled(netfs_i_cookie(netfs_inode(inode)));
if (cache_only && !caching)
return;
mark_inode_dirty(inode);
if (caching && !(inode->i_state & I_PINNING_NETFS_WB)) {
bool need_use = false;
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_PINNING_NETFS_WB)) {
inode->i_state |= I_PINNING_NETFS_WB;
need_use = true;
}
spin_unlock(&inode->i_lock);
if (need_use)
fscache_use_cookie(netfs_i_cookie(ictx), true);
}
}
EXPORT_SYMBOL(netfs_single_mark_inode_dirty);
static int netfs_single_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
{
return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
}
static void netfs_single_cache_prepare_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
if (!cres->ops) {
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
return;
}
subreq->source = cres->ops->prepare_read(subreq, rreq->i_size);
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
}
static void netfs_single_read_cache(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
_enter("R=%08x[%x]", rreq->debug_id, subreq->debug_index);
netfs_stat(&netfs_n_rh_read);
cres->ops->read(cres, subreq->start, &subreq->io_iter, NETFS_READ_HOLE_FAIL,
netfs_cache_read_terminated, subreq);
}
/*
* Perform a read to a buffer from the cache or the server. Only a single
* subreq is permitted as the object must be fetched in a single transaction.
*/
static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
struct netfs_io_subrequest *subreq;
int ret = 0;
subreq = netfs_alloc_subrequest(rreq);
if (!subreq)
return -ENOMEM;
subreq->source = NETFS_SOURCE_UNKNOWN;
subreq->start = 0;
subreq->len = rreq->len;
subreq->io_iter = rreq->buffer.iter;
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
spin_lock(&rreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests);
trace_netfs_sreq(subreq, netfs_sreq_trace_added);
stream->front = subreq;
/* Store list pointers before active flag */
smp_store_release(&stream->active, true);
spin_unlock(&rreq->lock);
netfs_single_cache_prepare_read(rreq, subreq);
switch (subreq->source) {
case NETFS_DOWNLOAD_FROM_SERVER:
netfs_stat(&netfs_n_rh_download);
if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0)
goto cancel;
}
rreq->netfs_ops->issue_read(subreq);
rreq->submitted += subreq->len;
break;
case NETFS_READ_FROM_CACHE:
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
netfs_single_read_cache(rreq, subreq);
rreq->submitted += subreq->len;
ret = 0;
break;
default:
pr_warn("Unexpected single-read source %u\n", subreq->source);
WARN_ON_ONCE(true);
ret = -EIO;
break;
}
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
return ret;
cancel:
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
return ret;
}
/**
* netfs_read_single - Synchronously read a single blob of pages.
* @inode: The inode to read from.
* @file: The file we're using to read or NULL.
* @iter: The buffer we're reading into.
*
* Fulfil a read request for a single monolithic object by drawing data from
* the cache if possible, or the netfs if not. The buffer may be larger than
* the file content; unused beyond the EOF will be zero-filled. The content
* will be read with a single I/O request (though this may be retried).
*
* The calling netfs must initialise a netfs context contiguous to the vfs
* inode before calling this.
*
* This is usable whether or not caching is enabled. If caching is enabled,
* the data will be stored as a single object into the cache.
*/
ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter)
{
struct netfs_io_request *rreq;
struct netfs_inode *ictx = netfs_inode(inode);
ssize_t ret;
rreq = netfs_alloc_request(inode->i_mapping, file, 0, iov_iter_count(iter),
NETFS_READ_SINGLE);
if (IS_ERR(rreq))
return PTR_ERR(rreq);
ret = netfs_single_begin_cache_read(rreq, ictx);
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
goto cleanup_free;
netfs_stat(&netfs_n_rh_read_single);
trace_netfs_read(rreq, 0, rreq->len, netfs_read_trace_read_single);
rreq->buffer.iter = *iter;
netfs_single_dispatch_read(rreq);
ret = netfs_wait_for_read(rreq);
netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
return ret;
cleanup_free:
netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
return ret;
}
EXPORT_SYMBOL(netfs_read_single);

225
fs/netfs/rolling_buffer.c Normal file
View File

@ -0,0 +1,225 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* Rolling buffer helpers
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/bitops.h>
#include <linux/pagemap.h>
#include <linux/rolling_buffer.h>
#include <linux/slab.h>
#include "internal.h"
static atomic_t debug_ids;
/**
* netfs_folioq_alloc - Allocate a folio_queue struct
* @rreq_id: Associated debugging ID for tracing purposes
* @gfp: Allocation constraints
* @trace: Trace tag to indicate the purpose of the allocation
*
* Allocate, initialise and account the folio_queue struct and log a trace line
* to mark the allocation.
*/
struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
unsigned int /*enum netfs_folioq_trace*/ trace)
{
struct folio_queue *fq;
fq = kmalloc(sizeof(*fq), gfp);
if (fq) {
netfs_stat(&netfs_n_folioq);
folioq_init(fq, rreq_id);
fq->debug_id = atomic_inc_return(&debug_ids);
trace_netfs_folioq(fq, trace);
}
return fq;
}
EXPORT_SYMBOL(netfs_folioq_alloc);
/**
* netfs_folioq_free - Free a folio_queue struct
* @folioq: The object to free
* @trace: Trace tag to indicate which free
*
* Free and unaccount the folio_queue struct.
*/
void netfs_folioq_free(struct folio_queue *folioq,
unsigned int /*enum netfs_trace_folioq*/ trace)
{
trace_netfs_folioq(folioq, trace);
netfs_stat_d(&netfs_n_folioq);
kfree(folioq);
}
EXPORT_SYMBOL(netfs_folioq_free);
/*
* Initialise a rolling buffer. We allocate an empty folio queue struct to so
* that the pointers can be independently driven by the producer and the
* consumer.
*/
int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id,
unsigned int direction)
{
struct folio_queue *fq;
fq = netfs_folioq_alloc(rreq_id, GFP_NOFS, netfs_trace_folioq_rollbuf_init);
if (!fq)
return -ENOMEM;
roll->head = fq;
roll->tail = fq;
iov_iter_folio_queue(&roll->iter, direction, fq, 0, 0, 0);
return 0;
}
/*
* Add another folio_queue to a rolling buffer if there's no space left.
*/
int rolling_buffer_make_space(struct rolling_buffer *roll)
{
struct folio_queue *fq, *head = roll->head;
if (!folioq_full(head))
return 0;
fq = netfs_folioq_alloc(head->rreq_id, GFP_NOFS, netfs_trace_folioq_make_space);
if (!fq)
return -ENOMEM;
fq->prev = head;
roll->head = fq;
if (folioq_full(head)) {
/* Make sure we don't leave the master iterator pointing to a
* block that might get immediately consumed.
*/
if (roll->iter.folioq == head &&
roll->iter.folioq_slot == folioq_nr_slots(head)) {
roll->iter.folioq = fq;
roll->iter.folioq_slot = 0;
}
}
/* Make sure the initialisation is stored before the next pointer.
*
* [!] NOTE: After we set head->next, the consumer is at liberty to
* immediately delete the old head.
*/
smp_store_release(&head->next, fq);
return 0;
}
/*
* Decant the list of folios to read into a rolling buffer.
*/
ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll,
struct readahead_control *ractl,
struct folio_batch *put_batch)
{
struct folio_queue *fq;
struct page **vec;
int nr, ix, to;
ssize_t size = 0;
if (rolling_buffer_make_space(roll) < 0)
return -ENOMEM;
fq = roll->head;
vec = (struct page **)fq->vec.folios;
nr = __readahead_batch(ractl, vec + folio_batch_count(&fq->vec),
folio_batch_space(&fq->vec));
ix = fq->vec.nr;
to = ix + nr;
fq->vec.nr = to;
for (; ix < to; ix++) {
struct folio *folio = folioq_folio(fq, ix);
unsigned int order = folio_order(folio);
fq->orders[ix] = order;
size += PAGE_SIZE << order;
trace_netfs_folio(folio, netfs_folio_trace_read);
if (!folio_batch_add(put_batch, folio))
folio_batch_release(put_batch);
}
WRITE_ONCE(roll->iter.count, roll->iter.count + size);
/* Store the counter after setting the slot. */
smp_store_release(&roll->next_head_slot, to);
for (; ix < folioq_nr_slots(fq); ix++)
folioq_clear(fq, ix);
return size;
}
/*
* Append a folio to the rolling buffer.
*/
ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio,
unsigned int flags)
{
ssize_t size = folio_size(folio);
int slot;
if (rolling_buffer_make_space(roll) < 0)
return -ENOMEM;
slot = folioq_append(roll->head, folio);
if (flags & ROLLBUF_MARK_1)
folioq_mark(roll->head, slot);
if (flags & ROLLBUF_MARK_2)
folioq_mark2(roll->head, slot);
WRITE_ONCE(roll->iter.count, roll->iter.count + size);
/* Store the counter after setting the slot. */
smp_store_release(&roll->next_head_slot, slot);
return size;
}
/*
* Delete a spent buffer from a rolling queue and return the next in line. We
* don't return the last buffer to keep the pointers independent, but return
* NULL instead.
*/
struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll)
{
struct folio_queue *spent = roll->tail, *next = READ_ONCE(spent->next);
if (!next)
return NULL;
next->prev = NULL;
netfs_folioq_free(spent, netfs_trace_folioq_delete);
roll->tail = next;
return next;
}
/*
* Clear out a rolling queue. Folios that have mark 1 set are put.
*/
void rolling_buffer_clear(struct rolling_buffer *roll)
{
struct folio_batch fbatch;
struct folio_queue *p;
folio_batch_init(&fbatch);
while ((p = roll->tail)) {
roll->tail = p->next;
for (int slot = 0; slot < folioq_count(p); slot++) {
struct folio *folio = folioq_folio(p, slot);
if (!folio)
continue;
if (folioq_is_marked(p, slot)) {
trace_netfs_folio(folio, netfs_folio_trace_put);
if (!folio_batch_add(&fbatch, folio))
folio_batch_release(&fbatch);
}
}
netfs_folioq_free(p, netfs_trace_folioq_clear);
}
folio_batch_release(&fbatch);
}

View File

@ -12,6 +12,7 @@
atomic_t netfs_n_rh_dio_read; atomic_t netfs_n_rh_dio_read;
atomic_t netfs_n_rh_readahead; atomic_t netfs_n_rh_readahead;
atomic_t netfs_n_rh_read_folio; atomic_t netfs_n_rh_read_folio;
atomic_t netfs_n_rh_read_single;
atomic_t netfs_n_rh_rreq; atomic_t netfs_n_rh_rreq;
atomic_t netfs_n_rh_sreq; atomic_t netfs_n_rh_sreq;
atomic_t netfs_n_rh_download; atomic_t netfs_n_rh_download;
@ -46,10 +47,11 @@ atomic_t netfs_n_folioq;
int netfs_stats_show(struct seq_file *m, void *v) int netfs_stats_show(struct seq_file *m, void *v)
{ {
seq_printf(m, "Reads : DR=%u RA=%u RF=%u WB=%u WBZ=%u\n", seq_printf(m, "Reads : DR=%u RA=%u RF=%u RS=%u WB=%u WBZ=%u\n",
atomic_read(&netfs_n_rh_dio_read), atomic_read(&netfs_n_rh_dio_read),
atomic_read(&netfs_n_rh_readahead), atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_read_folio), atomic_read(&netfs_n_rh_read_folio),
atomic_read(&netfs_n_rh_read_single),
atomic_read(&netfs_n_rh_write_begin), atomic_read(&netfs_n_rh_write_begin),
atomic_read(&netfs_n_rh_write_zskip)); atomic_read(&netfs_n_rh_write_zskip));
seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n", seq_printf(m, "Writes : BW=%u WT=%u DW=%u WP=%u 2C=%u\n",

View File

@ -17,10 +17,38 @@
#define HIT_PENDING 0x01 /* A front op was still pending */ #define HIT_PENDING 0x01 /* A front op was still pending */
#define NEED_REASSESS 0x02 /* Need to loop round and reassess */ #define NEED_REASSESS 0x02 /* Need to loop round and reassess */
#define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */ #define MADE_PROGRESS 0x04 /* Made progress cleaning up a stream or the folio set */
#define BUFFERED 0x08 /* The pagecache needs cleaning up */ #define NEED_UNLOCK 0x08 /* The pagecache needs unlocking */
#define NEED_RETRY 0x10 /* A front op requests retrying */ #define NEED_RETRY 0x10 /* A front op requests retrying */
#define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */ #define SAW_FAILURE 0x20 /* One stream or hit a permanent failure */
static void netfs_dump_request(const struct netfs_io_request *rreq)
{
pr_err("Request R=%08x r=%d fl=%lx or=%x e=%ld\n",
rreq->debug_id, refcount_read(&rreq->ref), rreq->flags,
rreq->origin, rreq->error);
pr_err(" st=%llx tsl=%zx/%llx/%llx\n",
rreq->start, rreq->transferred, rreq->submitted, rreq->len);
pr_err(" cci=%llx/%llx/%llx\n",
rreq->cleaned_to, rreq->collected_to, atomic64_read(&rreq->issued_to));
pr_err(" iw=%pSR\n", rreq->netfs_ops->issue_write);
for (int i = 0; i < NR_IO_STREAMS; i++) {
const struct netfs_io_subrequest *sreq;
const struct netfs_io_stream *s = &rreq->io_streams[i];
pr_err(" str[%x] s=%x e=%d acnf=%u,%u,%u,%u\n",
s->stream_nr, s->source, s->error,
s->avail, s->active, s->need_retry, s->failed);
pr_err(" str[%x] ct=%llx t=%zx\n",
s->stream_nr, s->collected_to, s->transferred);
list_for_each_entry(sreq, &s->subrequests, rreq_link) {
pr_err(" sreq[%x:%x] sc=%u s=%llx t=%zx/%zx r=%d f=%lx\n",
sreq->stream_nr, sreq->debug_index, sreq->source,
sreq->start, sreq->transferred, sreq->len,
refcount_read(&sreq->ref), sreq->flags);
}
}
}
/* /*
* Successful completion of write of a folio to the server and/or cache. Note * Successful completion of write of a folio to the server and/or cache. Note
* that we are not allowed to lock the folio here on pain of deadlocking with * that we are not allowed to lock the folio here on pain of deadlocking with
@ -83,9 +111,15 @@ int netfs_folio_written_back(struct folio *folio)
static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq, static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
unsigned int *notes) unsigned int *notes)
{ {
struct folio_queue *folioq = wreq->buffer; struct folio_queue *folioq = wreq->buffer.tail;
unsigned long long collected_to = wreq->collected_to; unsigned long long collected_to = wreq->collected_to;
unsigned int slot = wreq->buffer_head_slot; unsigned int slot = wreq->buffer.first_tail_slot;
if (WARN_ON_ONCE(!folioq)) {
pr_err("[!] Writeback unlock found empty rolling buffer!\n");
netfs_dump_request(wreq);
return;
}
if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) { if (wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) {
if (netfs_pgpriv2_unlock_copied_folios(wreq)) if (netfs_pgpriv2_unlock_copied_folios(wreq))
@ -94,7 +128,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
} }
if (slot >= folioq_nr_slots(folioq)) { if (slot >= folioq_nr_slots(folioq)) {
folioq = netfs_delete_buffer_head(wreq); folioq = rolling_buffer_delete_spent(&wreq->buffer);
if (!folioq)
return;
slot = 0; slot = 0;
} }
@ -134,9 +170,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
folioq_clear(folioq, slot); folioq_clear(folioq, slot);
slot++; slot++;
if (slot >= folioq_nr_slots(folioq)) { if (slot >= folioq_nr_slots(folioq)) {
if (READ_ONCE(wreq->buffer_tail) == folioq) folioq = rolling_buffer_delete_spent(&wreq->buffer);
break; if (!folioq)
folioq = netfs_delete_buffer_head(wreq); goto done;
slot = 0; slot = 0;
} }
@ -144,223 +180,9 @@ static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
break; break;
} }
wreq->buffer = folioq; wreq->buffer.tail = folioq;
wreq->buffer_head_slot = slot; done:
} wreq->buffer.first_tail_slot = slot;
/*
* Perform retries on the streams that need it.
*/
static void netfs_retry_write_stream(struct netfs_io_request *wreq,
struct netfs_io_stream *stream)
{
struct list_head *next;
_enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
if (list_empty(&stream->subrequests))
return;
if (stream->source == NETFS_UPLOAD_TO_SERVER &&
wreq->netfs_ops->retry_request)
wreq->netfs_ops->retry_request(wreq, stream);
if (unlikely(stream->failed))
return;
/* If there's no renegotiation to do, just resend each failed subreq. */
if (!stream->prepare_write) {
struct netfs_io_subrequest *subreq;
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
struct iov_iter source = subreq->io_iter;
iov_iter_revert(&source, subreq->len - source.count);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq, &source);
}
}
return;
}
next = stream->subrequests.next;
do {
struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
struct iov_iter source;
unsigned long long start, len;
size_t part;
bool boundary = false;
/* Go through the stream and find the next span of contiguous
* data that we then rejig (cifs, for example, needs the wsize
* renegotiating) and reissue.
*/
from = list_entry(next, struct netfs_io_subrequest, rreq_link);
to = from;
start = from->start + from->transferred;
len = from->len - from->transferred;
if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
!test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
return;
list_for_each_continue(next, &stream->subrequests) {
subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
if (subreq->start + subreq->transferred != start + len ||
test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
!test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
break;
to = subreq;
len += to->len;
}
/* Determine the set of buffers we're going to use. Each
* subreq gets a subset of a single overall contiguous buffer.
*/
netfs_reset_iter(from);
source = from->io_iter;
source.count = len;
/* Work through the sublist. */
subreq = from;
list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
if (!len)
break;
/* Renegotiate max_len (wsize) */
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
stream->prepare_write(subreq);
part = min(len, stream->sreq_max_len);
subreq->len = part;
subreq->start = start;
subreq->transferred = 0;
len -= part;
start += part;
if (len && subreq == to &&
__test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
boundary = true;
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq, &source);
if (subreq == to)
break;
}
/* If we managed to use fewer subreqs, we can discard the
* excess; if we used the same number, then we're done.
*/
if (!len) {
if (subreq == to)
continue;
list_for_each_entry_safe_from(subreq, tmp,
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
if (subreq == to)
break;
}
continue;
}
/* We ran out of subrequests, so we need to allocate some more
* and insert them after.
*/
do {
subreq = netfs_alloc_subrequest(wreq);
subreq->source = to->source;
subreq->start = start;
subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
subreq->stream_nr = to->stream_nr;
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
stream->sreq_max_len = len;
stream->sreq_max_segs = INT_MAX;
switch (stream->source) {
case NETFS_UPLOAD_TO_SERVER:
netfs_stat(&netfs_n_wh_upload);
stream->sreq_max_len = umin(len, wreq->wsize);
break;
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write);
break;
default:
WARN_ON_ONCE(1);
}
stream->prepare_write(subreq);
part = umin(len, stream->sreq_max_len);
subreq->len = subreq->transferred + part;
len -= part;
start += part;
if (!len && boundary) {
__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
boundary = false;
}
netfs_reissue_write(stream, subreq, &source);
if (!len)
break;
} while (len);
} while (!list_is_head(next, &stream->subrequests));
}
/*
* Perform retries on the streams that need it. If we're doing content
* encryption and the server copy changed due to a third-party write, we may
* need to do an RMW cycle and also rewrite the data to the cache.
*/
static void netfs_retry_writes(struct netfs_io_request *wreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream;
int s;
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (!stream->active)
continue;
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
}
}
// TODO: Enc: Fetch changed partial pages
// TODO: Enc: Reencrypt content if needed.
// TODO: Enc: Wind back transferred point.
// TODO: Enc: Mark cache pages for retry.
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->need_retry) {
stream->need_retry = false;
netfs_retry_write_stream(wreq, stream);
}
}
} }
/* /*
@ -391,7 +213,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
if (wreq->origin == NETFS_WRITEBACK || if (wreq->origin == NETFS_WRITEBACK ||
wreq->origin == NETFS_WRITETHROUGH || wreq->origin == NETFS_WRITETHROUGH ||
wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE) wreq->origin == NETFS_PGPRIV2_COPY_TO_CACHE)
notes = BUFFERED; notes = NEED_UNLOCK;
else else
notes = 0; notes = 0;
@ -450,14 +272,14 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
cancel: cancel:
/* Remove if completely consumed. */ /* Remove if completely consumed. */
spin_lock_bh(&wreq->lock); spin_lock(&wreq->lock);
remove = front; remove = front;
list_del_init(&front->rreq_link); list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests, front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link); struct netfs_io_subrequest, rreq_link);
stream->front = front; stream->front = front;
spin_unlock_bh(&wreq->lock); spin_unlock(&wreq->lock);
netfs_put_subrequest(remove, false, netfs_put_subrequest(remove, false,
notes & SAW_FAILURE ? notes & SAW_FAILURE ?
netfs_sreq_trace_put_cancel : netfs_sreq_trace_put_cancel :
@ -488,7 +310,7 @@ static void netfs_collect_write_results(struct netfs_io_request *wreq)
trace_netfs_collect_state(wreq, wreq->collected_to, notes); trace_netfs_collect_state(wreq, wreq->collected_to, notes);
/* Unlock any folios that we have now finished with. */ /* Unlock any folios that we have now finished with. */
if (notes & BUFFERED) { if (notes & NEED_UNLOCK) {
if (wreq->cleaned_to < wreq->collected_to) if (wreq->cleaned_to < wreq->collected_to)
netfs_writeback_unlock_folios(wreq, &notes); netfs_writeback_unlock_folios(wreq, &notes);
} else { } else {

View File

@ -94,9 +94,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
{ {
struct netfs_io_request *wreq; struct netfs_io_request *wreq;
struct netfs_inode *ictx; struct netfs_inode *ictx;
bool is_buffered = (origin == NETFS_WRITEBACK || bool is_cacheable = (origin == NETFS_WRITEBACK ||
origin == NETFS_WRITETHROUGH || origin == NETFS_WRITEBACK_SINGLE ||
origin == NETFS_PGPRIV2_COPY_TO_CACHE); origin == NETFS_WRITETHROUGH ||
origin == NETFS_PGPRIV2_COPY_TO_CACHE);
wreq = netfs_alloc_request(mapping, file, start, 0, origin); wreq = netfs_alloc_request(mapping, file, start, 0, origin);
if (IS_ERR(wreq)) if (IS_ERR(wreq))
@ -105,8 +106,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
_enter("R=%x", wreq->debug_id); _enter("R=%x", wreq->debug_id);
ictx = netfs_inode(wreq->inode); ictx = netfs_inode(wreq->inode);
if (is_buffered && netfs_is_cache_enabled(ictx)) if (is_cacheable && netfs_is_cache_enabled(ictx))
fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx)); fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
if (rolling_buffer_init(&wreq->buffer, wreq->debug_id, ITER_SOURCE) < 0)
goto nomem;
wreq->cleaned_to = wreq->start; wreq->cleaned_to = wreq->start;
@ -129,6 +132,10 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
} }
return wreq; return wreq;
nomem:
wreq->error = -ENOMEM;
netfs_put_request(wreq, false, netfs_rreq_trace_put_failed);
return ERR_PTR(-ENOMEM);
} }
/** /**
@ -153,16 +160,15 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
loff_t start) loff_t start)
{ {
struct netfs_io_subrequest *subreq; struct netfs_io_subrequest *subreq;
struct iov_iter *wreq_iter = &wreq->io_iter; struct iov_iter *wreq_iter = &wreq->buffer.iter;
/* Make sure we don't point the iterator at a used-up folio_queue /* Make sure we don't point the iterator at a used-up folio_queue
* struct being used as a placeholder to prevent the queue from * struct being used as a placeholder to prevent the queue from
* collapsing. In such a case, extend the queue. * collapsing. In such a case, extend the queue.
*/ */
if (iov_iter_is_folioq(wreq_iter) && if (iov_iter_is_folioq(wreq_iter) &&
wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq)) { wreq_iter->folioq_slot >= folioq_nr_slots(wreq_iter->folioq))
netfs_buffer_make_space(wreq); rolling_buffer_make_space(&wreq->buffer);
}
subreq = netfs_alloc_subrequest(wreq); subreq = netfs_alloc_subrequest(wreq);
subreq->source = stream->source; subreq->source = stream->source;
@ -198,7 +204,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
* the list. The collector only goes nextwards and uses the lock to * the list. The collector only goes nextwards and uses the lock to
* remove entries off of the front. * remove entries off of the front.
*/ */
spin_lock_bh(&wreq->lock); spin_lock(&wreq->lock);
list_add_tail(&subreq->rreq_link, &stream->subrequests); list_add_tail(&subreq->rreq_link, &stream->subrequests);
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) { if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
stream->front = subreq; stream->front = subreq;
@ -209,7 +215,7 @@ static void netfs_prepare_write(struct netfs_io_request *wreq,
} }
} }
spin_unlock_bh(&wreq->lock); spin_unlock(&wreq->lock);
stream->construct = subreq; stream->construct = subreq;
} }
@ -266,9 +272,9 @@ void netfs_issue_write(struct netfs_io_request *wreq,
* we can avoid overrunning the credits obtained (cifs) and try to parallelise * we can avoid overrunning the credits obtained (cifs) and try to parallelise
* content-crypto preparation with network writes. * content-crypto preparation with network writes.
*/ */
int netfs_advance_write(struct netfs_io_request *wreq, size_t netfs_advance_write(struct netfs_io_request *wreq,
struct netfs_io_stream *stream, struct netfs_io_stream *stream,
loff_t start, size_t len, bool to_eof) loff_t start, size_t len, bool to_eof)
{ {
struct netfs_io_subrequest *subreq = stream->construct; struct netfs_io_subrequest *subreq = stream->construct;
size_t part; size_t part;
@ -325,6 +331,9 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
_enter(""); _enter("");
if (rolling_buffer_make_space(&wreq->buffer) < 0)
return -ENOMEM;
/* netfs_perform_write() may shift i_size around the page or from out /* netfs_perform_write() may shift i_size around the page or from out
* of the page to beyond it, but cannot move i_size into or through the * of the page to beyond it, but cannot move i_size into or through the
* page since we have it locked. * page since we have it locked.
@ -429,7 +438,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
} }
/* Attach the folio to the rolling buffer. */ /* Attach the folio to the rolling buffer. */
netfs_buffer_append_folio(wreq, folio, false); rolling_buffer_append(&wreq->buffer, folio, 0);
/* Move the submission point forward to allow for write-streaming data /* Move the submission point forward to allow for write-streaming data
* not starting at the front of the page. We don't do write-streaming * not starting at the front of the page. We don't do write-streaming
@ -442,7 +451,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
stream = &wreq->io_streams[s]; stream = &wreq->io_streams[s];
stream->submit_off = foff; stream->submit_off = foff;
stream->submit_len = flen; stream->submit_len = flen;
if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) || if (!stream->avail ||
(stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
(stream->source == NETFS_UPLOAD_TO_SERVER && (stream->source == NETFS_UPLOAD_TO_SERVER &&
fgroup == NETFS_FOLIO_COPY_TO_CACHE)) { fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
stream->submit_off = UINT_MAX; stream->submit_off = UINT_MAX;
@ -476,7 +486,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
/* Advance the iterator(s). */ /* Advance the iterator(s). */
if (stream->submit_off > iter_off) { if (stream->submit_off > iter_off) {
iov_iter_advance(&wreq->io_iter, stream->submit_off - iter_off); rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
iter_off = stream->submit_off; iter_off = stream->submit_off;
} }
@ -494,7 +504,7 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
} }
if (fsize > iter_off) if (fsize > iter_off)
iov_iter_advance(&wreq->io_iter, fsize - iter_off); rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
atomic64_set(&wreq->issued_to, fpos + fsize); atomic64_set(&wreq->issued_to, fpos + fsize);
if (!debug) if (!debug)
@ -633,7 +643,7 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
struct folio **writethrough_cache) struct folio **writethrough_cache)
{ {
_enter("R=%x ic=%zu ws=%u cp=%zu tp=%u", _enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end); wreq->debug_id, wreq->buffer.iter.count, wreq->wsize, copied, to_page_end);
if (!*writethrough_cache) { if (!*writethrough_cache) {
if (folio_test_dirty(folio)) if (folio_test_dirty(folio))
@ -708,7 +718,7 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
part = netfs_advance_write(wreq, upload, start, len, false); part = netfs_advance_write(wreq, upload, start, len, false);
start += part; start += part;
len -= part; len -= part;
iov_iter_advance(&wreq->io_iter, part); rolling_buffer_advance(&wreq->buffer, part);
if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE); wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
@ -721,3 +731,194 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
_leave(" = %d", error); _leave(" = %d", error);
return error; return error;
} }
/*
* Write some of a pending folio data back to the server and/or the cache.
*/
static int netfs_write_folio_single(struct netfs_io_request *wreq,
struct folio *folio)
{
struct netfs_io_stream *upload = &wreq->io_streams[0];
struct netfs_io_stream *cache = &wreq->io_streams[1];
struct netfs_io_stream *stream;
size_t iter_off = 0;
size_t fsize = folio_size(folio), flen;
loff_t fpos = folio_pos(folio);
bool to_eof = false;
bool no_debug = false;
_enter("");
flen = folio_size(folio);
if (flen > wreq->i_size - fpos) {
flen = wreq->i_size - fpos;
folio_zero_segment(folio, flen, fsize);
to_eof = true;
} else if (flen == wreq->i_size - fpos) {
to_eof = true;
}
_debug("folio %zx/%zx", flen, fsize);
if (!upload->avail && !cache->avail) {
trace_netfs_folio(folio, netfs_folio_trace_cancel_store);
return 0;
}
if (!upload->construct)
trace_netfs_folio(folio, netfs_folio_trace_store);
else
trace_netfs_folio(folio, netfs_folio_trace_store_plus);
/* Attach the folio to the rolling buffer. */
folio_get(folio);
rolling_buffer_append(&wreq->buffer, folio, NETFS_ROLLBUF_PUT_MARK);
/* Move the submission point forward to allow for write-streaming data
* not starting at the front of the page. We don't do write-streaming
* with the cache as the cache requires DIO alignment.
*
* Also skip uploading for data that's been read and just needs copying
* to the cache.
*/
for (int s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
stream->submit_off = 0;
stream->submit_len = flen;
if (!stream->avail) {
stream->submit_off = UINT_MAX;
stream->submit_len = 0;
}
}
/* Attach the folio to one or more subrequests. For a big folio, we
* could end up with thousands of subrequests if the wsize is small -
* but we might need to wait during the creation of subrequests for
* network resources (eg. SMB credits).
*/
for (;;) {
ssize_t part;
size_t lowest_off = ULONG_MAX;
int choose_s = -1;
/* Always add to the lowest-submitted stream first. */
for (int s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->submit_len > 0 &&
stream->submit_off < lowest_off) {
lowest_off = stream->submit_off;
choose_s = s;
}
}
if (choose_s < 0)
break;
stream = &wreq->io_streams[choose_s];
/* Advance the iterator(s). */
if (stream->submit_off > iter_off) {
rolling_buffer_advance(&wreq->buffer, stream->submit_off - iter_off);
iter_off = stream->submit_off;
}
atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
stream->submit_extendable_to = fsize - stream->submit_off;
part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
stream->submit_len, to_eof);
stream->submit_off += part;
if (part > stream->submit_len)
stream->submit_len = 0;
else
stream->submit_len -= part;
if (part > 0)
no_debug = true;
}
wreq->buffer.iter.iov_offset = 0;
if (fsize > iter_off)
rolling_buffer_advance(&wreq->buffer, fsize - iter_off);
atomic64_set(&wreq->issued_to, fpos + fsize);
if (!no_debug)
kdebug("R=%x: No submit", wreq->debug_id);
_leave(" = 0");
return 0;
}
/**
* netfs_writeback_single - Write back a monolithic payload
* @mapping: The mapping to write from
* @wbc: Hints from the VM
* @iter: Data to write, must be ITER_FOLIOQ.
*
* Write a monolithic, non-pagecache object back to the server and/or
* the cache.
*/
int netfs_writeback_single(struct address_space *mapping,
struct writeback_control *wbc,
struct iov_iter *iter)
{
struct netfs_io_request *wreq;
struct netfs_inode *ictx = netfs_inode(mapping->host);
struct folio_queue *fq;
size_t size = iov_iter_count(iter);
int ret = 0;
if (WARN_ON_ONCE(!iov_iter_is_folioq(iter)))
return -EIO;
if (!mutex_trylock(&ictx->wb_lock)) {
if (wbc->sync_mode == WB_SYNC_NONE) {
netfs_stat(&netfs_n_wb_lock_skip);
return 0;
}
netfs_stat(&netfs_n_wb_lock_wait);
mutex_lock(&ictx->wb_lock);
}
wreq = netfs_create_write_req(mapping, NULL, 0, NETFS_WRITEBACK_SINGLE);
if (IS_ERR(wreq)) {
ret = PTR_ERR(wreq);
goto couldnt_start;
}
trace_netfs_write(wreq, netfs_write_trace_writeback);
netfs_stat(&netfs_n_wh_writepages);
if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
wreq->netfs_ops->begin_writeback(wreq);
for (fq = (struct folio_queue *)iter->folioq; fq; fq = fq->next) {
for (int slot = 0; slot < folioq_count(fq); slot++) {
struct folio *folio = folioq_folio(fq, slot);
size_t part = umin(folioq_folio_size(fq, slot), size);
_debug("wbiter %lx %llx", folio->index, atomic64_read(&wreq->issued_to));
ret = netfs_write_folio_single(wreq, folio);
if (ret < 0)
goto stop;
size -= part;
if (size <= 0)
goto stop;
}
}
stop:
for (int s = 0; s < NR_IO_STREAMS; s++)
netfs_issue_write(wreq, &wreq->io_streams[s]);
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
mutex_unlock(&ictx->wb_lock);
netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
_leave(" = %d", ret);
return ret;
couldnt_start:
mutex_unlock(&ictx->wb_lock);
_leave(" = %d", ret);
return ret;
}
EXPORT_SYMBOL(netfs_writeback_single);

233
fs/netfs/write_retry.c Normal file
View File

@ -0,0 +1,233 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Network filesystem write retrying.
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include "internal.h"
/*
* Perform retries on the streams that need it.
*/
static void netfs_retry_write_stream(struct netfs_io_request *wreq,
struct netfs_io_stream *stream)
{
struct list_head *next;
_enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
if (list_empty(&stream->subrequests))
return;
if (stream->source == NETFS_UPLOAD_TO_SERVER &&
wreq->netfs_ops->retry_request)
wreq->netfs_ops->retry_request(wreq, stream);
if (unlikely(stream->failed))
return;
/* If there's no renegotiation to do, just resend each failed subreq. */
if (!stream->prepare_write) {
struct netfs_io_subrequest *subreq;
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
struct iov_iter source = subreq->io_iter;
iov_iter_revert(&source, subreq->len - source.count);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq, &source);
}
}
return;
}
next = stream->subrequests.next;
do {
struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
struct iov_iter source;
unsigned long long start, len;
size_t part;
bool boundary = false;
/* Go through the stream and find the next span of contiguous
* data that we then rejig (cifs, for example, needs the wsize
* renegotiating) and reissue.
*/
from = list_entry(next, struct netfs_io_subrequest, rreq_link);
to = from;
start = from->start + from->transferred;
len = from->len - from->transferred;
if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
!test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
return;
list_for_each_continue(next, &stream->subrequests) {
subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
if (subreq->start + subreq->transferred != start + len ||
test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
!test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
break;
to = subreq;
len += to->len;
}
/* Determine the set of buffers we're going to use. Each
* subreq gets a subset of a single overall contiguous buffer.
*/
netfs_reset_iter(from);
source = from->io_iter;
source.count = len;
/* Work through the sublist. */
subreq = from;
list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
if (!len)
break;
subreq->start = start;
subreq->len = len;
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
/* Renegotiate max_len (wsize) */
stream->sreq_max_len = len;
stream->prepare_write(subreq);
part = umin(len, stream->sreq_max_len);
if (unlikely(stream->sreq_max_segs))
part = netfs_limit_iter(&source, 0, part, stream->sreq_max_segs);
subreq->len = part;
subreq->transferred = 0;
len -= part;
start += part;
if (len && subreq == to &&
__test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
boundary = true;
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq, &source);
if (subreq == to)
break;
}
/* If we managed to use fewer subreqs, we can discard the
* excess; if we used the same number, then we're done.
*/
if (!len) {
if (subreq == to)
continue;
list_for_each_entry_safe_from(subreq, tmp,
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
if (subreq == to)
break;
}
continue;
}
/* We ran out of subrequests, so we need to allocate some more
* and insert them after.
*/
do {
subreq = netfs_alloc_subrequest(wreq);
subreq->source = to->source;
subreq->start = start;
subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
subreq->stream_nr = to->stream_nr;
__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
stream->sreq_max_len = len;
stream->sreq_max_segs = INT_MAX;
switch (stream->source) {
case NETFS_UPLOAD_TO_SERVER:
netfs_stat(&netfs_n_wh_upload);
stream->sreq_max_len = umin(len, wreq->wsize);
break;
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write);
break;
default:
WARN_ON_ONCE(1);
}
stream->prepare_write(subreq);
part = umin(len, stream->sreq_max_len);
subreq->len = subreq->transferred + part;
len -= part;
start += part;
if (!len && boundary) {
__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
boundary = false;
}
netfs_reissue_write(stream, subreq, &source);
if (!len)
break;
} while (len);
} while (!list_is_head(next, &stream->subrequests));
}
/*
* Perform retries on the streams that need it. If we're doing content
* encryption and the server copy changed due to a third-party write, we may
* need to do an RMW cycle and also rewrite the data to the cache.
*/
void netfs_retry_writes(struct netfs_io_request *wreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream;
int s;
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (!stream->active)
continue;
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
}
}
// TODO: Enc: Fetch changed partial pages
// TODO: Enc: Reencrypt content if needed.
// TODO: Enc: Wind back transferred point.
// TODO: Enc: Mark cache pages for retry.
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
if (stream->need_retry) {
stream->need_retry = false;
netfs_retry_write_stream(wreq, stream);
}
}
}

View File

@ -307,8 +307,10 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
&nfs_async_read_completion_ops); &nfs_async_read_completion_ops);
netfs = nfs_netfs_alloc(sreq); netfs = nfs_netfs_alloc(sreq);
if (!netfs) if (!netfs) {
return netfs_read_subreq_terminated(sreq, -ENOMEM, false); sreq->error = -ENOMEM;
return netfs_read_subreq_terminated(sreq);
}
pgio.pg_netfs = netfs; /* used in completion */ pgio.pg_netfs = netfs; /* used in completion */

View File

@ -74,7 +74,8 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
*/ */
netfs->sreq->transferred = min_t(s64, netfs->sreq->len, netfs->sreq->transferred = min_t(s64, netfs->sreq->len,
atomic64_read(&netfs->transferred)); atomic64_read(&netfs->transferred));
netfs_read_subreq_terminated(netfs->sreq, netfs->error, false); netfs->sreq->error = netfs->error;
netfs_read_subreq_terminated(netfs->sreq);
kfree(netfs); kfree(netfs);
} }
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi) static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)

View File

@ -27,7 +27,7 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
int flags = nfsexp_flags(cred, exp); int flags = nfsexp_flags(cred, exp);
/* discard any old override before preparing the new set */ /* discard any old override before preparing the new set */
revert_creds(get_cred(current_real_cred())); put_cred(revert_creds(get_cred(current_real_cred())));
new = prepare_creds(); new = prepare_creds();
if (!new) if (!new)
return -ENOMEM; return -ENOMEM;
@ -80,7 +80,6 @@ int nfsd_setuser(struct svc_cred *cred, struct svc_export *exp)
new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
new->cap_permitted); new->cap_permitted);
put_cred(override_creds(new)); put_cred(override_creds(new));
put_cred(new);
return 0; return 0;
oom: oom:

View File

@ -1248,7 +1248,7 @@ nfsd_file_acquire_local(struct net *net, struct svc_cred *cred,
beres = nfsd_file_do_acquire(NULL, net, cred, client, beres = nfsd_file_do_acquire(NULL, net, cred, client,
fhp, may_flags, NULL, pnf, true); fhp, may_flags, NULL, pnf, true);
revert_creds(save_cred); put_cred(revert_creds(save_cred));
return beres; return beres;
} }

View File

@ -82,14 +82,13 @@ nfs4_save_creds(const struct cred **original_creds)
new->fsuid = GLOBAL_ROOT_UID; new->fsuid = GLOBAL_ROOT_UID;
new->fsgid = GLOBAL_ROOT_GID; new->fsgid = GLOBAL_ROOT_GID;
*original_creds = override_creds(new); *original_creds = override_creds(new);
put_cred(new);
return 0; return 0;
} }
static void static void
nfs4_reset_creds(const struct cred *original) nfs4_reset_creds(const struct cred *original)
{ {
revert_creds(original); put_cred(revert_creds(original));
} }
static void static void

View File

@ -222,7 +222,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
cap_raise_nfsd_set(new->cap_effective, cap_raise_nfsd_set(new->cap_effective,
new->cap_permitted); new->cap_permitted);
put_cred(override_creds(new)); put_cred(override_creds(new));
put_cred(new);
} else { } else {
error = nfsd_setuser_and_check_port(rqstp, cred, exp); error = nfsd_setuser_and_check_port(rqstp, cred, exp);
if (error) if (error)

View File

@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
if (usize < MNT_NS_INFO_SIZE_VER0) if (usize < MNT_NS_INFO_SIZE_VER0)
return -EINVAL; return -EINVAL;
if (previous) mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
else
mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
if (IS_ERR(mnt_ns)) if (IS_ERR(mnt_ns))
return PTR_ERR(mnt_ns); return PTR_ERR(mnt_ns);

View File

@ -413,7 +413,6 @@ static bool access_need_override_creds(int flags)
static const struct cred *access_override_creds(void) static const struct cred *access_override_creds(void)
{ {
const struct cred *old_cred;
struct cred *override_cred; struct cred *override_cred;
override_cred = prepare_creds(); override_cred = prepare_creds();
@ -458,13 +457,7 @@ static const struct cred *access_override_creds(void)
* freeing. * freeing.
*/ */
override_cred->non_rcu = 1; override_cred->non_rcu = 1;
return override_creds(override_cred);
old_cred = override_creds(override_cred);
/* override_cred() gets its own ref */
put_cred(override_cred);
return old_cred;
} }
static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
@ -534,7 +527,7 @@ static long do_faccessat(int dfd, const char __user *filename, int mode, int fla
} }
out: out:
if (old_cred) if (old_cred)
revert_creds(old_cred); put_cred(revert_creds(old_cred));
return res; return res;
} }

View File

@ -575,12 +575,12 @@ static const struct cred *ovl_setup_cred_for_create(struct dentry *dentry,
} }
/* /*
* Caller is going to match this with revert_creds_light() and drop * Caller is going to match this with revert_creds() and drop
* referenec on the returned creds. * referenec on the returned creds.
* We must be called with creator creds already, otherwise we risk * We must be called with creator creds already, otherwise we risk
* leaking creds. * leaking creds.
*/ */
old_cred = override_creds_light(override_cred); old_cred = override_creds(override_cred);
WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb)); WARN_ON_ONCE(old_cred != ovl_creds(dentry->d_sb));
return override_cred; return override_cred;

View File

@ -65,12 +65,12 @@ const struct cred *ovl_override_creds(struct super_block *sb)
{ {
struct ovl_fs *ofs = OVL_FS(sb); struct ovl_fs *ofs = OVL_FS(sb);
return override_creds_light(ofs->creator_cred); return override_creds(ofs->creator_cred);
} }
void ovl_revert_creds(const struct cred *old_cred) void ovl_revert_creds(const struct cred *old_cred)
{ {
revert_creds_light(old_cred); revert_creds(old_cred);
} }
/* /*

View File

@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/anon_inodes.h> #include <linux/anon_inodes.h>
#include <linux/exportfs.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
@ -23,6 +24,97 @@
#include "internal.h" #include "internal.h"
#include "mount.h" #include "mount.h"
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
static inline unsigned long pidfs_ino(u64 ino)
{
return lower_32_bits(ino);
}
/* On 32 bit the generation number are the upper 32 bits. */
static inline u32 pidfs_gen(u64 ino)
{
return upper_32_bits(ino);
}
#else
/* On 64 bit simply return ino. */
static inline unsigned long pidfs_ino(u64 ino)
{
return ino;
}
/* On 64 bit the generation number is 0. */
static inline u32 pidfs_gen(u64 ino)
{
return 0;
}
#endif
static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
{
struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
u64 pid_ino_a = pid_a->ino;
u64 pid_ino_b = pid_b->ino;
if (pid_ino_a < pid_ino_b)
return -1;
if (pid_ino_a > pid_ino_b)
return 1;
return 0;
}
void pidfs_add_pid(struct pid *pid)
{
static u64 pidfs_ino_nr = 2;
/*
* On 64 bit nothing special happens. The 64bit number assigned
* to struct pid is the inode number.
*
* On 32 bit the 64 bit number assigned to struct pid is split
* into two 32 bit numbers. The lower 32 bits are used as the
* inode number and the upper 32 bits are used as the inode
* generation number.
*
* On 32 bit pidfs_ino() will return the lower 32 bit. When
* pidfs_ino() returns zero a wrap around happened. When a
* wraparound happens the 64 bit number will be incremented by 2
* so inode numbering starts at 2 again.
*
* On 64 bit comparing two pidfds is as simple as comparing
* inode numbers.
*
* When a wraparound happens on 32 bit multiple pidfds with the
* same inode number are likely to exist (This isn't a problem
* since before pidfs pidfds used the anonymous inode meaning
* all pidfds had the same inode number.). Userspace can
* reconstruct the 64 bit identifier by retrieving both the
* inode number and the inode generation number to compare or
* use file handles.
*/
if (pidfs_ino(pidfs_ino_nr) == 0)
pidfs_ino_nr += 2;
pid->ino = pidfs_ino_nr;
pid->stashed = NULL;
pidfs_ino_nr++;
write_seqcount_begin(&pidmap_lock_seq);
rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
write_seqcount_end(&pidmap_lock_seq);
}
void pidfs_remove_pid(struct pid *pid)
{
write_seqcount_begin(&pidmap_lock_seq);
rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
write_seqcount_end(&pidmap_lock_seq);
}
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
/** /**
* pidfd_show_fdinfo - print information about a pidfd * pidfd_show_fdinfo - print information about a pidfd
@ -190,6 +282,27 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
return 0; return 0;
} }
static bool pidfs_ioctl_valid(unsigned int cmd)
{
switch (cmd) {
case FS_IOC_GETVERSION:
case PIDFD_GET_CGROUP_NAMESPACE:
case PIDFD_GET_INFO:
case PIDFD_GET_IPC_NAMESPACE:
case PIDFD_GET_MNT_NAMESPACE:
case PIDFD_GET_NET_NAMESPACE:
case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE:
case PIDFD_GET_TIME_NAMESPACE:
case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE:
case PIDFD_GET_UTS_NAMESPACE:
case PIDFD_GET_USER_NAMESPACE:
case PIDFD_GET_PID_NAMESPACE:
return true;
}
return false;
}
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{ {
struct task_struct *task __free(put_task) = NULL; struct task_struct *task __free(put_task) = NULL;
@ -198,6 +311,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct ns_common *ns_common = NULL; struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns; struct pid_namespace *pid_ns;
if (!pidfs_ioctl_valid(cmd))
return -ENOIOCTLCMD;
if (cmd == FS_IOC_GETVERSION) {
if (!arg)
return -EINVAL;
__u32 __user *argp = (__u32 __user *)arg;
return put_user(file_inode(file)->i_generation, argp);
}
task = get_pid_task(pid, PIDTYPE_PID); task = get_pid_task(pid, PIDTYPE_PID);
if (!task) if (!task)
return -ESRCH; return -ESRCH;
@ -318,40 +442,6 @@ struct pid *pidfd_pid(const struct file *file)
static struct vfsmount *pidfs_mnt __ro_after_init; static struct vfsmount *pidfs_mnt __ro_after_init;
#if BITS_PER_LONG == 32
/*
* Provide a fallback mechanism for 32-bit systems so processes remain
* reliably comparable by inode number even on those systems.
*/
static DEFINE_IDA(pidfd_inum_ida);
static int pidfs_inum(struct pid *pid, unsigned long *ino)
{
int ret;
ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
UINT_MAX, GFP_ATOMIC);
if (ret < 0)
return -ENOSPC;
*ino = ret;
return 0;
}
static inline void pidfs_free_inum(unsigned long ino)
{
if (ino > 0)
ida_free(&pidfd_inum_ida, ino);
}
#else
static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
{
*ino = pid->ino;
return 0;
}
#define pidfs_free_inum(ino) ((void)(ino))
#endif
/* /*
* The vfs falls back to simple_setattr() if i_op->setattr() isn't * The vfs falls back to simple_setattr() if i_op->setattr() isn't
* implemented. Let's reject it completely until we have a clean * implemented. Let's reject it completely until we have a clean
@ -403,7 +493,6 @@ static void pidfs_evict_inode(struct inode *inode)
clear_inode(inode); clear_inode(inode);
put_pid(pid); put_pid(pid);
pidfs_free_inum(inode->i_ino);
} }
static const struct super_operations pidfs_sops = { static const struct super_operations pidfs_sops = {
@ -427,19 +516,143 @@ static const struct dentry_operations pidfs_dentry_operations = {
.d_prune = stashed_dentry_prune, .d_prune = stashed_dentry_prune,
}; };
static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
struct inode *parent)
{
const struct pid *pid = inode->i_private;
if (*max_len < 2) {
*max_len = 2;
return FILEID_INVALID;
}
*max_len = 2;
*(u64 *)fh = pid->ino;
return FILEID_KERNFS;
}
static int pidfs_ino_find(const void *key, const struct rb_node *node)
{
const u64 pid_ino = *(u64 *)key;
const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
if (pid_ino < pid->ino)
return -1;
if (pid_ino > pid->ino)
return 1;
return 0;
}
/* Find a struct pid based on the inode number. */
static struct pid *pidfs_ino_get_pid(u64 ino)
{
struct pid *pid;
struct rb_node *node;
unsigned int seq;
guard(rcu)();
do {
seq = read_seqcount_begin(&pidmap_lock_seq);
node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
if (node)
break;
} while (read_seqcount_retry(&pidmap_lock_seq, seq));
if (!node)
return NULL;
pid = rb_entry(node, struct pid, pidfs_node);
/* Within our pid namespace hierarchy? */
if (pid_vnr(pid) == 0)
return NULL;
return get_pid(pid);
}
static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len,
int fh_type)
{
int ret;
u64 pid_ino;
struct path path;
struct pid *pid;
if (fh_len < 2)
return NULL;
switch (fh_type) {
case FILEID_KERNFS:
pid_ino = *(u64 *)fid;
break;
default:
return NULL;
}
pid = pidfs_ino_get_pid(pid_ino);
if (!pid)
return NULL;
ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path);
if (ret < 0)
return ERR_PTR(ret);
mntput(path.mnt);
return path.dentry;
}
/*
* Make sure that we reject any nonsensical flags that users pass via
* open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and
* PIDFD_NONBLOCK as O_NONBLOCK.
*/
#define VALID_FILE_HANDLE_OPEN_FLAGS \
(O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
static int pidfs_export_permission(struct handle_to_path_ctx *ctx,
unsigned int oflags)
{
if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE))
return -EINVAL;
/*
* pidfd_ino_get_pid() will verify that the struct pid is part
* of the caller's pid namespace hierarchy. No further
* permission checks are needed.
*/
return 0;
}
static struct file *pidfs_export_open(struct path *path, unsigned int oflags)
{
/*
* Clear O_LARGEFILE as open_by_handle_at() forces it and raise
* O_RDWR as pidfds always are.
*/
oflags &= ~O_LARGEFILE;
return dentry_open(path, oflags | O_RDWR, current_cred());
}
static const struct export_operations pidfs_export_operations = {
.encode_fh = pidfs_encode_fh,
.fh_to_dentry = pidfs_fh_to_dentry,
.open = pidfs_export_open,
.permission = pidfs_export_permission,
};
static int pidfs_init_inode(struct inode *inode, void *data) static int pidfs_init_inode(struct inode *inode, void *data)
{ {
const struct pid *pid = data;
inode->i_private = data; inode->i_private = data;
inode->i_flags |= S_PRIVATE; inode->i_flags |= S_PRIVATE;
inode->i_mode |= S_IRWXU; inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations; inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations; inode->i_fop = &pidfs_file_operations;
/* inode->i_ino = pidfs_ino(pid->ino);
* Inode numbering for pidfs start at RESERVED_PIDS + 1. This inode->i_generation = pidfs_gen(pid->ino);
* avoids collisions with the root inode which is 1 for pseudo return 0;
* filesystems.
*/
return pidfs_inum(data, &inode->i_ino);
} }
static void pidfs_put_data(void *data) static void pidfs_put_data(void *data)
@ -462,6 +675,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
return -ENOMEM; return -ENOMEM;
ctx->ops = &pidfs_sops; ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations; ctx->dops = &pidfs_dentry_operations;
fc->s_fs_info = (void *)&pidfs_stashed_ops; fc->s_fs_info = (void *)&pidfs_stashed_ops;
return 0; return 0;

View File

@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list)
continue; continue;
} else if (child->mnt.mnt_flags & MNT_UMOUNT) { } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
/* /*
* We have come accross an partially unmounted * We have come across a partially unmounted
* mount in list that has not been visited yet. * mount in a list that has not been visited
* Remember it has been visited and continue * yet. Remember it has been visited and
* about our merry way. * continue about our merry way.
*/ */
list_add_tail(&child->mnt_umounting, &visited); list_add_tail(&child->mnt_umounting, &visited);
continue; continue;

View File

@ -65,7 +65,11 @@ static inline void kc_unxlate_dev_mem_ptr(phys_addr_t phys, void *virt)
#endif #endif
static LIST_HEAD(kclist_head); static LIST_HEAD(kclist_head);
static DECLARE_RWSEM(kclist_lock); static int kcore_nphdr;
static size_t kcore_phdrs_len;
static size_t kcore_notes_len;
static size_t kcore_data_offset;
DEFINE_STATIC_PERCPU_RWSEM(kclist_lock);
static int kcore_need_update = 1; static int kcore_need_update = 1;
/* /*
@ -101,33 +105,32 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size,
list_add_tail(&new->list, &kclist_head); list_add_tail(&new->list, &kclist_head);
} }
static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, static void update_kcore_size(void)
size_t *data_offset)
{ {
size_t try, size; size_t try, size;
struct kcore_list *m; struct kcore_list *m;
*nphdr = 1; /* PT_NOTE */ kcore_nphdr = 1; /* PT_NOTE */
size = 0; size = 0;
list_for_each_entry(m, &kclist_head, list) { list_for_each_entry(m, &kclist_head, list) {
try = kc_vaddr_to_offset((size_t)m->addr + m->size); try = kc_vaddr_to_offset((size_t)m->addr + m->size);
if (try > size) if (try > size)
size = try; size = try;
*nphdr = *nphdr + 1; kcore_nphdr++;
} }
*phdrs_len = *nphdr * sizeof(struct elf_phdr); kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
*notes_len = (4 * sizeof(struct elf_note) + kcore_notes_len = (4 * sizeof(struct elf_note) +
3 * ALIGN(sizeof(CORE_STR), 4) + 3 * ALIGN(sizeof(CORE_STR), 4) +
VMCOREINFO_NOTE_NAME_BYTES + VMCOREINFO_NOTE_NAME_BYTES +
ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prstatus), 4) +
ALIGN(sizeof(struct elf_prpsinfo), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) +
ALIGN(arch_task_struct_size, 4) + ALIGN(arch_task_struct_size, 4) +
ALIGN(vmcoreinfo_size, 4)); ALIGN(vmcoreinfo_size, 4));
*data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + kcore_data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + kcore_phdrs_len +
*notes_len); kcore_notes_len);
return *data_offset + size; proc_root_kcore->size = kcore_data_offset + size;
} }
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
@ -270,12 +273,10 @@ static int kcore_update_ram(void)
{ {
LIST_HEAD(list); LIST_HEAD(list);
LIST_HEAD(garbage); LIST_HEAD(garbage);
int nphdr;
size_t phdrs_len, notes_len, data_offset;
struct kcore_list *tmp, *pos; struct kcore_list *tmp, *pos;
int ret = 0; int ret = 0;
down_write(&kclist_lock); percpu_down_write(&kclist_lock);
if (!xchg(&kcore_need_update, 0)) if (!xchg(&kcore_need_update, 0))
goto out; goto out;
@ -293,11 +294,10 @@ static int kcore_update_ram(void)
} }
list_splice_tail(&list, &kclist_head); list_splice_tail(&list, &kclist_head);
proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, &notes_len, update_kcore_size();
&data_offset);
out: out:
up_write(&kclist_lock); percpu_up_write(&kclist_lock);
list_for_each_entry_safe(pos, tmp, &garbage, list) { list_for_each_entry_safe(pos, tmp, &garbage, list) {
list_del(&pos->list); list_del(&pos->list);
kfree(pos); kfree(pos);
@ -326,27 +326,24 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp; struct file *file = iocb->ki_filp;
char *buf = file->private_data; char *buf = file->private_data;
loff_t *fpos = &iocb->ki_pos; loff_t *fpos = &iocb->ki_pos;
size_t phdrs_offset, notes_offset, data_offset; size_t phdrs_offset, notes_offset;
size_t page_offline_frozen = 1; size_t page_offline_frozen = 1;
size_t phdrs_len, notes_len;
struct kcore_list *m; struct kcore_list *m;
size_t tsz; size_t tsz;
int nphdr;
unsigned long start; unsigned long start;
size_t buflen = iov_iter_count(iter); size_t buflen = iov_iter_count(iter);
size_t orig_buflen = buflen; size_t orig_buflen = buflen;
int ret = 0; int ret = 0;
down_read(&kclist_lock); percpu_down_read(&kclist_lock);
/* /*
* Don't race against drivers that set PageOffline() and expect no * Don't race against drivers that set PageOffline() and expect no
* further page access. * further page access.
*/ */
page_offline_freeze(); page_offline_freeze();
get_kcore_size(&nphdr, &phdrs_len, &notes_len, &data_offset);
phdrs_offset = sizeof(struct elfhdr); phdrs_offset = sizeof(struct elfhdr);
notes_offset = phdrs_offset + phdrs_len; notes_offset = phdrs_offset + kcore_phdrs_len;
/* ELF file header. */ /* ELF file header. */
if (buflen && *fpos < sizeof(struct elfhdr)) { if (buflen && *fpos < sizeof(struct elfhdr)) {
@ -368,7 +365,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
.e_flags = ELF_CORE_EFLAGS, .e_flags = ELF_CORE_EFLAGS,
.e_ehsize = sizeof(struct elfhdr), .e_ehsize = sizeof(struct elfhdr),
.e_phentsize = sizeof(struct elf_phdr), .e_phentsize = sizeof(struct elf_phdr),
.e_phnum = nphdr, .e_phnum = kcore_nphdr,
}; };
tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos);
@ -382,10 +379,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
} }
/* ELF program headers. */ /* ELF program headers. */
if (buflen && *fpos < phdrs_offset + phdrs_len) { if (buflen && *fpos < phdrs_offset + kcore_phdrs_len) {
struct elf_phdr *phdrs, *phdr; struct elf_phdr *phdrs, *phdr;
phdrs = kzalloc(phdrs_len, GFP_KERNEL); phdrs = kzalloc(kcore_phdrs_len, GFP_KERNEL);
if (!phdrs) { if (!phdrs) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
@ -393,13 +390,14 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
phdrs[0].p_type = PT_NOTE; phdrs[0].p_type = PT_NOTE;
phdrs[0].p_offset = notes_offset; phdrs[0].p_offset = notes_offset;
phdrs[0].p_filesz = notes_len; phdrs[0].p_filesz = kcore_notes_len;
phdr = &phdrs[1]; phdr = &phdrs[1];
list_for_each_entry(m, &kclist_head, list) { list_for_each_entry(m, &kclist_head, list) {
phdr->p_type = PT_LOAD; phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R | PF_W | PF_X; phdr->p_flags = PF_R | PF_W | PF_X;
phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; phdr->p_offset = kc_vaddr_to_offset(m->addr)
+ kcore_data_offset;
phdr->p_vaddr = (size_t)m->addr; phdr->p_vaddr = (size_t)m->addr;
if (m->type == KCORE_RAM) if (m->type == KCORE_RAM)
phdr->p_paddr = __pa(m->addr); phdr->p_paddr = __pa(m->addr);
@ -412,7 +410,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
phdr++; phdr++;
} }
tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); tsz = min_t(size_t, buflen,
phdrs_offset + kcore_phdrs_len - *fpos);
if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz,
iter) != tsz) { iter) != tsz) {
kfree(phdrs); kfree(phdrs);
@ -426,7 +425,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
} }
/* ELF note segment. */ /* ELF note segment. */
if (buflen && *fpos < notes_offset + notes_len) { if (buflen && *fpos < notes_offset + kcore_notes_len) {
struct elf_prstatus prstatus = {}; struct elf_prstatus prstatus = {};
struct elf_prpsinfo prpsinfo = { struct elf_prpsinfo prpsinfo = {
.pr_sname = 'R', .pr_sname = 'R',
@ -438,7 +437,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
strscpy(prpsinfo.pr_psargs, saved_command_line, strscpy(prpsinfo.pr_psargs, saved_command_line,
sizeof(prpsinfo.pr_psargs)); sizeof(prpsinfo.pr_psargs));
notes = kzalloc(notes_len, GFP_KERNEL); notes = kzalloc(kcore_notes_len, GFP_KERNEL);
if (!notes) { if (!notes) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;
@ -459,9 +458,10 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
*/ */
append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0,
vmcoreinfo_data, vmcoreinfo_data,
min(vmcoreinfo_size, notes_len - i)); min(vmcoreinfo_size, kcore_notes_len - i));
tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); tsz = min_t(size_t, buflen,
notes_offset + kcore_notes_len - *fpos);
if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) {
kfree(notes); kfree(notes);
ret = -EFAULT; ret = -EFAULT;
@ -477,7 +477,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
* Check to see if our file offset matches with any of * Check to see if our file offset matches with any of
* the addresses in the elf_phdr on our list. * the addresses in the elf_phdr on our list.
*/ */
start = kc_offset_to_vaddr(*fpos - data_offset); start = kc_offset_to_vaddr(*fpos - kcore_data_offset);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen; tsz = buflen;
@ -626,7 +626,7 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
out: out:
page_offline_thaw(); page_offline_thaw();
up_read(&kclist_lock); percpu_up_read(&kclist_lock);
if (ret) if (ret)
return ret; return ret;
return orig_buflen - buflen; return orig_buflen - buflen;
@ -663,6 +663,7 @@ static int release_kcore(struct inode *inode, struct file *file)
} }
static const struct proc_ops kcore_proc_ops = { static const struct proc_ops kcore_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_read_iter = read_kcore_iter, .proc_read_iter = read_kcore_iter,
.proc_open = open_kcore, .proc_open = open_kcore,
.proc_release = release_kcore, .proc_release = release_kcore,

View File

@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) { if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
res = ns_get_name(name, sizeof(name), task, ns_ops); res = ns_get_name(name, sizeof(name), task, ns_ops);
if (res >= 0) if (res >= 0)
res = readlink_copy(buffer, buflen, name); res = readlink_copy(buffer, buflen, name, strlen(name));
} }
put_task_struct(task); put_task_struct(task);
return res; return res;

View File

@ -1258,14 +1258,6 @@ CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
return rc; return rc;
} }
static void cifs_readv_worker(struct work_struct *work)
{
struct cifs_io_subrequest *rdata =
container_of(work, struct cifs_io_subrequest, subreq.work);
netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false);
}
static void static void
cifs_readv_callback(struct mid_q_entry *mid) cifs_readv_callback(struct mid_q_entry *mid)
{ {
@ -1328,11 +1320,13 @@ cifs_readv_callback(struct mid_q_entry *mid)
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0; rdata->result = 0;
} }
if (rdata->got_bytes)
__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
} }
rdata->credits.value = 0; rdata->credits.value = 0;
rdata->subreq.error = rdata->result;
rdata->subreq.transferred += rdata->got_bytes; rdata->subreq.transferred += rdata->got_bytes;
INIT_WORK(&rdata->subreq.work, cifs_readv_worker);
queue_work(cifsiod_wq, &rdata->subreq.work); queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid); release_mid(mid);
add_credits(server, &credits, 0); add_credits(server, &credits, 0);

View File

@ -227,7 +227,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)
return; return;
failed: failed:
netfs_read_subreq_terminated(subreq, rc, false); subreq->error = rc;
netfs_read_subreq_terminated(subreq);
} }
/* /*

View File

@ -4388,7 +4388,7 @@ static struct folio_queue *cifs_alloc_folioq_buffer(ssize_t size)
p = kmalloc(sizeof(*p), GFP_NOFS); p = kmalloc(sizeof(*p), GFP_NOFS);
if (!p) if (!p)
goto nomem; goto nomem;
folioq_init(p); folioq_init(p, 0);
if (tail) { if (tail) {
tail->next = p; tail->next = p;
p->prev = tail; p->prev = tail;

View File

@ -4500,14 +4500,6 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
return rc; return rc;
} }
static void smb2_readv_worker(struct work_struct *work)
{
struct cifs_io_subrequest *rdata =
container_of(work, struct cifs_io_subrequest, subreq.work);
netfs_read_subreq_terminated(&rdata->subreq, rdata->result, false);
}
static void static void
smb2_readv_callback(struct mid_q_entry *mid) smb2_readv_callback(struct mid_q_entry *mid)
{ {
@ -4615,15 +4607,17 @@ smb2_readv_callback(struct mid_q_entry *mid)
__set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags); __set_bit(NETFS_SREQ_HIT_EOF, &rdata->subreq.flags);
rdata->result = 0; rdata->result = 0;
} }
if (rdata->got_bytes)
__set_bit(NETFS_SREQ_MADE_PROGRESS, &rdata->subreq.flags);
} }
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value, trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, rdata->credits.value,
server->credits, server->in_flight, server->credits, server->in_flight,
0, cifs_trace_rw_credits_read_response_clear); 0, cifs_trace_rw_credits_read_response_clear);
rdata->credits.value = 0; rdata->credits.value = 0;
rdata->subreq.error = rdata->result;
rdata->subreq.transferred += rdata->got_bytes; rdata->subreq.transferred += rdata->got_bytes;
trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress);
INIT_WORK(&rdata->subreq.work, smb2_readv_worker); netfs_read_subreq_terminated(&rdata->subreq);
queue_work(cifsiod_wq, &rdata->subreq.work);
release_mid(mid); release_mid(mid);
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
server->credits, server->in_flight, server->credits, server->in_flight,

View File

@ -781,10 +781,6 @@ int __ksmbd_override_fsids(struct ksmbd_work *work,
WARN_ON(work->saved_cred); WARN_ON(work->saved_cred);
work->saved_cred = override_creds(cred); work->saved_cred = override_creds(cred);
if (!work->saved_cred) {
abort_creds(cred);
return -EINVAL;
}
return 0; return 0;
} }
@ -796,13 +792,11 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
void ksmbd_revert_fsids(struct ksmbd_work *work) void ksmbd_revert_fsids(struct ksmbd_work *work)
{ {
const struct cred *cred; const struct cred *cred;
WARN_ON(!work->saved_cred); WARN_ON(!work->saved_cred);
cred = current_cred(); cred = revert_creds(work->saved_cred);
revert_creds(work->saved_cred);
put_cred(cred);
work->saved_cred = NULL; work->saved_cred = NULL;
put_cred(cred);
} }
__le32 smb_map_generic_desired_access(__le32 daccess) __le32 smb_map_generic_desired_access(__le32 daccess)

View File

@ -155,8 +155,6 @@ extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *); extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *); extern void abort_creds(struct cred *);
extern const struct cred *override_creds(const struct cred *);
extern void revert_creds(const struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *); extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int set_security_override(struct cred *, u32); extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_security_override_from_ctx(struct cred *, const char *);
@ -172,12 +170,7 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred)
cred->cap_inheritable)); cred->cap_inheritable));
} }
/* static inline const struct cred *override_creds(const struct cred *override_cred)
* Override creds without bumping reference count. Caller must ensure
* reference remains valid or has taken reference. Almost always not the
* interface you want. Use override_creds()/revert_creds() instead.
*/
static inline const struct cred *override_creds_light(const struct cred *override_cred)
{ {
const struct cred *old = current->cred; const struct cred *old = current->cred;
@ -185,35 +178,12 @@ static inline const struct cred *override_creds_light(const struct cred *overrid
return old; return old;
} }
static inline void revert_creds_light(const struct cred *revert_cred) static inline const struct cred *revert_creds(const struct cred *revert_cred)
{ {
const struct cred *override_cred = current->cred;
rcu_assign_pointer(current->cred, revert_cred); rcu_assign_pointer(current->cred, revert_cred);
} return override_cred;
/**
* get_new_cred_many - Get references on a new set of credentials
* @cred: The new credentials to reference
* @nr: Number of references to acquire
*
* Get references on the specified set of new credentials. The caller must
* release all acquired references.
*/
static inline struct cred *get_new_cred_many(struct cred *cred, int nr)
{
atomic_long_add(nr, &cred->usage);
return cred;
}
/**
* get_new_cred - Get a reference on a new set of credentials
* @cred: The new credentials to reference
*
* Get a reference on the specified set of new credentials. The caller must
* release the reference.
*/
static inline struct cred *get_new_cred(struct cred *cred)
{
return get_new_cred_many(cred, 1);
} }
/** /**
@ -236,7 +206,8 @@ static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
if (!cred) if (!cred)
return cred; return cred;
nonconst_cred->non_rcu = 0; nonconst_cred->non_rcu = 0;
return get_new_cred_many(nonconst_cred, nr); atomic_long_add(nr, &nonconst_cred->usage);
return cred;
} }
/* /*

View File

@ -3,6 +3,7 @@
#define LINUX_EXPORTFS_H 1 #define LINUX_EXPORTFS_H 1
#include <linux/types.h> #include <linux/types.h>
#include <linux/path.h>
struct dentry; struct dentry;
struct iattr; struct iattr;
@ -156,6 +157,17 @@ struct fid {
}; };
}; };
enum handle_to_path_flags {
HANDLE_CHECK_PERMS = (1 << 0),
HANDLE_CHECK_SUBTREE = (1 << 1),
};
struct handle_to_path_ctx {
struct path root;
enum handle_to_path_flags flags;
unsigned int fh_flags;
};
#define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */ #define EXPORT_FH_CONNECTABLE 0x1 /* Encode file handle with parent */
#define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */ #define EXPORT_FH_FID 0x2 /* File handle may be non-decodeable */
#define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */ #define EXPORT_FH_DIR_ONLY 0x4 /* Only decode file handle for a directory */
@ -225,6 +237,12 @@ struct fid {
* is also a directory. In the event that it cannot be found, or storage * is also a directory. In the event that it cannot be found, or storage
* space cannot be allocated, a %ERR_PTR should be returned. * space cannot be allocated, a %ERR_PTR should be returned.
* *
* permission:
* Allow filesystems to specify a custom permission function.
*
* open:
* Allow filesystems to specify a custom open function.
*
* commit_metadata: * commit_metadata:
* @commit_metadata should commit metadata changes to stable storage. * @commit_metadata should commit metadata changes to stable storage.
* *
@ -251,6 +269,8 @@ struct export_operations {
bool write, u32 *device_generation); bool write, u32 *device_generation);
int (*commit_blocks)(struct inode *inode, struct iomap *iomaps, int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
int nr_iomaps, struct iattr *iattr); int nr_iomaps, struct iattr *iattr);
int (*permission)(struct handle_to_path_ctx *ctx, unsigned int oflags);
struct file * (*open)(struct path *path, unsigned int oflags);
#define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */ #define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */
#define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */ #define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */
#define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */ #define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */

View File

@ -5,12 +5,18 @@
#include <uapi/linux/fiemap.h> #include <uapi/linux/fiemap.h>
#include <linux/fs.h> #include <linux/fs.h>
/**
* struct fiemap_extent_info - fiemap request to a filesystem
* @fi_flags: Flags as passed from user
* @fi_extents_mapped: Number of mapped extents
* @fi_extents_max: Size of fiemap_extent array
* @fi_extents_start: Start of fiemap_extent array
*/
struct fiemap_extent_info { struct fiemap_extent_info {
unsigned int fi_flags; /* Flags as passed from user */ unsigned int fi_flags;
unsigned int fi_extents_mapped; /* Number of mapped extents */ unsigned int fi_extents_mapped;
unsigned int fi_extents_max; /* Size of fiemap_extent array */ unsigned int fi_extents_max;
struct fiemap_extent __user *fi_extents_start; /* Start of struct fiemap_extent __user *fi_extents_start;
fiemap_extent array */
}; };
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,

View File

@ -37,16 +37,20 @@ struct folio_queue {
#if PAGEVEC_SIZE > BITS_PER_LONG #if PAGEVEC_SIZE > BITS_PER_LONG
#error marks is not big enough #error marks is not big enough
#endif #endif
unsigned int rreq_id;
unsigned int debug_id;
}; };
/** /**
* folioq_init - Initialise a folio queue segment * folioq_init - Initialise a folio queue segment
* @folioq: The segment to initialise * @folioq: The segment to initialise
* @rreq_id: The request identifier to use in tracelines.
* *
* Initialise a folio queue segment. Note that the folio pointers are * Initialise a folio queue segment and set an identifier to be used in traces.
* left uninitialised. *
* Note that the folio pointers are left uninitialised.
*/ */
static inline void folioq_init(struct folio_queue *folioq) static inline void folioq_init(struct folio_queue *folioq, unsigned int rreq_id)
{ {
folio_batch_init(&folioq->vec); folio_batch_init(&folioq->vec);
folioq->next = NULL; folioq->next = NULL;
@ -54,6 +58,8 @@ static inline void folioq_init(struct folio_queue *folioq)
folioq->marks = 0; folioq->marks = 0;
folioq->marks2 = 0; folioq->marks2 = 0;
folioq->marks3 = 0; folioq->marks3 = 0;
folioq->rreq_id = rreq_id;
folioq->debug_id = 0;
} }
/** /**

View File

@ -659,6 +659,7 @@ is_uncached_acl(struct posix_acl *acl)
#define IOP_XATTR 0x0008 #define IOP_XATTR 0x0008
#define IOP_DEFAULT_READLINK 0x0010 #define IOP_DEFAULT_READLINK 0x0010
#define IOP_MGTIME 0x0020 #define IOP_MGTIME 0x0020
#define IOP_CACHED_LINK 0x0040
/* /*
* Keep mostly read-only and often accessed (especially for * Keep mostly read-only and often accessed (especially for
@ -756,7 +757,10 @@ struct inode {
}; };
struct file_lock_context *i_flctx; struct file_lock_context *i_flctx;
struct address_space i_data; struct address_space i_data;
struct list_head i_devices; union {
struct list_head i_devices;
int i_linklen;
};
union { union {
struct pipe_inode_info *i_pipe; struct pipe_inode_info *i_pipe;
struct cdev *i_cdev; struct cdev *i_cdev;
@ -782,6 +786,13 @@ struct inode {
void *i_private; /* fs or device private pointer */ void *i_private; /* fs or device private pointer */
} __randomize_layout; } __randomize_layout;
static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
inode->i_link = link;
inode->i_linklen = linklen;
inode->i_opflags |= IOP_CACHED_LINK;
}
/* /*
* Get bit address from inode->i_state to use with wait_var_event() * Get bit address from inode->i_state to use with wait_var_event()
* infrastructre. * infrastructre.
@ -3409,7 +3420,7 @@ extern const struct file_operations generic_ro_fops;
#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
extern int readlink_copy(char __user *, int, const char *); extern int readlink_copy(char __user *, int, const char *, int);
extern int page_readlink(struct dentry *, char __user *, int); extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link(struct dentry *, struct inode *, extern const char *page_get_link(struct dentry *, struct inode *,
struct delayed_call *); struct delayed_call *);
@ -3526,7 +3537,6 @@ struct offset_ctx {
void simple_offset_init(struct offset_ctx *octx); void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
int simple_offset_empty(struct dentry *dentry);
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry); struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir, int simple_offset_rename_exchange(struct inode *old_dir,

View File

@ -50,7 +50,7 @@ struct path;
#define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )
#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)
#define MNT_INTERNAL 0x4000 #define MNT_INTERNAL 0x4000
@ -64,7 +64,6 @@ struct path;
#define MNT_SYNC_UMOUNT 0x2000000 #define MNT_SYNC_UMOUNT 0x2000000
#define MNT_MARKED 0x4000000 #define MNT_MARKED 0x4000000
#define MNT_UMOUNT 0x8000000 #define MNT_UMOUNT 0x8000000
#define MNT_ONRB 0x10000000
struct vfsmount { struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */ struct dentry *mnt_root; /* root of the mounted tree */
@ -76,7 +75,7 @@ struct vfsmount {
static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt) static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt)
{ {
/* Pairs with smp_store_release() in do_idmap_mount(). */ /* Pairs with smp_store_release() in do_idmap_mount(). */
return smp_load_acquire(&mnt->mnt_idmap); return READ_ONCE(mnt->mnt_idmap);
} }
extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write(struct vfsmount *mnt);

View File

@ -18,9 +18,11 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/rolling_buffer.h>
enum netfs_sreq_ref_trace; enum netfs_sreq_ref_trace;
typedef struct mempool_s mempool_t; typedef struct mempool_s mempool_t;
struct folio_queue;
/** /**
* folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED] * folio_start_private_2 - Start an fscache write on a folio. [DEPRECATED]
@ -71,6 +73,7 @@ struct netfs_inode {
#define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */ #define NETFS_ICTX_UNBUFFERED 1 /* I/O should not use the pagecache */
#define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */ #define NETFS_ICTX_WRITETHROUGH 2 /* Write-through caching */
#define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */ #define NETFS_ICTX_MODIFIED_ATTR 3 /* Indicate change in mtime/ctime */
#define NETFS_ICTX_SINGLE_NO_UPLOAD 4 /* Monolithic payload, cache but no upload */
}; };
/* /*
@ -178,23 +181,17 @@ struct netfs_io_subrequest {
unsigned long long start; /* Where to start the I/O */ unsigned long long start; /* Where to start the I/O */
size_t len; /* Size of the I/O */ size_t len; /* Size of the I/O */
size_t transferred; /* Amount of data transferred */ size_t transferred; /* Amount of data transferred */
size_t consumed; /* Amount of read data consumed */
size_t prev_donated; /* Amount of data donated from previous subreq */
size_t next_donated; /* Amount of data donated from next subreq */
refcount_t ref; refcount_t ref;
short error; /* 0 or error that occurred */ short error; /* 0 or error that occurred */
unsigned short debug_index; /* Index in list (for debugging output) */ unsigned short debug_index; /* Index in list (for debugging output) */
unsigned int nr_segs; /* Number of segs in io_iter */ unsigned int nr_segs; /* Number of segs in io_iter */
enum netfs_io_source source; /* Where to read from/write to */ enum netfs_io_source source; /* Where to read from/write to */
unsigned char stream_nr; /* I/O stream this belongs to */ unsigned char stream_nr; /* I/O stream this belongs to */
unsigned char curr_folioq_slot; /* Folio currently being read */
unsigned char curr_folio_order; /* Order of folio */
struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */
unsigned long flags; unsigned long flags;
#define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */ #define NETFS_SREQ_COPY_TO_CACHE 0 /* Set if should copy the data to the cache */
#define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */ #define NETFS_SREQ_CLEAR_TAIL 1 /* Set if the rest of the read should be cleared */
#define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */ #define NETFS_SREQ_SEEK_DATA_READ 3 /* Set if ->read() should SEEK_DATA first */
#define NETFS_SREQ_NO_PROGRESS 4 /* Set if we didn't manage to read any data */ #define NETFS_SREQ_MADE_PROGRESS 4 /* Set if we managed to read more data */
#define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */ #define NETFS_SREQ_ONDEMAND 5 /* Set if it's from on-demand read mode */
#define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */ #define NETFS_SREQ_BOUNDARY 6 /* Set if ends on hard boundary (eg. ceph object) */
#define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */ #define NETFS_SREQ_HIT_EOF 7 /* Set if short due to EOF */
@ -208,9 +205,11 @@ enum netfs_io_origin {
NETFS_READAHEAD, /* This read was triggered by readahead */ NETFS_READAHEAD, /* This read was triggered by readahead */
NETFS_READPAGE, /* This read is a synchronous read */ NETFS_READPAGE, /* This read is a synchronous read */
NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */
NETFS_READ_SINGLE, /* This read should be treated as a single object */
NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ NETFS_READ_FOR_WRITE, /* This read is to prepare a write */
NETFS_DIO_READ, /* This is a direct I/O read */ NETFS_DIO_READ, /* This is a direct I/O read */
NETFS_WRITEBACK, /* This write was triggered by writepages */ NETFS_WRITEBACK, /* This write was triggered by writepages */
NETFS_WRITEBACK_SINGLE, /* This monolithic write was triggered by writepages */
NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */ NETFS_WRITETHROUGH, /* This write was made by netfs_perform_write() */
NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */ NETFS_UNBUFFERED_WRITE, /* This is an unbuffered write */
NETFS_DIO_WRITE, /* This is a direct I/O write */ NETFS_DIO_WRITE, /* This is a direct I/O write */
@ -233,14 +232,13 @@ struct netfs_io_request {
struct netfs_cache_resources cache_resources; struct netfs_cache_resources cache_resources;
struct readahead_control *ractl; /* Readahead descriptor */ struct readahead_control *ractl; /* Readahead descriptor */
struct list_head proc_link; /* Link in netfs_iorequests */ struct list_head proc_link; /* Link in netfs_iorequests */
struct list_head subrequests; /* Contributory I/O operations */
struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */ struct netfs_io_stream io_streams[2]; /* Streams of parallel I/O operations */
#define NR_IO_STREAMS 2 //wreq->nr_io_streams #define NR_IO_STREAMS 2 //wreq->nr_io_streams
struct netfs_group *group; /* Writeback group being written back */ struct netfs_group *group; /* Writeback group being written back */
struct folio_queue *buffer; /* Head of I/O buffer */ struct rolling_buffer buffer; /* Unencrypted buffer */
struct folio_queue *buffer_tail; /* Tail of I/O buffer */ #define NETFS_ROLLBUF_PUT_MARK ROLLBUF_MARK_1
struct iov_iter iter; /* Unencrypted-side iterator */ #define NETFS_ROLLBUF_PAGECACHE_MARK ROLLBUF_MARK_2
struct iov_iter io_iter; /* I/O (Encrypted-side) iterator */ wait_queue_head_t waitq; /* Processor waiter */
void *netfs_priv; /* Private data for the netfs */ void *netfs_priv; /* Private data for the netfs */
void *netfs_priv2; /* Private data for the netfs */ void *netfs_priv2; /* Private data for the netfs */
struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */ struct bio_vec *direct_bv; /* DIO buffer list (when handling iovec-iter) */
@ -251,29 +249,29 @@ struct netfs_io_request {
atomic_t subreq_counter; /* Next subreq->debug_index */ atomic_t subreq_counter; /* Next subreq->debug_index */
unsigned int nr_group_rel; /* Number of refs to release on ->group */ unsigned int nr_group_rel; /* Number of refs to release on ->group */
spinlock_t lock; /* Lock for queuing subreqs */ spinlock_t lock; /* Lock for queuing subreqs */
atomic_t nr_outstanding; /* Number of ops in progress */
unsigned long long submitted; /* Amount submitted for I/O so far */ unsigned long long submitted; /* Amount submitted for I/O so far */
unsigned long long len; /* Length of the request */ unsigned long long len; /* Length of the request */
size_t transferred; /* Amount to be indicated as transferred */ size_t transferred; /* Amount to be indicated as transferred */
long error; /* 0 or error that occurred */ long error; /* 0 or error that occurred */
enum netfs_io_origin origin; /* Origin of the request */ enum netfs_io_origin origin; /* Origin of the request */
bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */ bool direct_bv_unpin; /* T if direct_bv[] must be unpinned */
u8 buffer_head_slot; /* First slot in ->buffer */
u8 buffer_tail_slot; /* Next slot in ->buffer_tail */
unsigned long long i_size; /* Size of the file */ unsigned long long i_size; /* Size of the file */
unsigned long long start; /* Start position */ unsigned long long start; /* Start position */
atomic64_t issued_to; /* Write issuer folio cursor */ atomic64_t issued_to; /* Write issuer folio cursor */
unsigned long long collected_to; /* Point we've collected to */ unsigned long long collected_to; /* Point we've collected to */
unsigned long long cleaned_to; /* Position we've cleaned folios to */ unsigned long long cleaned_to; /* Position we've cleaned folios to */
unsigned long long abandon_to; /* Position to abandon folios to */
pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ pgoff_t no_unlock_folio; /* Don't unlock this folio after read */
size_t prev_donated; /* Fallback for subreq->prev_donated */ unsigned char front_folio_order; /* Order (size) of front folio */
refcount_t ref; refcount_t ref;
unsigned long flags; unsigned long flags;
#define NETFS_RREQ_OFFLOAD_COLLECTION 0 /* Offload collection to workqueue */
#define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */ #define NETFS_RREQ_COPY_TO_CACHE 1 /* Need to write to the cache */
#define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ #define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */
#define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */
#define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_FAILED 4 /* The request failed */
#define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */
#define NETFS_RREQ_FOLIO_COPY_TO_CACHE 6 /* Copy current folio to cache from read */
#define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */ #define NETFS_RREQ_UPLOAD_TO_SERVER 8 /* Need to write to the server */
#define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */ #define NETFS_RREQ_NONBLOCK 9 /* Don't block if possible (O_NONBLOCK) */
#define NETFS_RREQ_BLOCKED 10 /* We blocked */ #define NETFS_RREQ_BLOCKED 10 /* We blocked */
@ -410,6 +408,13 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
struct netfs_group *netfs_group); struct netfs_group *netfs_group);
ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from); ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
/* Single, monolithic object read/write API. */
void netfs_single_mark_inode_dirty(struct inode *inode);
ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_iter *iter);
int netfs_writeback_single(struct address_space *mapping,
struct writeback_control *wbc,
struct iov_iter *iter);
/* Address operations API */ /* Address operations API */
struct readahead_control; struct readahead_control;
void netfs_readahead(struct readahead_control *); void netfs_readahead(struct readahead_control *);
@ -429,10 +434,8 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp);
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group); vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
/* (Sub)request management API. */ /* (Sub)request management API. */
void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq, void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq);
bool was_async); void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq);
void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq,
int error, bool was_async);
void netfs_get_subrequest(struct netfs_io_subrequest *subreq, void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
enum netfs_sreq_ref_trace what); enum netfs_sreq_ref_trace what);
void netfs_put_subrequest(struct netfs_io_subrequest *subreq, void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
@ -454,6 +457,18 @@ void netfs_end_io_write(struct inode *inode);
int netfs_start_io_direct(struct inode *inode); int netfs_start_io_direct(struct inode *inode);
void netfs_end_io_direct(struct inode *inode); void netfs_end_io_direct(struct inode *inode);
/* Miscellaneous APIs. */
struct folio_queue *netfs_folioq_alloc(unsigned int rreq_id, gfp_t gfp,
unsigned int /*enum netfs_folioq_trace*/ trace);
void netfs_folioq_free(struct folio_queue *folioq,
unsigned int /*enum netfs_trace_folioq*/ trace);
/* Buffer wrangling helpers API. */
int netfs_alloc_folioq_buffer(struct address_space *mapping,
struct folio_queue **_buffer,
size_t *_cur_size, ssize_t size, gfp_t gfp);
void netfs_free_folioq_buffer(struct folio_queue *fq);
/** /**
* netfs_inode - Get the netfs inode context from the inode * netfs_inode - Get the netfs inode context from the inode
* @inode: The inode to query * @inode: The inode to query

View File

@ -59,6 +59,7 @@ struct pid
spinlock_t lock; spinlock_t lock;
struct dentry *stashed; struct dentry *stashed;
u64 ino; u64 ino;
struct rb_node pidfs_node;
/* lists of tasks that use this pid */ /* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head tasks[PIDTYPE_MAX];
struct hlist_head inodes; struct hlist_head inodes;
@ -68,6 +69,7 @@ struct pid
struct upid numbers[]; struct upid numbers[];
}; };
extern seqcount_spinlock_t pidmap_lock_seq;
extern struct pid init_struct_pid; extern struct pid init_struct_pid;
struct file; struct file;
@ -106,9 +108,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new, extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type); enum pid_type);
extern int pid_max;
extern int pid_max_min, pid_max_max;
/* /*
* look up a PID in the hash table. Must be called with the tasklist_lock * look up a PID in the hash table. Must be called with the tasklist_lock
* or rcu_read_lock() held. * or rcu_read_lock() held.

View File

@ -30,6 +30,7 @@ struct pid_namespace {
struct task_struct *child_reaper; struct task_struct *child_reaper;
struct kmem_cache *pid_cachep; struct kmem_cache *pid_cachep;
unsigned int level; unsigned int level;
int pid_max;
struct pid_namespace *parent; struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT #ifdef CONFIG_BSD_PROCESS_ACCT
struct fs_pin *bacct; struct fs_pin *bacct;
@ -38,9 +39,14 @@ struct pid_namespace {
struct ucounts *ucounts; struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */ int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns; struct ns_common ns;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) struct work_struct work;
#ifdef CONFIG_SYSCTL
struct ctl_table_set set;
struct ctl_table_header *sysctls;
#if defined(CONFIG_MEMFD_CREATE)
int memfd_noexec_scope; int memfd_noexec_scope;
#endif #endif
#endif
} __randomize_layout; } __randomize_layout;
extern struct pid_namespace init_pid_ns; extern struct pid_namespace init_pid_ns;
@ -117,6 +123,8 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void); void pidhash_init(void);
void pid_idr_init(void); void pid_idr_init(void);
int register_pidns_sysctls(struct pid_namespace *pidns);
void unregister_pidns_sysctls(struct pid_namespace *pidns);
static inline bool task_is_in_init_pid_ns(struct task_struct *tsk) static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{ {

View File

@ -4,5 +4,7 @@
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
void __init pidfs_init(void); void __init pidfs_init(void);
void pidfs_add_pid(struct pid *pid);
void pidfs_remove_pid(struct pid *pid);
#endif /* _LINUX_PID_FS_H */ #endif /* _LINUX_PID_FS_H */

View File

@ -5,6 +5,7 @@
struct pseudo_fs_context { struct pseudo_fs_context {
const struct super_operations *ops; const struct super_operations *ops;
const struct export_operations *eops;
const struct xattr_handler * const *xattr; const struct xattr_handler * const *xattr;
const struct dentry_operations *dops; const struct dentry_operations *dops;
unsigned long magic; unsigned long magic;

View File

@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
* way, we must not access it directly * way, we must not access it directly
*/ */
#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
/*
* Return the ->prev pointer of a list_head in an rcu safe way. Don't
* access it directly.
*
* Any list traversed with list_bidir_prev_rcu() must never use
* list_del_rcu(). Doing so will poison the ->prev pointer that
* list_bidir_prev_rcu() relies on, which will result in segfaults.
* To prevent these segfaults, use list_bidir_del_rcu() instead
* of list_del_rcu().
*/
#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))
/** /**
* list_tail_rcu - returns the prev pointer of the head of the list * list_tail_rcu - returns the prev pointer of the head of the list
@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry)
entry->prev = LIST_POISON2; entry->prev = LIST_POISON2;
} }
/**
* list_bidir_del_rcu - deletes entry from list without re-initialization
* @entry: the element to delete from the list.
*
* In contrast to list_del_rcu() doesn't poison the prev pointer thus
* allowing backwards traversal via list_bidir_prev_rcu().
*
* Note: list_empty() on entry does not return true after this because
* the entry is in a special undefined state that permits RCU-based
* lockfree reverse traversal. In particular this means that we can not
* poison the forward and backwards pointers that may still be used for
* walking the list.
*
* The caller must take whatever precautions are necessary (such as
* holding appropriate locks) to avoid racing with another list-mutation
* primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
* this same list. However, it is perfectly legal to run concurrently
* with the _rcu list-traversal primitives, such as
* list_for_each_entry_rcu().
*
* Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
* the same list.
*
* Note that the caller is not permitted to immediately free
* the newly deleted entry. Instead, either synchronize_rcu()
* or call_rcu() must be used to defer freeing until an RCU
* grace period has elapsed.
*/
static inline void list_bidir_del_rcu(struct list_head *entry)
{
__list_del_entry(entry);
}
/** /**
* hlist_del_init_rcu - deletes entry from hash list with re-initialization * hlist_del_init_rcu - deletes entry from hash list with re-initialization
* @n: the element to delete from the hash list. * @n: the element to delete from the hash list.

View File

@ -0,0 +1,61 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Rolling buffer of folios
*
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#ifndef _ROLLING_BUFFER_H
#define _ROLLING_BUFFER_H
#include <linux/folio_queue.h>
#include <linux/uio.h>
/*
* Rolling buffer. Whilst the buffer is live and in use, folios and folio
* queue segments can be added to one end by one thread and removed from the
* other end by another thread. The buffer isn't allowed to be empty; it must
* always have at least one folio_queue in it so that neither side has to
* modify both queue pointers.
*
* The iterator in the buffer is extended as buffers are inserted. It can be
* snapshotted to use a segment of the buffer.
*/
struct rolling_buffer {
struct folio_queue *head; /* Producer's insertion point */
struct folio_queue *tail; /* Consumer's removal point */
struct iov_iter iter; /* Iterator tracking what's left in the buffer */
u8 next_head_slot; /* Next slot in ->head */
u8 first_tail_slot; /* First slot in ->tail */
};
/*
* Snapshot of a rolling buffer.
*/
struct rolling_buffer_snapshot {
struct folio_queue *curr_folioq; /* Queue segment in which current folio resides */
unsigned char curr_slot; /* Folio currently being read */
unsigned char curr_order; /* Order of folio */
};
/* Marks to store per-folio in the internal folio_queue structs. */
#define ROLLBUF_MARK_1 BIT(0)
#define ROLLBUF_MARK_2 BIT(1)
int rolling_buffer_init(struct rolling_buffer *roll, unsigned int rreq_id,
unsigned int direction);
int rolling_buffer_make_space(struct rolling_buffer *roll);
ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll,
struct readahead_control *ractl,
struct folio_batch *put_batch);
ssize_t rolling_buffer_append(struct rolling_buffer *roll, struct folio *folio,
unsigned int flags);
struct folio_queue *rolling_buffer_delete_spent(struct rolling_buffer *roll);
void rolling_buffer_clear(struct rolling_buffer *roll);
static inline void rolling_buffer_advance(struct rolling_buffer *roll, size_t amount)
{
iov_iter_advance(&roll->iter, amount);
}
#endif /* _ROLLING_BUFFER_H */

View File

@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex)
({ \ ({ \
unsigned __seq; \ unsigned __seq; \
\ \
while ((__seq = seqprop_sequence(s)) & 1) \ while (unlikely((__seq = seqprop_sequence(s)) & 1)) \
cpu_relax(); \ cpu_relax(); \
\ \
kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \

View File

@ -118,6 +118,8 @@ enum yfs_cm_operation {
*/ */
#define afs_call_traces \ #define afs_call_traces \
EM(afs_call_trace_alloc, "ALLOC") \ EM(afs_call_trace_alloc, "ALLOC") \
EM(afs_call_trace_async_abort, "ASYAB") \
EM(afs_call_trace_async_kill, "ASYKL") \
EM(afs_call_trace_free, "FREE ") \ EM(afs_call_trace_free, "FREE ") \
EM(afs_call_trace_get, "GET ") \ EM(afs_call_trace_get, "GET ") \
EM(afs_call_trace_put, "PUT ") \ EM(afs_call_trace_put, "PUT ") \
@ -323,6 +325,44 @@ enum yfs_cm_operation {
EM(yfs_CB_TellMeAboutYourself, "YFSCB.TellMeAboutYourself") \ EM(yfs_CB_TellMeAboutYourself, "YFSCB.TellMeAboutYourself") \
E_(yfs_CB_CallBack, "YFSCB.CallBack") E_(yfs_CB_CallBack, "YFSCB.CallBack")
#define afs_cb_promise_traces \
EM(afs_cb_promise_clear_cb_break, "CLEAR cb-break") \
EM(afs_cb_promise_clear_rmdir, "CLEAR rmdir") \
EM(afs_cb_promise_clear_rotate_server, "CLEAR rot-srv") \
EM(afs_cb_promise_clear_server_change, "CLEAR srv-chg") \
EM(afs_cb_promise_clear_vol_init_cb, "CLEAR vol-init-cb") \
EM(afs_cb_promise_set_apply_cb, "SET apply-cb") \
EM(afs_cb_promise_set_new_inode, "SET new-inode") \
E_(afs_cb_promise_set_new_symlink, "SET new-symlink")
#define afs_vnode_invalid_traces \
EM(afs_vnode_invalid_trace_cb_ro_snapshot, "cb-ro-snapshot") \
EM(afs_vnode_invalid_trace_cb_scrub, "cb-scrub") \
EM(afs_vnode_invalid_trace_cb_v_break, "cb-v-break") \
EM(afs_vnode_invalid_trace_expired, "expired") \
EM(afs_vnode_invalid_trace_no_cb_promise, "no-cb-promise") \
EM(afs_vnode_invalid_trace_vol_expired, "vol-expired") \
EM(afs_vnode_invalid_trace_zap_data, "zap-data") \
E_(afs_vnode_valid_trace, "valid")
#define afs_dir_invalid_traces \
EM(afs_dir_invalid_edit_add_bad_size, "edit-add-bad-size") \
EM(afs_dir_invalid_edit_add_no_slots, "edit-add-no-slots") \
EM(afs_dir_invalid_edit_add_too_many_blocks, "edit-add-too-many-blocks") \
EM(afs_dir_invalid_edit_get_block, "edit-get-block") \
EM(afs_dir_invalid_edit_mkdir, "edit-mkdir") \
EM(afs_dir_invalid_edit_rem_bad_size, "edit-rem-bad-size") \
EM(afs_dir_invalid_edit_rem_wrong_name, "edit-rem-wrong_name") \
EM(afs_dir_invalid_edit_upd_bad_size, "edit-upd-bad-size") \
EM(afs_dir_invalid_edit_upd_no_dd, "edit-upd-no-dotdot") \
EM(afs_dir_invalid_dv_mismatch, "dv-mismatch") \
EM(afs_dir_invalid_inval_folio, "inv-folio") \
EM(afs_dir_invalid_iter_stale, "iter-stale") \
EM(afs_dir_invalid_reclaimed_folio, "reclaimed-folio") \
EM(afs_dir_invalid_release_folio, "rel-folio") \
EM(afs_dir_invalid_remote, "remote") \
E_(afs_dir_invalid_subdir_removed, "subdir-removed")
#define afs_edit_dir_ops \ #define afs_edit_dir_ops \
EM(afs_edit_dir_create, "create") \ EM(afs_edit_dir_create, "create") \
EM(afs_edit_dir_create_error, "c_fail") \ EM(afs_edit_dir_create_error, "c_fail") \
@ -332,6 +372,7 @@ enum yfs_cm_operation {
EM(afs_edit_dir_delete_error, "d_err ") \ EM(afs_edit_dir_delete_error, "d_err ") \
EM(afs_edit_dir_delete_inval, "d_invl") \ EM(afs_edit_dir_delete_inval, "d_invl") \
EM(afs_edit_dir_delete_noent, "d_nent") \ EM(afs_edit_dir_delete_noent, "d_nent") \
EM(afs_edit_dir_mkdir, "mk_ent") \
EM(afs_edit_dir_update_dd, "u_ddot") \ EM(afs_edit_dir_update_dd, "u_ddot") \
EM(afs_edit_dir_update_error, "u_fail") \ EM(afs_edit_dir_update_error, "u_fail") \
EM(afs_edit_dir_update_inval, "u_invl") \ EM(afs_edit_dir_update_inval, "u_invl") \
@ -385,6 +426,7 @@ enum yfs_cm_operation {
EM(afs_file_error_dir_over_end, "DIR_ENT_OVER_END") \ EM(afs_file_error_dir_over_end, "DIR_ENT_OVER_END") \
EM(afs_file_error_dir_small, "DIR_SMALL") \ EM(afs_file_error_dir_small, "DIR_SMALL") \
EM(afs_file_error_dir_unmarked_ext, "DIR_UNMARKED_EXT") \ EM(afs_file_error_dir_unmarked_ext, "DIR_UNMARKED_EXT") \
EM(afs_file_error_symlink_big, "SYM_BIG") \
EM(afs_file_error_mntpt, "MNTPT_READ_FAILED") \ EM(afs_file_error_mntpt, "MNTPT_READ_FAILED") \
E_(afs_file_error_writeback_fail, "WRITEBACK_FAILED") E_(afs_file_error_writeback_fail, "WRITEBACK_FAILED")
@ -487,7 +529,9 @@ enum yfs_cm_operation {
enum afs_alist_trace { afs_alist_traces } __mode(byte); enum afs_alist_trace { afs_alist_traces } __mode(byte);
enum afs_call_trace { afs_call_traces } __mode(byte); enum afs_call_trace { afs_call_traces } __mode(byte);
enum afs_cb_break_reason { afs_cb_break_reasons } __mode(byte); enum afs_cb_break_reason { afs_cb_break_reasons } __mode(byte);
enum afs_cb_promise_trace { afs_cb_promise_traces } __mode(byte);
enum afs_cell_trace { afs_cell_traces } __mode(byte); enum afs_cell_trace { afs_cell_traces } __mode(byte);
enum afs_dir_invalid_trace { afs_dir_invalid_traces} __mode(byte);
enum afs_edit_dir_op { afs_edit_dir_ops } __mode(byte); enum afs_edit_dir_op { afs_edit_dir_ops } __mode(byte);
enum afs_edit_dir_reason { afs_edit_dir_reasons } __mode(byte); enum afs_edit_dir_reason { afs_edit_dir_reasons } __mode(byte);
enum afs_eproto_cause { afs_eproto_causes } __mode(byte); enum afs_eproto_cause { afs_eproto_causes } __mode(byte);
@ -498,6 +542,7 @@ enum afs_flock_operation { afs_flock_operations } __mode(byte);
enum afs_io_error { afs_io_errors } __mode(byte); enum afs_io_error { afs_io_errors } __mode(byte);
enum afs_rotate_trace { afs_rotate_traces } __mode(byte); enum afs_rotate_trace { afs_rotate_traces } __mode(byte);
enum afs_server_trace { afs_server_traces } __mode(byte); enum afs_server_trace { afs_server_traces } __mode(byte);
enum afs_vnode_invalid_trace { afs_vnode_invalid_traces} __mode(byte);
enum afs_volume_trace { afs_volume_traces } __mode(byte); enum afs_volume_trace { afs_volume_traces } __mode(byte);
#endif /* end __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY */ #endif /* end __AFS_GENERATE_TRACE_ENUMS_ONCE_ONLY */
@ -513,8 +558,10 @@ enum afs_volume_trace { afs_volume_traces } __mode(byte);
afs_alist_traces; afs_alist_traces;
afs_call_traces; afs_call_traces;
afs_cb_break_reasons; afs_cb_break_reasons;
afs_cb_promise_traces;
afs_cell_traces; afs_cell_traces;
afs_cm_operations; afs_cm_operations;
afs_dir_invalid_traces;
afs_edit_dir_ops; afs_edit_dir_ops;
afs_edit_dir_reasons; afs_edit_dir_reasons;
afs_eproto_causes; afs_eproto_causes;
@ -526,6 +573,7 @@ afs_fs_operations;
afs_io_errors; afs_io_errors;
afs_rotate_traces; afs_rotate_traces;
afs_server_traces; afs_server_traces;
afs_vnode_invalid_traces;
afs_vl_operations; afs_vl_operations;
yfs_cm_operations; yfs_cm_operations;
@ -670,7 +718,7 @@ TRACE_EVENT(afs_make_fs_call,
} }
), ),
TP_printk("c=%08x %06llx:%06llx:%06x %s", TP_printk("c=%08x V=%llx i=%llx:%x %s",
__entry->call, __entry->call,
__entry->fid.vid, __entry->fid.vid,
__entry->fid.vnode, __entry->fid.vnode,
@ -704,7 +752,7 @@ TRACE_EVENT(afs_make_fs_calli,
} }
), ),
TP_printk("c=%08x %06llx:%06llx:%06x %s i=%u", TP_printk("c=%08x V=%llx i=%llx:%x %s i=%u",
__entry->call, __entry->call,
__entry->fid.vid, __entry->fid.vid,
__entry->fid.vnode, __entry->fid.vnode,
@ -741,7 +789,7 @@ TRACE_EVENT(afs_make_fs_call1,
__entry->name[__len] = 0; __entry->name[__len] = 0;
), ),
TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\"", TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\"",
__entry->call, __entry->call,
__entry->fid.vid, __entry->fid.vid,
__entry->fid.vnode, __entry->fid.vnode,
@ -782,7 +830,7 @@ TRACE_EVENT(afs_make_fs_call2,
__entry->name2[__len2] = 0; __entry->name2[__len2] = 0;
), ),
TP_printk("c=%08x %06llx:%06llx:%06x %s \"%s\" \"%s\"", TP_printk("c=%08x V=%llx i=%llx:%x %s \"%s\" \"%s\"",
__entry->call, __entry->call,
__entry->fid.vid, __entry->fid.vid,
__entry->fid.vnode, __entry->fid.vnode,
@ -887,9 +935,9 @@ TRACE_EVENT(afs_sent_data,
); );
TRACE_EVENT(afs_dir_check_failed, TRACE_EVENT(afs_dir_check_failed,
TP_PROTO(struct afs_vnode *vnode, loff_t off, loff_t i_size), TP_PROTO(struct afs_vnode *vnode, loff_t off),
TP_ARGS(vnode, off, i_size), TP_ARGS(vnode, off),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(struct afs_vnode *, vnode) __field(struct afs_vnode *, vnode)
@ -900,7 +948,7 @@ TRACE_EVENT(afs_dir_check_failed,
TP_fast_assign( TP_fast_assign(
__entry->vnode = vnode; __entry->vnode = vnode;
__entry->off = off; __entry->off = off;
__entry->i_size = i_size; __entry->i_size = i_size_read(&vnode->netfs.inode);
), ),
TP_printk("vn=%p %llx/%llx", TP_printk("vn=%p %llx/%llx",
@ -1002,7 +1050,7 @@ TRACE_EVENT(afs_edit_dir,
__entry->name[__len] = 0; __entry->name[__len] = 0;
), ),
TP_printk("d=%x:%x %s %s %u[%u] f=%x:%x \"%s\"", TP_printk("di=%x:%x %s %s %u[%u] fi=%x:%x \"%s\"",
__entry->vnode, __entry->unique, __entry->vnode, __entry->unique,
__print_symbolic(__entry->why, afs_edit_dir_reasons), __print_symbolic(__entry->why, afs_edit_dir_reasons),
__print_symbolic(__entry->op, afs_edit_dir_ops), __print_symbolic(__entry->op, afs_edit_dir_ops),
@ -1011,6 +1059,122 @@ TRACE_EVENT(afs_edit_dir,
__entry->name) __entry->name)
); );
TRACE_EVENT(afs_dir_invalid,
TP_PROTO(const struct afs_vnode *dvnode, enum afs_dir_invalid_trace trace),
TP_ARGS(dvnode, trace),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(enum afs_dir_invalid_trace, trace)
),
TP_fast_assign(
__entry->vnode = dvnode->fid.vnode;
__entry->unique = dvnode->fid.unique;
__entry->trace = trace;
),
TP_printk("di=%x:%x %s",
__entry->vnode, __entry->unique,
__print_symbolic(__entry->trace, afs_dir_invalid_traces))
);
TRACE_EVENT(afs_cb_promise,
TP_PROTO(const struct afs_vnode *vnode, enum afs_cb_promise_trace trace),
TP_ARGS(vnode, trace),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(enum afs_cb_promise_trace, trace)
),
TP_fast_assign(
__entry->vnode = vnode->fid.vnode;
__entry->unique = vnode->fid.unique;
__entry->trace = trace;
),
TP_printk("di=%x:%x %s",
__entry->vnode, __entry->unique,
__print_symbolic(__entry->trace, afs_cb_promise_traces))
);
TRACE_EVENT(afs_vnode_invalid,
TP_PROTO(const struct afs_vnode *vnode, enum afs_vnode_invalid_trace trace),
TP_ARGS(vnode, trace),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(enum afs_vnode_invalid_trace, trace)
),
TP_fast_assign(
__entry->vnode = vnode->fid.vnode;
__entry->unique = vnode->fid.unique;
__entry->trace = trace;
),
TP_printk("di=%x:%x %s",
__entry->vnode, __entry->unique,
__print_symbolic(__entry->trace, afs_vnode_invalid_traces))
);
TRACE_EVENT(afs_set_dv,
TP_PROTO(const struct afs_vnode *dvnode, u64 new_dv),
TP_ARGS(dvnode, new_dv),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(u64, old_dv)
__field(u64, new_dv)
),
TP_fast_assign(
__entry->vnode = dvnode->fid.vnode;
__entry->unique = dvnode->fid.unique;
__entry->old_dv = dvnode->status.data_version;
__entry->new_dv = new_dv;
),
TP_printk("di=%x:%x dv=%llx -> dv=%llx",
__entry->vnode, __entry->unique,
__entry->old_dv, __entry->new_dv)
);
TRACE_EVENT(afs_dv_mismatch,
TP_PROTO(const struct afs_vnode *dvnode, u64 before_dv, int delta, u64 new_dv),
TP_ARGS(dvnode, before_dv, delta, new_dv),
TP_STRUCT__entry(
__field(unsigned int, vnode)
__field(unsigned int, unique)
__field(int, delta)
__field(u64, before_dv)
__field(u64, new_dv)
),
TP_fast_assign(
__entry->vnode = dvnode->fid.vnode;
__entry->unique = dvnode->fid.unique;
__entry->delta = delta;
__entry->before_dv = before_dv;
__entry->new_dv = new_dv;
),
TP_printk("di=%x:%x xdv=%llx+%d dv=%llx",
__entry->vnode, __entry->unique,
__entry->before_dv, __entry->delta, __entry->new_dv)
);
TRACE_EVENT(afs_protocol_error, TRACE_EVENT(afs_protocol_error,
TP_PROTO(struct afs_call *call, enum afs_eproto_cause cause), TP_PROTO(struct afs_call *call, enum afs_eproto_cause cause),
@ -1611,6 +1775,36 @@ TRACE_EVENT(afs_make_call,
__entry->fid.unique) __entry->fid.unique)
); );
TRACE_EVENT(afs_read_recv,
TP_PROTO(const struct afs_operation *op, const struct afs_call *call),
TP_ARGS(op, call),
TP_STRUCT__entry(
__field(unsigned int, rreq)
__field(unsigned int, sreq)
__field(unsigned int, op)
__field(unsigned int, op_flags)
__field(unsigned int, call)
__field(enum afs_call_state, call_state)
),
TP_fast_assign(
__entry->op = op->debug_id;
__entry->sreq = op->fetch.subreq->debug_index;
__entry->rreq = op->fetch.subreq->rreq->debug_id;
__entry->op_flags = op->flags;
__entry->call = call->debug_id;
__entry->call_state = call->state;
),
TP_printk("R=%08x[%x] OP=%08x c=%08x cs=%x of=%x",
__entry->rreq, __entry->sreq,
__entry->op,
__entry->call, __entry->call_state,
__entry->op_flags)
);
#endif /* _TRACE_AFS_H */ #endif /* _TRACE_AFS_H */
/* This part must be outside protection */ /* This part must be outside protection */

View File

@ -380,10 +380,11 @@ TRACE_EVENT(cachefiles_rename,
TRACE_EVENT(cachefiles_coherency, TRACE_EVENT(cachefiles_coherency,
TP_PROTO(struct cachefiles_object *obj, TP_PROTO(struct cachefiles_object *obj,
ino_t ino, ino_t ino,
u64 disk_aux,
enum cachefiles_content content, enum cachefiles_content content,
enum cachefiles_coherency_trace why), enum cachefiles_coherency_trace why),
TP_ARGS(obj, ino, content, why), TP_ARGS(obj, ino, disk_aux, content, why),
/* Note that obj may be NULL */ /* Note that obj may be NULL */
TP_STRUCT__entry( TP_STRUCT__entry(
@ -391,6 +392,8 @@ TRACE_EVENT(cachefiles_coherency,
__field(enum cachefiles_coherency_trace, why ) __field(enum cachefiles_coherency_trace, why )
__field(enum cachefiles_content, content ) __field(enum cachefiles_content, content )
__field(u64, ino ) __field(u64, ino )
__field(u64, aux )
__field(u64, disk_aux)
), ),
TP_fast_assign( TP_fast_assign(
@ -398,13 +401,17 @@ TRACE_EVENT(cachefiles_coherency,
__entry->why = why; __entry->why = why;
__entry->content = content; __entry->content = content;
__entry->ino = ino; __entry->ino = ino;
__entry->aux = be64_to_cpup((__be64 *)obj->cookie->inline_aux);
__entry->disk_aux = disk_aux;
), ),
TP_printk("o=%08x %s B=%llx c=%u", TP_printk("o=%08x %s B=%llx c=%u aux=%llx dsk=%llx",
__entry->obj, __entry->obj,
__print_symbolic(__entry->why, cachefiles_coherency_traces), __print_symbolic(__entry->why, cachefiles_coherency_traces),
__entry->ino, __entry->ino,
__entry->content) __entry->content,
__entry->aux,
__entry->disk_aux)
); );
TRACE_EVENT(cachefiles_vol_coherency, TRACE_EVENT(cachefiles_vol_coherency,

View File

@ -21,6 +21,7 @@
EM(netfs_read_trace_readahead, "READAHEAD") \ EM(netfs_read_trace_readahead, "READAHEAD") \
EM(netfs_read_trace_readpage, "READPAGE ") \ EM(netfs_read_trace_readpage, "READPAGE ") \
EM(netfs_read_trace_read_gaps, "READ-GAPS") \ EM(netfs_read_trace_read_gaps, "READ-GAPS") \
EM(netfs_read_trace_read_single, "READ-SNGL") \
EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \ EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \
E_(netfs_read_trace_write_begin, "WRITEBEGN") E_(netfs_read_trace_write_begin, "WRITEBEGN")
@ -35,9 +36,11 @@
EM(NETFS_READAHEAD, "RA") \ EM(NETFS_READAHEAD, "RA") \
EM(NETFS_READPAGE, "RP") \ EM(NETFS_READPAGE, "RP") \
EM(NETFS_READ_GAPS, "RG") \ EM(NETFS_READ_GAPS, "RG") \
EM(NETFS_READ_SINGLE, "R1") \
EM(NETFS_READ_FOR_WRITE, "RW") \ EM(NETFS_READ_FOR_WRITE, "RW") \
EM(NETFS_DIO_READ, "DR") \ EM(NETFS_DIO_READ, "DR") \
EM(NETFS_WRITEBACK, "WB") \ EM(NETFS_WRITEBACK, "WB") \
EM(NETFS_WRITEBACK_SINGLE, "W1") \
EM(NETFS_WRITETHROUGH, "WT") \ EM(NETFS_WRITETHROUGH, "WT") \
EM(NETFS_UNBUFFERED_WRITE, "UW") \ EM(NETFS_UNBUFFERED_WRITE, "UW") \
EM(NETFS_DIO_WRITE, "DW") \ EM(NETFS_DIO_WRITE, "DW") \
@ -47,17 +50,23 @@
EM(netfs_rreq_trace_assess, "ASSESS ") \ EM(netfs_rreq_trace_assess, "ASSESS ") \
EM(netfs_rreq_trace_copy, "COPY ") \ EM(netfs_rreq_trace_copy, "COPY ") \
EM(netfs_rreq_trace_collect, "COLLECT") \ EM(netfs_rreq_trace_collect, "COLLECT") \
EM(netfs_rreq_trace_complete, "COMPLET") \
EM(netfs_rreq_trace_dirty, "DIRTY ") \
EM(netfs_rreq_trace_done, "DONE ") \ EM(netfs_rreq_trace_done, "DONE ") \
EM(netfs_rreq_trace_free, "FREE ") \ EM(netfs_rreq_trace_free, "FREE ") \
EM(netfs_rreq_trace_redirty, "REDIRTY") \ EM(netfs_rreq_trace_redirty, "REDIRTY") \
EM(netfs_rreq_trace_resubmit, "RESUBMT") \ EM(netfs_rreq_trace_resubmit, "RESUBMT") \
EM(netfs_rreq_trace_set_abandon, "S-ABNDN") \
EM(netfs_rreq_trace_set_pause, "PAUSE ") \ EM(netfs_rreq_trace_set_pause, "PAUSE ") \
EM(netfs_rreq_trace_unlock, "UNLOCK ") \ EM(netfs_rreq_trace_unlock, "UNLOCK ") \
EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \ EM(netfs_rreq_trace_unlock_pgpriv2, "UNLCK-2") \
EM(netfs_rreq_trace_unmark, "UNMARK ") \ EM(netfs_rreq_trace_unmark, "UNMARK ") \
EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \ EM(netfs_rreq_trace_wait_ip, "WAIT-IP") \
EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \ EM(netfs_rreq_trace_wait_pause, "WT-PAUS") \
EM(netfs_rreq_trace_wait_queue, "WAIT-Q ") \
EM(netfs_rreq_trace_wake_ip, "WAKE-IP") \ EM(netfs_rreq_trace_wake_ip, "WAKE-IP") \
EM(netfs_rreq_trace_wake_queue, "WAKE-Q ") \
EM(netfs_rreq_trace_woke_queue, "WOKE-Q ") \
EM(netfs_rreq_trace_unpause, "UNPAUSE") \ EM(netfs_rreq_trace_unpause, "UNPAUSE") \
E_(netfs_rreq_trace_write_done, "WR-DONE") E_(netfs_rreq_trace_write_done, "WR-DONE")
@ -74,6 +83,9 @@
#define netfs_sreq_traces \ #define netfs_sreq_traces \
EM(netfs_sreq_trace_add_donations, "+DON ") \ EM(netfs_sreq_trace_add_donations, "+DON ") \
EM(netfs_sreq_trace_added, "ADD ") \ EM(netfs_sreq_trace_added, "ADD ") \
EM(netfs_sreq_trace_cache_nowrite, "CA-NW") \
EM(netfs_sreq_trace_cache_prepare, "CA-PR") \
EM(netfs_sreq_trace_cache_write, "CA-WR") \
EM(netfs_sreq_trace_clear, "CLEAR") \ EM(netfs_sreq_trace_clear, "CLEAR") \
EM(netfs_sreq_trace_discard, "DSCRD") \ EM(netfs_sreq_trace_discard, "DSCRD") \
EM(netfs_sreq_trace_donate_to_prev, "DON-P") \ EM(netfs_sreq_trace_donate_to_prev, "DON-P") \
@ -84,6 +96,8 @@
EM(netfs_sreq_trace_hit_eof, "EOF ") \ EM(netfs_sreq_trace_hit_eof, "EOF ") \
EM(netfs_sreq_trace_io_progress, "IO ") \ EM(netfs_sreq_trace_io_progress, "IO ") \
EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_limited, "LIMIT") \
EM(netfs_sreq_trace_partial_read, "PARTR") \
EM(netfs_sreq_trace_need_retry, "NRTRY") \
EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prepare, "PREP ") \
EM(netfs_sreq_trace_prep_failed, "PRPFL") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \
EM(netfs_sreq_trace_progress, "PRGRS") \ EM(netfs_sreq_trace_progress, "PRGRS") \
@ -152,6 +166,7 @@
EM(netfs_streaming_filled_page, "mod-streamw-f") \ EM(netfs_streaming_filled_page, "mod-streamw-f") \
EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \ EM(netfs_streaming_cont_filled_page, "mod-streamw-f+") \
EM(netfs_folio_trace_abandon, "abandon") \ EM(netfs_folio_trace_abandon, "abandon") \
EM(netfs_folio_trace_alloc_buffer, "alloc-buf") \
EM(netfs_folio_trace_cancel_copy, "cancel-copy") \ EM(netfs_folio_trace_cancel_copy, "cancel-copy") \
EM(netfs_folio_trace_cancel_store, "cancel-store") \ EM(netfs_folio_trace_cancel_store, "cancel-store") \
EM(netfs_folio_trace_clear, "clear") \ EM(netfs_folio_trace_clear, "clear") \
@ -168,6 +183,7 @@
EM(netfs_folio_trace_mkwrite, "mkwrite") \ EM(netfs_folio_trace_mkwrite, "mkwrite") \
EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \ EM(netfs_folio_trace_mkwrite_plus, "mkwrite+") \
EM(netfs_folio_trace_not_under_wback, "!wback") \ EM(netfs_folio_trace_not_under_wback, "!wback") \
EM(netfs_folio_trace_not_locked, "!locked") \
EM(netfs_folio_trace_put, "put") \ EM(netfs_folio_trace_put, "put") \
EM(netfs_folio_trace_read, "read") \ EM(netfs_folio_trace_read, "read") \
EM(netfs_folio_trace_read_done, "read-done") \ EM(netfs_folio_trace_read_done, "read-done") \
@ -191,6 +207,14 @@
EM(netfs_trace_donate_to_next, "to-next") \ EM(netfs_trace_donate_to_next, "to-next") \
E_(netfs_trace_donate_to_deferred_next, "defer-next") E_(netfs_trace_donate_to_deferred_next, "defer-next")
#define netfs_folioq_traces \
EM(netfs_trace_folioq_alloc_buffer, "alloc-buf") \
EM(netfs_trace_folioq_clear, "clear") \
EM(netfs_trace_folioq_delete, "delete") \
EM(netfs_trace_folioq_make_space, "make-space") \
EM(netfs_trace_folioq_rollbuf_init, "roll-init") \
E_(netfs_trace_folioq_read_progress, "r-progress")
#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
@ -209,6 +233,7 @@ enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte);
enum netfs_folio_trace { netfs_folio_traces } __mode(byte); enum netfs_folio_trace { netfs_folio_traces } __mode(byte);
enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte); enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte);
enum netfs_donate_trace { netfs_donate_traces } __mode(byte); enum netfs_donate_trace { netfs_donate_traces } __mode(byte);
enum netfs_folioq_trace { netfs_folioq_traces } __mode(byte);
#endif #endif
@ -232,6 +257,7 @@ netfs_sreq_ref_traces;
netfs_folio_traces; netfs_folio_traces;
netfs_collect_contig_traces; netfs_collect_contig_traces;
netfs_donate_traces; netfs_donate_traces;
netfs_folioq_traces;
/* /*
* Now redefine the EM() and E_() macros to map the enums to the strings that * Now redefine the EM() and E_() macros to map the enums to the strings that
@ -317,6 +343,7 @@ TRACE_EVENT(netfs_sreq,
__field(unsigned short, flags ) __field(unsigned short, flags )
__field(enum netfs_io_source, source ) __field(enum netfs_io_source, source )
__field(enum netfs_sreq_trace, what ) __field(enum netfs_sreq_trace, what )
__field(u8, slot )
__field(size_t, len ) __field(size_t, len )
__field(size_t, transferred ) __field(size_t, transferred )
__field(loff_t, start ) __field(loff_t, start )
@ -332,15 +359,16 @@ TRACE_EVENT(netfs_sreq,
__entry->len = sreq->len; __entry->len = sreq->len;
__entry->transferred = sreq->transferred; __entry->transferred = sreq->transferred;
__entry->start = sreq->start; __entry->start = sreq->start;
__entry->slot = sreq->io_iter.folioq_slot;
), ),
TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx e=%d", TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx s=%u e=%d",
__entry->rreq, __entry->index, __entry->rreq, __entry->index,
__print_symbolic(__entry->source, netfs_sreq_sources), __print_symbolic(__entry->source, netfs_sreq_sources),
__print_symbolic(__entry->what, netfs_sreq_traces), __print_symbolic(__entry->what, netfs_sreq_traces),
__entry->flags, __entry->flags,
__entry->start, __entry->transferred, __entry->len, __entry->start, __entry->transferred, __entry->len,
__entry->error) __entry->slot, __entry->error)
); );
TRACE_EVENT(netfs_failure, TRACE_EVENT(netfs_failure,
@ -680,69 +708,27 @@ TRACE_EVENT(netfs_collect_stream,
__entry->collected_to, __entry->front) __entry->collected_to, __entry->front)
); );
TRACE_EVENT(netfs_progress, TRACE_EVENT(netfs_folioq,
TP_PROTO(const struct netfs_io_subrequest *subreq, TP_PROTO(const struct folio_queue *fq,
unsigned long long start, size_t avail, size_t part), enum netfs_folioq_trace trace),
TP_ARGS(subreq, start, avail, part), TP_ARGS(fq, trace),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(unsigned int, rreq) __field(unsigned int, rreq)
__field(unsigned int, subreq) __field(unsigned int, id)
__field(unsigned int, consumed) __field(enum netfs_folioq_trace, trace)
__field(unsigned int, transferred)
__field(unsigned long long, f_start)
__field(unsigned int, f_avail)
__field(unsigned int, f_part)
__field(unsigned char, slot)
), ),
TP_fast_assign( TP_fast_assign(
__entry->rreq = subreq->rreq->debug_id; __entry->rreq = fq ? fq->rreq_id : 0;
__entry->subreq = subreq->debug_index; __entry->id = fq ? fq->debug_id : 0;
__entry->consumed = subreq->consumed;
__entry->transferred = subreq->transferred;
__entry->f_start = start;
__entry->f_avail = avail;
__entry->f_part = part;
__entry->slot = subreq->curr_folioq_slot;
),
TP_printk("R=%08x[%02x] s=%llx ct=%x/%x pa=%x/%x sl=%x",
__entry->rreq, __entry->subreq, __entry->f_start,
__entry->consumed, __entry->transferred,
__entry->f_part, __entry->f_avail, __entry->slot)
);
TRACE_EVENT(netfs_donate,
TP_PROTO(const struct netfs_io_request *rreq,
const struct netfs_io_subrequest *from,
const struct netfs_io_subrequest *to,
size_t amount,
enum netfs_donate_trace trace),
TP_ARGS(rreq, from, to, amount, trace),
TP_STRUCT__entry(
__field(unsigned int, rreq)
__field(unsigned int, from)
__field(unsigned int, to)
__field(unsigned int, amount)
__field(enum netfs_donate_trace, trace)
),
TP_fast_assign(
__entry->rreq = rreq->debug_id;
__entry->from = from->debug_index;
__entry->to = to ? to->debug_index : -1;
__entry->amount = amount;
__entry->trace = trace; __entry->trace = trace;
), ),
TP_printk("R=%08x[%02x] -> [%02x] %s am=%x", TP_printk("R=%08x fq=%x %s",
__entry->rreq, __entry->from, __entry->to, __entry->rreq, __entry->id,
__print_symbolic(__entry->trace, netfs_donate_traces), __print_symbolic(__entry->trace, netfs_folioq_traces))
__entry->amount)
); );
#undef EM #undef EM

View File

@ -14,37 +14,56 @@
#include <linux/types.h> #include <linux/types.h>
/**
* struct fiemap_extent - description of one fiemap extent
* @fe_logical: byte offset of the extent in the file
* @fe_physical: byte offset of extent on disk
* @fe_length: length in bytes for this extent
* @fe_flags: FIEMAP_EXTENT_* flags for this extent
*/
struct fiemap_extent { struct fiemap_extent {
__u64 fe_logical; /* logical offset in bytes for the start of __u64 fe_logical;
* the extent from the beginning of the file */ __u64 fe_physical;
__u64 fe_physical; /* physical offset in bytes for the start __u64 fe_length;
* of the extent from the beginning of the disk */ /* private: */
__u64 fe_length; /* length in bytes for this extent */
__u64 fe_reserved64[2]; __u64 fe_reserved64[2];
__u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ /* public: */
__u32 fe_flags;
/* private: */
__u32 fe_reserved[3]; __u32 fe_reserved[3];
}; };
/**
* struct fiemap - file extent mappings
* @fm_start: byte offset (inclusive) at which to start mapping (in)
* @fm_length: logical length of mapping which userspace wants (in)
* @fm_flags: FIEMAP_FLAG_* flags for request (in/out)
* @fm_mapped_extents: number of extents that were mapped (out)
* @fm_extent_count: size of fm_extents array (in)
* @fm_extents: array of mapped extents (out)
*/
struct fiemap { struct fiemap {
__u64 fm_start; /* logical offset (inclusive) at __u64 fm_start;
* which to start mapping (in) */ __u64 fm_length;
__u64 fm_length; /* logical length of mapping which __u32 fm_flags;
* userspace wants (in) */ __u32 fm_mapped_extents;
__u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ __u32 fm_extent_count;
__u32 fm_mapped_extents;/* number of extents that were mapped (out) */ /* private: */
__u32 fm_extent_count; /* size of fm_extents array (in) */
__u32 fm_reserved; __u32 fm_reserved;
struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */ /* public: */
struct fiemap_extent fm_extents[];
}; };
#define FIEMAP_MAX_OFFSET (~0ULL) #define FIEMAP_MAX_OFFSET (~0ULL)
/* flags used in fm_flags: */
#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
#define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */ #define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */
#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
/* flags used in fe_flags: */
#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ #define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */
#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ #define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */
#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. #define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending.

View File

@ -476,56 +476,6 @@ void abort_creds(struct cred *new)
} }
EXPORT_SYMBOL(abort_creds); EXPORT_SYMBOL(abort_creds);
/**
* override_creds - Override the current process's subjective credentials
* @new: The credentials to be assigned
*
* Install a set of temporary override subjective credentials on the current
* process, returning the old set for later reversion.
*/
const struct cred *override_creds(const struct cred *new)
{
const struct cred *old;
kdebug("override_creds(%p{%ld})", new,
atomic_long_read(&new->usage));
/*
* NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
*
* That means that we do not clear the 'non_rcu' flag, since
* we are only installing the cred into the thread-synchronous
* '->cred' pointer, not the '->real_cred' pointer that is
* visible to other threads under RCU.
*/
get_new_cred((struct cred *)new);
old = override_creds_light(new);
kdebug("override_creds() = %p{%ld}", old,
atomic_long_read(&old->usage));
return old;
}
EXPORT_SYMBOL(override_creds);
/**
* revert_creds - Revert a temporary subjective credentials override
* @old: The credentials to be restored
*
* Revert a temporary set of override subjective credentials to an old set,
* discarding the override set.
*/
void revert_creds(const struct cred *old)
{
const struct cred *override = current->cred;
kdebug("revert_creds(%p{%ld})", old,
atomic_long_read(&old->usage));
revert_creds_light(old);
put_cred(override);
}
EXPORT_SYMBOL(revert_creds);
/** /**
* cred_fscmp - Compare two credentials with respect to filesystem access. * cred_fscmp - Compare two credentials with respect to filesystem access.
* @a: The first credential * @a: The first credential

View File

@ -89,6 +89,7 @@ find $cpio_dir -type f -print0 |
# Create archive and try to normalize metadata for reproducibility. # Create archive and try to normalize metadata for reproducibility.
tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \ tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
--exclude=".__afs*" --exclude=".nfs*" \
--owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \ --owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
-I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null

View File

@ -43,6 +43,7 @@
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/idr.h> #include <linux/idr.h>
#include <linux/pidfs.h> #include <linux/pidfs.h>
#include <linux/seqlock.h>
#include <net/sock.h> #include <net/sock.h>
#include <uapi/linux/pidfd.h> #include <uapi/linux/pidfd.h>
@ -60,15 +61,8 @@ struct pid init_struct_pid = {
}, } }, }
}; };
int pid_max = PID_MAX_DEFAULT; static int pid_max_min = RESERVED_PIDS + 1;
static int pid_max_max = PID_MAX_LIMIT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
/*
* Pseudo filesystems start inode numbering after one. We use Reserved
* PIDs as a natural offset.
*/
static u64 pidfs_ino = RESERVED_PIDS;
/* /*
* PID-map pages start out as NULL, they get allocated upon * PID-map pages start out as NULL, they get allocated upon
@ -87,6 +81,7 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS #ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations, .ns.ops = &pidns_operations,
#endif #endif
.pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif #endif
@ -108,6 +103,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
*/ */
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
void put_pid(struct pid *pid) void put_pid(struct pid *pid)
{ {
@ -158,6 +154,7 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr); idr_remove(&ns->idr, upid->nr);
} }
pidfs_remove_pid(pid);
spin_unlock_irqrestore(&pidmap_lock, flags); spin_unlock_irqrestore(&pidmap_lock, flags);
call_rcu(&pid->rcu, delayed_put_pid); call_rcu(&pid->rcu, delayed_put_pid);
@ -193,6 +190,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) { for (i = ns->level; i >= 0; i--) {
int tid = 0; int tid = 0;
int pid_max = READ_ONCE(tmp->pid_max);
if (set_tid_size) { if (set_tid_size) {
tid = set_tid[ns->level - i]; tid = set_tid[ns->level - i];
@ -273,22 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->inodes); INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level; upid = pid->numbers + ns->level;
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock); spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING)) if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock; goto out_unlock;
pid->stashed = NULL; pidfs_add_pid(pid);
pid->ino = ++pidfs_ino;
for ( ; upid >= pid->numbers; --upid) { for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */ /* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr); idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++; upid->ns->pid_allocated++;
} }
spin_unlock_irq(&pidmap_lock); spin_unlock_irq(&pidmap_lock);
idr_preload_end();
return pid; return pid;
out_unlock: out_unlock:
spin_unlock_irq(&pidmap_lock); spin_unlock_irq(&pidmap_lock);
idr_preload_end();
put_pid_ns(ns); put_pid_ns(ns);
out_free: out_free:
@ -644,17 +644,118 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd; return fd;
} }
#ifdef CONFIG_SYSCTL
static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
{
return &task_active_pid_ns(current)->set;
}
static int set_is_seen(struct ctl_table_set *set)
{
return &task_active_pid_ns(current)->set == set;
}
static int pid_table_root_permissions(struct ctl_table_header *head,
const struct ctl_table *table)
{
struct pid_namespace *pidns =
container_of(head->set, struct pid_namespace, set);
int mode = table->mode;
if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
mode = (mode & S_IRWXU) >> 6;
else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
mode = (mode & S_IRWXG) >> 3;
else
mode = mode & S_IROTH;
return (mode << 6) | (mode << 3) | mode;
}
static void pid_table_root_set_ownership(struct ctl_table_header *head,
kuid_t *uid, kgid_t *gid)
{
struct pid_namespace *pidns =
container_of(head->set, struct pid_namespace, set);
kuid_t ns_root_uid;
kgid_t ns_root_gid;
ns_root_uid = make_kuid(pidns->user_ns, 0);
if (uid_valid(ns_root_uid))
*uid = ns_root_uid;
ns_root_gid = make_kgid(pidns->user_ns, 0);
if (gid_valid(ns_root_gid))
*gid = ns_root_gid;
}
static struct ctl_table_root pid_table_root = {
.lookup = pid_table_root_lookup,
.permissions = pid_table_root_permissions,
.set_ownership = pid_table_root_set_ownership,
};
static struct ctl_table pid_table[] = {
{
.procname = "pid_max",
.data = &init_pid_ns.pid_max,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
};
#endif
int register_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
struct ctl_table *tbl;
setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
if (!tbl)
return -ENOMEM;
tbl->data = &pidns->pid_max;
pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
ARRAY_SIZE(pid_table));
if (!pidns->sysctls) {
kfree(tbl);
retire_sysctl_set(&pidns->set);
return -ENOMEM;
}
#endif
return 0;
}
void unregister_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
const struct ctl_table *tbl;
tbl = pidns->sysctls->ctl_table_arg;
unregister_sysctl_table(pidns->sysctls);
retire_sysctl_set(&pidns->set);
kfree(tbl);
#endif
}
void __init pid_idr_init(void) void __init pid_idr_init(void)
{ {
/* Verify no one has done anything silly: */ /* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */ /* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max, init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus())); PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min, pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus()); PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
idr_init(&init_pid_ns.idr); idr_init(&init_pid_ns.idr);
@ -665,6 +766,16 @@ void __init pid_idr_init(void)
NULL); NULL);
} }
static __init int pid_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
/* "kernel" directory will have already been initialized. */
BUG_ON(register_pidns_sysctls(&init_pid_ns));
#endif
return 0;
}
subsys_initcall(pid_namespace_sysctl_init);
static struct file *__pidfd_fget(struct task_struct *task, int fd) static struct file *__pidfd_fget(struct task_struct *task, int fd)
{ {
struct file *file; struct file *file;

View File

@ -70,6 +70,8 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
} }
static void destroy_pid_namespace_work(struct work_struct *work);
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns) struct pid_namespace *parent_pid_ns)
{ {
@ -105,17 +107,27 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr; goto out_free_idr;
ns->ns.ops = &pidns_operations; ns->ns.ops = &pidns_operations;
ns->pid_max = parent_pid_ns->pid_max;
err = register_pidns_sysctls(ns);
if (err)
goto out_free_inum;
refcount_set(&ns->ns.count, 1); refcount_set(&ns->ns.count, 1);
ns->level = level; ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns); ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns); ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts; ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING; ns->pid_allocated = PIDNS_ADDING;
INIT_WORK(&ns->work, destroy_pid_namespace_work);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif #endif
return ns; return ns;
out_free_inum:
ns_free_inum(&ns->ns);
out_free_idr: out_free_idr:
idr_destroy(&ns->idr); idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns); kmem_cache_free(pid_ns_cachep, ns);
@ -137,12 +149,28 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns) static void destroy_pid_namespace(struct pid_namespace *ns)
{ {
unregister_pidns_sysctls(ns);
ns_free_inum(&ns->ns); ns_free_inum(&ns->ns);
idr_destroy(&ns->idr); idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns); call_rcu(&ns->rcu, delayed_free_pidns);
} }
static void destroy_pid_namespace_work(struct work_struct *work)
{
struct pid_namespace *ns =
container_of(work, struct pid_namespace, work);
do {
struct pid_namespace *parent;
parent = ns->parent;
destroy_pid_namespace(ns);
ns = parent;
} while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
}
struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns) struct user_namespace *user_ns, struct pid_namespace *old_ns)
{ {
@ -155,15 +183,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
void put_pid_ns(struct pid_namespace *ns) void put_pid_ns(struct pid_namespace *ns)
{ {
struct pid_namespace *parent; if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
schedule_work(&ns->work);
while (ns != &init_pid_ns) {
parent = ns->parent;
if (!refcount_dec_and_test(&ns->ns.count))
break;
destroy_pid_namespace(ns);
ns = parent;
}
} }
EXPORT_SYMBOL_GPL(put_pid_ns); EXPORT_SYMBOL_GPL(put_pid_ns);
@ -274,6 +295,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
next = idr_get_cursor(&pid_ns->idr) - 1; next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next; tmp.data = &next;
tmp.extra2 = &pid_ns->pid_max;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (!ret && write) if (!ret && write)
idr_set_cursor(&pid_ns->idr, next + 1); idr_set_cursor(&pid_ns->idr, next + 1);
@ -281,7 +303,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
return ret; return ret;
} }
extern int pid_max;
static struct ctl_table pid_ns_ctl_table[] = { static struct ctl_table pid_ns_ctl_table[] = {
{ {
.procname = "ns_last_pid", .procname = "ns_last_pid",
@ -289,7 +310,7 @@ static struct ctl_table pid_ns_ctl_table[] = {
.mode = 0666, /* permissions are checked in the handler */ .mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler, .proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
.extra2 = &pid_max, .extra2 = &init_pid_ns.pid_max,
}, },
}; };
#endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_CHECKPOINT_RESTORE */

View File

@ -1803,15 +1803,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec, .proc_handler = proc_dointvec,
}, },
#endif #endif
{
.procname = "pid_max",
.data = &pid_max,
.maxlen = sizeof (int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &pid_max_min,
.extra2 = &pid_max_max,
},
{ {
.procname = "panic_on_oops", .procname = "panic_on_oops",
.data = &panic_on_oops, .data = &panic_on_oops,

View File

@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i; int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */ /* According to linux/thread.h, pids can be no bigger that 30 bits */
WARN_ON_ONCE(pid_max > (1 << 30)); WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list) if (!pid_list)

View File

@ -717,8 +717,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */ /* PID filtering */
extern int pid_max;
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid); pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,

Some files were not shown because too many files have changed in this diff Show More