mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-14 17:53:39 +00:00
Merge branch 'fs-next' of linux-next
This commit is contained in:
commit
4f2961328d
@ -183,4 +183,4 @@ even better as a code comment.
|
||||
A good code comment is wonderful, but even better is the comment that didn't
|
||||
need to exist because the code was so straightforward as to be obvious;
|
||||
organized into small clean and tidy modules, with clear and descriptive names
|
||||
for functions and variable, where every line of code has a clear purpose.
|
||||
for functions and variables, where every line of code has a clear purpose.
|
||||
|
@ -12,21 +12,10 @@ returns a list of extents.
|
||||
Request Basics
|
||||
--------------
|
||||
|
||||
A fiemap request is encoded within struct fiemap::
|
||||
|
||||
struct fiemap {
|
||||
__u64 fm_start; /* logical offset (inclusive) at
|
||||
* which to start mapping (in) */
|
||||
__u64 fm_length; /* logical length of mapping which
|
||||
* userspace cares about (in) */
|
||||
__u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */
|
||||
__u32 fm_mapped_extents; /* number of extents that were
|
||||
* mapped (out) */
|
||||
__u32 fm_extent_count; /* size of fm_extents array (in) */
|
||||
__u32 fm_reserved;
|
||||
struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
|
||||
};
|
||||
A fiemap request is encoded within struct fiemap:
|
||||
|
||||
.. kernel-doc:: include/uapi/linux/fiemap.h
|
||||
:identifiers: fiemap
|
||||
|
||||
fm_start, and fm_length specify the logical range within the file
|
||||
which the process would like mappings for. Extents returned mirror
|
||||
@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR
|
||||
If this flag is set, the extents returned will describe the inodes
|
||||
extended attribute lookup tree, instead of its data tree.
|
||||
|
||||
FIEMAP_FLAG_CACHE
|
||||
This flag requests caching of the extents.
|
||||
|
||||
Extent Mapping
|
||||
--------------
|
||||
@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST
|
||||
flag set (see the next section on extent flags).
|
||||
|
||||
Each extent is described by a single fiemap_extent structure as
|
||||
returned in fm_extents::
|
||||
returned in fm_extents:
|
||||
|
||||
struct fiemap_extent {
|
||||
__u64 fe_logical; /* logical offset in bytes for the start of
|
||||
* the extent */
|
||||
__u64 fe_physical; /* physical offset in bytes for the start
|
||||
* of the extent */
|
||||
__u64 fe_length; /* length in bytes for the extent */
|
||||
__u64 fe_reserved64[2];
|
||||
__u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */
|
||||
__u32 fe_reserved[3];
|
||||
};
|
||||
.. kernel-doc:: include/uapi/linux/fiemap.h
|
||||
:identifiers: fiemap_extent
|
||||
|
||||
All offsets and lengths are in bytes and mirror those on disk. It is valid
|
||||
for an extents logical offset to start before the request or its logical
|
||||
@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED
|
||||
userspace would be highly inefficient, the kernel will try to merge most
|
||||
adjacent blocks into 'extents'.
|
||||
|
||||
FIEMAP_EXTENT_SHARED
|
||||
This flag is set to request that space be shared with other files.
|
||||
|
||||
VFS -> File System Implementation
|
||||
---------------------------------
|
||||
@ -191,14 +176,10 @@ each discovered extent::
|
||||
u64 len);
|
||||
|
||||
->fiemap is passed struct fiemap_extent_info which describes the
|
||||
fiemap request::
|
||||
fiemap request:
|
||||
|
||||
struct fiemap_extent_info {
|
||||
unsigned int fi_flags; /* Flags as passed from user */
|
||||
unsigned int fi_extents_mapped; /* Number of mapped extents */
|
||||
unsigned int fi_extents_max; /* Size of fiemap_extent array */
|
||||
struct fiemap_extent *fi_extents_start; /* Start of fiemap_extent array */
|
||||
};
|
||||
.. kernel-doc:: include/linux/fiemap.h
|
||||
:identifiers: fiemap_extent_info
|
||||
|
||||
It is intended that the file system should not need to access any of this
|
||||
structure directly. Filesystem handlers should be tolerant to signals and return
|
||||
|
@ -218,64 +218,30 @@ NFS Client and Server Interlock
|
||||
===============================
|
||||
|
||||
LOCALIO provides the nfs_uuid_t object and associated interfaces to
|
||||
allow proper network namespace (net-ns) and NFSD object refcounting:
|
||||
allow proper network namespace (net-ns) and NFSD object refcounting.
|
||||
|
||||
We don't want to keep a long-term counted reference on each NFSD's
|
||||
net-ns in the client because that prevents a server container from
|
||||
completely shutting down.
|
||||
|
||||
So we avoid taking a reference at all and rely on the per-cpu
|
||||
reference to the server (detailed below) being sufficient to keep
|
||||
the net-ns active. This involves allowing the NFSD's net-ns exit
|
||||
code to iterate all active clients and clear their ->net pointers
|
||||
(which are needed to find the per-cpu-refcount for the nfsd_serv).
|
||||
|
||||
Details:
|
||||
|
||||
- Embed nfs_uuid_t in nfs_client. nfs_uuid_t provides a list_head
|
||||
that can be used to find the client. It does add the 16-byte
|
||||
uuid_t to nfs_client so it is bigger than needed (given that
|
||||
uuid_t is only used during the initial NFS client and server
|
||||
LOCALIO handshake to determine if they are local to each other).
|
||||
If that is really a problem we can find a fix.
|
||||
|
||||
- When the nfs server confirms that the uuid_t is local, it moves
|
||||
the nfs_uuid_t onto a per-net-ns list in NFSD's nfsd_net.
|
||||
|
||||
- When each server's net-ns is shutting down - in a "pre_exit"
|
||||
handler, all these nfs_uuid_t have their ->net cleared. There is
|
||||
an rcu_synchronize() call between pre_exit() handlers and exit()
|
||||
handlers so any caller that sees nfs_uuid_t ->net as not NULL can
|
||||
safely manage the per-cpu-refcount for nfsd_serv.
|
||||
|
||||
- The client's nfs_uuid_t is passed to nfsd_open_local_fh() so it
|
||||
can safely dereference ->net in a private rcu_read_lock() section
|
||||
to allow safe access to the associated nfsd_net and nfsd_serv.
|
||||
|
||||
So LOCALIO required the introduction and use of NFSD's percpu_ref to
|
||||
interlock nfsd_destroy_serv() and nfsd_open_local_fh(), to ensure each
|
||||
nn->nfsd_serv is not destroyed while in use by nfsd_open_local_fh(), and
|
||||
LOCALIO required the introduction and use of NFSD's percpu nfsd_net_ref
|
||||
to interlock nfsd_shutdown_net() and nfsd_open_local_fh(), to ensure
|
||||
each net-ns is not destroyed while in use by nfsd_open_local_fh(), and
|
||||
warrants a more detailed explanation:
|
||||
|
||||
nfsd_open_local_fh() uses nfsd_serv_try_get() before opening its
|
||||
nfsd_open_local_fh() uses nfsd_net_try_get() before opening its
|
||||
nfsd_file handle and then the caller (NFS client) must drop the
|
||||
reference for the nfsd_file and associated nn->nfsd_serv using
|
||||
nfs_file_put_local() once it has completed its IO.
|
||||
reference for the nfsd_file and associated net-ns using
|
||||
nfsd_file_put_local() once it has completed its IO.
|
||||
|
||||
This interlock working relies heavily on nfsd_open_local_fh() being
|
||||
afforded the ability to safely deal with the possibility that the
|
||||
NFSD's net-ns (and nfsd_net by association) may have been destroyed
|
||||
by nfsd_destroy_serv() via nfsd_shutdown_net() -- which is only
|
||||
possible given the nfs_uuid_t ->net pointer managemenet detailed
|
||||
above.
|
||||
by nfsd_destroy_serv() via nfsd_shutdown_net().
|
||||
|
||||
All told, this elaborate interlock of the NFS client and server has been
|
||||
verified to fix an easy to hit crash that would occur if an NFSD
|
||||
instance running in a container, with a LOCALIO client mounted, is
|
||||
shutdown. Upon restart of the container and associated NFSD the client
|
||||
would go on to crash due to NULL pointer dereference that occurred due
|
||||
to the LOCALIO client's attempting to nfsd_open_local_fh(), using
|
||||
nn->nfsd_serv, without having a proper reference on nn->nfsd_serv.
|
||||
This interlock of the NFS client and server has been verified to fix an
|
||||
easy to hit crash that would occur if an NFSD instance running in a
|
||||
container, with a LOCALIO client mounted, is shutdown. Upon restart of
|
||||
the container and associated NFSD, the client would go on to crash due
|
||||
to NULL pointer dereference that occurred due to the LOCALIO client's
|
||||
attempting to nfsd_open_local_fh() without having a proper reference on
|
||||
NFSD's net-ns.
|
||||
|
||||
NFS Client issues IO instead of Server
|
||||
======================================
|
||||
@ -306,10 +272,24 @@ is issuing IO to the underlying local filesystem that it is sharing with
|
||||
the NFS server. See: fs/nfs/localio.c:nfs_local_doio() and
|
||||
fs/nfs/localio.c:nfs_local_commit().
|
||||
|
||||
With normal NFS that makes use of RPC to issue IO to the server, if an
|
||||
application uses O_DIRECT the NFS client will bypass the pagecache but
|
||||
the NFS server will not. The NFS server's use of buffered IO affords
|
||||
applications to be less precise with their alignment when issuing IO to
|
||||
the NFS client. But if all applications properly align their IO, LOCALIO
|
||||
can be configured to use end-to-end O_DIRECT semantics from the NFS
|
||||
client to the underlying local filesystem, that it is sharing with
|
||||
the NFS server, by setting the 'localio_O_DIRECT_semantics' nfs module
|
||||
parameter to Y, e.g.:
|
||||
echo Y > /sys/module/nfs/parameters/localio_O_DIRECT_semantics
|
||||
Once enabled, it will cause LOCALIO to use end-to-end O_DIRECT semantics
|
||||
(but again, this may cause IO to fail if applications do not properly
|
||||
align their IO).
|
||||
|
||||
Security
|
||||
========
|
||||
|
||||
Localio is only supported when UNIX-style authentication (AUTH_UNIX, aka
|
||||
LOCALIO is only supported when UNIX-style authentication (AUTH_UNIX, aka
|
||||
AUTH_SYS) is used.
|
||||
|
||||
Care is taken to ensure the same NFS security mechanisms are used
|
||||
@ -324,6 +304,24 @@ client is afforded this same level of access (albeit in terms of the NFS
|
||||
protocol via SUNRPC). No other namespaces (user, mount, etc) have been
|
||||
altered or purposely extended from the server to the client.
|
||||
|
||||
Module Parameters
|
||||
=================
|
||||
|
||||
/sys/module/nfs/parameters/localio_enabled (bool)
|
||||
controls if LOCALIO is enabled, defaults to Y. If client and server are
|
||||
local but 'localio_enabled' is set to N then LOCALIO will not be used.
|
||||
|
||||
/sys/module/nfs/parameters/localio_O_DIRECT_semantics (bool)
|
||||
controls if O_DIRECT extends down to the underlying filesystem, defaults
|
||||
to N. Application IO must be logical blocksize aligned, otherwise
|
||||
O_DIRECT will fail.
|
||||
|
||||
/sys/module/nfsv3/parameters/nfs3_localio_probe_throttle (uint)
|
||||
controls if NFSv3 read and write IOs will trigger (re)enabling of
|
||||
LOCALIO every N (nfs3_localio_probe_throttle) IOs, defaults to 0
|
||||
(disabled). Must be power-of-2, admin keeps all the pieces if they
|
||||
misconfigure (too low a value or non-power-of-2).
|
||||
|
||||
Testing
|
||||
=======
|
||||
|
||||
|
@ -527,11 +527,6 @@ There are some functions to help manage credentials:
|
||||
This gets a reference on a live set of credentials, returning a pointer to
|
||||
that set of credentials.
|
||||
|
||||
- ``struct cred *get_new_cred(struct cred *cred);``
|
||||
|
||||
This gets a reference on a set of credentials that is under construction
|
||||
and is thus still mutable, returning a pointer to that set of credentials.
|
||||
|
||||
|
||||
Open File Credentials
|
||||
=====================
|
||||
|
186
Documentation/sunrpc/xdr/nfs4_1.x
Normal file
186
Documentation/sunrpc/xdr/nfs4_1.x
Normal file
@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Copyright (c) 2010 IETF Trust and the persons identified
|
||||
* as the document authors. All rights reserved.
|
||||
*
|
||||
* The document authors are identified in RFC 3530 and
|
||||
* RFC 5661.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with
|
||||
* or without modification, are permitted provided that the
|
||||
* following conditions are met:
|
||||
*
|
||||
* - Redistributions of source code must retain the above
|
||||
* copyright notice, this list of conditions and the
|
||||
* following disclaimer.
|
||||
*
|
||||
* - Redistributions in binary form must reproduce the above
|
||||
* copyright notice, this list of conditions and the
|
||||
* following disclaimer in the documentation and/or other
|
||||
* materials provided with the distribution.
|
||||
*
|
||||
* - Neither the name of Internet Society, IETF or IETF
|
||||
* Trust, nor the names of specific contributors, may be
|
||||
* used to endorse or promote products derived from this
|
||||
* software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS
|
||||
* AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
|
||||
* EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
|
||||
* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
pragma header nfs4;
|
||||
|
||||
/*
|
||||
* Basic typedefs for RFC 1832 data type definitions
|
||||
*/
|
||||
typedef hyper int64_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
/*
|
||||
* Basic data types
|
||||
*/
|
||||
typedef uint32_t bitmap4<>;
|
||||
|
||||
/*
|
||||
* Timeval
|
||||
*/
|
||||
struct nfstime4 {
|
||||
int64_t seconds;
|
||||
uint32_t nseconds;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* The following content was extracted from draft-ietf-nfsv4-delstid
|
||||
*/
|
||||
|
||||
typedef bool fattr4_offline;
|
||||
|
||||
|
||||
const FATTR4_OFFLINE = 83;
|
||||
|
||||
|
||||
struct open_arguments4 {
|
||||
bitmap4 oa_share_access;
|
||||
bitmap4 oa_share_deny;
|
||||
bitmap4 oa_share_access_want;
|
||||
bitmap4 oa_open_claim;
|
||||
bitmap4 oa_create_mode;
|
||||
};
|
||||
|
||||
|
||||
enum open_args_share_access4 {
|
||||
OPEN_ARGS_SHARE_ACCESS_READ = 1,
|
||||
OPEN_ARGS_SHARE_ACCESS_WRITE = 2,
|
||||
OPEN_ARGS_SHARE_ACCESS_BOTH = 3
|
||||
};
|
||||
|
||||
|
||||
enum open_args_share_deny4 {
|
||||
OPEN_ARGS_SHARE_DENY_NONE = 0,
|
||||
OPEN_ARGS_SHARE_DENY_READ = 1,
|
||||
OPEN_ARGS_SHARE_DENY_WRITE = 2,
|
||||
OPEN_ARGS_SHARE_DENY_BOTH = 3
|
||||
};
|
||||
|
||||
|
||||
enum open_args_share_access_want4 {
|
||||
OPEN_ARGS_SHARE_ACCESS_WANT_ANY_DELEG = 3,
|
||||
OPEN_ARGS_SHARE_ACCESS_WANT_NO_DELEG = 4,
|
||||
OPEN_ARGS_SHARE_ACCESS_WANT_CANCEL = 5,
|
||||
OPEN_ARGS_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL
|
||||
= 17,
|
||||
OPEN_ARGS_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED
|
||||
= 18,
|
||||
OPEN_ARGS_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 20,
|
||||
OPEN_ARGS_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 21
|
||||
};
|
||||
|
||||
|
||||
enum open_args_open_claim4 {
|
||||
OPEN_ARGS_OPEN_CLAIM_NULL = 0,
|
||||
OPEN_ARGS_OPEN_CLAIM_PREVIOUS = 1,
|
||||
OPEN_ARGS_OPEN_CLAIM_DELEGATE_CUR = 2,
|
||||
OPEN_ARGS_OPEN_CLAIM_DELEGATE_PREV = 3,
|
||||
OPEN_ARGS_OPEN_CLAIM_FH = 4,
|
||||
OPEN_ARGS_OPEN_CLAIM_DELEG_CUR_FH = 5,
|
||||
OPEN_ARGS_OPEN_CLAIM_DELEG_PREV_FH = 6
|
||||
};
|
||||
|
||||
|
||||
enum open_args_createmode4 {
|
||||
OPEN_ARGS_CREATEMODE_UNCHECKED4 = 0,
|
||||
OPEN_ARGS_CREATE_MODE_GUARDED = 1,
|
||||
OPEN_ARGS_CREATEMODE_EXCLUSIVE4 = 2,
|
||||
OPEN_ARGS_CREATE_MODE_EXCLUSIVE4_1 = 3
|
||||
};
|
||||
|
||||
|
||||
typedef open_arguments4 fattr4_open_arguments;
|
||||
pragma public fattr4_open_arguments;
|
||||
|
||||
|
||||
%/*
|
||||
% * Determine what OPEN supports.
|
||||
% */
|
||||
const FATTR4_OPEN_ARGUMENTS = 86;
|
||||
|
||||
|
||||
|
||||
|
||||
const OPEN4_RESULT_NO_OPEN_STATEID = 0x00000010;
|
||||
|
||||
|
||||
/*
|
||||
* attributes for the delegation times being
|
||||
* cached and served by the "client"
|
||||
*/
|
||||
typedef nfstime4 fattr4_time_deleg_access;
|
||||
typedef nfstime4 fattr4_time_deleg_modify;
|
||||
pragma public fattr4_time_deleg_access;
|
||||
pragma public fattr4_time_deleg_modify;
|
||||
|
||||
|
||||
%/*
|
||||
% * New RECOMMENDED Attribute for
|
||||
% * delegation caching of times
|
||||
% */
|
||||
const FATTR4_TIME_DELEG_ACCESS = 84;
|
||||
const FATTR4_TIME_DELEG_MODIFY = 85;
|
||||
|
||||
|
||||
|
||||
/* new flags for share_access field of OPEN4args */
|
||||
const OPEN4_SHARE_ACCESS_WANT_DELEG_MASK = 0xFF00;
|
||||
const OPEN4_SHARE_ACCESS_WANT_NO_PREFERENCE = 0x0000;
|
||||
const OPEN4_SHARE_ACCESS_WANT_READ_DELEG = 0x0100;
|
||||
const OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG = 0x0200;
|
||||
const OPEN4_SHARE_ACCESS_WANT_ANY_DELEG = 0x0300;
|
||||
const OPEN4_SHARE_ACCESS_WANT_NO_DELEG = 0x0400;
|
||||
const OPEN4_SHARE_ACCESS_WANT_CANCEL = 0x0500;
|
||||
|
||||
const OPEN4_SHARE_ACCESS_WANT_SIGNAL_DELEG_WHEN_RESRC_AVAIL = 0x10000;
|
||||
const OPEN4_SHARE_ACCESS_WANT_PUSH_DELEG_WHEN_UNCONTENDED = 0x20000;
|
||||
const OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS = 0x100000;
|
||||
const OPEN4_SHARE_ACCESS_WANT_OPEN_XOR_DELEGATION = 0x200000;
|
||||
|
||||
enum open_delegation_type4 {
|
||||
OPEN_DELEGATE_NONE = 0,
|
||||
OPEN_DELEGATE_READ = 1,
|
||||
OPEN_DELEGATE_WRITE = 2,
|
||||
OPEN_DELEGATE_NONE_EXT = 3, /* new to v4.1 */
|
||||
OPEN_DELEGATE_READ_ATTRS_DELEG = 4,
|
||||
OPEN_DELEGATE_WRITE_ATTRS_DELEG = 5
|
||||
};
|
@ -12400,6 +12400,13 @@ F: Documentation/kbuild/kconfig*
|
||||
F: scripts/Kconfig.include
|
||||
F: scripts/kconfig/
|
||||
|
||||
KCORE
|
||||
M: Omar Sandoval <osandov@osandov.com>
|
||||
L: linux-debuggers@vger.kernel.org
|
||||
S: Maintained
|
||||
F: fs/proc/kcore.c
|
||||
F: include/linux/kcore.h
|
||||
|
||||
KCOV
|
||||
R: Dmitry Vyukov <dvyukov@google.com>
|
||||
R: Andrey Konovalov <andreyknvl@gmail.com>
|
||||
|
@ -249,7 +249,7 @@ static struct file *open_file_as_root(const char *filename, int flags, umode_t m
|
||||
fp = file_open_root(&root, filename, flags, mode);
|
||||
path_put(&root);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(revert_creds(old_cred));
|
||||
|
||||
return fp;
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
config VBOXGUEST
|
||||
tristate "Virtual Box Guest integration support"
|
||||
depends on X86 && PCI && INPUT
|
||||
depends on (ARM64 || X86) && PCI && INPUT
|
||||
help
|
||||
This is a driver for the Virtual Box Guest PCI device used in
|
||||
Virtual Box virtual machines. Enabling this driver will add
|
||||
|
@ -81,13 +81,13 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
|
||||
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
|
||||
if (pos + total >= i_size_read(rreq->inode))
|
||||
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
|
||||
|
||||
if (!err) {
|
||||
if (!err && total) {
|
||||
subreq->transferred += total;
|
||||
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
|
||||
}
|
||||
|
||||
netfs_read_subreq_terminated(subreq, err, false);
|
||||
subreq->error = err;
|
||||
netfs_read_subreq_terminated(subreq);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -11,6 +11,7 @@ kafs-y := \
|
||||
cmservice.o \
|
||||
dir.o \
|
||||
dir_edit.o \
|
||||
dir_search.o \
|
||||
dir_silly.o \
|
||||
dynroot.o \
|
||||
file.o \
|
||||
|
@ -41,7 +41,7 @@ static void afs_volume_init_callback(struct afs_volume *volume)
|
||||
|
||||
list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
|
||||
if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
|
||||
atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
|
||||
afs_clear_cb_promise(vnode, afs_cb_promise_clear_vol_init_cb);
|
||||
queue_work(system_unbound_wq, &vnode->cb_work);
|
||||
}
|
||||
}
|
||||
@ -79,7 +79,7 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
|
||||
_enter("");
|
||||
|
||||
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
|
||||
if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) {
|
||||
if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_cb_break)) {
|
||||
vnode->cb_break++;
|
||||
vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
|
||||
afs_clear_permits(vnode);
|
||||
|
@ -146,18 +146,20 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
cell->name = kmalloc(namelen + 1, GFP_KERNEL);
|
||||
cell->name = kmalloc(1 + namelen + 1, GFP_KERNEL);
|
||||
if (!cell->name) {
|
||||
kfree(cell);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
cell->net = net;
|
||||
cell->name[0] = '.';
|
||||
cell->name++;
|
||||
cell->name_len = namelen;
|
||||
for (i = 0; i < namelen; i++)
|
||||
cell->name[i] = tolower(name[i]);
|
||||
cell->name[i] = 0;
|
||||
|
||||
cell->net = net;
|
||||
refcount_set(&cell->ref, 1);
|
||||
atomic_set(&cell->active, 0);
|
||||
INIT_WORK(&cell->manager, afs_manage_cell_work);
|
||||
@ -211,7 +213,7 @@ parse_failed:
|
||||
if (ret == -EINVAL)
|
||||
printk(KERN_ERR "kAFS: bad VL server IP address\n");
|
||||
error:
|
||||
kfree(cell->name);
|
||||
kfree(cell->name - 1);
|
||||
kfree(cell);
|
||||
_leave(" = %d", ret);
|
||||
return ERR_PTR(ret);
|
||||
@ -365,6 +367,14 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
|
||||
len = cp - rootcell;
|
||||
}
|
||||
|
||||
if (len == 0 || !rootcell[0] || rootcell[0] == '.' || rootcell[len - 1] == '.')
|
||||
return -EINVAL;
|
||||
if (memchr(rootcell, '/', len))
|
||||
return -EINVAL;
|
||||
cp = strstr(rootcell, "..");
|
||||
if (cp && cp < rootcell + len)
|
||||
return -EINVAL;
|
||||
|
||||
/* allocate a cell record for the root cell */
|
||||
new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
|
||||
if (IS_ERR(new_root)) {
|
||||
@ -502,7 +512,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
|
||||
afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
|
||||
afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
|
||||
key_put(cell->anonymous_key);
|
||||
kfree(cell->name);
|
||||
kfree(cell->name - 1);
|
||||
kfree(cell);
|
||||
|
||||
afs_dec_cells_outstanding(net);
|
||||
@ -710,7 +720,8 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
|
||||
afs_proc_cell_remove(cell);
|
||||
|
||||
mutex_lock(&net->proc_cells_lock);
|
||||
hlist_del_rcu(&cell->proc_link);
|
||||
if (!hlist_unhashed(&cell->proc_link))
|
||||
hlist_del_rcu(&cell->proc_link);
|
||||
afs_dynroot_rmdir(net, cell);
|
||||
mutex_unlock(&net->proc_cells_lock);
|
||||
|
||||
|
841
fs/afs/dir.c
841
fs/afs/dir.c
File diff suppressed because it is too large
Load Diff
@ -10,6 +10,7 @@
|
||||
#include <linux/namei.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/folio_queue.h>
|
||||
#include "internal.h"
|
||||
#include "xdr_fs.h"
|
||||
|
||||
@ -105,23 +106,57 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a new directory folio.
|
||||
* Get a specific block, extending the directory storage to cover it as needed.
|
||||
*/
|
||||
static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
|
||||
static union afs_xdr_dir_block *afs_dir_get_block(struct afs_dir_iter *iter, size_t block)
|
||||
{
|
||||
struct address_space *mapping = vnode->netfs.inode.i_mapping;
|
||||
struct folio_queue *fq;
|
||||
struct afs_vnode *dvnode = iter->dvnode;
|
||||
struct folio *folio;
|
||||
size_t blpos = block * AFS_DIR_BLOCK_SIZE;
|
||||
size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
|
||||
int ret;
|
||||
|
||||
folio = __filemap_get_folio(mapping, index,
|
||||
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
|
||||
mapping->gfp_mask);
|
||||
if (IS_ERR(folio)) {
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
return NULL;
|
||||
if (dvnode->directory_size < blend) {
|
||||
size_t cur_size = dvnode->directory_size;
|
||||
|
||||
ret = netfs_alloc_folioq_buffer(
|
||||
NULL, &dvnode->directory, &cur_size, blend,
|
||||
mapping_gfp_mask(dvnode->netfs.inode.i_mapping));
|
||||
dvnode->directory_size = cur_size;
|
||||
if (ret < 0)
|
||||
goto fail;
|
||||
}
|
||||
if (!folio_test_private(folio))
|
||||
folio_attach_private(folio, (void *)1);
|
||||
return folio;
|
||||
|
||||
fq = iter->fq;
|
||||
if (!fq)
|
||||
fq = dvnode->directory;
|
||||
|
||||
/* Search the folio queue for the folio containing the block... */
|
||||
for (; fq; fq = fq->next) {
|
||||
for (int s = iter->fq_slot; s < folioq_count(fq); s++) {
|
||||
size_t fsize = folioq_folio_size(fq, s);
|
||||
|
||||
if (blend <= fpos + fsize) {
|
||||
/* ... and then return the mapped block. */
|
||||
folio = folioq_folio(fq, s);
|
||||
if (WARN_ON_ONCE(folio_pos(folio) != fpos))
|
||||
goto fail;
|
||||
iter->fq = fq;
|
||||
iter->fq_slot = s;
|
||||
iter->fpos = fpos;
|
||||
return kmap_local_folio(folio, blpos - fpos);
|
||||
}
|
||||
fpos += fsize;
|
||||
}
|
||||
iter->fq_slot = 0;
|
||||
}
|
||||
|
||||
fail:
|
||||
iter->fq = NULL;
|
||||
iter->fq_slot = 0;
|
||||
afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -209,9 +244,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
|
||||
{
|
||||
union afs_xdr_dir_block *meta, *block;
|
||||
union afs_xdr_dirent *de;
|
||||
struct folio *folio0, *folio;
|
||||
unsigned int need_slots, nr_blocks, b;
|
||||
pgoff_t index;
|
||||
struct afs_dir_iter iter = { .dvnode = vnode };
|
||||
unsigned int nr_blocks, b, entry;
|
||||
loff_t i_size;
|
||||
int slot;
|
||||
|
||||
@ -220,20 +254,17 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
|
||||
i_size = i_size_read(&vnode->netfs.inode);
|
||||
if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
|
||||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_bad_size);
|
||||
return;
|
||||
}
|
||||
|
||||
folio0 = afs_dir_get_folio(vnode, 0);
|
||||
if (!folio0) {
|
||||
_leave(" [fgp]");
|
||||
meta = afs_dir_get_block(&iter, 0);
|
||||
if (!meta)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Work out how many slots we're going to need. */
|
||||
need_slots = afs_dir_calc_slots(name->len);
|
||||
iter.nr_slots = afs_dir_calc_slots(name->len);
|
||||
|
||||
meta = kmap_local_folio(folio0, 0);
|
||||
if (i_size == 0)
|
||||
goto new_directory;
|
||||
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
|
||||
@ -245,22 +276,21 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
|
||||
/* If the directory extended into a new folio, then we need to
|
||||
* tack a new folio on the end.
|
||||
*/
|
||||
index = b / AFS_DIR_BLOCKS_PER_PAGE;
|
||||
if (nr_blocks >= AFS_DIR_MAX_BLOCKS)
|
||||
goto error;
|
||||
if (index >= folio_nr_pages(folio0)) {
|
||||
folio = afs_dir_get_folio(vnode, index);
|
||||
if (!folio)
|
||||
goto error;
|
||||
} else {
|
||||
folio = folio0;
|
||||
}
|
||||
goto error_too_many_blocks;
|
||||
|
||||
block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
|
||||
/* Lower dir blocks have a counter in the header we can check. */
|
||||
if (b < AFS_DIR_BLOCKS_WITH_CTR &&
|
||||
meta->meta.alloc_ctrs[b] < iter.nr_slots)
|
||||
continue;
|
||||
|
||||
block = afs_dir_get_block(&iter, b);
|
||||
if (!block)
|
||||
goto error;
|
||||
|
||||
/* Abandon the edit if we got a callback break. */
|
||||
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
|
||||
goto invalidated;
|
||||
goto already_invalidated;
|
||||
|
||||
_debug("block %u: %2u %3u %u",
|
||||
b,
|
||||
@ -275,31 +305,23 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
|
||||
afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
/* Only lower dir blocks have a counter in the header. */
|
||||
if (b >= AFS_DIR_BLOCKS_WITH_CTR ||
|
||||
meta->meta.alloc_ctrs[b] >= need_slots) {
|
||||
/* We need to try and find one or more consecutive
|
||||
* slots to hold the entry.
|
||||
*/
|
||||
slot = afs_find_contig_bits(block, need_slots);
|
||||
if (slot >= 0) {
|
||||
_debug("slot %u", slot);
|
||||
goto found_space;
|
||||
}
|
||||
/* We need to try and find one or more consecutive slots to
|
||||
* hold the entry.
|
||||
*/
|
||||
slot = afs_find_contig_bits(block, iter.nr_slots);
|
||||
if (slot >= 0) {
|
||||
_debug("slot %u", slot);
|
||||
goto found_space;
|
||||
}
|
||||
|
||||
kunmap_local(block);
|
||||
if (folio != folio0) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
}
|
||||
|
||||
/* There are no spare slots of sufficient size, yet the operation
|
||||
* succeeded. Download the directory again.
|
||||
*/
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_nospc, 0, 0, 0, 0, name->name);
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_no_slots);
|
||||
goto out_unmap;
|
||||
|
||||
new_directory:
|
||||
@ -307,8 +329,7 @@ new_directory:
|
||||
i_size = AFS_DIR_BLOCK_SIZE;
|
||||
afs_set_i_size(vnode, i_size);
|
||||
slot = AFS_DIR_RESV_BLOCKS0;
|
||||
folio = folio0;
|
||||
block = kmap_local_folio(folio, 0);
|
||||
block = afs_dir_get_block(&iter, 0);
|
||||
nr_blocks = 1;
|
||||
b = 0;
|
||||
|
||||
@ -326,41 +347,39 @@ found_space:
|
||||
de->u.name[name->len] = 0;
|
||||
|
||||
/* Adjust the bitmap. */
|
||||
afs_set_contig_bits(block, slot, need_slots);
|
||||
kunmap_local(block);
|
||||
if (folio != folio0) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
afs_set_contig_bits(block, slot, iter.nr_slots);
|
||||
|
||||
/* Adjust the allocation counter. */
|
||||
if (b < AFS_DIR_BLOCKS_WITH_CTR)
|
||||
meta->meta.alloc_ctrs[b] -= need_slots;
|
||||
meta->meta.alloc_ctrs[b] -= iter.nr_slots;
|
||||
|
||||
/* Adjust the hash chain. */
|
||||
entry = b * AFS_DIR_SLOTS_PER_BLOCK + slot;
|
||||
iter.bucket = afs_dir_hash_name(name);
|
||||
de->u.hash_next = meta->meta.hashtable[iter.bucket];
|
||||
meta->meta.hashtable[iter.bucket] = htons(entry);
|
||||
kunmap_local(block);
|
||||
|
||||
inode_inc_iversion_raw(&vnode->netfs.inode);
|
||||
afs_stat_v(vnode, n_dir_cr);
|
||||
_debug("Insert %s in %u[%u]", name->name, b, slot);
|
||||
|
||||
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
|
||||
|
||||
out_unmap:
|
||||
kunmap_local(meta);
|
||||
folio_unlock(folio0);
|
||||
folio_put(folio0);
|
||||
_leave("");
|
||||
return;
|
||||
|
||||
invalidated:
|
||||
already_invalidated:
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name);
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
kunmap_local(block);
|
||||
if (folio != folio0) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
goto out_unmap;
|
||||
|
||||
error_too_many_blocks:
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_edit_add_too_many_blocks);
|
||||
error:
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_create_error, 0, 0, 0, 0, name->name);
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
@ -374,13 +393,14 @@ error:
|
||||
void afs_edit_dir_remove(struct afs_vnode *vnode,
|
||||
struct qstr *name, enum afs_edit_dir_reason why)
|
||||
{
|
||||
union afs_xdr_dir_block *meta, *block;
|
||||
union afs_xdr_dirent *de;
|
||||
struct folio *folio0, *folio;
|
||||
unsigned int need_slots, nr_blocks, b;
|
||||
pgoff_t index;
|
||||
union afs_xdr_dir_block *meta, *block, *pblock;
|
||||
union afs_xdr_dirent *de, *pde;
|
||||
struct afs_dir_iter iter = { .dvnode = vnode };
|
||||
struct afs_fid fid;
|
||||
unsigned int b, slot, entry;
|
||||
loff_t i_size;
|
||||
int slot;
|
||||
__be16 next;
|
||||
int found;
|
||||
|
||||
_enter(",,{%d,%s},", name->len, name->name);
|
||||
|
||||
@ -388,81 +408,95 @@ void afs_edit_dir_remove(struct afs_vnode *vnode,
|
||||
if (i_size < AFS_DIR_BLOCK_SIZE ||
|
||||
i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS ||
|
||||
(i_size & (AFS_DIR_BLOCK_SIZE - 1))) {
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
return;
|
||||
}
|
||||
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
|
||||
|
||||
folio0 = afs_dir_get_folio(vnode, 0);
|
||||
if (!folio0) {
|
||||
_leave(" [fgp]");
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_bad_size);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Work out how many slots we're going to discard. */
|
||||
need_slots = afs_dir_calc_slots(name->len);
|
||||
if (!afs_dir_init_iter(&iter, name))
|
||||
return;
|
||||
|
||||
meta = kmap_local_folio(folio0, 0);
|
||||
meta = afs_dir_find_block(&iter, 0);
|
||||
if (!meta)
|
||||
return;
|
||||
|
||||
/* Find a block that has sufficient slots available. Each folio
|
||||
* contains two or more directory blocks.
|
||||
*/
|
||||
for (b = 0; b < nr_blocks; b++) {
|
||||
index = b / AFS_DIR_BLOCKS_PER_PAGE;
|
||||
if (index >= folio_nr_pages(folio0)) {
|
||||
folio = afs_dir_get_folio(vnode, index);
|
||||
if (!folio)
|
||||
goto error;
|
||||
} else {
|
||||
folio = folio0;
|
||||
}
|
||||
|
||||
block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
|
||||
|
||||
/* Abandon the edit if we got a callback break. */
|
||||
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
|
||||
goto invalidated;
|
||||
|
||||
if (b > AFS_DIR_BLOCKS_WITH_CTR ||
|
||||
meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) {
|
||||
slot = afs_dir_scan_block(block, name, b);
|
||||
if (slot >= 0)
|
||||
goto found_dirent;
|
||||
}
|
||||
|
||||
kunmap_local(block);
|
||||
if (folio != folio0) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
/* Find the entry in the blob. */
|
||||
found = afs_dir_search_bucket(&iter, name, &fid);
|
||||
if (found < 0) {
|
||||
/* Didn't find the dirent to clobber. Re-download. */
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
|
||||
0, 0, 0, 0, name->name);
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_edit_rem_wrong_name);
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
/* Didn't find the dirent to clobber. Download the directory again. */
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_noent,
|
||||
0, 0, 0, 0, name->name);
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
goto out_unmap;
|
||||
entry = found;
|
||||
b = entry / AFS_DIR_SLOTS_PER_BLOCK;
|
||||
slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
|
||||
|
||||
found_dirent:
|
||||
block = afs_dir_find_block(&iter, b);
|
||||
if (!block)
|
||||
goto error;
|
||||
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
|
||||
goto already_invalidated;
|
||||
|
||||
/* Check and clear the entry. */
|
||||
de = &block->dirents[slot];
|
||||
if (de->u.valid != 1)
|
||||
goto error_unmap;
|
||||
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete, b, slot,
|
||||
ntohl(de->u.vnode), ntohl(de->u.unique),
|
||||
name->name);
|
||||
|
||||
memset(de, 0, sizeof(*de) * need_slots);
|
||||
|
||||
/* Adjust the bitmap. */
|
||||
afs_clear_contig_bits(block, slot, need_slots);
|
||||
kunmap_local(block);
|
||||
if (folio != folio0) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
afs_clear_contig_bits(block, slot, iter.nr_slots);
|
||||
|
||||
/* Adjust the allocation counter. */
|
||||
if (b < AFS_DIR_BLOCKS_WITH_CTR)
|
||||
meta->meta.alloc_ctrs[b] += need_slots;
|
||||
meta->meta.alloc_ctrs[b] += iter.nr_slots;
|
||||
|
||||
/* Clear the constituent entries. */
|
||||
next = de->u.hash_next;
|
||||
memset(de, 0, sizeof(*de) * iter.nr_slots);
|
||||
kunmap_local(block);
|
||||
|
||||
/* Adjust the hash chain: if iter->prev_entry is 0, the hashtable head
|
||||
* index is previous; otherwise it's slot number of the previous entry.
|
||||
*/
|
||||
if (!iter.prev_entry) {
|
||||
__be16 prev_next = meta->meta.hashtable[iter.bucket];
|
||||
|
||||
if (unlikely(prev_next != htons(entry))) {
|
||||
pr_warn("%llx:%llx:%x: not head of chain b=%x p=%x,%x e=%x %*s",
|
||||
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
|
||||
iter.bucket, iter.prev_entry, prev_next, entry,
|
||||
name->len, name->name);
|
||||
goto error;
|
||||
}
|
||||
meta->meta.hashtable[iter.bucket] = next;
|
||||
} else {
|
||||
unsigned int pb = iter.prev_entry / AFS_DIR_SLOTS_PER_BLOCK;
|
||||
unsigned int ps = iter.prev_entry % AFS_DIR_SLOTS_PER_BLOCK;
|
||||
__be16 prev_next;
|
||||
|
||||
pblock = afs_dir_find_block(&iter, pb);
|
||||
if (!pblock)
|
||||
goto error;
|
||||
pde = &pblock->dirents[ps];
|
||||
prev_next = pde->u.hash_next;
|
||||
if (prev_next != htons(entry)) {
|
||||
kunmap_local(pblock);
|
||||
pr_warn("%llx:%llx:%x: not prev in chain b=%x p=%x,%x e=%x %*s",
|
||||
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique,
|
||||
iter.bucket, iter.prev_entry, prev_next, entry,
|
||||
name->len, name->name);
|
||||
goto error;
|
||||
}
|
||||
pde->u.hash_next = next;
|
||||
kunmap_local(pblock);
|
||||
}
|
||||
|
||||
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
|
||||
|
||||
inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
|
||||
afs_stat_v(vnode, n_dir_rm);
|
||||
@ -470,26 +504,20 @@ found_dirent:
|
||||
|
||||
out_unmap:
|
||||
kunmap_local(meta);
|
||||
folio_unlock(folio0);
|
||||
folio_put(folio0);
|
||||
_leave("");
|
||||
return;
|
||||
|
||||
invalidated:
|
||||
already_invalidated:
|
||||
kunmap_local(block);
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval,
|
||||
0, 0, 0, 0, name->name);
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
kunmap_local(block);
|
||||
if (folio != folio0) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
goto out_unmap;
|
||||
|
||||
error_unmap:
|
||||
kunmap_local(block);
|
||||
error:
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_error,
|
||||
0, 0, 0, 0, name->name);
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
@ -502,9 +530,8 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
|
||||
{
|
||||
union afs_xdr_dir_block *block;
|
||||
union afs_xdr_dirent *de;
|
||||
struct folio *folio;
|
||||
struct afs_dir_iter iter = { .dvnode = vnode };
|
||||
unsigned int nr_blocks, b;
|
||||
pgoff_t index;
|
||||
loff_t i_size;
|
||||
int slot;
|
||||
|
||||
@ -512,39 +539,35 @@ void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_d
|
||||
|
||||
i_size = i_size_read(&vnode->netfs.inode);
|
||||
if (i_size < AFS_DIR_BLOCK_SIZE) {
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_bad_size);
|
||||
return;
|
||||
}
|
||||
|
||||
nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
|
||||
|
||||
/* Find a block that has sufficient slots available. Each folio
|
||||
* contains two or more directory blocks.
|
||||
*/
|
||||
for (b = 0; b < nr_blocks; b++) {
|
||||
index = b / AFS_DIR_BLOCKS_PER_PAGE;
|
||||
folio = afs_dir_get_folio(vnode, index);
|
||||
if (!folio)
|
||||
block = afs_dir_get_block(&iter, b);
|
||||
if (!block)
|
||||
goto error;
|
||||
|
||||
block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
|
||||
|
||||
/* Abandon the edit if we got a callback break. */
|
||||
if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
|
||||
goto invalidated;
|
||||
goto already_invalidated;
|
||||
|
||||
slot = afs_dir_scan_block(block, &dotdot_name, b);
|
||||
if (slot >= 0)
|
||||
goto found_dirent;
|
||||
|
||||
kunmap_local(block);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
/* Didn't find the dirent to clobber. Download the directory again. */
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
|
||||
0, 0, 0, 0, "..");
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_edit_upd_no_dd);
|
||||
goto out;
|
||||
|
||||
found_dirent:
|
||||
@ -556,26 +579,70 @@ found_dirent:
|
||||
ntohl(de->u.vnode), ntohl(de->u.unique), "..");
|
||||
|
||||
kunmap_local(block);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
|
||||
inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
|
||||
|
||||
out:
|
||||
_leave("");
|
||||
return;
|
||||
|
||||
invalidated:
|
||||
already_invalidated:
|
||||
kunmap_local(block);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
|
||||
0, 0, 0, 0, "..");
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
goto out;
|
||||
|
||||
error:
|
||||
trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
|
||||
0, 0, 0, 0, "..");
|
||||
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialise a new directory. We need to fill in the "." and ".." entries.
|
||||
*/
|
||||
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_dvnode)
|
||||
{
|
||||
union afs_xdr_dir_block *meta;
|
||||
struct afs_dir_iter iter = { .dvnode = dvnode };
|
||||
union afs_xdr_dirent *de;
|
||||
unsigned int slot = AFS_DIR_RESV_BLOCKS0;
|
||||
loff_t i_size;
|
||||
|
||||
i_size = i_size_read(&dvnode->netfs.inode);
|
||||
if (i_size != AFS_DIR_BLOCK_SIZE) {
|
||||
afs_invalidate_dir(dvnode, afs_dir_invalid_edit_add_bad_size);
|
||||
return;
|
||||
}
|
||||
|
||||
meta = afs_dir_get_block(&iter, 0);
|
||||
if (!meta)
|
||||
return;
|
||||
|
||||
afs_edit_init_block(meta, meta, 0);
|
||||
|
||||
de = &meta->dirents[slot];
|
||||
de->u.valid = 1;
|
||||
de->u.vnode = htonl(dvnode->fid.vnode);
|
||||
de->u.unique = htonl(dvnode->fid.unique);
|
||||
memcpy(de->u.name, ".", 2);
|
||||
trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
|
||||
dvnode->fid.vnode, dvnode->fid.unique, ".");
|
||||
slot++;
|
||||
|
||||
de = &meta->dirents[slot];
|
||||
de->u.valid = 1;
|
||||
de->u.vnode = htonl(parent_dvnode->fid.vnode);
|
||||
de->u.unique = htonl(parent_dvnode->fid.unique);
|
||||
memcpy(de->u.name, "..", 3);
|
||||
trace_afs_edit_dir(dvnode, afs_edit_dir_for_mkdir, afs_edit_dir_mkdir, 0, slot,
|
||||
parent_dvnode->fid.vnode, parent_dvnode->fid.unique, "..");
|
||||
|
||||
afs_set_contig_bits(meta, AFS_DIR_RESV_BLOCKS0, 2);
|
||||
meta->meta.alloc_ctrs[0] -= 2;
|
||||
kunmap_local(meta);
|
||||
|
||||
netfs_single_mark_inode_dirty(&dvnode->netfs.inode);
|
||||
set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
|
||||
set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
|
||||
}
|
||||
|
227
fs/afs/dir_search.c
Normal file
227
fs/afs/dir_search.c
Normal file
@ -0,0 +1,227 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* Search a directory's hash table.
|
||||
*
|
||||
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
|
||||
* Written by David Howells (dhowells@redhat.com)
|
||||
*
|
||||
* https://tools.ietf.org/html/draft-keiser-afs3-directory-object-00
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/iversion.h>
|
||||
#include "internal.h"
|
||||
#include "afs_fs.h"
|
||||
#include "xdr_fs.h"
|
||||
|
||||
/*
|
||||
* Calculate the name hash.
|
||||
*/
|
||||
unsigned int afs_dir_hash_name(const struct qstr *name)
|
||||
{
|
||||
const unsigned char *p = name->name;
|
||||
unsigned int hash = 0, i;
|
||||
int bucket;
|
||||
|
||||
for (i = 0; i < name->len; i++)
|
||||
hash = (hash * 173) + p[i];
|
||||
bucket = hash & (AFS_DIR_HASHTBL_SIZE - 1);
|
||||
if (hash > INT_MAX) {
|
||||
bucket = AFS_DIR_HASHTBL_SIZE - bucket;
|
||||
bucket &= (AFS_DIR_HASHTBL_SIZE - 1);
|
||||
}
|
||||
return bucket;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset a directory iterator.
|
||||
*/
|
||||
static bool afs_dir_reset_iter(struct afs_dir_iter *iter)
|
||||
{
|
||||
unsigned long long i_size = i_size_read(&iter->dvnode->netfs.inode);
|
||||
unsigned int nblocks;
|
||||
|
||||
/* Work out the maximum number of steps we can take. */
|
||||
nblocks = umin(i_size / AFS_DIR_BLOCK_SIZE, AFS_DIR_MAX_BLOCKS);
|
||||
if (!nblocks)
|
||||
return false;
|
||||
iter->loop_check = nblocks * (AFS_DIR_SLOTS_PER_BLOCK - AFS_DIR_RESV_BLOCKS);
|
||||
iter->prev_entry = 0; /* Hash head is previous */
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialise a directory iterator for looking up a name.
|
||||
*/
|
||||
bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name)
|
||||
{
|
||||
iter->nr_slots = afs_dir_calc_slots(name->len);
|
||||
iter->bucket = afs_dir_hash_name(name);
|
||||
return afs_dir_reset_iter(iter);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a specific block.
|
||||
*/
|
||||
union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block)
|
||||
{
|
||||
struct folio_queue *fq = iter->fq;
|
||||
struct afs_vnode *dvnode = iter->dvnode;
|
||||
struct folio *folio;
|
||||
size_t blpos = block * AFS_DIR_BLOCK_SIZE;
|
||||
size_t blend = (block + 1) * AFS_DIR_BLOCK_SIZE, fpos = iter->fpos;
|
||||
int slot = iter->fq_slot;
|
||||
|
||||
_enter("%zx,%d", block, slot);
|
||||
|
||||
if (iter->block) {
|
||||
kunmap_local(iter->block);
|
||||
iter->block = NULL;
|
||||
}
|
||||
|
||||
if (dvnode->directory_size < blend)
|
||||
goto fail;
|
||||
|
||||
if (!fq || blpos < fpos) {
|
||||
fq = dvnode->directory;
|
||||
slot = 0;
|
||||
fpos = 0;
|
||||
}
|
||||
|
||||
/* Search the folio queue for the folio containing the block... */
|
||||
for (; fq; fq = fq->next) {
|
||||
for (; slot < folioq_count(fq); slot++) {
|
||||
size_t fsize = folioq_folio_size(fq, slot);
|
||||
|
||||
if (blend <= fpos + fsize) {
|
||||
/* ... and then return the mapped block. */
|
||||
folio = folioq_folio(fq, slot);
|
||||
if (WARN_ON_ONCE(folio_pos(folio) != fpos))
|
||||
goto fail;
|
||||
iter->fq = fq;
|
||||
iter->fq_slot = slot;
|
||||
iter->fpos = fpos;
|
||||
iter->block = kmap_local_folio(folio, blpos - fpos);
|
||||
return iter->block;
|
||||
}
|
||||
fpos += fsize;
|
||||
}
|
||||
slot = 0;
|
||||
}
|
||||
|
||||
fail:
|
||||
iter->fq = NULL;
|
||||
iter->fq_slot = 0;
|
||||
afs_invalidate_dir(dvnode, afs_dir_invalid_edit_get_block);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search through a directory bucket.
|
||||
*/
|
||||
int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
|
||||
struct afs_fid *_fid)
|
||||
{
|
||||
const union afs_xdr_dir_block *meta;
|
||||
unsigned int entry;
|
||||
int ret = -ESTALE;
|
||||
|
||||
meta = afs_dir_find_block(iter, 0);
|
||||
if (!meta)
|
||||
return -ESTALE;
|
||||
|
||||
entry = ntohs(meta->meta.hashtable[iter->bucket & (AFS_DIR_HASHTBL_SIZE - 1)]);
|
||||
_enter("%x,%x", iter->bucket, entry);
|
||||
|
||||
while (entry) {
|
||||
const union afs_xdr_dir_block *block;
|
||||
const union afs_xdr_dirent *dire;
|
||||
unsigned int blnum = entry / AFS_DIR_SLOTS_PER_BLOCK;
|
||||
unsigned int slot = entry % AFS_DIR_SLOTS_PER_BLOCK;
|
||||
unsigned int resv = (blnum == 0 ? AFS_DIR_RESV_BLOCKS0 : AFS_DIR_RESV_BLOCKS);
|
||||
|
||||
_debug("search %x", entry);
|
||||
|
||||
if (slot < resv) {
|
||||
kdebug("slot out of range h=%x rs=%2x sl=%2x-%2x",
|
||||
iter->bucket, resv, slot, slot + iter->nr_slots - 1);
|
||||
goto bad;
|
||||
}
|
||||
|
||||
block = afs_dir_find_block(iter, blnum);
|
||||
if (!block)
|
||||
goto bad;
|
||||
dire = &block->dirents[slot];
|
||||
|
||||
if (slot + iter->nr_slots <= AFS_DIR_SLOTS_PER_BLOCK &&
|
||||
memcmp(dire->u.name, name->name, name->len) == 0 &&
|
||||
dire->u.name[name->len] == '\0') {
|
||||
_fid->vnode = ntohl(dire->u.vnode);
|
||||
_fid->unique = ntohl(dire->u.unique);
|
||||
ret = entry;
|
||||
goto found;
|
||||
}
|
||||
|
||||
iter->prev_entry = entry;
|
||||
entry = ntohs(dire->u.hash_next);
|
||||
if (!--iter->loop_check) {
|
||||
kdebug("dir chain loop h=%x", iter->bucket);
|
||||
goto bad;
|
||||
}
|
||||
}
|
||||
|
||||
ret = -ENOENT;
|
||||
found:
|
||||
if (iter->block) {
|
||||
kunmap_local(iter->block);
|
||||
iter->block = NULL;
|
||||
}
|
||||
|
||||
bad:
|
||||
if (ret == -ESTALE)
|
||||
afs_invalidate_dir(iter->dvnode, afs_dir_invalid_iter_stale);
|
||||
_leave(" = %d", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Search the appropriate hash chain in the contents of an AFS directory.
|
||||
*/
|
||||
int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
|
||||
struct afs_fid *_fid, afs_dataversion_t *_dir_version)
|
||||
{
|
||||
struct afs_dir_iter iter = { .dvnode = dvnode, };
|
||||
int ret, retry_limit = 3;
|
||||
|
||||
_enter("{%lu},,,", dvnode->netfs.inode.i_ino);
|
||||
|
||||
if (!afs_dir_init_iter(&iter, name))
|
||||
return -ENOENT;
|
||||
do {
|
||||
if (--retry_limit < 0) {
|
||||
pr_warn("afs_read_dir(): Too many retries\n");
|
||||
ret = -ESTALE;
|
||||
break;
|
||||
}
|
||||
ret = afs_read_dir(dvnode, NULL);
|
||||
if (ret < 0) {
|
||||
if (ret != -ESTALE)
|
||||
break;
|
||||
if (test_bit(AFS_VNODE_DELETED, &dvnode->flags)) {
|
||||
ret = -ESTALE;
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
*_dir_version = inode_peek_iversion_raw(&dvnode->netfs.inode);
|
||||
|
||||
ret = afs_dir_search_bucket(&iter, name, _fid);
|
||||
up_read(&dvnode->validate_lock);
|
||||
if (ret == -ESTALE)
|
||||
afs_dir_reset_iter(&iter);
|
||||
} while (ret == -ESTALE);
|
||||
|
||||
_leave(" = %d", ret);
|
||||
return ret;
|
||||
}
|
229
fs/afs/dynroot.c
229
fs/afs/dynroot.c
@ -185,50 +185,6 @@ out:
|
||||
return ret == -ENOENT ? NULL : ERR_PTR(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* Look up @cell in a dynroot directory. This is a substitution for the
|
||||
* local cell name for the net namespace.
|
||||
*/
|
||||
static struct dentry *afs_lookup_atcell(struct dentry *dentry)
|
||||
{
|
||||
struct afs_cell *cell;
|
||||
struct afs_net *net = afs_d2net(dentry);
|
||||
struct dentry *ret;
|
||||
char *name;
|
||||
int len;
|
||||
|
||||
if (!net->ws_cell)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
ret = ERR_PTR(-ENOMEM);
|
||||
name = kmalloc(AFS_MAXCELLNAME + 1, GFP_KERNEL);
|
||||
if (!name)
|
||||
goto out_p;
|
||||
|
||||
down_read(&net->cells_lock);
|
||||
cell = net->ws_cell;
|
||||
if (cell) {
|
||||
len = cell->name_len;
|
||||
memcpy(name, cell->name, len + 1);
|
||||
}
|
||||
up_read(&net->cells_lock);
|
||||
|
||||
ret = ERR_PTR(-ENOENT);
|
||||
if (!cell)
|
||||
goto out_n;
|
||||
|
||||
ret = lookup_one_len(name, dentry->d_parent, len);
|
||||
|
||||
/* We don't want to d_add() the @cell dentry here as we don't want to
|
||||
* the cached dentry to hide changes to the local cell name.
|
||||
*/
|
||||
|
||||
out_n:
|
||||
kfree(name);
|
||||
out_p:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Look up an entry in a dynroot directory.
|
||||
*/
|
||||
@ -247,10 +203,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
|
||||
return ERR_PTR(-ENAMETOOLONG);
|
||||
}
|
||||
|
||||
if (dentry->d_name.len == 5 &&
|
||||
memcmp(dentry->d_name.name, "@cell", 5) == 0)
|
||||
return afs_lookup_atcell(dentry);
|
||||
|
||||
return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry);
|
||||
}
|
||||
|
||||
@ -271,7 +223,8 @@ const struct dentry_operations afs_dynroot_dentry_operations = {
|
||||
int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
|
||||
{
|
||||
struct super_block *sb = net->dynroot_sb;
|
||||
struct dentry *root, *subdir;
|
||||
struct dentry *root, *subdir, *dsubdir;
|
||||
char *dotname = cell->name - 1;
|
||||
int ret;
|
||||
|
||||
if (!sb || atomic_read(&sb->s_active) == 0)
|
||||
@ -286,34 +239,31 @@ int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* Note that we're retaining an extra ref on the dentry */
|
||||
dsubdir = lookup_one_len(dotname, root, cell->name_len + 1);
|
||||
if (IS_ERR(dsubdir)) {
|
||||
ret = PTR_ERR(dsubdir);
|
||||
dput(subdir);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* Note that we're retaining extra refs on the dentries. */
|
||||
subdir->d_fsdata = (void *)1UL;
|
||||
dsubdir->d_fsdata = (void *)1UL;
|
||||
ret = 0;
|
||||
unlock:
|
||||
inode_unlock(root->d_inode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a manually added cell mount directory.
|
||||
* - The caller must hold net->proc_cells_lock
|
||||
*/
|
||||
void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
|
||||
static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len)
|
||||
{
|
||||
struct super_block *sb = net->dynroot_sb;
|
||||
struct dentry *root, *subdir;
|
||||
|
||||
if (!sb || atomic_read(&sb->s_active) == 0)
|
||||
return;
|
||||
|
||||
root = sb->s_root;
|
||||
inode_lock(root->d_inode);
|
||||
struct dentry *subdir;
|
||||
|
||||
/* Don't want to trigger a lookup call, which will re-add the cell */
|
||||
subdir = try_lookup_one_len(cell->name, root, cell->name_len);
|
||||
subdir = try_lookup_one_len(name, root, name_len);
|
||||
if (IS_ERR_OR_NULL(subdir)) {
|
||||
_debug("lookup %ld", PTR_ERR(subdir));
|
||||
goto no_dentry;
|
||||
return;
|
||||
}
|
||||
|
||||
_debug("rmdir %pd %u", subdir, d_count(subdir));
|
||||
@ -324,11 +274,152 @@ void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
|
||||
dput(subdir);
|
||||
}
|
||||
dput(subdir);
|
||||
no_dentry:
|
||||
inode_unlock(root->d_inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a manually added cell mount directory.
|
||||
* - The caller must hold net->proc_cells_lock
|
||||
*/
|
||||
void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
|
||||
{
|
||||
struct super_block *sb = net->dynroot_sb;
|
||||
char *dotname = cell->name - 1;
|
||||
|
||||
if (!sb || atomic_read(&sb->s_active) == 0)
|
||||
return;
|
||||
|
||||
inode_lock(sb->s_root->d_inode);
|
||||
afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len);
|
||||
afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1);
|
||||
inode_unlock(sb->s_root->d_inode);
|
||||
_leave("");
|
||||
}
|
||||
|
||||
static void afs_atcell_delayed_put_cell(void *arg)
|
||||
{
|
||||
struct afs_cell *cell = arg;
|
||||
|
||||
afs_put_cell(cell, afs_cell_trace_put_atcell);
|
||||
}
|
||||
|
||||
/*
|
||||
* Read @cell or .@cell symlinks.
|
||||
*/
|
||||
static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inode,
|
||||
struct delayed_call *done)
|
||||
{
|
||||
struct afs_vnode *vnode = AFS_FS_I(inode);
|
||||
struct afs_cell *cell;
|
||||
struct afs_net *net = afs_i2net(inode);
|
||||
const char *name;
|
||||
bool dotted = vnode->fid.vnode == 3;
|
||||
|
||||
if (!net->ws_cell)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
down_read(&net->cells_lock);
|
||||
|
||||
cell = net->ws_cell;
|
||||
if (dotted)
|
||||
name = cell->name - 1;
|
||||
else
|
||||
name = cell->name;
|
||||
afs_get_cell(cell, afs_cell_trace_get_atcell);
|
||||
set_delayed_call(done, afs_atcell_delayed_put_cell, cell);
|
||||
|
||||
up_read(&net->cells_lock);
|
||||
return name;
|
||||
}
|
||||
|
||||
static const struct inode_operations afs_atcell_inode_operations = {
|
||||
.get_link = afs_atcell_get_link,
|
||||
};
|
||||
|
||||
/*
|
||||
* Look up @cell or .@cell in a dynroot directory. This is a substitution for
|
||||
* the local cell name for the net namespace.
|
||||
*/
|
||||
static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name)
|
||||
{
|
||||
struct afs_vnode *vnode;
|
||||
struct afs_fid fid = { .vnode = 2, .unique = 1, };
|
||||
struct dentry *dentry;
|
||||
struct inode *inode;
|
||||
|
||||
if (name[0] == '.')
|
||||
fid.vnode = 3;
|
||||
|
||||
dentry = d_alloc_name(root, name);
|
||||
if (!dentry)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
inode = iget5_locked(dentry->d_sb, fid.vnode,
|
||||
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
|
||||
if (!inode) {
|
||||
dput(dentry);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
vnode = AFS_FS_I(inode);
|
||||
|
||||
/* there shouldn't be an existing inode */
|
||||
if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) {
|
||||
iput(inode);
|
||||
dput(dentry);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
netfs_inode_init(&vnode->netfs, NULL, false);
|
||||
simple_inode_init_ts(inode);
|
||||
set_nlink(inode, 1);
|
||||
inode->i_size = 0;
|
||||
inode->i_mode = S_IFLNK | 0555;
|
||||
inode->i_op = &afs_atcell_inode_operations;
|
||||
inode->i_uid = GLOBAL_ROOT_UID;
|
||||
inode->i_gid = GLOBAL_ROOT_GID;
|
||||
inode->i_blocks = 0;
|
||||
inode->i_generation = 0;
|
||||
inode->i_flags |= S_NOATIME;
|
||||
|
||||
unlock_new_inode(inode);
|
||||
d_splice_alias(inode, dentry);
|
||||
return dentry;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create @cell and .@cell symlinks.
|
||||
*/
|
||||
static int afs_dynroot_symlink(struct afs_net *net)
|
||||
{
|
||||
struct super_block *sb = net->dynroot_sb;
|
||||
struct dentry *root, *symlink, *dsymlink;
|
||||
int ret;
|
||||
|
||||
/* Let the ->lookup op do the creation */
|
||||
root = sb->s_root;
|
||||
inode_lock(root->d_inode);
|
||||
symlink = afs_dynroot_create_symlink(root, "@cell");
|
||||
if (IS_ERR(symlink)) {
|
||||
ret = PTR_ERR(symlink);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
dsymlink = afs_dynroot_create_symlink(root, ".@cell");
|
||||
if (IS_ERR(dsymlink)) {
|
||||
ret = PTR_ERR(dsymlink);
|
||||
dput(symlink);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* Note that we're retaining extra refs on the dentries. */
|
||||
symlink->d_fsdata = (void *)1UL;
|
||||
dsymlink->d_fsdata = (void *)1UL;
|
||||
ret = 0;
|
||||
unlock:
|
||||
inode_unlock(root->d_inode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Populate a newly created dynamic root with cell names.
|
||||
*/
|
||||
@ -341,6 +432,10 @@ int afs_dynroot_populate(struct super_block *sb)
|
||||
mutex_lock(&net->proc_cells_lock);
|
||||
|
||||
net->dynroot_sb = sb;
|
||||
ret = afs_dynroot_symlink(net);
|
||||
if (ret < 0)
|
||||
goto error;
|
||||
|
||||
hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
|
||||
ret = afs_dynroot_mkdir(net, cell);
|
||||
if (ret < 0)
|
||||
|
258
fs/afs/file.c
258
fs/afs/file.c
@ -20,7 +20,6 @@
|
||||
#include "internal.h"
|
||||
|
||||
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
|
||||
static int afs_symlink_read_folio(struct file *file, struct folio *folio);
|
||||
|
||||
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
|
||||
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
|
||||
@ -61,13 +60,6 @@ const struct address_space_operations afs_file_aops = {
|
||||
.writepages = afs_writepages,
|
||||
};
|
||||
|
||||
const struct address_space_operations afs_symlink_aops = {
|
||||
.read_folio = afs_symlink_read_folio,
|
||||
.release_folio = netfs_release_folio,
|
||||
.invalidate_folio = netfs_invalidate_folio,
|
||||
.migrate_folio = filemap_migrate_folio,
|
||||
};
|
||||
|
||||
static const struct vm_operations_struct afs_vm_ops = {
|
||||
.open = afs_vm_open,
|
||||
.close = afs_vm_close,
|
||||
@ -208,49 +200,12 @@ int afs_release(struct inode *inode, struct file *file)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new read record.
|
||||
*/
|
||||
struct afs_read *afs_alloc_read(gfp_t gfp)
|
||||
{
|
||||
struct afs_read *req;
|
||||
|
||||
req = kzalloc(sizeof(struct afs_read), gfp);
|
||||
if (req)
|
||||
refcount_set(&req->usage, 1);
|
||||
|
||||
return req;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dispose of a ref to a read record.
|
||||
*/
|
||||
void afs_put_read(struct afs_read *req)
|
||||
{
|
||||
if (refcount_dec_and_test(&req->usage)) {
|
||||
if (req->cleanup)
|
||||
req->cleanup(req);
|
||||
key_put(req->key);
|
||||
kfree(req);
|
||||
}
|
||||
}
|
||||
|
||||
static void afs_fetch_data_notify(struct afs_operation *op)
|
||||
{
|
||||
struct afs_read *req = op->fetch.req;
|
||||
struct netfs_io_subrequest *subreq = req->subreq;
|
||||
int error = afs_op_error(op);
|
||||
struct netfs_io_subrequest *subreq = op->fetch.subreq;
|
||||
|
||||
req->error = error;
|
||||
if (subreq) {
|
||||
subreq->rreq->i_size = req->file_size;
|
||||
if (req->pos + req->actual_len >= req->file_size)
|
||||
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
|
||||
netfs_read_subreq_terminated(subreq, error, false);
|
||||
req->subreq = NULL;
|
||||
} else if (req->done) {
|
||||
req->done(req);
|
||||
}
|
||||
subreq->error = afs_op_error(op);
|
||||
netfs_read_subreq_terminated(subreq);
|
||||
}
|
||||
|
||||
static void afs_fetch_data_success(struct afs_operation *op)
|
||||
@ -260,7 +215,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
|
||||
_enter("op=%08x", op->debug_id);
|
||||
afs_vnode_commit_status(op, &op->file[0]);
|
||||
afs_stat_v(vnode, n_fetches);
|
||||
atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
|
||||
atomic_long_add(op->fetch.subreq->transferred, &op->net->n_fetch_bytes);
|
||||
afs_fetch_data_notify(op);
|
||||
}
|
||||
|
||||
@ -270,107 +225,188 @@ static void afs_fetch_data_aborted(struct afs_operation *op)
|
||||
afs_fetch_data_notify(op);
|
||||
}
|
||||
|
||||
static void afs_fetch_data_put(struct afs_operation *op)
|
||||
{
|
||||
op->fetch.req->error = afs_op_error(op);
|
||||
afs_put_read(op->fetch.req);
|
||||
}
|
||||
|
||||
static const struct afs_operation_ops afs_fetch_data_operation = {
|
||||
const struct afs_operation_ops afs_fetch_data_operation = {
|
||||
.issue_afs_rpc = afs_fs_fetch_data,
|
||||
.issue_yfs_rpc = yfs_fs_fetch_data,
|
||||
.success = afs_fetch_data_success,
|
||||
.aborted = afs_fetch_data_aborted,
|
||||
.failed = afs_fetch_data_notify,
|
||||
.put = afs_fetch_data_put,
|
||||
};
|
||||
|
||||
static void afs_issue_read_call(struct afs_operation *op)
|
||||
{
|
||||
op->call_responded = false;
|
||||
op->call_error = 0;
|
||||
op->call_abort_code = 0;
|
||||
if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags))
|
||||
yfs_fs_fetch_data(op);
|
||||
else
|
||||
afs_fs_fetch_data(op);
|
||||
}
|
||||
|
||||
static void afs_end_read(struct afs_operation *op)
|
||||
{
|
||||
if (op->call_responded && op->server)
|
||||
set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
|
||||
|
||||
if (!afs_op_error(op))
|
||||
afs_fetch_data_success(op);
|
||||
else if (op->cumul_error.aborted)
|
||||
afs_fetch_data_aborted(op);
|
||||
else
|
||||
afs_fetch_data_notify(op);
|
||||
|
||||
afs_end_vnode_operation(op);
|
||||
afs_put_operation(op);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform I/O processing on an asynchronous call. The work item carries a ref
|
||||
* to the call struct that we either need to release or to pass on.
|
||||
*/
|
||||
static void afs_read_receive(struct afs_call *call)
|
||||
{
|
||||
struct afs_operation *op = call->op;
|
||||
enum afs_call_state state;
|
||||
|
||||
_enter("");
|
||||
|
||||
state = READ_ONCE(call->state);
|
||||
if (state == AFS_CALL_COMPLETE)
|
||||
return;
|
||||
trace_afs_read_recv(op, call);
|
||||
|
||||
while (state < AFS_CALL_COMPLETE && READ_ONCE(call->need_attention)) {
|
||||
WRITE_ONCE(call->need_attention, false);
|
||||
afs_deliver_to_call(call);
|
||||
state = READ_ONCE(call->state);
|
||||
}
|
||||
|
||||
if (state < AFS_CALL_COMPLETE) {
|
||||
netfs_read_subreq_progress(op->fetch.subreq);
|
||||
if (rxrpc_kernel_check_life(call->net->socket, call->rxcall))
|
||||
return;
|
||||
/* rxrpc terminated the call. */
|
||||
afs_set_call_complete(call, call->error, call->abort_code);
|
||||
}
|
||||
|
||||
op->call_abort_code = call->abort_code;
|
||||
op->call_error = call->error;
|
||||
op->call_responded = call->responded;
|
||||
op->call = NULL;
|
||||
call->op = NULL;
|
||||
afs_put_call(call);
|
||||
|
||||
/* If the call failed, then we need to crank the server rotation
|
||||
* handle and try the next.
|
||||
*/
|
||||
if (afs_select_fileserver(op)) {
|
||||
afs_issue_read_call(op);
|
||||
return;
|
||||
}
|
||||
|
||||
afs_end_read(op);
|
||||
}
|
||||
|
||||
void afs_fetch_data_async_rx(struct work_struct *work)
|
||||
{
|
||||
struct afs_call *call = container_of(work, struct afs_call, async_work);
|
||||
|
||||
afs_read_receive(call);
|
||||
afs_put_call(call);
|
||||
}
|
||||
|
||||
void afs_fetch_data_immediate_cancel(struct afs_call *call)
|
||||
{
|
||||
if (call->async) {
|
||||
afs_get_call(call, afs_call_trace_wake);
|
||||
if (!queue_work(afs_async_calls, &call->async_work))
|
||||
afs_deferred_put_call(call);
|
||||
flush_work(&call->async_work);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Fetch file data from the volume.
|
||||
*/
|
||||
int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
|
||||
static void afs_issue_read(struct netfs_io_subrequest *subreq)
|
||||
{
|
||||
struct afs_operation *op;
|
||||
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
|
||||
struct key *key = subreq->rreq->netfs_priv;
|
||||
|
||||
_enter("%s{%llx:%llu.%u},%x,,,",
|
||||
vnode->volume->name,
|
||||
vnode->fid.vid,
|
||||
vnode->fid.vnode,
|
||||
vnode->fid.unique,
|
||||
key_serial(req->key));
|
||||
key_serial(key));
|
||||
|
||||
op = afs_alloc_operation(req->key, vnode->volume);
|
||||
op = afs_alloc_operation(key, vnode->volume);
|
||||
if (IS_ERR(op)) {
|
||||
if (req->subreq)
|
||||
netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false);
|
||||
return PTR_ERR(op);
|
||||
subreq->error = PTR_ERR(op);
|
||||
netfs_read_subreq_terminated(subreq);
|
||||
return;
|
||||
}
|
||||
|
||||
afs_op_set_vnode(op, 0, vnode);
|
||||
|
||||
op->fetch.req = afs_get_read(req);
|
||||
op->fetch.subreq = subreq;
|
||||
op->ops = &afs_fetch_data_operation;
|
||||
return afs_do_sync_operation(op);
|
||||
}
|
||||
|
||||
static void afs_read_worker(struct work_struct *work)
|
||||
{
|
||||
struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
|
||||
struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
|
||||
struct afs_read *fsreq;
|
||||
|
||||
fsreq = afs_alloc_read(GFP_NOFS);
|
||||
if (!fsreq)
|
||||
return netfs_read_subreq_terminated(subreq, -ENOMEM, false);
|
||||
|
||||
fsreq->subreq = subreq;
|
||||
fsreq->pos = subreq->start + subreq->transferred;
|
||||
fsreq->len = subreq->len - subreq->transferred;
|
||||
fsreq->key = key_get(subreq->rreq->netfs_priv);
|
||||
fsreq->vnode = vnode;
|
||||
fsreq->iter = &subreq->io_iter;
|
||||
|
||||
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
|
||||
afs_fetch_data(fsreq->vnode, fsreq);
|
||||
afs_put_read(fsreq);
|
||||
}
|
||||
|
||||
static void afs_issue_read(struct netfs_io_subrequest *subreq)
|
||||
{
|
||||
INIT_WORK(&subreq->work, afs_read_worker);
|
||||
queue_work(system_long_wq, &subreq->work);
|
||||
}
|
||||
if (subreq->rreq->origin == NETFS_READAHEAD ||
|
||||
subreq->rreq->iocb) {
|
||||
op->flags |= AFS_OPERATION_ASYNC;
|
||||
|
||||
static int afs_symlink_read_folio(struct file *file, struct folio *folio)
|
||||
{
|
||||
struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host);
|
||||
struct afs_read *fsreq;
|
||||
int ret;
|
||||
if (!afs_begin_vnode_operation(op)) {
|
||||
subreq->error = afs_put_operation(op);
|
||||
netfs_read_subreq_terminated(subreq);
|
||||
return;
|
||||
}
|
||||
|
||||
fsreq = afs_alloc_read(GFP_NOFS);
|
||||
if (!fsreq)
|
||||
return -ENOMEM;
|
||||
if (!afs_select_fileserver(op)) {
|
||||
afs_end_read(op);
|
||||
return;
|
||||
}
|
||||
|
||||
fsreq->pos = folio_pos(folio);
|
||||
fsreq->len = folio_size(folio);
|
||||
fsreq->vnode = vnode;
|
||||
fsreq->iter = &fsreq->def_iter;
|
||||
iov_iter_xarray(&fsreq->def_iter, ITER_DEST, &folio->mapping->i_pages,
|
||||
fsreq->pos, fsreq->len);
|
||||
|
||||
ret = afs_fetch_data(fsreq->vnode, fsreq);
|
||||
if (ret == 0)
|
||||
folio_mark_uptodate(folio);
|
||||
folio_unlock(folio);
|
||||
return ret;
|
||||
afs_issue_read_call(op);
|
||||
} else {
|
||||
afs_do_sync_operation(op);
|
||||
}
|
||||
}
|
||||
|
||||
static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
|
||||
{
|
||||
struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
|
||||
|
||||
if (file)
|
||||
rreq->netfs_priv = key_get(afs_file_key(file));
|
||||
rreq->rsize = 256 * 1024;
|
||||
rreq->wsize = 256 * 1024 * 1024;
|
||||
|
||||
switch (rreq->origin) {
|
||||
case NETFS_READ_SINGLE:
|
||||
if (!file) {
|
||||
struct key *key = afs_request_key(vnode->volume->cell);
|
||||
|
||||
if (IS_ERR(key))
|
||||
return PTR_ERR(key);
|
||||
rreq->netfs_priv = key;
|
||||
}
|
||||
break;
|
||||
case NETFS_WRITEBACK:
|
||||
case NETFS_WRITETHROUGH:
|
||||
case NETFS_UNBUFFERED_WRITE:
|
||||
case NETFS_DIO_WRITE:
|
||||
if (S_ISREG(rreq->inode->i_mode))
|
||||
rreq->io_streams[0].avail = true;
|
||||
break;
|
||||
case NETFS_WRITEBACK_SINGLE:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -49,6 +49,105 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
|
||||
return op;
|
||||
}
|
||||
|
||||
struct afs_io_locker {
|
||||
struct list_head link;
|
||||
struct task_struct *task;
|
||||
unsigned long have_lock;
|
||||
};
|
||||
|
||||
/*
|
||||
* Unlock the I/O lock on a vnode.
|
||||
*/
|
||||
static void afs_unlock_for_io(struct afs_vnode *vnode)
|
||||
{
|
||||
struct afs_io_locker *locker;
|
||||
|
||||
spin_lock(&vnode->lock);
|
||||
locker = list_first_entry_or_null(&vnode->io_lock_waiters,
|
||||
struct afs_io_locker, link);
|
||||
if (locker) {
|
||||
list_del(&locker->link);
|
||||
smp_store_release(&locker->have_lock, 1); /* The unlock barrier. */
|
||||
smp_mb__after_atomic(); /* Store have_lock before task state */
|
||||
wake_up_process(locker->task);
|
||||
} else {
|
||||
clear_bit(AFS_VNODE_IO_LOCK, &vnode->flags);
|
||||
}
|
||||
spin_unlock(&vnode->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock the I/O lock on a vnode uninterruptibly. We can't use an ordinary
|
||||
* mutex as lockdep will complain if we unlock it in the wrong thread.
|
||||
*/
|
||||
static void afs_lock_for_io(struct afs_vnode *vnode)
|
||||
{
|
||||
struct afs_io_locker myself = { .task = current, };
|
||||
|
||||
spin_lock(&vnode->lock);
|
||||
|
||||
if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
|
||||
spin_unlock(&vnode->lock);
|
||||
return;
|
||||
}
|
||||
|
||||
list_add_tail(&myself.link, &vnode->io_lock_waiters);
|
||||
spin_unlock(&vnode->lock);
|
||||
|
||||
for (;;) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (smp_load_acquire(&myself.have_lock)) /* The lock barrier */
|
||||
break;
|
||||
schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock the I/O lock on a vnode interruptibly. We can't use an ordinary mutex
|
||||
* as lockdep will complain if we unlock it in the wrong thread.
|
||||
*/
|
||||
static int afs_lock_for_io_interruptible(struct afs_vnode *vnode)
|
||||
{
|
||||
struct afs_io_locker myself = { .task = current, };
|
||||
int ret = 0;
|
||||
|
||||
spin_lock(&vnode->lock);
|
||||
|
||||
if (!test_and_set_bit(AFS_VNODE_IO_LOCK, &vnode->flags)) {
|
||||
spin_unlock(&vnode->lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
list_add_tail(&myself.link, &vnode->io_lock_waiters);
|
||||
spin_unlock(&vnode->lock);
|
||||
|
||||
for (;;) {
|
||||
set_current_state(TASK_INTERRUPTIBLE);
|
||||
if (smp_load_acquire(&myself.have_lock) || /* The lock barrier */
|
||||
signal_pending(current))
|
||||
break;
|
||||
schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
/* If we got a signal, try to transfer the lock onto the next
|
||||
* waiter.
|
||||
*/
|
||||
if (unlikely(signal_pending(current))) {
|
||||
spin_lock(&vnode->lock);
|
||||
if (myself.have_lock) {
|
||||
spin_unlock(&vnode->lock);
|
||||
afs_unlock_for_io(vnode);
|
||||
} else {
|
||||
list_del(&myself.link);
|
||||
spin_unlock(&vnode->lock);
|
||||
}
|
||||
ret = -ERESTARTSYS;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock the vnode(s) being operated upon.
|
||||
*/
|
||||
@ -60,7 +159,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
|
||||
_enter("");
|
||||
|
||||
if (op->flags & AFS_OPERATION_UNINTR) {
|
||||
mutex_lock(&vnode->io_lock);
|
||||
afs_lock_for_io(vnode);
|
||||
op->flags |= AFS_OPERATION_LOCK_0;
|
||||
_leave(" = t [1]");
|
||||
return true;
|
||||
@ -72,7 +171,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
|
||||
if (vnode2 > vnode)
|
||||
swap(vnode, vnode2);
|
||||
|
||||
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
|
||||
if (afs_lock_for_io_interruptible(vnode) < 0) {
|
||||
afs_op_set_error(op, -ERESTARTSYS);
|
||||
op->flags |= AFS_OPERATION_STOP;
|
||||
_leave(" = f [I 0]");
|
||||
@ -81,10 +180,10 @@ static bool afs_get_io_locks(struct afs_operation *op)
|
||||
op->flags |= AFS_OPERATION_LOCK_0;
|
||||
|
||||
if (vnode2) {
|
||||
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
|
||||
if (afs_lock_for_io_interruptible(vnode2) < 0) {
|
||||
afs_op_set_error(op, -ERESTARTSYS);
|
||||
op->flags |= AFS_OPERATION_STOP;
|
||||
mutex_unlock(&vnode->io_lock);
|
||||
afs_unlock_for_io(vnode);
|
||||
op->flags &= ~AFS_OPERATION_LOCK_0;
|
||||
_leave(" = f [I 1]");
|
||||
return false;
|
||||
@ -104,9 +203,9 @@ static void afs_drop_io_locks(struct afs_operation *op)
|
||||
_enter("");
|
||||
|
||||
if (op->flags & AFS_OPERATION_LOCK_1)
|
||||
mutex_unlock(&vnode2->io_lock);
|
||||
afs_unlock_for_io(vnode2);
|
||||
if (op->flags & AFS_OPERATION_LOCK_0)
|
||||
mutex_unlock(&vnode->io_lock);
|
||||
afs_unlock_for_io(vnode);
|
||||
}
|
||||
|
||||
static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *vp,
|
||||
@ -157,7 +256,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
|
||||
/*
|
||||
* Tidy up a filesystem cursor and unlock the vnode.
|
||||
*/
|
||||
static void afs_end_vnode_operation(struct afs_operation *op)
|
||||
void afs_end_vnode_operation(struct afs_operation *op)
|
||||
{
|
||||
_enter("");
|
||||
|
||||
|
@ -301,19 +301,19 @@ void afs_fs_fetch_status(struct afs_operation *op)
|
||||
static int afs_deliver_fs_fetch_data(struct afs_call *call)
|
||||
{
|
||||
struct afs_operation *op = call->op;
|
||||
struct netfs_io_subrequest *subreq = op->fetch.subreq;
|
||||
struct afs_vnode_param *vp = &op->file[0];
|
||||
struct afs_read *req = op->fetch.req;
|
||||
const __be32 *bp;
|
||||
size_t count_before;
|
||||
int ret;
|
||||
|
||||
_enter("{%u,%zu,%zu/%llu}",
|
||||
call->unmarshall, call->iov_len, iov_iter_count(call->iter),
|
||||
req->actual_len);
|
||||
call->remaining);
|
||||
|
||||
switch (call->unmarshall) {
|
||||
case 0:
|
||||
req->actual_len = 0;
|
||||
call->remaining = 0;
|
||||
call->unmarshall++;
|
||||
if (call->operation_ID == FSFETCHDATA64) {
|
||||
afs_extract_to_tmp64(call);
|
||||
@ -323,8 +323,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
|
||||
}
|
||||
fallthrough;
|
||||
|
||||
/* Extract the returned data length into
|
||||
* ->actual_len. This may indicate more or less data than was
|
||||
/* Extract the returned data length into ->remaining.
|
||||
* This may indicate more or less data than was
|
||||
* requested will be returned.
|
||||
*/
|
||||
case 1:
|
||||
@ -333,42 +333,40 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
req->actual_len = be64_to_cpu(call->tmp64);
|
||||
_debug("DATA length: %llu", req->actual_len);
|
||||
call->remaining = be64_to_cpu(call->tmp64);
|
||||
_debug("DATA length: %llu", call->remaining);
|
||||
|
||||
if (req->actual_len == 0)
|
||||
if (call->remaining == 0)
|
||||
goto no_more_data;
|
||||
|
||||
call->iter = req->iter;
|
||||
call->iov_len = min(req->actual_len, req->len);
|
||||
call->iter = &subreq->io_iter;
|
||||
call->iov_len = umin(call->remaining, subreq->len - subreq->transferred);
|
||||
call->unmarshall++;
|
||||
fallthrough;
|
||||
|
||||
/* extract the returned data */
|
||||
case 2:
|
||||
count_before = call->iov_len;
|
||||
_debug("extract data %zu/%llu", count_before, req->actual_len);
|
||||
_debug("extract data %zu/%llu", count_before, call->remaining);
|
||||
|
||||
ret = afs_extract_data(call, true);
|
||||
if (req->subreq) {
|
||||
req->subreq->transferred += count_before - call->iov_len;
|
||||
netfs_read_subreq_progress(req->subreq, false);
|
||||
}
|
||||
subreq->transferred += count_before - call->iov_len;
|
||||
call->remaining -= count_before - call->iov_len;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
call->iter = &call->def_iter;
|
||||
if (req->actual_len <= req->len)
|
||||
if (call->remaining)
|
||||
goto no_more_data;
|
||||
|
||||
/* Discard any excess data the server gave us */
|
||||
afs_extract_discard(call, req->actual_len - req->len);
|
||||
afs_extract_discard(call, call->remaining);
|
||||
call->unmarshall = 3;
|
||||
fallthrough;
|
||||
|
||||
case 3:
|
||||
_debug("extract discard %zu/%llu",
|
||||
iov_iter_count(call->iter), req->actual_len - req->len);
|
||||
iov_iter_count(call->iter), call->remaining);
|
||||
|
||||
ret = afs_extract_data(call, true);
|
||||
if (ret < 0)
|
||||
@ -390,8 +388,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
|
||||
xdr_decode_AFSCallBack(&bp, call, &vp->scb);
|
||||
xdr_decode_AFSVolSync(&bp, &op->volsync);
|
||||
|
||||
req->data_version = vp->scb.status.data_version;
|
||||
req->file_size = vp->scb.status.size;
|
||||
if (subreq->start + subreq->transferred >= vp->scb.status.size)
|
||||
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
|
||||
|
||||
call->unmarshall++;
|
||||
fallthrough;
|
||||
@ -410,14 +408,18 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
|
||||
static const struct afs_call_type afs_RXFSFetchData = {
|
||||
.name = "FS.FetchData",
|
||||
.op = afs_FS_FetchData,
|
||||
.async_rx = afs_fetch_data_async_rx,
|
||||
.deliver = afs_deliver_fs_fetch_data,
|
||||
.immediate_cancel = afs_fetch_data_immediate_cancel,
|
||||
.destructor = afs_flat_call_destructor,
|
||||
};
|
||||
|
||||
static const struct afs_call_type afs_RXFSFetchData64 = {
|
||||
.name = "FS.FetchData64",
|
||||
.op = afs_FS_FetchData64,
|
||||
.async_rx = afs_fetch_data_async_rx,
|
||||
.deliver = afs_deliver_fs_fetch_data,
|
||||
.immediate_cancel = afs_fetch_data_immediate_cancel,
|
||||
.destructor = afs_flat_call_destructor,
|
||||
};
|
||||
|
||||
@ -426,8 +428,8 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
|
||||
*/
|
||||
static void afs_fs_fetch_data64(struct afs_operation *op)
|
||||
{
|
||||
struct netfs_io_subrequest *subreq = op->fetch.subreq;
|
||||
struct afs_vnode_param *vp = &op->file[0];
|
||||
struct afs_read *req = op->fetch.req;
|
||||
struct afs_call *call;
|
||||
__be32 *bp;
|
||||
|
||||
@ -437,16 +439,19 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
|
||||
if (!call)
|
||||
return afs_op_nomem(op);
|
||||
|
||||
if (op->flags & AFS_OPERATION_ASYNC)
|
||||
call->async = true;
|
||||
|
||||
/* marshall the parameters */
|
||||
bp = call->request;
|
||||
bp[0] = htonl(FSFETCHDATA64);
|
||||
bp[1] = htonl(vp->fid.vid);
|
||||
bp[2] = htonl(vp->fid.vnode);
|
||||
bp[3] = htonl(vp->fid.unique);
|
||||
bp[4] = htonl(upper_32_bits(req->pos));
|
||||
bp[5] = htonl(lower_32_bits(req->pos));
|
||||
bp[4] = htonl(upper_32_bits(subreq->start + subreq->transferred));
|
||||
bp[5] = htonl(lower_32_bits(subreq->start + subreq->transferred));
|
||||
bp[6] = 0;
|
||||
bp[7] = htonl(lower_32_bits(req->len));
|
||||
bp[7] = htonl(lower_32_bits(subreq->len - subreq->transferred));
|
||||
|
||||
call->fid = vp->fid;
|
||||
trace_afs_make_fs_call(call, &vp->fid);
|
||||
@ -458,9 +463,9 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
|
||||
*/
|
||||
void afs_fs_fetch_data(struct afs_operation *op)
|
||||
{
|
||||
struct netfs_io_subrequest *subreq = op->fetch.subreq;
|
||||
struct afs_vnode_param *vp = &op->file[0];
|
||||
struct afs_call *call;
|
||||
struct afs_read *req = op->fetch.req;
|
||||
__be32 *bp;
|
||||
|
||||
if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags))
|
||||
@ -472,16 +477,14 @@ void afs_fs_fetch_data(struct afs_operation *op)
|
||||
if (!call)
|
||||
return afs_op_nomem(op);
|
||||
|
||||
req->call_debug_id = call->debug_id;
|
||||
|
||||
/* marshall the parameters */
|
||||
bp = call->request;
|
||||
bp[0] = htonl(FSFETCHDATA);
|
||||
bp[1] = htonl(vp->fid.vid);
|
||||
bp[2] = htonl(vp->fid.vnode);
|
||||
bp[3] = htonl(vp->fid.unique);
|
||||
bp[4] = htonl(lower_32_bits(req->pos));
|
||||
bp[5] = htonl(lower_32_bits(req->len));
|
||||
bp[4] = htonl(lower_32_bits(subreq->start + subreq->transferred));
|
||||
bp[5] = htonl(lower_32_bits(subreq->len + subreq->transferred));
|
||||
|
||||
call->fid = vp->fid;
|
||||
trace_afs_make_fs_call(call, &vp->fid);
|
||||
@ -1733,6 +1736,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
|
||||
.op = afs_FS_GetCapabilities,
|
||||
.deliver = afs_deliver_fs_get_capabilities,
|
||||
.done = afs_fileserver_probe_result,
|
||||
.immediate_cancel = afs_fileserver_probe_result,
|
||||
.destructor = afs_fs_get_capabilities_destructor,
|
||||
};
|
||||
|
||||
|
140
fs/afs/inode.c
140
fs/afs/inode.c
@ -25,8 +25,94 @@
|
||||
#include "internal.h"
|
||||
#include "afs_fs.h"
|
||||
|
||||
void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op)
|
||||
{
|
||||
size_t size = strlen(op->create.symlink) + 1;
|
||||
size_t dsize = 0;
|
||||
char *p;
|
||||
|
||||
if (netfs_alloc_folioq_buffer(NULL, &vnode->directory, &dsize, size,
|
||||
mapping_gfp_mask(vnode->netfs.inode.i_mapping)) < 0)
|
||||
return;
|
||||
|
||||
vnode->directory_size = dsize;
|
||||
p = kmap_local_folio(folioq_folio(vnode->directory, 0), 0);
|
||||
memcpy(p, op->create.symlink, size);
|
||||
kunmap_local(p);
|
||||
set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
|
||||
netfs_single_mark_inode_dirty(&vnode->netfs.inode);
|
||||
}
|
||||
|
||||
static void afs_put_link(void *arg)
|
||||
{
|
||||
struct folio *folio = virt_to_folio(arg);
|
||||
|
||||
kunmap_local(arg);
|
||||
folio_put(folio);
|
||||
}
|
||||
|
||||
const char *afs_get_link(struct dentry *dentry, struct inode *inode,
|
||||
struct delayed_call *callback)
|
||||
{
|
||||
struct afs_vnode *vnode = AFS_FS_I(inode);
|
||||
struct folio *folio;
|
||||
char *content;
|
||||
ssize_t ret;
|
||||
|
||||
if (!dentry) {
|
||||
/* RCU pathwalk. */
|
||||
if (!test_bit(AFS_VNODE_DIR_READ, &vnode->flags) || !afs_check_validity(vnode))
|
||||
return ERR_PTR(-ECHILD);
|
||||
goto good;
|
||||
}
|
||||
|
||||
if (test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
|
||||
goto fetch;
|
||||
|
||||
ret = afs_validate(vnode, NULL);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
if (!test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags) &&
|
||||
test_bit(AFS_VNODE_DIR_READ, &vnode->flags))
|
||||
goto good;
|
||||
|
||||
fetch:
|
||||
ret = afs_read_single(vnode, NULL);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
set_bit(AFS_VNODE_DIR_READ, &vnode->flags);
|
||||
|
||||
good:
|
||||
folio = folioq_folio(vnode->directory, 0);
|
||||
folio_get(folio);
|
||||
content = kmap_local_folio(folio, 0);
|
||||
set_delayed_call(callback, afs_put_link, content);
|
||||
return content;
|
||||
}
|
||||
|
||||
int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
|
||||
{
|
||||
DEFINE_DELAYED_CALL(done);
|
||||
const char *content;
|
||||
int len;
|
||||
|
||||
content = afs_get_link(dentry, d_inode(dentry), &done);
|
||||
if (IS_ERR(content)) {
|
||||
do_delayed_call(&done);
|
||||
return PTR_ERR(content);
|
||||
}
|
||||
|
||||
len = umin(strlen(content), buflen);
|
||||
if (copy_to_user(buffer, content, len))
|
||||
len = -EFAULT;
|
||||
do_delayed_call(&done);
|
||||
return len;
|
||||
}
|
||||
|
||||
static const struct inode_operations afs_symlink_inode_operations = {
|
||||
.get_link = page_get_link,
|
||||
.get_link = afs_get_link,
|
||||
.readlink = afs_readlink,
|
||||
};
|
||||
|
||||
static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *parent_vnode)
|
||||
@ -110,7 +196,9 @@ static int afs_inode_init_from_status(struct afs_operation *op,
|
||||
inode->i_op = &afs_dir_inode_operations;
|
||||
inode->i_fop = &afs_dir_file_operations;
|
||||
inode->i_mapping->a_ops = &afs_dir_aops;
|
||||
mapping_set_large_folios(inode->i_mapping);
|
||||
__set_bit(NETFS_ICTX_SINGLE_NO_UPLOAD, &vnode->netfs.flags);
|
||||
/* Assume locally cached directory data will be valid. */
|
||||
__set_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
|
||||
break;
|
||||
case AFS_FTYPE_SYMLINK:
|
||||
/* Symlinks with a mode of 0644 are actually mountpoints. */
|
||||
@ -122,13 +210,13 @@ static int afs_inode_init_from_status(struct afs_operation *op,
|
||||
inode->i_mode = S_IFDIR | 0555;
|
||||
inode->i_op = &afs_mntpt_inode_operations;
|
||||
inode->i_fop = &afs_mntpt_file_operations;
|
||||
inode->i_mapping->a_ops = &afs_symlink_aops;
|
||||
} else {
|
||||
inode->i_mode = S_IFLNK | status->mode;
|
||||
inode->i_op = &afs_symlink_inode_operations;
|
||||
inode->i_mapping->a_ops = &afs_symlink_aops;
|
||||
}
|
||||
inode->i_mapping->a_ops = &afs_dir_aops;
|
||||
inode_nohighmem(inode);
|
||||
mapping_set_release_always(inode->i_mapping);
|
||||
break;
|
||||
default:
|
||||
dump_vnode(vnode, op->file[0].vnode != vnode ? op->file[0].vnode : NULL);
|
||||
@ -140,15 +228,17 @@ static int afs_inode_init_from_status(struct afs_operation *op,
|
||||
afs_set_netfs_context(vnode);
|
||||
|
||||
vnode->invalid_before = status->data_version;
|
||||
trace_afs_set_dv(vnode, status->data_version);
|
||||
inode_set_iversion_raw(&vnode->netfs.inode, status->data_version);
|
||||
|
||||
if (!vp->scb.have_cb) {
|
||||
/* it's a symlink we just created (the fileserver
|
||||
* didn't give us a callback) */
|
||||
atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
|
||||
afs_clear_cb_promise(vnode, afs_cb_promise_set_new_symlink);
|
||||
} else {
|
||||
vnode->cb_server = op->server;
|
||||
atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at);
|
||||
afs_set_cb_promise(vnode, vp->scb.callback.expires_at,
|
||||
afs_cb_promise_set_new_inode);
|
||||
}
|
||||
|
||||
write_sequnlock(&vnode->cb_lock);
|
||||
@ -207,12 +297,17 @@ static void afs_apply_status(struct afs_operation *op,
|
||||
if (vp->update_ctime)
|
||||
inode_set_ctime_to_ts(inode, op->ctime);
|
||||
|
||||
if (vnode->status.data_version != status->data_version)
|
||||
if (vnode->status.data_version != status->data_version) {
|
||||
trace_afs_set_dv(vnode, status->data_version);
|
||||
data_changed = true;
|
||||
}
|
||||
|
||||
vnode->status = *status;
|
||||
|
||||
if (vp->dv_before + vp->dv_delta != status->data_version) {
|
||||
trace_afs_dv_mismatch(vnode, vp->dv_before, vp->dv_delta,
|
||||
status->data_version);
|
||||
|
||||
if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
|
||||
atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
|
||||
pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
|
||||
@ -223,12 +318,10 @@ static void afs_apply_status(struct afs_operation *op,
|
||||
op->debug_id);
|
||||
|
||||
vnode->invalid_before = status->data_version;
|
||||
if (vnode->status.type == AFS_FTYPE_DIR) {
|
||||
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
|
||||
afs_stat_v(vnode, n_inval);
|
||||
} else {
|
||||
if (vnode->status.type == AFS_FTYPE_DIR)
|
||||
afs_invalidate_dir(vnode, afs_dir_invalid_dv_mismatch);
|
||||
else
|
||||
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
|
||||
}
|
||||
change_size = true;
|
||||
data_changed = true;
|
||||
unexpected_jump = true;
|
||||
@ -258,6 +351,8 @@ static void afs_apply_status(struct afs_operation *op,
|
||||
inode_set_ctime_to_ts(inode, t);
|
||||
inode_set_atime_to_ts(inode, t);
|
||||
}
|
||||
if (op->ops == &afs_fetch_data_operation)
|
||||
op->fetch.subreq->rreq->i_size = status->size;
|
||||
}
|
||||
}
|
||||
|
||||
@ -273,7 +368,7 @@ static void afs_apply_callback(struct afs_operation *op,
|
||||
if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
|
||||
if (op->volume->type == AFSVL_RWVOL)
|
||||
vnode->cb_server = op->server;
|
||||
atomic64_set(&vnode->cb_expires_at, cb->expires_at);
|
||||
afs_set_cb_promise(vnode, cb->expires_at, afs_cb_promise_set_apply_cb);
|
||||
}
|
||||
}
|
||||
|
||||
@ -435,7 +530,9 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
|
||||
} __packed key;
|
||||
struct afs_vnode_cache_aux aux;
|
||||
|
||||
if (vnode->status.type != AFS_FTYPE_FILE) {
|
||||
if (vnode->status.type != AFS_FTYPE_FILE &&
|
||||
vnode->status.type != AFS_FTYPE_DIR &&
|
||||
vnode->status.type != AFS_FTYPE_SYMLINK) {
|
||||
vnode->netfs.cache = NULL;
|
||||
return;
|
||||
}
|
||||
@ -637,6 +734,7 @@ int afs_drop_inode(struct inode *inode)
|
||||
void afs_evict_inode(struct inode *inode)
|
||||
{
|
||||
struct afs_vnode_cache_aux aux;
|
||||
struct afs_super_info *sbi = AFS_FS_S(inode->i_sb);
|
||||
struct afs_vnode *vnode = AFS_FS_I(inode);
|
||||
|
||||
_enter("{%llx:%llu.%d}",
|
||||
@ -648,8 +746,22 @@ void afs_evict_inode(struct inode *inode)
|
||||
|
||||
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
|
||||
|
||||
if ((S_ISDIR(inode->i_mode) ||
|
||||
S_ISLNK(inode->i_mode)) &&
|
||||
(inode->i_state & I_DIRTY) &&
|
||||
!sbi->dyn_root) {
|
||||
struct writeback_control wbc = {
|
||||
.sync_mode = WB_SYNC_ALL,
|
||||
.for_sync = true,
|
||||
.range_end = LLONG_MAX,
|
||||
};
|
||||
|
||||
afs_single_writepages(inode->i_mapping, &wbc);
|
||||
}
|
||||
|
||||
netfs_wait_for_outstanding_io(inode);
|
||||
truncate_inode_pages_final(&inode->i_data);
|
||||
netfs_free_folioq_buffer(vnode->directory);
|
||||
|
||||
afs_set_cache_aux(vnode, &aux);
|
||||
netfs_clear_inode_writeback(inode, &aux);
|
||||
|
@ -163,6 +163,7 @@ struct afs_call {
|
||||
spinlock_t state_lock;
|
||||
int error; /* error code */
|
||||
u32 abort_code; /* Remote abort ID or 0 */
|
||||
unsigned long long remaining; /* How much is left to receive */
|
||||
unsigned int max_lifespan; /* Maximum lifespan in secs to set if not 0 */
|
||||
unsigned request_size; /* size of request data */
|
||||
unsigned reply_max; /* maximum size of reply */
|
||||
@ -201,11 +202,17 @@ struct afs_call_type {
|
||||
/* clean up a call */
|
||||
void (*destructor)(struct afs_call *call);
|
||||
|
||||
/* Async receive processing function */
|
||||
void (*async_rx)(struct work_struct *work);
|
||||
|
||||
/* Work function */
|
||||
void (*work)(struct work_struct *work);
|
||||
|
||||
/* Call done function (gets called immediately on success or failure) */
|
||||
void (*done)(struct afs_call *call);
|
||||
|
||||
/* Handle a call being immediately cancelled. */
|
||||
void (*immediate_cancel)(struct afs_call *call);
|
||||
};
|
||||
|
||||
/*
|
||||
@ -232,28 +239,6 @@ static inline struct key *afs_file_key(struct file *file)
|
||||
return af->key;
|
||||
}
|
||||
|
||||
/*
|
||||
* Record of an outstanding read operation on a vnode.
|
||||
*/
|
||||
struct afs_read {
|
||||
loff_t pos; /* Where to start reading */
|
||||
loff_t len; /* How much we're asking for */
|
||||
loff_t actual_len; /* How much we're actually getting */
|
||||
loff_t file_size; /* File size returned by server */
|
||||
struct key *key; /* The key to use to reissue the read */
|
||||
struct afs_vnode *vnode; /* The file being read into. */
|
||||
struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */
|
||||
afs_dataversion_t data_version; /* Version number returned by server */
|
||||
refcount_t usage;
|
||||
unsigned int call_debug_id;
|
||||
unsigned int nr_pages;
|
||||
int error;
|
||||
void (*done)(struct afs_read *);
|
||||
void (*cleanup)(struct afs_read *);
|
||||
struct iov_iter *iter; /* Iterator representing the buffer */
|
||||
struct iov_iter def_iter; /* Default iterator */
|
||||
};
|
||||
|
||||
/*
|
||||
* AFS superblock private data
|
||||
* - there's one superblock per volume
|
||||
@ -702,13 +687,14 @@ struct afs_vnode {
|
||||
struct afs_file_status status; /* AFS status info for this file */
|
||||
afs_dataversion_t invalid_before; /* Child dentries are invalid before this */
|
||||
struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */
|
||||
struct mutex io_lock; /* Lock for serialising I/O on this mutex */
|
||||
struct list_head io_lock_waiters; /* Threads waiting for the I/O lock */
|
||||
struct rw_semaphore validate_lock; /* lock for validating this vnode */
|
||||
struct rw_semaphore rmdir_lock; /* Lock for rmdir vs sillyrename */
|
||||
struct key *silly_key; /* Silly rename key */
|
||||
spinlock_t wb_lock; /* lock for wb_keys */
|
||||
spinlock_t lock; /* waitqueue/flags lock */
|
||||
unsigned long flags;
|
||||
#define AFS_VNODE_IO_LOCK 0 /* Set if the I/O serialisation lock is held */
|
||||
#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
|
||||
#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */
|
||||
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
|
||||
@ -719,7 +705,9 @@ struct afs_vnode {
|
||||
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
|
||||
#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
|
||||
#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */
|
||||
#define AFS_VNODE_DIR_READ 11 /* Set if we've read a dir's contents */
|
||||
|
||||
struct folio_queue *directory; /* Directory contents */
|
||||
struct list_head wb_keys; /* List of keys available for writeback */
|
||||
struct list_head pending_locks; /* locks waiting to be granted */
|
||||
struct list_head granted_locks; /* locks granted on this file */
|
||||
@ -728,6 +716,7 @@ struct afs_vnode {
|
||||
ktime_t locked_at; /* Time at which lock obtained */
|
||||
enum afs_lock_state lock_state : 8;
|
||||
afs_lock_type_t lock_type : 8;
|
||||
unsigned int directory_size; /* Amount of space in ->directory */
|
||||
|
||||
/* outstanding callback notification on this file */
|
||||
struct work_struct cb_work; /* Work for mmap'd files */
|
||||
@ -907,7 +896,7 @@ struct afs_operation {
|
||||
bool new_negative;
|
||||
} rename;
|
||||
struct {
|
||||
struct afs_read *req;
|
||||
struct netfs_io_subrequest *subreq;
|
||||
} fetch;
|
||||
struct {
|
||||
afs_lock_type_t type;
|
||||
@ -959,6 +948,7 @@ struct afs_operation {
|
||||
#define AFS_OPERATION_TRIED_ALL 0x0400 /* Set if we've tried all the fileservers */
|
||||
#define AFS_OPERATION_RETRY_SERVER 0x0800 /* Set if we should retry the current server */
|
||||
#define AFS_OPERATION_DIR_CONFLICT 0x1000 /* Set if we detected a 3rd-party dir change */
|
||||
#define AFS_OPERATION_ASYNC 0x2000 /* Set if should run asynchronously */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -983,6 +973,21 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
|
||||
i_size_read(&vnode->netfs.inode), flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Directory iteration management.
|
||||
*/
|
||||
struct afs_dir_iter {
|
||||
struct afs_vnode *dvnode;
|
||||
union afs_xdr_dir_block *block;
|
||||
struct folio_queue *fq;
|
||||
unsigned int fpos;
|
||||
int fq_slot;
|
||||
unsigned int loop_check;
|
||||
u8 nr_slots;
|
||||
u8 bucket;
|
||||
unsigned int prev_entry;
|
||||
};
|
||||
|
||||
#include <trace/events/afs.h>
|
||||
|
||||
/*****************************************************************************/
|
||||
@ -1064,8 +1069,13 @@ extern const struct inode_operations afs_dir_inode_operations;
|
||||
extern const struct address_space_operations afs_dir_aops;
|
||||
extern const struct dentry_operations afs_fs_dentry_operations;
|
||||
|
||||
ssize_t afs_read_single(struct afs_vnode *dvnode, struct file *file);
|
||||
ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
|
||||
__acquires(&dvnode->validate_lock);
|
||||
extern void afs_d_release(struct dentry *);
|
||||
extern void afs_check_for_remote_deletion(struct afs_operation *);
|
||||
int afs_single_writepages(struct address_space *mapping,
|
||||
struct writeback_control *wbc);
|
||||
|
||||
/*
|
||||
* dir_edit.c
|
||||
@ -1075,6 +1085,18 @@ extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *
|
||||
extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
|
||||
void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
|
||||
enum afs_edit_dir_reason why);
|
||||
void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode);
|
||||
|
||||
/*
|
||||
* dir_search.c
|
||||
*/
|
||||
unsigned int afs_dir_hash_name(const struct qstr *name);
|
||||
bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name);
|
||||
union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block);
|
||||
int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name,
|
||||
struct afs_fid *_fid);
|
||||
int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name,
|
||||
struct afs_fid *_fid, afs_dataversion_t *_dir_version);
|
||||
|
||||
/*
|
||||
* dir_silly.c
|
||||
@ -1099,24 +1121,17 @@ extern void afs_dynroot_depopulate(struct super_block *);
|
||||
* file.c
|
||||
*/
|
||||
extern const struct address_space_operations afs_file_aops;
|
||||
extern const struct address_space_operations afs_symlink_aops;
|
||||
extern const struct inode_operations afs_file_inode_operations;
|
||||
extern const struct file_operations afs_file_operations;
|
||||
extern const struct afs_operation_ops afs_fetch_data_operation;
|
||||
extern const struct netfs_request_ops afs_req_ops;
|
||||
|
||||
extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
|
||||
extern void afs_put_wb_key(struct afs_wb_key *);
|
||||
extern int afs_open(struct inode *, struct file *);
|
||||
extern int afs_release(struct inode *, struct file *);
|
||||
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
|
||||
extern struct afs_read *afs_alloc_read(gfp_t);
|
||||
extern void afs_put_read(struct afs_read *);
|
||||
|
||||
static inline struct afs_read *afs_get_read(struct afs_read *req)
|
||||
{
|
||||
refcount_inc(&req->usage);
|
||||
return req;
|
||||
}
|
||||
void afs_fetch_data_async_rx(struct work_struct *work);
|
||||
void afs_fetch_data_immediate_cancel(struct afs_call *call);
|
||||
|
||||
/*
|
||||
* flock.c
|
||||
@ -1168,6 +1183,7 @@ extern void afs_fs_store_acl(struct afs_operation *);
|
||||
extern struct afs_operation *afs_alloc_operation(struct key *, struct afs_volume *);
|
||||
extern int afs_put_operation(struct afs_operation *);
|
||||
extern bool afs_begin_vnode_operation(struct afs_operation *);
|
||||
extern void afs_end_vnode_operation(struct afs_operation *op);
|
||||
extern void afs_wait_for_operation(struct afs_operation *);
|
||||
extern int afs_do_sync_operation(struct afs_operation *);
|
||||
|
||||
@ -1205,6 +1221,10 @@ extern void afs_fs_probe_cleanup(struct afs_net *);
|
||||
*/
|
||||
extern const struct afs_operation_ops afs_fetch_status_operation;
|
||||
|
||||
void afs_init_new_symlink(struct afs_vnode *vnode, struct afs_operation *op);
|
||||
const char *afs_get_link(struct dentry *dentry, struct inode *inode,
|
||||
struct delayed_call *callback);
|
||||
int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
|
||||
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
|
||||
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
|
||||
extern int afs_ilookup5_test_by_fid(struct inode *, void *);
|
||||
@ -1336,6 +1356,7 @@ extern void afs_charge_preallocation(struct work_struct *);
|
||||
extern void afs_put_call(struct afs_call *);
|
||||
void afs_deferred_put_call(struct afs_call *call);
|
||||
void afs_make_call(struct afs_call *call, gfp_t gfp);
|
||||
void afs_deliver_to_call(struct afs_call *call);
|
||||
void afs_wait_for_call_to_complete(struct afs_call *call);
|
||||
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
|
||||
const struct afs_call_type *,
|
||||
@ -1346,6 +1367,28 @@ extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
|
||||
extern int afs_extract_data(struct afs_call *, bool);
|
||||
extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
|
||||
|
||||
static inline struct afs_call *afs_get_call(struct afs_call *call,
|
||||
enum afs_call_trace why)
|
||||
{
|
||||
int r;
|
||||
|
||||
__refcount_inc(&call->ref, &r);
|
||||
|
||||
trace_afs_call(call->debug_id, why, r + 1,
|
||||
atomic_read(&call->net->nr_outstanding_calls),
|
||||
__builtin_return_address(0));
|
||||
return call;
|
||||
}
|
||||
|
||||
static inline void afs_see_call(struct afs_call *call, enum afs_call_trace why)
|
||||
{
|
||||
int r = refcount_read(&call->ref);
|
||||
|
||||
trace_afs_call(call->debug_id, why, r,
|
||||
atomic_read(&call->net->nr_outstanding_calls),
|
||||
__builtin_return_address(0));
|
||||
}
|
||||
|
||||
static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call,
|
||||
gfp_t gfp)
|
||||
{
|
||||
@ -1712,6 +1755,38 @@ static inline int afs_bad(struct afs_vnode *vnode, enum afs_file_error where)
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Set the callback promise on a vnode.
|
||||
*/
|
||||
static inline void afs_set_cb_promise(struct afs_vnode *vnode, time64_t expires_at,
|
||||
enum afs_cb_promise_trace trace)
|
||||
{
|
||||
atomic64_set(&vnode->cb_expires_at, expires_at);
|
||||
trace_afs_cb_promise(vnode, trace);
|
||||
}
|
||||
|
||||
/*
|
||||
* Clear the callback promise on a vnode, returning true if it was promised.
|
||||
*/
|
||||
static inline bool afs_clear_cb_promise(struct afs_vnode *vnode,
|
||||
enum afs_cb_promise_trace trace)
|
||||
{
|
||||
trace_afs_cb_promise(vnode, trace);
|
||||
return atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark a directory as being invalid.
|
||||
*/
|
||||
static inline void afs_invalidate_dir(struct afs_vnode *dvnode,
|
||||
enum afs_dir_invalid_trace trace)
|
||||
{
|
||||
if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
|
||||
trace_afs_dir_invalid(dvnode, trace);
|
||||
afs_stat_v(dvnode, n_inval);
|
||||
}
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/*
|
||||
* debug tracing
|
||||
|
@ -177,7 +177,7 @@ static int __init afs_init(void)
|
||||
afs_wq = alloc_workqueue("afs", 0, 0);
|
||||
if (!afs_wq)
|
||||
goto error_afs_wq;
|
||||
afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM, 0);
|
||||
afs_async_calls = alloc_workqueue("kafsd", WQ_MEM_RECLAIM | WQ_UNBOUND, 0);
|
||||
if (!afs_async_calls)
|
||||
goto error_async;
|
||||
afs_lock_manager = alloc_workqueue("kafs_lockd", WQ_MEM_RECLAIM, 0);
|
||||
|
@ -30,7 +30,7 @@ const struct file_operations afs_mntpt_file_operations = {
|
||||
|
||||
const struct inode_operations afs_mntpt_inode_operations = {
|
||||
.lookup = afs_mntpt_lookup,
|
||||
.readlink = page_readlink,
|
||||
.readlink = afs_readlink,
|
||||
.getattr = afs_getattr,
|
||||
};
|
||||
|
||||
@ -118,9 +118,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
|
||||
ctx->volnamesz = sizeof(afs_root_volume) - 1;
|
||||
} else {
|
||||
/* read the contents of the AFS special symlink */
|
||||
struct page *page;
|
||||
DEFINE_DELAYED_CALL(cleanup);
|
||||
const char *content;
|
||||
loff_t size = i_size_read(d_inode(mntpt));
|
||||
char *buf;
|
||||
|
||||
if (src_as->cell)
|
||||
ctx->cell = afs_use_cell(src_as->cell, afs_cell_trace_use_mntpt);
|
||||
@ -128,16 +128,16 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
|
||||
if (size < 2 || size > PAGE_SIZE - 1)
|
||||
return -EINVAL;
|
||||
|
||||
page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
|
||||
if (IS_ERR(page))
|
||||
return PTR_ERR(page);
|
||||
content = afs_get_link(mntpt, d_inode(mntpt), &cleanup);
|
||||
if (IS_ERR(content)) {
|
||||
do_delayed_call(&cleanup);
|
||||
return PTR_ERR(content);
|
||||
}
|
||||
|
||||
buf = kmap(page);
|
||||
ret = -EINVAL;
|
||||
if (buf[size - 1] == '.')
|
||||
ret = vfs_parse_fs_string(fc, "source", buf, size - 1);
|
||||
kunmap(page);
|
||||
put_page(page);
|
||||
if (content[size - 1] == '.')
|
||||
ret = vfs_parse_fs_string(fc, "source", content, size - 1);
|
||||
do_delayed_call(&cleanup);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
|
@ -240,7 +240,13 @@ static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size)
|
||||
/* determine command to perform */
|
||||
_debug("rootcell=%s", buf);
|
||||
|
||||
ret = afs_cell_init(net, buf);
|
||||
ret = -EEXIST;
|
||||
inode_lock(file_inode(file));
|
||||
if (!net->ws_cell)
|
||||
ret = afs_cell_init(net, buf);
|
||||
else
|
||||
printk("busy\n");
|
||||
inode_unlock(file_inode(file));
|
||||
|
||||
out:
|
||||
_leave(" = %d", ret);
|
||||
|
@ -99,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
|
||||
write_seqlock(&vnode->cb_lock);
|
||||
ASSERTCMP(cb_server, ==, vnode->cb_server);
|
||||
vnode->cb_server = NULL;
|
||||
if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
|
||||
if (afs_clear_cb_promise(vnode, afs_cb_promise_clear_rotate_server))
|
||||
vnode->cb_break++;
|
||||
write_sequnlock(&vnode->cb_lock);
|
||||
}
|
||||
@ -583,7 +583,7 @@ selected_server:
|
||||
if (vnode->cb_server != server) {
|
||||
vnode->cb_server = server;
|
||||
vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
|
||||
atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
|
||||
afs_clear_cb_promise(vnode, afs_cb_promise_clear_server_change);
|
||||
}
|
||||
|
||||
retry_server:
|
||||
|
@ -149,7 +149,8 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
|
||||
call->net = net;
|
||||
call->debug_id = atomic_inc_return(&rxrpc_debug_id);
|
||||
refcount_set(&call->ref, 1);
|
||||
INIT_WORK(&call->async_work, afs_process_async_call);
|
||||
INIT_WORK(&call->async_work, type->async_rx ?: afs_process_async_call);
|
||||
INIT_WORK(&call->work, call->type->work);
|
||||
INIT_WORK(&call->free_work, afs_deferred_free_worker);
|
||||
init_waitqueue_head(&call->waitq);
|
||||
spin_lock_init(&call->state_lock);
|
||||
@ -235,27 +236,12 @@ void afs_deferred_put_call(struct afs_call *call)
|
||||
schedule_work(&call->free_work);
|
||||
}
|
||||
|
||||
static struct afs_call *afs_get_call(struct afs_call *call,
|
||||
enum afs_call_trace why)
|
||||
{
|
||||
int r;
|
||||
|
||||
__refcount_inc(&call->ref, &r);
|
||||
|
||||
trace_afs_call(call->debug_id, why, r + 1,
|
||||
atomic_read(&call->net->nr_outstanding_calls),
|
||||
__builtin_return_address(0));
|
||||
return call;
|
||||
}
|
||||
|
||||
/*
|
||||
* Queue the call for actual work.
|
||||
*/
|
||||
static void afs_queue_call_work(struct afs_call *call)
|
||||
{
|
||||
if (call->type->work) {
|
||||
INIT_WORK(&call->work, call->type->work);
|
||||
|
||||
afs_get_call(call, afs_call_trace_work);
|
||||
if (!queue_work(afs_wq, &call->work))
|
||||
afs_put_call(call);
|
||||
@ -430,11 +416,16 @@ void afs_make_call(struct afs_call *call, gfp_t gfp)
|
||||
return;
|
||||
|
||||
error_do_abort:
|
||||
if (ret != -ECONNABORTED) {
|
||||
if (ret != -ECONNABORTED)
|
||||
rxrpc_kernel_abort_call(call->net->socket, rxcall,
|
||||
RX_USER_ABORT, ret,
|
||||
afs_abort_send_data_error);
|
||||
} else {
|
||||
if (call->async) {
|
||||
afs_see_call(call, afs_call_trace_async_abort);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ret == -ECONNABORTED) {
|
||||
len = 0;
|
||||
iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
|
||||
rxrpc_kernel_recv_data(call->net->socket, rxcall,
|
||||
@ -445,8 +436,10 @@ error_do_abort:
|
||||
call->error = ret;
|
||||
trace_afs_call_done(call);
|
||||
error_kill_call:
|
||||
if (call->type->done)
|
||||
call->type->done(call);
|
||||
if (call->async)
|
||||
afs_see_call(call, afs_call_trace_async_kill);
|
||||
if (call->type->immediate_cancel)
|
||||
call->type->immediate_cancel(call);
|
||||
|
||||
/* We need to dispose of the extra ref we grabbed for an async call.
|
||||
* The call, however, might be queued on afs_async_calls and we need to
|
||||
@ -501,7 +494,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
|
||||
/*
|
||||
* deliver messages to a call
|
||||
*/
|
||||
static void afs_deliver_to_call(struct afs_call *call)
|
||||
void afs_deliver_to_call(struct afs_call *call)
|
||||
{
|
||||
enum afs_call_state state;
|
||||
size_t len;
|
||||
@ -602,7 +595,6 @@ local_abort:
|
||||
abort_code = 0;
|
||||
call_complete:
|
||||
afs_set_call_complete(call, ret, remote_abort);
|
||||
state = AFS_CALL_COMPLETE;
|
||||
goto done;
|
||||
}
|
||||
|
||||
@ -803,6 +795,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
|
||||
return -ENOTSUPP;
|
||||
|
||||
trace_afs_cb_call(call);
|
||||
call->work.func = call->type->work;
|
||||
|
||||
/* pass responsibility for the remainer of this message off to the
|
||||
* cache manager op */
|
||||
|
@ -663,7 +663,7 @@ static void afs_i_init_once(void *_vnode)
|
||||
|
||||
memset(vnode, 0, sizeof(*vnode));
|
||||
inode_init_once(&vnode->netfs.inode);
|
||||
mutex_init(&vnode->io_lock);
|
||||
INIT_LIST_HEAD(&vnode->io_lock_waiters);
|
||||
init_rwsem(&vnode->validate_lock);
|
||||
spin_lock_init(&vnode->wb_lock);
|
||||
spin_lock_init(&vnode->lock);
|
||||
@ -696,6 +696,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
|
||||
vnode->volume = NULL;
|
||||
vnode->lock_key = NULL;
|
||||
vnode->permit_cache = NULL;
|
||||
vnode->directory = NULL;
|
||||
vnode->directory_size = 0;
|
||||
|
||||
vnode->flags = 1 << AFS_VNODE_UNSET;
|
||||
vnode->lock_state = AFS_VNODE_LOCK_NONE;
|
||||
|
@ -120,22 +120,31 @@
|
||||
bool afs_check_validity(const struct afs_vnode *vnode)
|
||||
{
|
||||
const struct afs_volume *volume = vnode->volume;
|
||||
enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
|
||||
time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
|
||||
time64_t deadline = ktime_get_real_seconds() + 10;
|
||||
|
||||
if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
|
||||
return true;
|
||||
|
||||
if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
|
||||
atomic64_read(&vnode->cb_expires_at) <= deadline ||
|
||||
volume->cb_expires_at <= deadline ||
|
||||
vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
|
||||
vnode->cb_scrub != atomic_read(&volume->cb_scrub) ||
|
||||
test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
|
||||
_debug("inval");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))
|
||||
trace = afs_vnode_invalid_trace_cb_v_break;
|
||||
else if (cb_expires_at == AFS_NO_CB_PROMISE)
|
||||
trace = afs_vnode_invalid_trace_no_cb_promise;
|
||||
else if (cb_expires_at <= deadline)
|
||||
trace = afs_vnode_invalid_trace_expired;
|
||||
else if (volume->cb_expires_at <= deadline)
|
||||
trace = afs_vnode_invalid_trace_vol_expired;
|
||||
else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))
|
||||
trace = afs_vnode_invalid_trace_cb_ro_snapshot;
|
||||
else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))
|
||||
trace = afs_vnode_invalid_trace_cb_scrub;
|
||||
else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
|
||||
trace = afs_vnode_invalid_trace_zap_data;
|
||||
else
|
||||
return true;
|
||||
trace_afs_vnode_invalid(vnode, trace);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -370,6 +370,7 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
|
||||
.name = "VL.GetCapabilities",
|
||||
.op = afs_VL_GetCapabilities,
|
||||
.deliver = afs_deliver_vl_get_capabilities,
|
||||
.immediate_cancel = afs_vlserver_probe_result,
|
||||
.done = afs_vlserver_probe_result,
|
||||
.destructor = afs_destroy_vl_get_capabilities,
|
||||
};
|
||||
|
@ -182,8 +182,8 @@ void afs_issue_write(struct netfs_io_subrequest *subreq)
|
||||
*/
|
||||
void afs_begin_writeback(struct netfs_io_request *wreq)
|
||||
{
|
||||
afs_get_writeback_key(wreq);
|
||||
wreq->io_streams[0].avail = true;
|
||||
if (S_ISREG(wreq->inode->i_mode))
|
||||
afs_get_writeback_key(wreq);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -196,6 +196,18 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st
|
||||
list_first_entry(&stream->subrequests,
|
||||
struct netfs_io_subrequest, rreq_link);
|
||||
|
||||
switch (wreq->origin) {
|
||||
case NETFS_READAHEAD:
|
||||
case NETFS_READPAGE:
|
||||
case NETFS_READ_GAPS:
|
||||
case NETFS_READ_SINGLE:
|
||||
case NETFS_READ_FOR_WRITE:
|
||||
case NETFS_DIO_READ:
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
switch (subreq->error) {
|
||||
case -EACCES:
|
||||
case -EPERM:
|
||||
|
@ -88,7 +88,7 @@ union afs_xdr_dir_block {
|
||||
|
||||
struct {
|
||||
struct afs_xdr_dir_hdr hdr;
|
||||
u8 alloc_ctrs[AFS_DIR_MAX_BLOCKS];
|
||||
u8 alloc_ctrs[AFS_DIR_BLOCKS_WITH_CTR];
|
||||
__be16 hashtable[AFS_DIR_HASHTBL_SIZE];
|
||||
} meta;
|
||||
|
||||
|
@ -352,19 +352,19 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
|
||||
static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
|
||||
{
|
||||
struct afs_operation *op = call->op;
|
||||
struct netfs_io_subrequest *subreq = op->fetch.subreq;
|
||||
struct afs_vnode_param *vp = &op->file[0];
|
||||
struct afs_read *req = op->fetch.req;
|
||||
const __be32 *bp;
|
||||
size_t count_before;
|
||||
int ret;
|
||||
|
||||
_enter("{%u,%zu, %zu/%llu}",
|
||||
call->unmarshall, call->iov_len, iov_iter_count(call->iter),
|
||||
req->actual_len);
|
||||
call->remaining);
|
||||
|
||||
switch (call->unmarshall) {
|
||||
case 0:
|
||||
req->actual_len = 0;
|
||||
call->remaining = 0;
|
||||
afs_extract_to_tmp64(call);
|
||||
call->unmarshall++;
|
||||
fallthrough;
|
||||
@ -379,42 +379,39 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
req->actual_len = be64_to_cpu(call->tmp64);
|
||||
_debug("DATA length: %llu", req->actual_len);
|
||||
call->remaining = be64_to_cpu(call->tmp64);
|
||||
_debug("DATA length: %llu", call->remaining);
|
||||
|
||||
if (req->actual_len == 0)
|
||||
if (call->remaining == 0)
|
||||
goto no_more_data;
|
||||
|
||||
call->iter = req->iter;
|
||||
call->iov_len = min(req->actual_len, req->len);
|
||||
call->iter = &subreq->io_iter;
|
||||
call->iov_len = min(call->remaining, subreq->len - subreq->transferred);
|
||||
call->unmarshall++;
|
||||
fallthrough;
|
||||
|
||||
/* extract the returned data */
|
||||
case 2:
|
||||
count_before = call->iov_len;
|
||||
_debug("extract data %zu/%llu", count_before, req->actual_len);
|
||||
_debug("extract data %zu/%llu", count_before, call->remaining);
|
||||
|
||||
ret = afs_extract_data(call, true);
|
||||
if (req->subreq) {
|
||||
req->subreq->transferred += count_before - call->iov_len;
|
||||
netfs_read_subreq_progress(req->subreq, false);
|
||||
}
|
||||
subreq->transferred += count_before - call->iov_len;
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
call->iter = &call->def_iter;
|
||||
if (req->actual_len <= req->len)
|
||||
if (call->remaining)
|
||||
goto no_more_data;
|
||||
|
||||
/* Discard any excess data the server gave us */
|
||||
afs_extract_discard(call, req->actual_len - req->len);
|
||||
afs_extract_discard(call, call->remaining);
|
||||
call->unmarshall = 3;
|
||||
fallthrough;
|
||||
|
||||
case 3:
|
||||
_debug("extract discard %zu/%llu",
|
||||
iov_iter_count(call->iter), req->actual_len - req->len);
|
||||
iov_iter_count(call->iter), call->remaining);
|
||||
|
||||
ret = afs_extract_data(call, true);
|
||||
if (ret < 0)
|
||||
@ -439,8 +436,8 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
|
||||
xdr_decode_YFSCallBack(&bp, call, &vp->scb);
|
||||
xdr_decode_YFSVolSync(&bp, &op->volsync);
|
||||
|
||||
req->data_version = vp->scb.status.data_version;
|
||||
req->file_size = vp->scb.status.size;
|
||||
if (subreq->start + subreq->transferred >= vp->scb.status.size)
|
||||
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
|
||||
|
||||
call->unmarshall++;
|
||||
fallthrough;
|
||||
@ -459,7 +456,9 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
|
||||
static const struct afs_call_type yfs_RXYFSFetchData64 = {
|
||||
.name = "YFS.FetchData64",
|
||||
.op = yfs_FS_FetchData64,
|
||||
.async_rx = afs_fetch_data_async_rx,
|
||||
.deliver = yfs_deliver_fs_fetch_data64,
|
||||
.immediate_cancel = afs_fetch_data_immediate_cancel,
|
||||
.destructor = afs_flat_call_destructor,
|
||||
};
|
||||
|
||||
@ -468,14 +467,15 @@ static const struct afs_call_type yfs_RXYFSFetchData64 = {
|
||||
*/
|
||||
void yfs_fs_fetch_data(struct afs_operation *op)
|
||||
{
|
||||
struct netfs_io_subrequest *subreq = op->fetch.subreq;
|
||||
struct afs_vnode_param *vp = &op->file[0];
|
||||
struct afs_read *req = op->fetch.req;
|
||||
struct afs_call *call;
|
||||
__be32 *bp;
|
||||
|
||||
_enter(",%x,{%llx:%llu},%llx,%llx",
|
||||
_enter(",%x,{%llx:%llu},%llx,%zx",
|
||||
key_serial(op->key), vp->fid.vid, vp->fid.vnode,
|
||||
req->pos, req->len);
|
||||
subreq->start + subreq->transferred,
|
||||
subreq->len - subreq->transferred);
|
||||
|
||||
call = afs_alloc_flat_call(op->net, &yfs_RXYFSFetchData64,
|
||||
sizeof(__be32) * 2 +
|
||||
@ -487,15 +487,16 @@ void yfs_fs_fetch_data(struct afs_operation *op)
|
||||
if (!call)
|
||||
return afs_op_nomem(op);
|
||||
|
||||
req->call_debug_id = call->debug_id;
|
||||
if (op->flags & AFS_OPERATION_ASYNC)
|
||||
call->async = true;
|
||||
|
||||
/* marshall the parameters */
|
||||
bp = call->request;
|
||||
bp = xdr_encode_u32(bp, YFSFETCHDATA64);
|
||||
bp = xdr_encode_u32(bp, 0); /* RPC flags */
|
||||
bp = xdr_encode_YFSFid(bp, &vp->fid);
|
||||
bp = xdr_encode_u64(bp, req->pos);
|
||||
bp = xdr_encode_u64(bp, req->len);
|
||||
bp = xdr_encode_u64(bp, subreq->start + subreq->transferred);
|
||||
bp = xdr_encode_u64(bp, subreq->len - subreq->transferred);
|
||||
yfs_check_req(call, bp);
|
||||
|
||||
call->fid = vp->fid;
|
||||
|
@ -176,7 +176,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
|
||||
!(file->f_mode & FMODE_CAN_ODIRECT))
|
||||
return -EINVAL;
|
||||
|
||||
old_cred = override_creds_light(ctx->cred);
|
||||
old_cred = override_creds(ctx->cred);
|
||||
if (is_sync_kiocb(iocb)) {
|
||||
rwf_t rwf = iocb_to_rw_flags(flags);
|
||||
|
||||
@ -197,7 +197,7 @@ ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
|
||||
backing_aio_cleanup(aio, ret);
|
||||
}
|
||||
out:
|
||||
revert_creds_light(old_cred);
|
||||
revert_creds(old_cred);
|
||||
|
||||
if (ctx->accessed)
|
||||
ctx->accessed(iocb->ki_filp);
|
||||
@ -233,7 +233,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
|
||||
*/
|
||||
flags &= ~IOCB_DIO_CALLER_COMP;
|
||||
|
||||
old_cred = override_creds_light(ctx->cred);
|
||||
old_cred = override_creds(ctx->cred);
|
||||
if (is_sync_kiocb(iocb)) {
|
||||
rwf_t rwf = iocb_to_rw_flags(flags);
|
||||
|
||||
@ -264,7 +264,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
|
||||
backing_aio_cleanup(aio, ret);
|
||||
}
|
||||
out:
|
||||
revert_creds_light(old_cred);
|
||||
revert_creds(old_cred);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -281,9 +281,9 @@ ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb,
|
||||
if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
|
||||
return -EIO;
|
||||
|
||||
old_cred = override_creds_light(ctx->cred);
|
||||
old_cred = override_creds(ctx->cred);
|
||||
ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags);
|
||||
revert_creds_light(old_cred);
|
||||
revert_creds(old_cred);
|
||||
|
||||
if (ctx->accessed)
|
||||
ctx->accessed(iocb->ki_filp);
|
||||
@ -310,11 +310,11 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
old_cred = override_creds_light(ctx->cred);
|
||||
old_cred = override_creds(ctx->cred);
|
||||
file_start_write(out);
|
||||
ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags);
|
||||
file_end_write(out);
|
||||
revert_creds_light(old_cred);
|
||||
revert_creds(old_cred);
|
||||
|
||||
if (ctx->end_write)
|
||||
ctx->end_write(iocb, ret);
|
||||
@ -338,9 +338,9 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
|
||||
|
||||
vma_set_file(vma, file);
|
||||
|
||||
old_cred = override_creds_light(ctx->cred);
|
||||
old_cred = override_creds(ctx->cred);
|
||||
ret = call_mmap(vma->vm_file, vma);
|
||||
revert_creds_light(old_cred);
|
||||
revert_creds(old_cred);
|
||||
|
||||
if (ctx->accessed)
|
||||
ctx->accessed(user_file);
|
||||
|
@ -90,7 +90,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN
|
||||
|
||||
config BCACHEFS_PATH_TRACEPOINTS
|
||||
bool "Extra btree_path tracepoints"
|
||||
depends on BCACHEFS_FS
|
||||
depends on BCACHEFS_FS && TRACING
|
||||
help
|
||||
Enable extra tracepoints for debugging btree_path operations; we don't
|
||||
normally want these enabled because they happen at very high rates.
|
||||
|
@ -82,6 +82,7 @@ bcachefs-y := \
|
||||
siphash.o \
|
||||
six.o \
|
||||
snapshot.o \
|
||||
str_hash.o \
|
||||
subvolume.o \
|
||||
super.o \
|
||||
super-io.o \
|
||||
|
@ -184,11 +184,6 @@ invalid:
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
#define acl_for_each_entry(acl, acl_e) \
|
||||
for (acl_e = acl->a_entries; \
|
||||
acl_e < acl->a_entries + acl->a_count; \
|
||||
acl_e++)
|
||||
|
||||
/*
|
||||
* Convert from in-memory to filesystem representation.
|
||||
*/
|
||||
@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans,
|
||||
{
|
||||
struct bkey_i_xattr *xattr;
|
||||
bch_acl_header *acl_header;
|
||||
const struct posix_acl_entry *acl_e;
|
||||
const struct posix_acl_entry *acl_e, *pe;
|
||||
void *outptr;
|
||||
unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
|
||||
|
||||
acl_for_each_entry(acl, acl_e) {
|
||||
FOREACH_ACL_ENTRY(acl_e, acl, pe) {
|
||||
switch (acl_e->e_tag) {
|
||||
case ACL_USER:
|
||||
case ACL_GROUP:
|
||||
@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans,
|
||||
|
||||
outptr = (void *) acl_header + sizeof(*acl_header);
|
||||
|
||||
acl_for_each_entry(acl, acl_e) {
|
||||
FOREACH_ACL_ENTRY(acl_e, acl, pe) {
|
||||
bch_acl_entry *entry = outptr;
|
||||
|
||||
entry->e_tag = cpu_to_le16(acl_e->e_tag);
|
||||
|
@ -198,7 +198,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
|
||||
}
|
||||
|
||||
int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
|
||||
int ret = 0;
|
||||
@ -213,7 +213,7 @@ fsck_err:
|
||||
}
|
||||
|
||||
int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bkey_alloc_unpacked u;
|
||||
int ret = 0;
|
||||
@ -226,7 +226,7 @@ fsck_err:
|
||||
}
|
||||
|
||||
int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bkey_alloc_unpacked u;
|
||||
int ret = 0;
|
||||
@ -239,7 +239,7 @@ fsck_err:
|
||||
}
|
||||
|
||||
int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bch_alloc_v4 a;
|
||||
int ret = 0;
|
||||
@ -322,9 +322,9 @@ fsck_err:
|
||||
void bch2_alloc_v4_swab(struct bkey_s k)
|
||||
{
|
||||
struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v;
|
||||
struct bch_backpointer *bp, *bps;
|
||||
|
||||
a->journal_seq = swab64(a->journal_seq);
|
||||
a->journal_seq_nonempty = swab64(a->journal_seq_nonempty);
|
||||
a->journal_seq_empty = swab64(a->journal_seq_empty);
|
||||
a->flags = swab32(a->flags);
|
||||
a->dirty_sectors = swab32(a->dirty_sectors);
|
||||
a->cached_sectors = swab32(a->cached_sectors);
|
||||
@ -333,13 +333,6 @@ void bch2_alloc_v4_swab(struct bkey_s k)
|
||||
a->stripe = swab32(a->stripe);
|
||||
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
|
||||
a->stripe_sectors = swab32(a->stripe_sectors);
|
||||
|
||||
bps = alloc_v4_backpointers(a);
|
||||
for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
|
||||
bp->bucket_offset = swab40(bp->bucket_offset);
|
||||
bp->bucket_len = swab32(bp->bucket_len);
|
||||
bch2_bpos_swab(&bp->pos);
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
|
||||
@ -354,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
|
||||
prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
|
||||
bch2_prt_data_type(out, a->data_type);
|
||||
prt_newline(out);
|
||||
prt_printf(out, "journal_seq %llu\n", a->journal_seq);
|
||||
prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
|
||||
prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
|
||||
prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
|
||||
prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
|
||||
prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
|
||||
prt_printf(out, "stripe %u\n", a->stripe);
|
||||
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
|
||||
prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
|
||||
prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
|
||||
prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty);
|
||||
prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty);
|
||||
prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a));
|
||||
prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a));
|
||||
prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors);
|
||||
prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors);
|
||||
prt_printf(out, "cached_sectors %u\n", a->cached_sectors);
|
||||
prt_printf(out, "stripe %u\n", a->stripe);
|
||||
prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy);
|
||||
prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]);
|
||||
prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]);
|
||||
|
||||
if (ca)
|
||||
prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca));
|
||||
@ -392,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
|
||||
struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
|
||||
|
||||
*out = (struct bch_alloc_v4) {
|
||||
.journal_seq = u.journal_seq,
|
||||
.journal_seq_nonempty = u.journal_seq,
|
||||
.flags = u.need_discard,
|
||||
.gen = u.gen,
|
||||
.oldest_gen = u.oldest_gen,
|
||||
@ -517,7 +511,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
|
||||
}
|
||||
|
||||
int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
@ -664,74 +658,80 @@ int bch2_alloc_read(struct bch_fs *c)
|
||||
|
||||
/* Free space/discard btree: */
|
||||
|
||||
static int __need_discard_or_freespace_err(struct btree_trans *trans,
|
||||
struct bkey_s_c alloc_k,
|
||||
bool set, bool discard, bool repair)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0);
|
||||
enum bch_sb_error_id err_id = discard
|
||||
? BCH_FSCK_ERR_need_discard_key_wrong
|
||||
: BCH_FSCK_ERR_freespace_key_wrong;
|
||||
enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, c, alloc_k);
|
||||
|
||||
int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
|
||||
"bucket incorrectly %sset in %s btree\n"
|
||||
" %s",
|
||||
set ? "" : "un",
|
||||
bch2_btree_id_str(btree),
|
||||
buf.buf);
|
||||
if (ret == -BCH_ERR_fsck_ignore ||
|
||||
ret == -BCH_ERR_fsck_errors_not_fixed)
|
||||
ret = 0;
|
||||
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define need_discard_or_freespace_err(...) \
|
||||
fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__))
|
||||
|
||||
#define need_discard_or_freespace_err_on(cond, ...) \
|
||||
(unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false)
|
||||
|
||||
static int bch2_bucket_do_index(struct btree_trans *trans,
|
||||
struct bch_dev *ca,
|
||||
struct bkey_s_c alloc_k,
|
||||
const struct bch_alloc_v4 *a,
|
||||
bool set)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c old;
|
||||
struct bkey_i *k;
|
||||
enum btree_id btree;
|
||||
enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
|
||||
enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret;
|
||||
struct bpos pos;
|
||||
|
||||
if (a->data_type != BCH_DATA_free &&
|
||||
a->data_type != BCH_DATA_need_discard)
|
||||
return 0;
|
||||
|
||||
k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
|
||||
if (IS_ERR(k))
|
||||
return PTR_ERR(k);
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.type = new_type;
|
||||
|
||||
switch (a->data_type) {
|
||||
case BCH_DATA_free:
|
||||
btree = BTREE_ID_freespace;
|
||||
k->k.p = alloc_freespace_pos(alloc_k.k->p, *a);
|
||||
bch2_key_resize(&k->k, 1);
|
||||
pos = alloc_freespace_pos(alloc_k.k->p, *a);
|
||||
break;
|
||||
case BCH_DATA_need_discard:
|
||||
btree = BTREE_ID_need_discard;
|
||||
k->k.p = alloc_k.k->p;
|
||||
pos = alloc_k.k->p;
|
||||
break;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
old = bch2_bkey_get_iter(trans, &iter, btree,
|
||||
bkey_start_pos(&k->k),
|
||||
BTREE_ITER_intent);
|
||||
ret = bkey_err(old);
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent);
|
||||
int ret = bkey_err(old);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (ca->mi.freespace_initialized &&
|
||||
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info &&
|
||||
bch2_trans_inconsistent_on(old.k->type != old_type, trans,
|
||||
"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
|
||||
" for %s",
|
||||
set ? "setting" : "clearing",
|
||||
bch2_btree_id_str(btree),
|
||||
iter.pos.inode,
|
||||
iter.pos.offset,
|
||||
bch2_bkey_types[old.k->type],
|
||||
bch2_bkey_types[old_type],
|
||||
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
|
||||
ret = -EIO;
|
||||
goto err;
|
||||
}
|
||||
need_discard_or_freespace_err_on(ca->mi.freespace_initialized &&
|
||||
!old.k->type != set,
|
||||
trans, alloc_k, set,
|
||||
btree == BTREE_ID_need_discard, false);
|
||||
|
||||
ret = bch2_trans_update(trans, &iter, k, 0);
|
||||
err:
|
||||
ret = bch2_btree_bit_mod_iter(trans, &iter, set);
|
||||
fsck_err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -858,7 +858,10 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
if (flags & BTREE_TRIGGER_transactional) {
|
||||
alloc_data_type_set(new_a, new_a->data_type);
|
||||
|
||||
if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) {
|
||||
int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
|
||||
(int) data_type_is_empty(old_a->data_type);
|
||||
|
||||
if (is_empty_delta < 0) {
|
||||
new_a->io_time[READ] = bch2_current_io_time(c, READ);
|
||||
new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE);
|
||||
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
|
||||
@ -928,37 +931,55 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
|
||||
u64 journal_seq = trans->journal_res.seq;
|
||||
u64 bucket_journal_seq = new_a->journal_seq;
|
||||
u64 transaction_seq = trans->journal_res.seq;
|
||||
BUG_ON(!transaction_seq);
|
||||
|
||||
if ((flags & BTREE_TRIGGER_insert) &&
|
||||
data_type_is_empty(old_a->data_type) !=
|
||||
data_type_is_empty(new_a->data_type) &&
|
||||
new.k->type == KEY_TYPE_alloc_v4) {
|
||||
struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
|
||||
if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq,
|
||||
trans, alloc_key_journal_seq_in_future,
|
||||
"bucket journal seq in future (currently at %llu)\n%s",
|
||||
journal_cur_seq(&c->journal),
|
||||
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)))
|
||||
new_a->journal_seq_nonempty = transaction_seq;
|
||||
|
||||
/*
|
||||
* If the btree updates referring to a bucket weren't flushed
|
||||
* before the bucket became empty again, then the we don't have
|
||||
* to wait on a journal flush before we can reuse the bucket:
|
||||
*/
|
||||
v->journal_seq = bucket_journal_seq =
|
||||
data_type_is_empty(new_a->data_type) &&
|
||||
(journal_seq == v->journal_seq ||
|
||||
bch2_journal_noflush_seq(&c->journal, v->journal_seq))
|
||||
? 0 : journal_seq;
|
||||
int is_empty_delta = (int) data_type_is_empty(new_a->data_type) -
|
||||
(int) data_type_is_empty(old_a->data_type);
|
||||
|
||||
/*
|
||||
* Record journal sequence number of empty -> nonempty transition:
|
||||
* Note that there may be multiple empty -> nonempty
|
||||
* transitions, data in a bucket may be overwritten while we're
|
||||
* still writing to it - so be careful to only record the first:
|
||||
* */
|
||||
if (is_empty_delta < 0 &&
|
||||
new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) {
|
||||
new_a->journal_seq_nonempty = transaction_seq;
|
||||
new_a->journal_seq_empty = 0;
|
||||
}
|
||||
|
||||
if (!data_type_is_empty(old_a->data_type) &&
|
||||
data_type_is_empty(new_a->data_type) &&
|
||||
bucket_journal_seq) {
|
||||
ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
|
||||
c->journal.flushed_seq_ondisk,
|
||||
new.k->p.inode, new.k->p.offset,
|
||||
bucket_journal_seq);
|
||||
if (bch2_fs_fatal_err_on(ret, c,
|
||||
"setting bucket_needs_journal_commit: %s", bch2_err_str(ret)))
|
||||
goto err;
|
||||
/*
|
||||
* Bucket becomes empty: mark it as waiting for a journal flush,
|
||||
* unless updates since empty -> nonempty transition were never
|
||||
* flushed - we may need to ask the journal not to flush
|
||||
* intermediate sequence numbers:
|
||||
*/
|
||||
if (is_empty_delta > 0) {
|
||||
if (new_a->journal_seq_nonempty == transaction_seq ||
|
||||
bch2_journal_noflush_seq(&c->journal,
|
||||
new_a->journal_seq_nonempty,
|
||||
transaction_seq)) {
|
||||
new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0;
|
||||
} else {
|
||||
new_a->journal_seq_empty = transaction_seq;
|
||||
|
||||
ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
|
||||
c->journal.flushed_seq_ondisk,
|
||||
new.k->p.inode, new.k->p.offset,
|
||||
transaction_seq);
|
||||
if (bch2_fs_fatal_err_on(ret, c,
|
||||
"setting bucket_needs_journal_commit: %s",
|
||||
bch2_err_str(ret)))
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
if (new_a->gen != old_a->gen) {
|
||||
@ -974,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
|
||||
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
|
||||
#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr)
|
||||
#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
|
||||
#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk)
|
||||
|
||||
if (statechange(a->data_type == BCH_DATA_free) &&
|
||||
bucket_flushed(new_a))
|
||||
@ -1006,6 +1027,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
|
||||
rcu_read_unlock();
|
||||
}
|
||||
err:
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
bch2_dev_put(ca);
|
||||
return ret;
|
||||
@ -1045,7 +1067,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
|
||||
* btree node min/max is a closed interval, upto takes a half
|
||||
* open interval:
|
||||
*/
|
||||
k = bch2_btree_iter_peek_upto(&iter2, end);
|
||||
k = bch2_btree_iter_peek_max(&iter2, end);
|
||||
next = iter2.pos;
|
||||
bch2_trans_iter_exit(iter->trans, &iter2);
|
||||
|
||||
@ -1129,7 +1151,6 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_alloc_v4 a_convert;
|
||||
const struct bch_alloc_v4 *a;
|
||||
unsigned discard_key_type, freespace_key_type;
|
||||
unsigned gens_offset;
|
||||
struct bkey_s_c k;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
@ -1149,64 +1170,30 @@ int bch2_check_alloc_key(struct btree_trans *trans,
|
||||
|
||||
a = bch2_alloc_to_v4(alloc_k, &a_convert);
|
||||
|
||||
discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0;
|
||||
bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
|
||||
k = bch2_btree_iter_peek_slot(discard_iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(k.k->type != discard_key_type,
|
||||
trans, need_discard_key_wrong,
|
||||
"incorrect key in need_discard btree (got %s should be %s)\n"
|
||||
" %s",
|
||||
bch2_bkey_types[k.k->type],
|
||||
bch2_bkey_types[discard_key_type],
|
||||
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
|
||||
struct bkey_i *update =
|
||||
bch2_trans_kmalloc(trans, sizeof(*update));
|
||||
|
||||
ret = PTR_ERR_OR_ZERO(update);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bkey_init(&update->k);
|
||||
update->k.type = discard_key_type;
|
||||
update->k.p = discard_iter->pos;
|
||||
|
||||
ret = bch2_trans_update(trans, discard_iter, update, 0);
|
||||
bool is_discarded = a->data_type == BCH_DATA_need_discard;
|
||||
if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded,
|
||||
trans, alloc_k, !is_discarded, true, true)) {
|
||||
ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0;
|
||||
bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
|
||||
k = bch2_btree_iter_peek_slot(freespace_iter);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (fsck_err_on(k.k->type != freespace_key_type,
|
||||
trans, freespace_key_wrong,
|
||||
"incorrect key in freespace btree (got %s should be %s)\n"
|
||||
" %s",
|
||||
bch2_bkey_types[k.k->type],
|
||||
bch2_bkey_types[freespace_key_type],
|
||||
(printbuf_reset(&buf),
|
||||
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
|
||||
struct bkey_i *update =
|
||||
bch2_trans_kmalloc(trans, sizeof(*update));
|
||||
|
||||
ret = PTR_ERR_OR_ZERO(update);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bkey_init(&update->k);
|
||||
update->k.type = freespace_key_type;
|
||||
update->k.p = freespace_iter->pos;
|
||||
bch2_key_resize(&update->k, 1);
|
||||
|
||||
ret = bch2_trans_update(trans, freespace_iter, update, 0);
|
||||
bool is_free = a->data_type == BCH_DATA_free;
|
||||
if (need_discard_or_freespace_err_on(!!k.k->type != is_free,
|
||||
trans, alloc_k, !is_free, false, true)) {
|
||||
ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
@ -1368,51 +1355,88 @@ fsck_err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
|
||||
struct btree_iter *iter)
|
||||
struct check_discard_freespace_key_async {
|
||||
struct work_struct work;
|
||||
struct bch_fs *c;
|
||||
struct bbpos pos;
|
||||
};
|
||||
|
||||
static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
u8 gen;
|
||||
ret = k.k->type != KEY_TYPE_set
|
||||
? bch2_check_discard_freespace_key(trans, &iter, &gen, false)
|
||||
: 0;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void check_discard_freespace_key_work(struct work_struct *work)
|
||||
{
|
||||
struct check_discard_freespace_key_async *w =
|
||||
container_of(work, struct check_discard_freespace_key_async, work);
|
||||
|
||||
bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
|
||||
bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
|
||||
kfree(w);
|
||||
}
|
||||
|
||||
int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen,
|
||||
bool async_repair)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter alloc_iter;
|
||||
struct bkey_s_c alloc_k;
|
||||
struct bch_alloc_v4 a_convert;
|
||||
const struct bch_alloc_v4 *a;
|
||||
u64 genbits;
|
||||
struct bpos pos;
|
||||
enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard
|
||||
? BCH_DATA_need_discard
|
||||
: BCH_DATA_free;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret;
|
||||
|
||||
pos = iter->pos;
|
||||
pos.offset &= ~(~0ULL << 56);
|
||||
genbits = iter->pos.offset & (~0ULL << 56);
|
||||
struct bpos bucket = iter->pos;
|
||||
bucket.offset &= ~(~0ULL << 56);
|
||||
u64 genbits = iter->pos.offset & (~0ULL << 56);
|
||||
|
||||
alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
|
||||
ret = bkey_err(alloc_k);
|
||||
struct btree_iter alloc_iter;
|
||||
struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
|
||||
BTREE_ID_alloc, bucket,
|
||||
async_repair ? BTREE_ITER_cached : 0);
|
||||
int ret = bkey_err(alloc_k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos),
|
||||
trans, need_discard_freespace_key_to_invalid_dev_bucket,
|
||||
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
|
||||
bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
|
||||
goto delete;
|
||||
if (!bch2_dev_bucket_exists(c, bucket)) {
|
||||
if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
|
||||
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
|
||||
bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
|
||||
goto delete;
|
||||
ret = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
a = bch2_alloc_to_v4(alloc_k, &a_convert);
|
||||
struct bch_alloc_v4 a_convert;
|
||||
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
|
||||
|
||||
if (fsck_err_on(a->data_type != state ||
|
||||
(state == BCH_DATA_free &&
|
||||
genbits != alloc_freespace_genbits(*a)),
|
||||
trans, need_discard_freespace_key_bad,
|
||||
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
|
||||
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
|
||||
bch2_btree_id_str(iter->btree_id),
|
||||
iter->pos.inode,
|
||||
iter->pos.offset,
|
||||
a->data_type == state,
|
||||
genbits >> 56, alloc_freespace_genbits(*a) >> 56))
|
||||
goto delete;
|
||||
if (a->data_type != state ||
|
||||
(state == BCH_DATA_free &&
|
||||
genbits != alloc_freespace_genbits(*a))) {
|
||||
if (fsck_err(trans, need_discard_freespace_key_bad,
|
||||
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
|
||||
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
|
||||
bch2_btree_id_str(iter->btree_id),
|
||||
iter->pos.inode,
|
||||
iter->pos.offset,
|
||||
a->data_type == state,
|
||||
genbits >> 56, alloc_freespace_genbits(*a) >> 56))
|
||||
goto delete;
|
||||
ret = 1;
|
||||
goto out;
|
||||
}
|
||||
|
||||
*gen = a->gen;
|
||||
out:
|
||||
fsck_err:
|
||||
bch2_set_btree_iter_dontneed(&alloc_iter);
|
||||
@ -1420,11 +1444,40 @@ fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
delete:
|
||||
ret = bch2_btree_delete_extent_at(trans, iter,
|
||||
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_enospc);
|
||||
goto out;
|
||||
if (!async_repair) {
|
||||
ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_enospc) ?:
|
||||
-BCH_ERR_transaction_restart_commit;
|
||||
goto out;
|
||||
} else {
|
||||
/*
|
||||
* We can't repair here when called from the allocator path: the
|
||||
* commit will recurse back into the allocator
|
||||
*/
|
||||
struct check_discard_freespace_key_async *w =
|
||||
kzalloc(sizeof(*w), GFP_KERNEL);
|
||||
if (!w)
|
||||
goto out;
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
|
||||
kfree(w);
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_WORK(&w->work, check_discard_freespace_key_work);
|
||||
w->c = c;
|
||||
w->pos = BBPOS(iter->btree_id, iter->pos);
|
||||
queue_work(c->write_ref_wq, &w->work);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter)
|
||||
{
|
||||
u8 gen;
|
||||
int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false);
|
||||
return ret < 0 ? ret : 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1581,7 +1634,7 @@ bkey_err:
|
||||
ret = for_each_btree_key(trans, iter,
|
||||
BTREE_ID_need_discard, POS_MIN,
|
||||
BTREE_ITER_prefetch, k,
|
||||
bch2_check_discard_freespace_key(trans, &iter));
|
||||
bch2_check_discard_freespace_key_fsck(trans, &iter));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -1594,7 +1647,7 @@ bkey_err:
|
||||
break;
|
||||
|
||||
ret = bkey_err(k) ?:
|
||||
bch2_check_discard_freespace_key(trans, &iter);
|
||||
bch2_check_discard_freespace_key_fsck(trans, &iter);
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
|
||||
ret = 0;
|
||||
continue;
|
||||
@ -1757,7 +1810,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
||||
struct bch_dev *ca,
|
||||
struct btree_iter *need_discard_iter,
|
||||
struct bpos *discard_pos_done,
|
||||
struct discard_buckets_state *s)
|
||||
struct discard_buckets_state *s,
|
||||
bool fastpath)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bpos pos = need_discard_iter->pos;
|
||||
@ -1793,45 +1847,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (bch2_bucket_sectors_total(a->v)) {
|
||||
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
|
||||
trans, "attempting to discard bucket with dirty data\n%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (a->v.data_type != BCH_DATA_need_discard) {
|
||||
if (data_type_is_empty(a->v.data_type) &&
|
||||
BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
|
||||
a->v.gen++;
|
||||
SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
|
||||
goto write;
|
||||
if (need_discard_or_freespace_err(trans, k, true, true, true)) {
|
||||
ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false);
|
||||
if (ret)
|
||||
goto out;
|
||||
goto commit;
|
||||
}
|
||||
|
||||
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
|
||||
trans, "bucket incorrectly set in need_discard btree\n"
|
||||
"%s",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
|
||||
if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
|
||||
trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
|
||||
a->v.journal_seq,
|
||||
c->journal.flushed_seq_ondisk,
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
if (!fastpath) {
|
||||
if (discard_in_flight_add(ca, iter.pos.offset, true))
|
||||
goto out;
|
||||
|
||||
discard_locked = true;
|
||||
}
|
||||
|
||||
if (discard_in_flight_add(ca, iter.pos.offset, true))
|
||||
goto out;
|
||||
|
||||
discard_locked = true;
|
||||
|
||||
if (!bkey_eq(*discard_pos_done, iter.pos) &&
|
||||
ca->mi.discard && !c->opts.nochanges) {
|
||||
/*
|
||||
@ -1844,6 +1877,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
||||
ca->mi.bucket_size,
|
||||
GFP_KERNEL);
|
||||
*discard_pos_done = iter.pos;
|
||||
s->discarded++;
|
||||
|
||||
ret = bch2_trans_relock_notrace(trans);
|
||||
if (ret)
|
||||
@ -1851,22 +1885,25 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
|
||||
write:
|
||||
alloc_data_type_set(&a->v, a->v.data_type);
|
||||
|
||||
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
|
||||
bch2_trans_commit(trans, NULL, NULL,
|
||||
BCH_WATERMARK_btree|
|
||||
BCH_TRANS_COMMIT_no_enospc);
|
||||
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
|
||||
if (ret)
|
||||
goto out;
|
||||
commit:
|
||||
ret = bch2_trans_commit(trans, NULL, NULL,
|
||||
BCH_WATERMARK_btree|
|
||||
BCH_TRANS_COMMIT_no_enospc);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
count_event(c, bucket_discard);
|
||||
s->discarded++;
|
||||
out:
|
||||
fsck_err:
|
||||
if (discard_locked)
|
||||
discard_in_flight_remove(ca, iter.pos.offset);
|
||||
s->seen++;
|
||||
if (!ret)
|
||||
s->seen++;
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
@ -1886,11 +1923,11 @@ static void bch2_do_discards_work(struct work_struct *work)
|
||||
* successful commit:
|
||||
*/
|
||||
ret = bch2_trans_run(c,
|
||||
for_each_btree_key_upto(trans, iter,
|
||||
for_each_btree_key_max(trans, iter,
|
||||
BTREE_ID_need_discard,
|
||||
POS(ca->dev_idx, 0),
|
||||
POS(ca->dev_idx, U64_MAX), 0, k,
|
||||
bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s)));
|
||||
bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
|
||||
|
||||
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
|
||||
bch2_err_str(ret));
|
||||
@ -1923,27 +1960,29 @@ void bch2_do_discards(struct bch_fs *c)
|
||||
bch2_dev_do_discards(ca);
|
||||
}
|
||||
|
||||
static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
|
||||
static int bch2_do_discards_fast_one(struct btree_trans *trans,
|
||||
struct bch_dev *ca,
|
||||
u64 bucket,
|
||||
struct bpos *discard_pos_done,
|
||||
struct discard_buckets_state *s)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
|
||||
int ret = bkey_err(k);
|
||||
struct btree_iter need_discard_iter;
|
||||
struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter,
|
||||
BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0);
|
||||
int ret = bkey_err(discard_k);
|
||||
if (ret)
|
||||
goto err;
|
||||
return ret;
|
||||
|
||||
struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
|
||||
ret = PTR_ERR_OR_ZERO(a);
|
||||
if (ret)
|
||||
goto err;
|
||||
if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set,
|
||||
trans, discarding_bucket_not_in_need_discard_btree,
|
||||
"attempting to discard bucket %u:%llu not in need_discard btree",
|
||||
ca->dev_idx, bucket))
|
||||
goto out;
|
||||
|
||||
BUG_ON(a->v.dirty_sectors);
|
||||
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
|
||||
alloc_data_type_set(&a->v, a->v.data_type);
|
||||
|
||||
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
|
||||
err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true);
|
||||
out:
|
||||
fsck_err:
|
||||
bch2_trans_iter_exit(trans, &need_discard_iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1951,6 +1990,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
|
||||
{
|
||||
struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work);
|
||||
struct bch_fs *c = ca->fs;
|
||||
struct discard_buckets_state s = {};
|
||||
struct bpos discard_pos_done = POS_MAX;
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
int ret = 0;
|
||||
|
||||
while (1) {
|
||||
bool got_bucket = false;
|
||||
@ -1971,16 +2014,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
|
||||
if (!got_bucket)
|
||||
break;
|
||||
|
||||
if (ca->mi.discard && !c->opts.nochanges)
|
||||
blkdev_issue_discard(ca->disk_sb.bdev,
|
||||
bucket_to_sector(ca, bucket),
|
||||
ca->mi.bucket_size,
|
||||
GFP_KERNEL);
|
||||
|
||||
int ret = bch2_trans_commit_do(c, NULL, NULL,
|
||||
BCH_WATERMARK_btree|
|
||||
BCH_TRANS_COMMIT_no_enospc,
|
||||
bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
|
||||
ret = lockrestart_do(trans,
|
||||
bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s));
|
||||
bch_err_fn(c, ret);
|
||||
|
||||
discard_in_flight_remove(ca, bucket);
|
||||
@ -1989,6 +2024,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
|
||||
break;
|
||||
}
|
||||
|
||||
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
|
||||
|
||||
bch2_trans_put(trans);
|
||||
percpu_ref_put(&ca->io_ref);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
|
||||
}
|
||||
@ -2030,8 +2068,11 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
||||
return 1;
|
||||
|
||||
if (!bch2_dev_bucket_exists(c, bucket)) {
|
||||
prt_str(&buf, "lru entry points to invalid bucket");
|
||||
goto err;
|
||||
if (fsck_err(trans, lru_entry_to_invalid_bucket,
|
||||
"lru key points to nonexistent device:bucket %llu:%llu",
|
||||
bucket.inode, bucket.offset))
|
||||
return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
|
||||
@ -2072,28 +2113,9 @@ static int invalidate_one_bucket(struct btree_trans *trans,
|
||||
trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors);
|
||||
--*nr_to_invalidate;
|
||||
out:
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
err:
|
||||
prt_str(&buf, "\n lru key: ");
|
||||
bch2_bkey_val_to_text(&buf, c, lru_k);
|
||||
|
||||
prt_str(&buf, "\n lru entry: ");
|
||||
bch2_lru_pos_to_text(&buf, lru_iter->pos);
|
||||
|
||||
prt_str(&buf, "\n alloc key: ");
|
||||
if (!a)
|
||||
bch2_bpos_to_text(&buf, bucket);
|
||||
else
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i));
|
||||
|
||||
bch_err(c, "%s", buf.buf);
|
||||
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) {
|
||||
bch2_inconsistent_error(c);
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter,
|
||||
@ -2101,7 +2123,7 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
again:
|
||||
k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
|
||||
k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
|
||||
if (!k.k && !*wrapped) {
|
||||
bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
|
||||
*wrapped = true;
|
||||
|
@ -8,8 +8,6 @@
|
||||
#include "debug.h"
|
||||
#include "super.h"
|
||||
|
||||
enum bch_validate_flags;
|
||||
|
||||
/* How out of date a pointer gen is allowed to be: */
|
||||
#define BUCKET_GC_GEN_MAX 96U
|
||||
|
||||
@ -245,10 +243,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
|
||||
|
||||
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
|
||||
|
||||
int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
void bch2_alloc_v4_swab(struct bkey_s);
|
||||
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
@ -282,7 +284,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
})
|
||||
|
||||
int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
struct bkey_validate_context);
|
||||
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \
|
||||
@ -307,6 +309,8 @@ int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *,
|
||||
int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
|
||||
struct bkey_s_c, struct bkey_s,
|
||||
enum btree_iter_update_trigger_flags);
|
||||
|
||||
int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool);
|
||||
int bch2_check_alloc_info(struct bch_fs *);
|
||||
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
|
||||
void bch2_dev_do_discards(struct bch_dev *);
|
||||
|
@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
|
||||
|
||||
struct bch_alloc_v4 {
|
||||
struct bch_val v;
|
||||
__u64 journal_seq;
|
||||
__u64 journal_seq_nonempty;
|
||||
__u32 flags;
|
||||
__u8 gen;
|
||||
__u8 oldest_gen;
|
||||
@ -70,7 +70,7 @@ struct bch_alloc_v4 {
|
||||
__u32 stripe;
|
||||
__u32 nr_external_backpointers;
|
||||
/* end of fields in original version of alloc_v4 */
|
||||
__u64 _fragmentation_lru; /* obsolete */
|
||||
__u64 journal_seq_empty;
|
||||
__u32 stripe_sectors;
|
||||
__u32 pad;
|
||||
} __packed __aligned(8);
|
||||
|
@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
||||
return;
|
||||
}
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
spin_lock(&ob->lock);
|
||||
|
||||
ob->valid = false;
|
||||
ob->data_type = 0;
|
||||
|
||||
spin_unlock(&ob->lock);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
spin_lock(&c->freelist_lock);
|
||||
bch2_open_bucket_hash_remove(c, ob);
|
||||
@ -156,6 +152,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
|
||||
return ob;
|
||||
}
|
||||
|
||||
static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
|
||||
{
|
||||
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
|
||||
return false;
|
||||
|
||||
return bch2_is_superblock_bucket(ca, b);
|
||||
}
|
||||
|
||||
static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
|
||||
{
|
||||
BUG_ON(c->open_buckets_partial_nr >=
|
||||
@ -175,20 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
|
||||
closure_wake_up(&c->freelist_wait);
|
||||
}
|
||||
|
||||
/* _only_ for allocating the journal on a new device: */
|
||||
long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
|
||||
{
|
||||
while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
|
||||
u64 b = ca->new_fs_bucket_idx++;
|
||||
|
||||
if (!is_superblock_bucket(ca, b) &&
|
||||
(!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
|
||||
return b;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
|
||||
{
|
||||
switch (watermark) {
|
||||
@ -206,33 +196,40 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
|
||||
}
|
||||
}
|
||||
|
||||
static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
u64 bucket,
|
||||
enum bch_watermark watermark,
|
||||
const struct bch_alloc_v4 *a,
|
||||
struct bucket_alloc_state *s,
|
||||
struct closure *cl)
|
||||
static inline bool may_alloc_bucket(struct bch_fs *c,
|
||||
struct bpos bucket,
|
||||
struct bucket_alloc_state *s)
|
||||
{
|
||||
struct open_bucket *ob;
|
||||
|
||||
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
|
||||
s->skipped_nouse++;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
|
||||
if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
|
||||
s->skipped_open++;
|
||||
return NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
|
||||
c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
|
||||
c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) {
|
||||
s->skipped_need_journal_commit++;
|
||||
return NULL;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) {
|
||||
if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
|
||||
s->skipped_nocow++;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
|
||||
u64 bucket, u8 gen,
|
||||
enum bch_watermark watermark,
|
||||
struct bucket_alloc_state *s,
|
||||
struct closure *cl)
|
||||
{
|
||||
if (unlikely(is_superblock_bucket(c, ca, bucket)))
|
||||
return NULL;
|
||||
|
||||
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
|
||||
s->skipped_nouse++;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -254,14 +251,13 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ob = bch2_open_bucket_alloc(c);
|
||||
struct open_bucket *ob = bch2_open_bucket_alloc(c);
|
||||
|
||||
spin_lock(&ob->lock);
|
||||
|
||||
ob->valid = true;
|
||||
ob->sectors_free = ca->mi.bucket_size;
|
||||
ob->dev = ca->dev_idx;
|
||||
ob->gen = a->gen;
|
||||
ob->gen = gen;
|
||||
ob->bucket = bucket;
|
||||
spin_unlock(&ob->lock);
|
||||
|
||||
@ -276,111 +272,29 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
|
||||
}
|
||||
|
||||
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
|
||||
enum bch_watermark watermark, u64 free_entry,
|
||||
enum bch_watermark watermark,
|
||||
struct bucket_alloc_state *s,
|
||||
struct bkey_s_c freespace_k,
|
||||
struct btree_iter *freespace_iter,
|
||||
struct closure *cl)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter = { NULL };
|
||||
struct bkey_s_c k;
|
||||
struct open_bucket *ob;
|
||||
struct bch_alloc_v4 a_convert;
|
||||
const struct bch_alloc_v4 *a;
|
||||
u64 b = free_entry & ~(~0ULL << 56);
|
||||
unsigned genbits = free_entry >> 56;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret;
|
||||
u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
|
||||
|
||||
if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
|
||||
prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
|
||||
" freespace key ",
|
||||
ca->mi.first_bucket, ca->mi.nbuckets);
|
||||
bch2_bkey_val_to_text(&buf, c, freespace_k);
|
||||
bch2_trans_inconsistent(trans, "%s", buf.buf);
|
||||
ob = ERR_PTR(-EIO);
|
||||
goto err;
|
||||
}
|
||||
if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
|
||||
return NULL;
|
||||
|
||||
k = bch2_bkey_get_iter(trans, &iter,
|
||||
BTREE_ID_alloc, POS(ca->dev_idx, b),
|
||||
BTREE_ITER_cached);
|
||||
ret = bkey_err(k);
|
||||
if (ret) {
|
||||
ob = ERR_PTR(ret);
|
||||
goto err;
|
||||
}
|
||||
u8 gen;
|
||||
int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
if (ret)
|
||||
return NULL;
|
||||
|
||||
a = bch2_alloc_to_v4(k, &a_convert);
|
||||
|
||||
if (a->data_type != BCH_DATA_free) {
|
||||
if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
|
||||
ob = NULL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
prt_printf(&buf, "non free bucket in freespace btree\n"
|
||||
" freespace key ");
|
||||
bch2_bkey_val_to_text(&buf, c, freespace_k);
|
||||
prt_printf(&buf, "\n ");
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
bch2_trans_inconsistent(trans, "%s", buf.buf);
|
||||
ob = ERR_PTR(-EIO);
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (genbits != (alloc_freespace_genbits(*a) >> 56) &&
|
||||
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
|
||||
prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
|
||||
" freespace key ",
|
||||
genbits, alloc_freespace_genbits(*a) >> 56);
|
||||
bch2_bkey_val_to_text(&buf, c, freespace_k);
|
||||
prt_printf(&buf, "\n ");
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
bch2_trans_inconsistent(trans, "%s", buf.buf);
|
||||
ob = ERR_PTR(-EIO);
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) {
|
||||
struct bch_backpointer bp;
|
||||
struct bpos bp_pos = POS_MIN;
|
||||
|
||||
ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1,
|
||||
&bp_pos, &bp,
|
||||
BTREE_ITER_nopreserve);
|
||||
if (ret) {
|
||||
ob = ERR_PTR(ret);
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!bkey_eq(bp_pos, POS_MAX)) {
|
||||
/*
|
||||
* Bucket may have data in it - we don't call
|
||||
* bc2h_trans_inconnsistent() because fsck hasn't
|
||||
* finished yet
|
||||
*/
|
||||
ob = NULL;
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
|
||||
if (!ob)
|
||||
bch2_set_btree_iter_dontneed(&iter);
|
||||
err:
|
||||
if (iter.path)
|
||||
bch2_set_btree_iter_dontneed(&iter);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
printbuf_exit(&buf);
|
||||
return ob;
|
||||
return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
|
||||
}
|
||||
|
||||
/*
|
||||
* This path is for before the freespace btree is initialized:
|
||||
*
|
||||
* If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
|
||||
* journal buckets - journal buckets will be < ca->new_fs_bucket_idx
|
||||
*/
|
||||
static noinline struct open_bucket *
|
||||
bch2_bucket_alloc_early(struct btree_trans *trans,
|
||||
@ -389,10 +303,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
|
||||
struct bucket_alloc_state *s,
|
||||
struct closure *cl)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter, citer;
|
||||
struct bkey_s_c k, ck;
|
||||
struct open_bucket *ob = NULL;
|
||||
u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
|
||||
u64 first_bucket = ca->mi.first_bucket;
|
||||
u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
|
||||
u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
|
||||
u64 alloc_cursor = alloc_start;
|
||||
@ -415,10 +330,6 @@ again:
|
||||
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
|
||||
break;
|
||||
|
||||
if (ca->new_fs_bucket_idx &&
|
||||
is_superblock_bucket(ca, k.k->p.offset))
|
||||
continue;
|
||||
|
||||
if (s->btree_bitmap != BTREE_BITMAP_ANY &&
|
||||
s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
|
||||
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
|
||||
@ -452,7 +363,10 @@ again:
|
||||
|
||||
s->buckets_seen++;
|
||||
|
||||
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
|
||||
ob = may_alloc_bucket(c, k.k->p, s)
|
||||
? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
|
||||
watermark, s, cl)
|
||||
: NULL;
|
||||
next:
|
||||
bch2_set_btree_iter_dontneed(&citer);
|
||||
bch2_trans_iter_exit(trans, &citer);
|
||||
@ -489,20 +403,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
|
||||
u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
|
||||
u64 alloc_cursor = alloc_start;
|
||||
int ret;
|
||||
|
||||
BUG_ON(ca->new_fs_bucket_idx);
|
||||
again:
|
||||
for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
|
||||
POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
|
||||
if (k.k->p.inode != ca->dev_idx)
|
||||
break;
|
||||
for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace,
|
||||
POS(ca->dev_idx, alloc_cursor),
|
||||
POS(ca->dev_idx, U64_MAX),
|
||||
0, k, ret) {
|
||||
/*
|
||||
* peek normally dosen't trim extents - they can span iter.pos,
|
||||
* which is not what we want here:
|
||||
*/
|
||||
iter.k.size = iter.k.p.offset - iter.pos.offset;
|
||||
|
||||
for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
|
||||
alloc_cursor < k.k->p.offset;
|
||||
alloc_cursor++) {
|
||||
while (iter.k.size) {
|
||||
s->buckets_seen++;
|
||||
|
||||
u64 bucket = alloc_cursor & ~(~0ULL << 56);
|
||||
u64 bucket = iter.pos.offset & ~(~0ULL << 56);
|
||||
if (s->btree_bitmap != BTREE_BITMAP_ANY &&
|
||||
s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
|
||||
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
|
||||
@ -511,32 +426,36 @@ again:
|
||||
goto fail;
|
||||
|
||||
bucket = sector_to_bucket(ca,
|
||||
round_up(bucket_to_sector(ca, bucket) + 1,
|
||||
round_up(bucket_to_sector(ca, bucket + 1),
|
||||
1ULL << ca->mi.btree_bitmap_shift));
|
||||
u64 genbits = alloc_cursor >> 56;
|
||||
alloc_cursor = bucket | (genbits << 56);
|
||||
alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
|
||||
|
||||
if (alloc_cursor > k.k->p.offset)
|
||||
bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
|
||||
bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
|
||||
s->skipped_mi_btree_bitmap++;
|
||||
continue;
|
||||
goto next;
|
||||
}
|
||||
|
||||
ob = try_alloc_bucket(trans, ca, watermark,
|
||||
alloc_cursor, s, k, cl);
|
||||
ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
|
||||
if (ob) {
|
||||
if (!IS_ERR(ob))
|
||||
*dev_alloc_cursor = iter.pos.offset;
|
||||
bch2_set_btree_iter_dontneed(&iter);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
iter.k.size--;
|
||||
iter.pos.offset++;
|
||||
}
|
||||
next:
|
||||
if (ob || ret)
|
||||
break;
|
||||
}
|
||||
fail:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (!ob && ret)
|
||||
BUG_ON(ob && ret);
|
||||
|
||||
if (ret)
|
||||
ob = ERR_PTR(ret);
|
||||
|
||||
if (!ob && alloc_start > ca->mi.first_bucket) {
|
||||
@ -544,8 +463,6 @@ fail:
|
||||
goto again;
|
||||
}
|
||||
|
||||
*dev_alloc_cursor = alloc_cursor;
|
||||
|
||||
return ob;
|
||||
}
|
||||
|
||||
@ -595,6 +512,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
|
||||
* @watermark: how important is this allocation?
|
||||
* @data_type: BCH_DATA_journal, btree, user...
|
||||
* @cl: if not NULL, closure to be used to wait if buckets not available
|
||||
* @nowait: if true, do not wait for buckets to become available
|
||||
* @usage: for secondarily also returning the current device usage
|
||||
*
|
||||
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
|
||||
@ -629,6 +547,10 @@ again:
|
||||
bch2_dev_do_invalidates(ca);
|
||||
|
||||
if (!avail) {
|
||||
if (watermark > BCH_WATERMARK_normal &&
|
||||
c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
|
||||
goto alloc;
|
||||
|
||||
if (cl && !waiting) {
|
||||
closure_wait(&c->freelist_wait, cl);
|
||||
waiting = true;
|
||||
@ -711,9 +633,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
|
||||
unsigned i;
|
||||
|
||||
for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
|
||||
ret.devs[ret.nr++] = i;
|
||||
ret.data[ret.nr++] = i;
|
||||
|
||||
bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
|
||||
bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -785,18 +707,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
||||
struct closure *cl)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct dev_alloc_list devs_sorted =
|
||||
bch2_dev_alloc_list(c, stripe, devs_may_alloc);
|
||||
int ret = -BCH_ERR_insufficient_devices;
|
||||
|
||||
BUG_ON(*nr_effective >= nr_replicas);
|
||||
|
||||
for (unsigned i = 0; i < devs_sorted.nr; i++) {
|
||||
struct bch_dev_usage usage;
|
||||
struct open_bucket *ob;
|
||||
|
||||
unsigned dev = devs_sorted.devs[i];
|
||||
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
|
||||
struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
|
||||
darray_for_each(devs_sorted, i) {
|
||||
struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
|
||||
if (!ca)
|
||||
continue;
|
||||
|
||||
@ -805,8 +722,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
|
||||
continue;
|
||||
}
|
||||
|
||||
ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
|
||||
cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
|
||||
struct bch_dev_usage usage;
|
||||
struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
|
||||
cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
|
||||
if (!IS_ERR(ob))
|
||||
bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
|
||||
bch2_dev_put(ca);
|
||||
@ -850,10 +768,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
|
||||
struct closure *cl)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct dev_alloc_list devs_sorted;
|
||||
struct ec_stripe_head *h;
|
||||
struct open_bucket *ob;
|
||||
unsigned i, ec_idx;
|
||||
int ret = 0;
|
||||
|
||||
if (nr_replicas < 2)
|
||||
@ -862,34 +776,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
|
||||
if (ec_open_bucket(c, ptrs))
|
||||
return 0;
|
||||
|
||||
h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
|
||||
struct ec_stripe_head *h =
|
||||
bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
|
||||
if (IS_ERR(h))
|
||||
return PTR_ERR(h);
|
||||
if (!h)
|
||||
return 0;
|
||||
|
||||
devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
|
||||
|
||||
for (i = 0; i < devs_sorted.nr; i++)
|
||||
for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
|
||||
struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
|
||||
darray_for_each(devs_sorted, i)
|
||||
for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
|
||||
if (!h->s->blocks[ec_idx])
|
||||
continue;
|
||||
|
||||
ob = c->open_buckets + h->s->blocks[ec_idx];
|
||||
if (ob->dev == devs_sorted.devs[i] &&
|
||||
!test_and_set_bit(ec_idx, h->s->blocks_allocated))
|
||||
goto got_bucket;
|
||||
}
|
||||
goto out_put_head;
|
||||
got_bucket:
|
||||
ob->ec_idx = ec_idx;
|
||||
ob->ec = h->s;
|
||||
ec_stripe_new_get(h->s, STRIPE_REF_io);
|
||||
struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx];
|
||||
if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) {
|
||||
ob->ec_idx = ec_idx;
|
||||
ob->ec = h->s;
|
||||
ec_stripe_new_get(h->s, STRIPE_REF_io);
|
||||
|
||||
ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, ob);
|
||||
out_put_head:
|
||||
ret = add_new_bucket(c, ptrs, devs_may_alloc,
|
||||
nr_replicas, nr_effective,
|
||||
have_cache, ob);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
out:
|
||||
bch2_ec_stripe_head_put(c, h);
|
||||
return ret;
|
||||
}
|
||||
|
@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *);
|
||||
|
||||
struct dev_alloc_list {
|
||||
unsigned nr;
|
||||
u8 devs[BCH_SB_MEMBERS_MAX];
|
||||
u8 data[BCH_SB_MEMBERS_MAX];
|
||||
};
|
||||
|
||||
struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
|
||||
@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
|
||||
struct bch_devs_mask *);
|
||||
void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
|
||||
|
||||
long bch2_bucket_alloc_new_fs(struct bch_dev *);
|
||||
|
||||
static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
|
||||
{
|
||||
return bch2_dev_have_ref(c, ob->dev);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -18,14 +18,14 @@ static inline u64 swab40(u64 x)
|
||||
((x & 0xff00000000ULL) >> 32));
|
||||
}
|
||||
|
||||
int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags);
|
||||
void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
|
||||
void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k,
|
||||
struct bkey_validate_context);
|
||||
void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_backpointer_swab(struct bkey_s);
|
||||
|
||||
#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \
|
||||
.key_validate = bch2_backpointer_validate, \
|
||||
.val_to_text = bch2_backpointer_k_to_text, \
|
||||
.val_to_text = bch2_backpointer_to_text, \
|
||||
.swab = bch2_backpointer_swab, \
|
||||
.min_val_size = 32, \
|
||||
})
|
||||
@ -43,22 +43,24 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos
|
||||
return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector));
|
||||
}
|
||||
|
||||
static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos,
|
||||
u32 *bucket_offset)
|
||||
{
|
||||
u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
|
||||
|
||||
return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset));
|
||||
}
|
||||
|
||||
static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
|
||||
{
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode);
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
|
||||
if (ca)
|
||||
*bucket = bp_pos_to_bucket(ca, bp_pos);
|
||||
rcu_read_unlock();
|
||||
return ca != NULL;
|
||||
}
|
||||
|
||||
static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
|
||||
{
|
||||
return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket),
|
||||
c, "backpointer for missing device %llu", bp_pos.inode);
|
||||
}
|
||||
|
||||
static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca,
|
||||
struct bpos bucket,
|
||||
u64 bucket_offset)
|
||||
@ -80,31 +82,35 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *,
|
||||
struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool);
|
||||
static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket)
|
||||
{
|
||||
return bucket_pos_to_bp(ca, bucket, 0);
|
||||
}
|
||||
|
||||
static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket)
|
||||
{
|
||||
return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0));
|
||||
}
|
||||
|
||||
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *,
|
||||
struct bkey_s_c,
|
||||
struct bkey_i_backpointer *,
|
||||
bool);
|
||||
|
||||
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
|
||||
struct bch_dev *ca,
|
||||
struct bpos bucket,
|
||||
struct bch_backpointer bp,
|
||||
struct bkey_s_c orig_k,
|
||||
struct bkey_i_backpointer *bp,
|
||||
bool insert)
|
||||
{
|
||||
if (unlikely(bch2_backpointers_no_use_write_buffer))
|
||||
return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert);
|
||||
|
||||
struct bkey_i_backpointer bp_k;
|
||||
|
||||
bkey_backpointer_init(&bp_k.k_i);
|
||||
bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset);
|
||||
bp_k.v = bp;
|
||||
return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
|
||||
|
||||
if (!insert) {
|
||||
bp_k.k.type = KEY_TYPE_deleted;
|
||||
set_bkey_val_u64s(&bp_k.k, 0);
|
||||
bp->k.type = KEY_TYPE_deleted;
|
||||
set_bkey_val_u64s(&bp->k, 0);
|
||||
}
|
||||
|
||||
return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
|
||||
return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i);
|
||||
}
|
||||
|
||||
static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
|
||||
@ -134,44 +140,29 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
|
||||
static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
|
||||
enum btree_id btree_id, unsigned level,
|
||||
struct bkey_s_c k, struct extent_ptr_decoded p,
|
||||
const union bch_extent_entry *entry,
|
||||
struct bpos *bucket_pos, struct bch_backpointer *bp,
|
||||
u64 sectors)
|
||||
struct bkey_i_backpointer *bp)
|
||||
{
|
||||
u32 bucket_offset;
|
||||
*bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset);
|
||||
*bp = (struct bch_backpointer) {
|
||||
bkey_backpointer_init(&bp->k_i);
|
||||
bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
|
||||
bp->v = (struct bch_backpointer) {
|
||||
.btree_id = btree_id,
|
||||
.level = level,
|
||||
.data_type = bch2_bkey_ptr_data_type(k, p, entry),
|
||||
.bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) +
|
||||
p.crc.offset,
|
||||
.bucket_len = sectors,
|
||||
.bucket_gen = p.ptr.gen,
|
||||
.bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p),
|
||||
.pos = k.k->p,
|
||||
};
|
||||
}
|
||||
|
||||
static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca,
|
||||
enum btree_id btree_id, unsigned level,
|
||||
struct bkey_s_c k, struct extent_ptr_decoded p,
|
||||
const union bch_extent_entry *entry,
|
||||
struct bpos *bucket_pos, struct bch_backpointer *bp)
|
||||
{
|
||||
u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
|
||||
|
||||
__bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors);
|
||||
}
|
||||
|
||||
int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int,
|
||||
struct bpos *, struct bch_backpointer *, unsigned);
|
||||
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *,
|
||||
struct bpos, struct bch_backpointer,
|
||||
unsigned);
|
||||
struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *,
|
||||
struct bpos, struct bch_backpointer);
|
||||
struct bkey_buf;
|
||||
struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer,
|
||||
struct btree_iter *, unsigned, struct bkey_buf *);
|
||||
struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
|
||||
struct btree_iter *, struct bkey_buf *);
|
||||
|
||||
int bch2_check_btree_backpointers(struct bch_fs *);
|
||||
int bch2_check_extents_to_backpointers(struct bch_fs *);
|
||||
|
@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos)
|
||||
|
||||
static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
|
||||
{
|
||||
prt_str(out, bch2_btree_id_str(pos.btree));
|
||||
bch2_btree_id_to_text(out, pos.btree);
|
||||
prt_char(out, ':');
|
||||
bch2_bpos_to_text(out, pos.pos);
|
||||
}
|
||||
|
@ -205,6 +205,7 @@
|
||||
#include <linux/zstd.h>
|
||||
|
||||
#include "bcachefs_format.h"
|
||||
#include "btree_journal_iter_types.h"
|
||||
#include "disk_accounting_types.h"
|
||||
#include "errcode.h"
|
||||
#include "fifo.h"
|
||||
@ -293,6 +294,8 @@ do { \
|
||||
|
||||
#define bch_info(c, fmt, ...) \
|
||||
bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
|
||||
#define bch_info_ratelimited(c, fmt, ...) \
|
||||
bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
|
||||
#define bch_notice(c, fmt, ...) \
|
||||
bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
|
||||
#define bch_warn(c, fmt, ...) \
|
||||
@ -352,6 +355,12 @@ do { \
|
||||
bch_info(c, fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define bch_verbose_ratelimited(c, fmt, ...) \
|
||||
do { \
|
||||
if ((c)->opts.verbose) \
|
||||
bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
#define pr_verbose_init(opts, fmt, ...) \
|
||||
do { \
|
||||
if (opt_get(opts, verbose)) \
|
||||
@ -538,20 +547,20 @@ struct bch_dev {
|
||||
|
||||
/*
|
||||
* Buckets:
|
||||
* Per-bucket arrays are protected by c->mark_lock, bucket_lock and
|
||||
* gc_gens_lock, for device resize - holding any is sufficient for
|
||||
* access: Or rcu_read_lock(), but only for dev_ptr_stale():
|
||||
* Per-bucket arrays are protected by either rcu_read_lock or
|
||||
* state_lock, for device resize.
|
||||
*/
|
||||
GENRADIX(struct bucket) buckets_gc;
|
||||
struct bucket_gens __rcu *bucket_gens;
|
||||
u8 *oldest_gen;
|
||||
unsigned long *buckets_nouse;
|
||||
struct rw_semaphore bucket_lock;
|
||||
|
||||
unsigned long *bucket_backpointer_mismatches;
|
||||
unsigned long *bucket_backpointer_empty;
|
||||
|
||||
struct bch_dev_usage __percpu *usage;
|
||||
|
||||
/* Allocator: */
|
||||
u64 new_fs_bucket_idx;
|
||||
u64 alloc_cursor[3];
|
||||
|
||||
unsigned nr_open_buckets;
|
||||
@ -606,6 +615,7 @@ struct bch_dev {
|
||||
x(going_ro) \
|
||||
x(write_disable_complete) \
|
||||
x(clean_shutdown) \
|
||||
x(recovery_running) \
|
||||
x(fsck_running) \
|
||||
x(initial_gc_unfixed) \
|
||||
x(need_delete_dead_snapshots) \
|
||||
@ -650,28 +660,6 @@ struct journal_seq_blacklist_table {
|
||||
} entries[];
|
||||
};
|
||||
|
||||
struct journal_keys {
|
||||
/* must match layout in darray_types.h */
|
||||
size_t nr, size;
|
||||
struct journal_key {
|
||||
u64 journal_seq;
|
||||
u32 journal_offset;
|
||||
enum btree_id btree_id:8;
|
||||
unsigned level:8;
|
||||
bool allocated;
|
||||
bool overwritten;
|
||||
struct bkey_i *k;
|
||||
} *data;
|
||||
/*
|
||||
* Gap buffer: instead of all the empty space in the array being at the
|
||||
* end of the buffer - from @nr to @size - the empty space is at @gap.
|
||||
* This means that sequential insertions are O(n) instead of O(n^2).
|
||||
*/
|
||||
size_t gap;
|
||||
atomic_t ref;
|
||||
bool initial_ref_held;
|
||||
};
|
||||
|
||||
struct btree_trans_buf {
|
||||
struct btree_trans *trans;
|
||||
};
|
||||
@ -680,6 +668,7 @@ struct btree_trans_buf {
|
||||
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
|
||||
|
||||
#define BCH_WRITE_REFS() \
|
||||
x(journal) \
|
||||
x(trans) \
|
||||
x(write) \
|
||||
x(promote) \
|
||||
@ -692,6 +681,7 @@ struct btree_trans_buf {
|
||||
x(dio_write) \
|
||||
x(discard) \
|
||||
x(discard_fast) \
|
||||
x(check_discard_freespace_key) \
|
||||
x(invalidate) \
|
||||
x(delete_dead_snapshots) \
|
||||
x(gc_gens) \
|
||||
@ -734,6 +724,12 @@ struct bch_fs {
|
||||
#else
|
||||
struct percpu_ref writes;
|
||||
#endif
|
||||
/*
|
||||
* Certain operations are only allowed in single threaded mode, during
|
||||
* recovery, and we want to assert that this is the case:
|
||||
*/
|
||||
struct task_struct *recovery_task;
|
||||
|
||||
/*
|
||||
* Analagous to c->writes, for asynchronous ops that don't necessarily
|
||||
* need fs to be read-write
|
||||
@ -764,6 +760,8 @@ struct bch_fs {
|
||||
__uuid_t user_uuid;
|
||||
|
||||
u16 version;
|
||||
u16 version_incompat;
|
||||
u16 version_incompat_allowed;
|
||||
u16 version_min;
|
||||
u16 version_upgrade_complete;
|
||||
|
||||
@ -834,9 +832,10 @@ struct bch_fs {
|
||||
struct work_struct btree_interior_update_work;
|
||||
|
||||
struct workqueue_struct *btree_node_rewrite_worker;
|
||||
|
||||
struct list_head pending_node_rewrites;
|
||||
struct mutex pending_node_rewrites_lock;
|
||||
struct list_head btree_node_rewrites;
|
||||
struct list_head btree_node_rewrites_pending;
|
||||
spinlock_t btree_node_rewrites_lock;
|
||||
struct closure_waitlist btree_node_rewrites_wait;
|
||||
|
||||
/* btree_io.c: */
|
||||
spinlock_t btree_write_error_lock;
|
||||
@ -967,8 +966,7 @@ struct bch_fs {
|
||||
struct rhashtable promote_table;
|
||||
|
||||
mempool_t compression_bounce[2];
|
||||
mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
|
||||
mempool_t decompress_workspace;
|
||||
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
|
||||
size_t zstd_workspace_size;
|
||||
|
||||
struct crypto_shash *sha256;
|
||||
@ -1027,6 +1025,7 @@ struct bch_fs {
|
||||
struct list_head vfs_inodes_list;
|
||||
struct mutex vfs_inodes_lock;
|
||||
struct rhashtable vfs_inodes_table;
|
||||
struct rhltable vfs_inodes_by_inum_table;
|
||||
|
||||
/* VFS IO PATH - fs-io.c */
|
||||
struct bio_set writepage_bioset;
|
||||
@ -1048,10 +1047,12 @@ struct bch_fs {
|
||||
* for signaling to the toplevel code which pass we want to run now.
|
||||
*/
|
||||
enum bch_recovery_pass curr_recovery_pass;
|
||||
enum bch_recovery_pass next_recovery_pass;
|
||||
/* bitmask of recovery passes that we actually ran */
|
||||
u64 recovery_passes_complete;
|
||||
/* never rewinds version of curr_recovery_pass */
|
||||
enum bch_recovery_pass recovery_pass_done;
|
||||
spinlock_t recovery_pass_lock;
|
||||
struct semaphore online_fsck_mutex;
|
||||
|
||||
/* DEBUG JUNK */
|
||||
@ -1062,9 +1063,6 @@ struct bch_fs {
|
||||
struct btree_node *verify_ondisk;
|
||||
struct mutex verify_lock;
|
||||
|
||||
u64 *unused_inode_hints;
|
||||
unsigned inode_shard_bits;
|
||||
|
||||
/*
|
||||
* A btree node on disk could have too many bsets for an iterator to fit
|
||||
* on the stack - have to dynamically allocate them
|
||||
@ -1086,8 +1084,6 @@ struct bch_fs {
|
||||
u64 counters_on_mount[BCH_COUNTER_NR];
|
||||
u64 __percpu *counters;
|
||||
|
||||
unsigned copy_gc_enabled:1;
|
||||
|
||||
struct bch2_time_stats times[BCH_TIME_STAT_NR];
|
||||
|
||||
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
|
||||
|
@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k)
|
||||
x(snapshot_tree, 31) \
|
||||
x(logged_op_truncate, 32) \
|
||||
x(logged_op_finsert, 33) \
|
||||
x(accounting, 34)
|
||||
x(accounting, 34) \
|
||||
x(inode_alloc_cursor, 35)
|
||||
|
||||
enum bch_bkey_type {
|
||||
#define x(name, nr) KEY_TYPE_##name = nr,
|
||||
@ -463,7 +464,8 @@ struct bch_backpointer {
|
||||
__u8 btree_id;
|
||||
__u8 level;
|
||||
__u8 data_type;
|
||||
__u64 bucket_offset:40;
|
||||
__u8 bucket_gen;
|
||||
__u32 pad;
|
||||
__u32 bucket_len;
|
||||
struct bpos pos;
|
||||
} __packed __aligned(8);
|
||||
@ -499,8 +501,6 @@ struct bch_sb_field {
|
||||
#include "disk_groups_format.h"
|
||||
#include "extents_format.h"
|
||||
#include "ec_format.h"
|
||||
#include "dirent_format.h"
|
||||
#include "disk_groups_format.h"
|
||||
#include "inode_format.h"
|
||||
#include "journal_seq_blacklist_format.h"
|
||||
#include "logged_ops_format.h"
|
||||
@ -679,7 +679,13 @@ struct bch_sb_field_ext {
|
||||
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
|
||||
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
|
||||
x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
|
||||
x(inode_has_child_snapshots, BCH_VERSION(1, 13))
|
||||
x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \
|
||||
x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \
|
||||
x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \
|
||||
x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \
|
||||
x(inode_depth, BCH_VERSION(1, 17)) \
|
||||
x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
|
||||
x(autofix_errors, BCH_VERSION(1, 19))
|
||||
|
||||
enum bcachefs_metadata_version {
|
||||
bcachefs_metadata_version_min = 9,
|
||||
@ -844,6 +850,10 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
|
||||
struct bch_sb, flags[5], 0, 16);
|
||||
LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
|
||||
struct bch_sb, flags[5], 16, 32);
|
||||
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
|
||||
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
|
||||
struct bch_sb, flags[5], 48, 64);
|
||||
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
|
||||
|
||||
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
|
||||
{
|
||||
@ -896,21 +906,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
|
||||
x(new_varint, 15) \
|
||||
x(journal_no_flush, 16) \
|
||||
x(alloc_v2, 17) \
|
||||
x(extents_across_btree_nodes, 18)
|
||||
x(extents_across_btree_nodes, 18) \
|
||||
x(incompat_version_field, 19)
|
||||
|
||||
#define BCH_SB_FEATURES_ALWAYS \
|
||||
((1ULL << BCH_FEATURE_new_extent_overwrite)| \
|
||||
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
|
||||
(1ULL << BCH_FEATURE_btree_updates_journalled)|\
|
||||
(1ULL << BCH_FEATURE_alloc_v2)|\
|
||||
(1ULL << BCH_FEATURE_extents_across_btree_nodes))
|
||||
(BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
|
||||
BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
|
||||
BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
|
||||
BIT_ULL(BCH_FEATURE_alloc_v2)|\
|
||||
BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
|
||||
|
||||
#define BCH_SB_FEATURES_ALL \
|
||||
(BCH_SB_FEATURES_ALWAYS| \
|
||||
(1ULL << BCH_FEATURE_new_siphash)| \
|
||||
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
|
||||
(1ULL << BCH_FEATURE_new_varint)| \
|
||||
(1ULL << BCH_FEATURE_journal_no_flush))
|
||||
BIT_ULL(BCH_FEATURE_new_siphash)| \
|
||||
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
|
||||
BIT_ULL(BCH_FEATURE_new_varint)| \
|
||||
BIT_ULL(BCH_FEATURE_journal_no_flush))
|
||||
|
||||
enum bch_sb_feature {
|
||||
#define x(f, n) BCH_FEATURE_##f,
|
||||
@ -1032,7 +1043,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
|
||||
x(crc64, 2) \
|
||||
x(xxhash, 3)
|
||||
|
||||
enum bch_csum_opts {
|
||||
enum bch_csum_opt {
|
||||
#define x(t, n) BCH_CSUM_OPT_##t = n,
|
||||
BCH_CSUM_OPTS()
|
||||
#undef x
|
||||
@ -1221,6 +1232,15 @@ struct jset_entry_log {
|
||||
u8 d[];
|
||||
} __packed __aligned(8);
|
||||
|
||||
static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
|
||||
{
|
||||
unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
|
||||
|
||||
while (b && !l->d[b - 1])
|
||||
--b;
|
||||
return b;
|
||||
}
|
||||
|
||||
struct jset_entry_datetime {
|
||||
struct jset_entry entry;
|
||||
__le64 seconds;
|
||||
@ -1268,14 +1288,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
|
||||
/* Btree: */
|
||||
|
||||
enum btree_id_flags {
|
||||
BTREE_ID_EXTENTS = BIT(0),
|
||||
BTREE_ID_SNAPSHOTS = BIT(1),
|
||||
BTREE_ID_SNAPSHOT_FIELD = BIT(2),
|
||||
BTREE_ID_DATA = BIT(3),
|
||||
BTREE_IS_extents = BIT(0),
|
||||
BTREE_IS_snapshots = BIT(1),
|
||||
BTREE_IS_snapshot_field = BIT(2),
|
||||
BTREE_IS_data = BIT(3),
|
||||
BTREE_IS_write_buffer = BIT(4),
|
||||
};
|
||||
|
||||
#define BCH_BTREE_IDS() \
|
||||
x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
|
||||
x(extents, 0, \
|
||||
BTREE_IS_extents| \
|
||||
BTREE_IS_snapshots| \
|
||||
BTREE_IS_data, \
|
||||
BIT_ULL(KEY_TYPE_whiteout)| \
|
||||
BIT_ULL(KEY_TYPE_error)| \
|
||||
BIT_ULL(KEY_TYPE_cookie)| \
|
||||
@ -1283,17 +1307,20 @@ enum btree_id_flags {
|
||||
BIT_ULL(KEY_TYPE_reservation)| \
|
||||
BIT_ULL(KEY_TYPE_reflink_p)| \
|
||||
BIT_ULL(KEY_TYPE_inline_data)) \
|
||||
x(inodes, 1, BTREE_ID_SNAPSHOTS, \
|
||||
x(inodes, 1, \
|
||||
BTREE_IS_snapshots, \
|
||||
BIT_ULL(KEY_TYPE_whiteout)| \
|
||||
BIT_ULL(KEY_TYPE_inode)| \
|
||||
BIT_ULL(KEY_TYPE_inode_v2)| \
|
||||
BIT_ULL(KEY_TYPE_inode_v3)| \
|
||||
BIT_ULL(KEY_TYPE_inode_generation)) \
|
||||
x(dirents, 2, BTREE_ID_SNAPSHOTS, \
|
||||
x(dirents, 2, \
|
||||
BTREE_IS_snapshots, \
|
||||
BIT_ULL(KEY_TYPE_whiteout)| \
|
||||
BIT_ULL(KEY_TYPE_hash_whiteout)| \
|
||||
BIT_ULL(KEY_TYPE_dirent)) \
|
||||
x(xattrs, 3, BTREE_ID_SNAPSHOTS, \
|
||||
x(xattrs, 3, \
|
||||
BTREE_IS_snapshots, \
|
||||
BIT_ULL(KEY_TYPE_whiteout)| \
|
||||
BIT_ULL(KEY_TYPE_cookie)| \
|
||||
BIT_ULL(KEY_TYPE_hash_whiteout)| \
|
||||
@ -1307,7 +1334,9 @@ enum btree_id_flags {
|
||||
BIT_ULL(KEY_TYPE_quota)) \
|
||||
x(stripes, 6, 0, \
|
||||
BIT_ULL(KEY_TYPE_stripe)) \
|
||||
x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
|
||||
x(reflink, 7, \
|
||||
BTREE_IS_extents| \
|
||||
BTREE_IS_data, \
|
||||
BIT_ULL(KEY_TYPE_reflink_v)| \
|
||||
BIT_ULL(KEY_TYPE_indirect_inline_data)| \
|
||||
BIT_ULL(KEY_TYPE_error)) \
|
||||
@ -1315,28 +1344,38 @@ enum btree_id_flags {
|
||||
BIT_ULL(KEY_TYPE_subvolume)) \
|
||||
x(snapshots, 9, 0, \
|
||||
BIT_ULL(KEY_TYPE_snapshot)) \
|
||||
x(lru, 10, 0, \
|
||||
x(lru, 10, \
|
||||
BTREE_IS_write_buffer, \
|
||||
BIT_ULL(KEY_TYPE_set)) \
|
||||
x(freespace, 11, BTREE_ID_EXTENTS, \
|
||||
x(freespace, 11, \
|
||||
BTREE_IS_extents, \
|
||||
BIT_ULL(KEY_TYPE_set)) \
|
||||
x(need_discard, 12, 0, \
|
||||
BIT_ULL(KEY_TYPE_set)) \
|
||||
x(backpointers, 13, 0, \
|
||||
x(backpointers, 13, \
|
||||
BTREE_IS_write_buffer, \
|
||||
BIT_ULL(KEY_TYPE_backpointer)) \
|
||||
x(bucket_gens, 14, 0, \
|
||||
BIT_ULL(KEY_TYPE_bucket_gens)) \
|
||||
x(snapshot_trees, 15, 0, \
|
||||
BIT_ULL(KEY_TYPE_snapshot_tree)) \
|
||||
x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
|
||||
x(deleted_inodes, 16, \
|
||||
BTREE_IS_snapshot_field| \
|
||||
BTREE_IS_write_buffer, \
|
||||
BIT_ULL(KEY_TYPE_set)) \
|
||||
x(logged_ops, 17, 0, \
|
||||
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
|
||||
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
|
||||
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
|
||||
BIT_ULL(KEY_TYPE_logged_op_finsert)| \
|
||||
BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \
|
||||
x(rebalance_work, 18, \
|
||||
BTREE_IS_snapshot_field| \
|
||||
BTREE_IS_write_buffer, \
|
||||
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \
|
||||
x(subvolume_children, 19, 0, \
|
||||
BIT_ULL(KEY_TYPE_set)) \
|
||||
x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \
|
||||
x(accounting, 20, \
|
||||
BTREE_IS_snapshot_field| \
|
||||
BTREE_IS_write_buffer, \
|
||||
BIT_ULL(KEY_TYPE_accounting)) \
|
||||
|
||||
enum btree_id {
|
||||
@ -1361,6 +1400,8 @@ static inline bool btree_id_is_alloc(enum btree_id id)
|
||||
case BTREE_ID_need_discard:
|
||||
case BTREE_ID_freespace:
|
||||
case BTREE_ID_bucket_gens:
|
||||
case BTREE_ID_lru:
|
||||
case BTREE_ID_accounting:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -9,13 +9,6 @@
|
||||
#include "util.h"
|
||||
#include "vstructs.h"
|
||||
|
||||
enum bch_validate_flags {
|
||||
BCH_VALIDATE_write = BIT(0),
|
||||
BCH_VALIDATE_commit = BIT(1),
|
||||
BCH_VALIDATE_journal = BIT(2),
|
||||
BCH_VALIDATE_silent = BIT(3),
|
||||
};
|
||||
|
||||
#if 0
|
||||
|
||||
/*
|
||||
|
@ -28,7 +28,7 @@ const char * const bch2_bkey_types[] = {
|
||||
};
|
||||
|
||||
static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -42,7 +42,7 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
})
|
||||
|
||||
static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
@ -59,7 +59,7 @@ fsck_err:
|
||||
})
|
||||
|
||||
static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -83,7 +83,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
})
|
||||
|
||||
static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -124,7 +124,7 @@ const struct bkey_ops bch2_bkey_null_ops = {
|
||||
};
|
||||
|
||||
int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
|
||||
return 0;
|
||||
@ -140,7 +140,7 @@ int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
if (!ops->key_validate)
|
||||
return 0;
|
||||
|
||||
ret = ops->key_validate(c, k, flags);
|
||||
ret = ops->key_validate(c, k, from);
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -161,9 +161,10 @@ const char *bch2_btree_node_type_str(enum btree_node_type type)
|
||||
}
|
||||
|
||||
int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum btree_node_type type,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
enum btree_node_type type = __btree_node_type(from.level, from.btree);
|
||||
|
||||
if (test_bit(BCH_FS_no_invalid_checks, &c->flags))
|
||||
return 0;
|
||||
|
||||
@ -177,7 +178,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
return 0;
|
||||
|
||||
bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
|
||||
(type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) &&
|
||||
(type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
|
||||
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
|
||||
c, bkey_invalid_type_for_btree,
|
||||
"invalid key type for btree %s (%s)",
|
||||
@ -228,15 +229,15 @@ fsck_err:
|
||||
}
|
||||
|
||||
int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum btree_node_type type,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
return __bch2_bkey_validate(c, k, type, flags) ?:
|
||||
bch2_bkey_val_validate(c, k, flags);
|
||||
return __bch2_bkey_validate(c, k, from) ?:
|
||||
bch2_bkey_val_validate(c, k, from);
|
||||
}
|
||||
|
||||
int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
|
||||
struct bkey_s_c k, enum bch_validate_flags flags)
|
||||
struct bkey_s_c k,
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
|
@ -22,7 +22,7 @@ extern const struct bkey_ops bch2_bkey_null_ops;
|
||||
*/
|
||||
struct bkey_ops {
|
||||
int (*key_validate)(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags);
|
||||
struct bkey_validate_context from);
|
||||
void (*val_to_text)(struct printbuf *, struct bch_fs *,
|
||||
struct bkey_s_c);
|
||||
void (*swab)(struct bkey_s);
|
||||
@ -48,13 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type)
|
||||
: &bch2_bkey_null_ops;
|
||||
}
|
||||
|
||||
int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
|
||||
enum bch_validate_flags);
|
||||
int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
|
||||
enum bch_validate_flags);
|
||||
int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
struct bkey_validate_context from);
|
||||
|
||||
void bch2_bpos_to_text(struct printbuf *, struct bpos);
|
||||
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
|
||||
|
@ -210,4 +210,32 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
|
||||
BCH_BKEY_TYPES();
|
||||
#undef x
|
||||
|
||||
enum bch_validate_flags {
|
||||
BCH_VALIDATE_write = BIT(0),
|
||||
BCH_VALIDATE_commit = BIT(1),
|
||||
BCH_VALIDATE_silent = BIT(2),
|
||||
};
|
||||
|
||||
#define BKEY_VALIDATE_CONTEXTS() \
|
||||
x(unknown) \
|
||||
x(superblock) \
|
||||
x(journal) \
|
||||
x(btree_root) \
|
||||
x(btree_node) \
|
||||
x(commit)
|
||||
|
||||
struct bkey_validate_context {
|
||||
enum {
|
||||
#define x(n) BKEY_VALIDATE_##n,
|
||||
BKEY_VALIDATE_CONTEXTS()
|
||||
#undef x
|
||||
} from:8;
|
||||
enum bch_validate_flags flags:8;
|
||||
u8 level;
|
||||
enum btree_id btree;
|
||||
bool root:1;
|
||||
unsigned journal_offset;
|
||||
u64 journal_seq;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_BKEY_TYPES_H */
|
||||
|
@ -222,7 +222,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b)
|
||||
struct btree_cache *bc = &c->btree_cache;
|
||||
|
||||
mutex_lock(&bc->lock);
|
||||
BUG_ON(!__btree_node_pinned(bc, b));
|
||||
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
|
||||
set_btree_node_pinned(b);
|
||||
list_move(&b->list, &bc->live[1].list);
|
||||
@ -326,7 +325,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans,
|
||||
if (!IS_ERR_OR_NULL(b)) {
|
||||
mutex_lock(&c->btree_cache.lock);
|
||||
|
||||
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
__bch2_btree_node_hash_remove(&c->btree_cache, b);
|
||||
|
||||
bkey_copy(&b->key, new);
|
||||
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
|
||||
@ -1004,16 +1003,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
|
||||
return;
|
||||
|
||||
prt_printf(&buf,
|
||||
"btree node header doesn't match ptr\n"
|
||||
"btree %s level %u\n"
|
||||
"ptr: ",
|
||||
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
"btree node header doesn't match ptr: ");
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_str(&buf, "\nptr: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
|
||||
prt_printf(&buf, "\nheader: btree %s level %llu\n"
|
||||
"min ",
|
||||
bch2_btree_id_str(BTREE_NODE_ID(b->data)),
|
||||
BTREE_NODE_LEVEL(b->data));
|
||||
prt_str(&buf, "\nheader: ");
|
||||
bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data));
|
||||
prt_str(&buf, "\nmin ");
|
||||
bch2_bpos_to_text(&buf, b->data->min_key);
|
||||
|
||||
prt_printf(&buf, "\nmax ");
|
||||
@ -1133,7 +1130,7 @@ retry:
|
||||
|
||||
if (unlikely(btree_node_read_error(b))) {
|
||||
six_unlock_type(&b->c.lock, lock_type);
|
||||
return ERR_PTR(-BCH_ERR_btree_node_read_error);
|
||||
return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
|
||||
}
|
||||
|
||||
EBUG_ON(b->c.btree_id != path->btree_id);
|
||||
@ -1223,7 +1220,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
|
||||
|
||||
if (unlikely(btree_node_read_error(b))) {
|
||||
six_unlock_type(&b->c.lock, lock_type);
|
||||
return ERR_PTR(-BCH_ERR_btree_node_read_error);
|
||||
return ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
|
||||
}
|
||||
|
||||
EBUG_ON(b->c.btree_id != path->btree_id);
|
||||
@ -1305,7 +1302,7 @@ lock_node:
|
||||
|
||||
if (unlikely(btree_node_read_error(b))) {
|
||||
six_unlock_read(&b->c.lock);
|
||||
b = ERR_PTR(-BCH_ERR_btree_node_read_error);
|
||||
b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1398,13 +1395,31 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
|
||||
prt_printf(out, "(unknown btree %u)", btree);
|
||||
}
|
||||
|
||||
void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level)
|
||||
{
|
||||
prt_str(out, "btree=");
|
||||
bch2_btree_id_to_text(out, btree);
|
||||
prt_printf(out, " level=%u", level);
|
||||
}
|
||||
|
||||
void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
enum btree_id btree, unsigned level, struct bkey_s_c k)
|
||||
{
|
||||
bch2_btree_id_to_text(out, btree);
|
||||
prt_printf(out, " level %u/", level);
|
||||
struct btree_root *r = bch2_btree_id_root(c, btree);
|
||||
if (r)
|
||||
prt_printf(out, "%u", r->level);
|
||||
else
|
||||
prt_printf(out, "(unknown)");
|
||||
prt_printf(out, "\n ");
|
||||
|
||||
bch2_bkey_val_to_text(out, c, k);
|
||||
}
|
||||
|
||||
void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
|
||||
{
|
||||
prt_printf(out, "%s level %u/%u\n ",
|
||||
bch2_btree_id_str(b->c.btree_id),
|
||||
b->c.level,
|
||||
bch2_btree_id_root(c, b->c.btree_id)->level);
|
||||
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
|
||||
__bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key));
|
||||
}
|
||||
|
||||
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
|
||||
@ -1478,8 +1493,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
|
||||
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
|
||||
prt_newline(out);
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
|
||||
prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
|
||||
bch2_btree_id_to_text(out, i);
|
||||
prt_printf(out, "\t");
|
||||
prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size);
|
||||
prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]);
|
||||
}
|
||||
|
||||
prt_newline(out);
|
||||
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
|
||||
|
@ -128,19 +128,27 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i
|
||||
} else {
|
||||
unsigned idx = id - BTREE_ID_NR;
|
||||
|
||||
EBUG_ON(idx >= c->btree_roots_extra.nr);
|
||||
/* This can happen when we're called from btree_node_scan */
|
||||
if (idx >= c->btree_roots_extra.nr)
|
||||
return NULL;
|
||||
|
||||
return &c->btree_roots_extra.data[idx];
|
||||
}
|
||||
}
|
||||
|
||||
static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
return bch2_btree_id_root(c, b->c.btree_id)->b;
|
||||
struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id);
|
||||
|
||||
return r ? r->b : NULL;
|
||||
}
|
||||
|
||||
const char *bch2_btree_id_str(enum btree_id);
|
||||
const char *bch2_btree_id_str(enum btree_id); /* avoid */
|
||||
void bch2_btree_id_to_text(struct printbuf *, enum btree_id);
|
||||
void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned);
|
||||
|
||||
void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *,
|
||||
enum btree_id, unsigned, struct bkey_s_c);
|
||||
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
|
||||
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
|
||||
void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
|
||||
|
@ -29,6 +29,7 @@
|
||||
#include "move.h"
|
||||
#include "recovery_passes.h"
|
||||
#include "reflink.h"
|
||||
#include "recovery.h"
|
||||
#include "replicas.h"
|
||||
#include "super-io.h"
|
||||
#include "trace.h"
|
||||
@ -56,8 +57,8 @@ void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p)
|
||||
{
|
||||
prt_str(out, bch2_gc_phase_strs[p->phase]);
|
||||
prt_char(out, ' ');
|
||||
bch2_btree_id_to_text(out, p->btree);
|
||||
prt_printf(out, " l=%u ", p->level);
|
||||
bch2_btree_id_level_to_text(out, p->btree, p->level);
|
||||
prt_char(out, ' ');
|
||||
bch2_bpos_to_text(out, p->pos);
|
||||
}
|
||||
|
||||
@ -209,8 +210,9 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
|
||||
if (bpos_eq(expected_start, cur->data->min_key))
|
||||
return 0;
|
||||
|
||||
prt_printf(&buf, " at btree %s level %u:\n parent: ",
|
||||
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
prt_printf(&buf, " at ");
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_printf(&buf, ":\n parent: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
|
||||
if (prev) {
|
||||
@ -277,8 +279,9 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
|
||||
if (bpos_eq(child->key.k.p, b->key.k.p))
|
||||
return 0;
|
||||
|
||||
prt_printf(&buf, "at btree %s level %u:\n parent: ",
|
||||
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
prt_printf(&buf, " at ");
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_printf(&buf, ":\n parent: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
|
||||
prt_str(&buf, "\n child: ");
|
||||
@ -341,14 +344,14 @@ again:
|
||||
ret = PTR_ERR_OR_ZERO(cur);
|
||||
|
||||
printbuf_reset(&buf);
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1);
|
||||
prt_char(&buf, ' ');
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
|
||||
|
||||
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
|
||||
trans, btree_node_unreadable,
|
||||
"Topology repair: unreadable btree node at btree %s level %u:\n"
|
||||
trans, btree_node_read_error,
|
||||
"Topology repair: unreadable btree node at\n"
|
||||
" %s",
|
||||
bch2_btree_id_str(b->c.btree_id),
|
||||
b->c.level - 1,
|
||||
buf.buf)) {
|
||||
bch2_btree_node_evict(trans, cur_k.k);
|
||||
cur = NULL;
|
||||
@ -357,11 +360,9 @@ again:
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
if (!btree_id_is_alloc(b->c.btree_id)) {
|
||||
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
ret = bch2_btree_lost_data(c, b->c.btree_id);
|
||||
if (ret)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -370,7 +371,7 @@ again:
|
||||
break;
|
||||
|
||||
if (bch2_btree_node_is_stale(c, cur)) {
|
||||
bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
|
||||
bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf);
|
||||
six_unlock_read(&cur->c.lock);
|
||||
bch2_btree_node_evict(trans, cur_k.k);
|
||||
ret = bch2_journal_key_delete(c, b->c.btree_id,
|
||||
@ -478,14 +479,13 @@ again:
|
||||
}
|
||||
|
||||
printbuf_reset(&buf);
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_newline(&buf);
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
|
||||
if (mustfix_fsck_err_on(!have_child,
|
||||
trans, btree_node_topology_interior_node_empty,
|
||||
"empty interior btree node at btree %s level %u\n"
|
||||
" %s",
|
||||
bch2_btree_id_str(b->c.btree_id),
|
||||
b->c.level, buf.buf))
|
||||
"empty interior btree node at %s", buf.buf))
|
||||
ret = DROP_THIS_NODE;
|
||||
err:
|
||||
fsck_err:
|
||||
@ -511,6 +511,7 @@ int bch2_check_topology(struct bch_fs *c)
|
||||
{
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct bpos pulled_from_scan = POS_MIN;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_srcu_unlock(trans);
|
||||
@ -519,19 +520,22 @@ int bch2_check_topology(struct bch_fs *c)
|
||||
struct btree_root *r = bch2_btree_id_root(c, i);
|
||||
bool reconstructed_root = false;
|
||||
|
||||
printbuf_reset(&buf);
|
||||
bch2_btree_id_to_text(&buf, i);
|
||||
|
||||
if (r->error) {
|
||||
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
|
||||
ret = bch2_btree_lost_data(c, i);
|
||||
if (ret)
|
||||
break;
|
||||
reconstruct_root:
|
||||
bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
|
||||
bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
|
||||
|
||||
r->alive = false;
|
||||
r->error = 0;
|
||||
|
||||
if (!bch2_btree_has_scanned_nodes(c, i)) {
|
||||
mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
|
||||
"no nodes found for btree %s, continue?", bch2_btree_id_str(i));
|
||||
"no nodes found for btree %s, continue?", buf.buf);
|
||||
bch2_btree_root_alloc_fake_trans(trans, i, 0);
|
||||
} else {
|
||||
bch2_btree_root_alloc_fake_trans(trans, i, 1);
|
||||
@ -560,13 +564,14 @@ reconstruct_root:
|
||||
if (!reconstructed_root)
|
||||
goto reconstruct_root;
|
||||
|
||||
bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
|
||||
bch_err(c, "empty btree root %s", buf.buf);
|
||||
bch2_btree_root_alloc_fake_trans(trans, i, 0);
|
||||
r->alive = false;
|
||||
ret = 0;
|
||||
}
|
||||
}
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
bch2_trans_put(trans);
|
||||
return ret;
|
||||
}
|
||||
@ -713,6 +718,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
|
||||
{
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
enum btree_id ids[BTREE_ID_NR];
|
||||
struct printbuf buf = PRINTBUF;
|
||||
unsigned i;
|
||||
int ret = 0;
|
||||
|
||||
@ -727,14 +733,9 @@ static int bch2_gc_btrees(struct bch_fs *c)
|
||||
continue;
|
||||
|
||||
ret = bch2_gc_btree(trans, btree, true);
|
||||
|
||||
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
|
||||
trans, btree_node_read_error,
|
||||
"btree node read error for %s",
|
||||
bch2_btree_id_str(btree)))
|
||||
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
|
||||
}
|
||||
fsck_err:
|
||||
|
||||
printbuf_exit(&buf);
|
||||
bch2_trans_put(trans);
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
@ -802,7 +803,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
|
||||
old = bch2_alloc_to_v4(k, &old_convert);
|
||||
gc = new = *old;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
__bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset));
|
||||
|
||||
old_gc = gc;
|
||||
@ -813,7 +813,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
|
||||
gc.data_type = old->data_type;
|
||||
gc.dirty_sectors = old->dirty_sectors;
|
||||
}
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
/*
|
||||
* gc.data_type doesn't yet include need_discard & need_gc_gen states -
|
||||
@ -831,11 +830,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
|
||||
* safe w.r.t. transaction restarts, so fixup the gc_bucket so
|
||||
* we don't run it twice:
|
||||
*/
|
||||
percpu_down_read(&c->mark_lock);
|
||||
struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
|
||||
gc_m->data_type = gc.data_type;
|
||||
gc_m->dirty_sectors = gc.dirty_sectors;
|
||||
percpu_up_read(&c->mark_lock);
|
||||
}
|
||||
|
||||
if (fsck_err_on(new.data_type != gc.data_type,
|
||||
@ -895,11 +892,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c)
|
||||
|
||||
for_each_member_device(c, ca) {
|
||||
ret = bch2_trans_run(c,
|
||||
for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
|
||||
for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc,
|
||||
POS(ca->dev_idx, ca->mi.first_bucket),
|
||||
POS(ca->dev_idx, ca->mi.nbuckets - 1),
|
||||
BTREE_ITER_slots|BTREE_ITER_prefetch, k,
|
||||
NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
|
||||
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
||||
bch2_alloc_write_key(trans, &iter, ca, k)));
|
||||
if (ret) {
|
||||
bch2_dev_put(ca);
|
||||
@ -928,98 +925,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_write_reflink_key(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k,
|
||||
size_t *idx)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
const __le64 *refcount = bkey_refcount_c(k);
|
||||
struct printbuf buf = PRINTBUF;
|
||||
struct reflink_gc *r;
|
||||
int ret = 0;
|
||||
|
||||
if (!refcount)
|
||||
return 0;
|
||||
|
||||
while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) &&
|
||||
r->offset < k.k->p.offset)
|
||||
++*idx;
|
||||
|
||||
if (!r ||
|
||||
r->offset != k.k->p.offset ||
|
||||
r->size != k.k->size) {
|
||||
bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
|
||||
trans, reflink_v_refcount_wrong,
|
||||
"reflink key has wrong refcount:\n"
|
||||
" %s\n"
|
||||
" should be %u",
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
|
||||
r->refcount)) {
|
||||
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
|
||||
ret = PTR_ERR_OR_ZERO(new);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (!r->refcount)
|
||||
new->k.type = KEY_TYPE_deleted;
|
||||
else
|
||||
*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
|
||||
ret = bch2_trans_update(trans, iter, new, 0);
|
||||
}
|
||||
out:
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_reflink_done(struct bch_fs *c)
|
||||
{
|
||||
size_t idx = 0;
|
||||
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_commit(trans, iter,
|
||||
BTREE_ID_reflink, POS_MIN,
|
||||
BTREE_ITER_prefetch, k,
|
||||
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
|
||||
bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
|
||||
c->reflink_gc_nr = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_reflink_start(struct bch_fs *c)
|
||||
{
|
||||
c->reflink_gc_nr = 0;
|
||||
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
|
||||
BTREE_ITER_prefetch, k, ({
|
||||
const __le64 *refcount = bkey_refcount_c(k);
|
||||
|
||||
if (!refcount)
|
||||
continue;
|
||||
|
||||
struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
|
||||
c->reflink_gc_nr++, GFP_KERNEL);
|
||||
if (!r) {
|
||||
ret = -BCH_ERR_ENOMEM_gc_reflink_start;
|
||||
break;
|
||||
}
|
||||
|
||||
r->offset = k.k->p.offset;
|
||||
r->size = k.k->size;
|
||||
r->refcount = 0;
|
||||
0;
|
||||
})));
|
||||
|
||||
bch_err_fn(c, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_gc_write_stripes_key(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k)
|
||||
@ -1171,7 +1076,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
|
||||
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
|
||||
return -EROFS;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
rcu_read_lock();
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
|
||||
@ -1180,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
|
||||
|
||||
if (dev_ptr_stale(ca, ptr) > 16) {
|
||||
rcu_read_unlock();
|
||||
percpu_up_read(&c->mark_lock);
|
||||
goto update;
|
||||
}
|
||||
}
|
||||
@ -1195,7 +1098,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
|
||||
*gen = ptr->gen;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
percpu_up_read(&c->mark_lock);
|
||||
return 0;
|
||||
update:
|
||||
u = bch2_bkey_make_mut(trans, iter, &k, 0);
|
||||
@ -1224,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev
|
||||
return ret;
|
||||
|
||||
a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
|
||||
alloc_data_type_set(&a_mut->v, a_mut->v.data_type);
|
||||
|
||||
return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
|
||||
}
|
||||
@ -1337,9 +1238,16 @@ void bch2_gc_gens_async(struct bch_fs *c)
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
|
||||
}
|
||||
|
||||
void bch2_fs_gc_init(struct bch_fs *c)
|
||||
void bch2_fs_btree_gc_exit(struct bch_fs *c)
|
||||
{
|
||||
}
|
||||
|
||||
int bch2_fs_btree_gc_init(struct bch_fs *c)
|
||||
{
|
||||
seqcount_init(&c->gc_pos_lock);
|
||||
|
||||
INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
|
||||
|
||||
init_rwsem(&c->gc_lock);
|
||||
mutex_init(&c->gc_gens_lock);
|
||||
return 0;
|
||||
}
|
||||
|
@ -82,6 +82,8 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
|
||||
|
||||
int bch2_gc_gens(struct bch_fs *);
|
||||
void bch2_gc_gens_async(struct bch_fs *);
|
||||
void bch2_fs_gc_init(struct bch_fs *);
|
||||
|
||||
void bch2_fs_btree_gc_exit(struct bch_fs *);
|
||||
int bch2_fs_btree_gc_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_GC_H */
|
||||
|
@ -25,9 +25,8 @@
|
||||
|
||||
static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn)
|
||||
{
|
||||
prt_printf(out, "btree=%s l=%u seq %llux\n",
|
||||
bch2_btree_id_str(BTREE_NODE_ID(bn)),
|
||||
(unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq);
|
||||
bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn));
|
||||
prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn));
|
||||
prt_str(out, "min: ");
|
||||
bch2_bpos_to_text(out, bn->min_key);
|
||||
prt_newline(out);
|
||||
@ -490,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
|
||||
if (b->nsets == MAX_BSETS &&
|
||||
!btree_node_write_in_flight(b) &&
|
||||
should_compact_all(c, b)) {
|
||||
bch2_btree_node_write(c, b, SIX_LOCK_write,
|
||||
BTREE_WRITE_init_next_bset);
|
||||
bch2_btree_node_write_trans(trans, b, SIX_LOCK_write,
|
||||
BTREE_WRITE_init_next_bset);
|
||||
reinit_iter = true;
|
||||
}
|
||||
|
||||
@ -832,13 +831,32 @@ fsck_err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b,
|
||||
struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
{
|
||||
return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) {
|
||||
.from = BKEY_VALIDATE_btree_node,
|
||||
.level = b->c.level,
|
||||
.btree = b->c.btree_id,
|
||||
.flags = flags
|
||||
});
|
||||
}
|
||||
|
||||
static int bset_key_validate(struct bch_fs *c, struct btree *b,
|
||||
struct bkey_s_c k,
|
||||
bool updated_range, int rw)
|
||||
bool updated_range,
|
||||
enum bch_validate_flags flags)
|
||||
{
|
||||
return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?:
|
||||
(!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?:
|
||||
(rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0);
|
||||
struct bkey_validate_context from = (struct bkey_validate_context) {
|
||||
.from = BKEY_VALIDATE_btree_node,
|
||||
.level = b->c.level,
|
||||
.btree = b->c.btree_id,
|
||||
.flags = flags,
|
||||
};
|
||||
return __bch2_bkey_validate(c, k, from) ?:
|
||||
(!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?:
|
||||
(flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0);
|
||||
}
|
||||
|
||||
static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
|
||||
@ -855,7 +873,21 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
|
||||
|
||||
struct bkey tmp;
|
||||
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
|
||||
return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent);
|
||||
return !__bch2_bkey_validate(c, u.s_c,
|
||||
(struct bkey_validate_context) {
|
||||
.from = BKEY_VALIDATE_btree_node,
|
||||
.level = b->c.level,
|
||||
.btree = b->c.btree_id,
|
||||
.flags = BCH_VALIDATE_silent
|
||||
});
|
||||
}
|
||||
|
||||
static inline int btree_node_read_bkey_cmp(const struct btree *b,
|
||||
const struct bkey_packed *l,
|
||||
const struct bkey_packed *r)
|
||||
{
|
||||
return bch2_bkey_cmp_packed(b, l, r)
|
||||
?: (int) bkey_deleted(r) - (int) bkey_deleted(l);
|
||||
}
|
||||
|
||||
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
||||
@ -918,7 +950,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
||||
BSET_BIG_ENDIAN(i), write,
|
||||
&b->format, k);
|
||||
|
||||
if (prev && bkey_iter_cmp(b, prev, k) > 0) {
|
||||
if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) {
|
||||
struct bkey up = bkey_unpack_key(b, prev);
|
||||
|
||||
printbuf_reset(&buf);
|
||||
@ -965,6 +997,7 @@ drop_this_key:
|
||||
got_good_key:
|
||||
le16_add_cpu(&i->u64s, -next_good_key);
|
||||
memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
|
||||
set_btree_node_need_rewrite(b);
|
||||
}
|
||||
fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
@ -1038,39 +1071,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
while (b->written < (ptr_written ?: btree_sectors(c))) {
|
||||
unsigned sectors;
|
||||
struct nonce nonce;
|
||||
bool first = !b->written;
|
||||
bool csum_bad;
|
||||
|
||||
if (!b->written) {
|
||||
if (first) {
|
||||
bne = NULL;
|
||||
i = &b->data->keys;
|
||||
} else {
|
||||
bne = write_block(b);
|
||||
i = &bne->keys;
|
||||
|
||||
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
|
||||
-BCH_ERR_btree_node_read_err_want_retry,
|
||||
c, ca, b, i, NULL,
|
||||
bset_unknown_csum,
|
||||
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
|
||||
if (i->seq != b->data->keys.seq)
|
||||
break;
|
||||
}
|
||||
|
||||
nonce = btree_nonce(i, b->written << 9);
|
||||
struct nonce nonce = btree_nonce(i, b->written << 9);
|
||||
bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
|
||||
|
||||
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
|
||||
csum_bad = bch2_crc_cmp(b->data->csum, csum);
|
||||
if (csum_bad)
|
||||
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
||||
btree_err_on(!good_csum_type,
|
||||
bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))
|
||||
? -BCH_ERR_btree_node_read_err_must_retry
|
||||
: -BCH_ERR_btree_node_read_err_want_retry,
|
||||
c, ca, b, i, NULL,
|
||||
bset_unknown_csum,
|
||||
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
|
||||
|
||||
btree_err_on(csum_bad,
|
||||
-BCH_ERR_btree_node_read_err_want_retry,
|
||||
c, ca, b, i, NULL,
|
||||
bset_bad_csum,
|
||||
"%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
|
||||
buf.buf));
|
||||
if (first) {
|
||||
if (good_csum_type) {
|
||||
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
|
||||
bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
|
||||
if (csum_bad)
|
||||
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
||||
|
||||
ret = bset_encrypt(c, i, b->written << 9);
|
||||
if (bch2_fs_fatal_err_on(ret, c,
|
||||
"decrypting btree node: %s", bch2_err_str(ret)))
|
||||
goto fsck_err;
|
||||
btree_err_on(csum_bad,
|
||||
-BCH_ERR_btree_node_read_err_want_retry,
|
||||
c, ca, b, i, NULL,
|
||||
bset_bad_csum,
|
||||
"%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
|
||||
buf.buf));
|
||||
|
||||
ret = bset_encrypt(c, i, b->written << 9);
|
||||
if (bch2_fs_fatal_err_on(ret, c,
|
||||
"decrypting btree node: %s", bch2_err_str(ret)))
|
||||
goto fsck_err;
|
||||
}
|
||||
|
||||
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
|
||||
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
|
||||
@ -1081,37 +1126,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
|
||||
sectors = vstruct_sectors(b->data, c->block_bits);
|
||||
} else {
|
||||
bne = write_block(b);
|
||||
i = &bne->keys;
|
||||
if (good_csum_type) {
|
||||
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
||||
bool csum_bad = bch2_crc_cmp(bne->csum, csum);
|
||||
if (ca && csum_bad)
|
||||
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
||||
|
||||
if (i->seq != b->data->keys.seq)
|
||||
break;
|
||||
btree_err_on(csum_bad,
|
||||
-BCH_ERR_btree_node_read_err_want_retry,
|
||||
c, ca, b, i, NULL,
|
||||
bset_bad_csum,
|
||||
"%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
|
||||
buf.buf));
|
||||
|
||||
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
|
||||
-BCH_ERR_btree_node_read_err_want_retry,
|
||||
c, ca, b, i, NULL,
|
||||
bset_unknown_csum,
|
||||
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
|
||||
|
||||
nonce = btree_nonce(i, b->written << 9);
|
||||
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
||||
csum_bad = bch2_crc_cmp(bne->csum, csum);
|
||||
if (ca && csum_bad)
|
||||
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
|
||||
|
||||
btree_err_on(csum_bad,
|
||||
-BCH_ERR_btree_node_read_err_want_retry,
|
||||
c, ca, b, i, NULL,
|
||||
bset_bad_csum,
|
||||
"%s",
|
||||
(printbuf_reset(&buf),
|
||||
bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
|
||||
buf.buf));
|
||||
|
||||
ret = bset_encrypt(c, i, b->written << 9);
|
||||
if (bch2_fs_fatal_err_on(ret, c,
|
||||
"decrypting btree node: %s", bch2_err_str(ret)))
|
||||
goto fsck_err;
|
||||
ret = bset_encrypt(c, i, b->written << 9);
|
||||
if (bch2_fs_fatal_err_on(ret, c,
|
||||
"decrypting btree node: %s", bch2_err_str(ret)))
|
||||
goto fsck_err;
|
||||
}
|
||||
|
||||
sectors = vstruct_sectors(bne, c->block_bits);
|
||||
}
|
||||
@ -1216,7 +1250,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
struct bkey tmp;
|
||||
struct bkey_s u = __bkey_disassemble(b, k, &tmp);
|
||||
|
||||
ret = bch2_bkey_val_validate(c, u.s_c, READ);
|
||||
ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
|
||||
if (ret == -BCH_ERR_fsck_delete_bkey ||
|
||||
(bch2_inject_invalid_keys &&
|
||||
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
|
||||
@ -1226,6 +1260,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
||||
memmove_u64s_down(k, bkey_p_next(k),
|
||||
(u64 *) vstruct_end(i) - (u64 *) k);
|
||||
set_btree_bset_end(b, b->set);
|
||||
set_btree_node_need_rewrite(b);
|
||||
continue;
|
||||
}
|
||||
if (ret)
|
||||
@ -1339,13 +1374,18 @@ start:
|
||||
rb->start_time);
|
||||
bio_put(&rb->bio);
|
||||
|
||||
if (saw_error &&
|
||||
if ((saw_error ||
|
||||
btree_node_need_rewrite(b)) &&
|
||||
!btree_node_read_error(b) &&
|
||||
c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
|
||||
printbuf_reset(&buf);
|
||||
bch2_bpos_to_text(&buf, b->key.k.p);
|
||||
bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
|
||||
__func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
|
||||
if (saw_error) {
|
||||
printbuf_reset(&buf);
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_str(&buf, " ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
|
||||
__func__, buf.buf);
|
||||
}
|
||||
|
||||
bch2_btree_node_rewrite_async(c, b);
|
||||
}
|
||||
@ -1933,7 +1973,12 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
|
||||
bool saw_error;
|
||||
|
||||
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
|
||||
BKEY_TYPE_btree, WRITE);
|
||||
(struct bkey_validate_context) {
|
||||
.from = BKEY_VALIDATE_btree_node,
|
||||
.level = b->c.level + 1,
|
||||
.btree = b->c.btree_id,
|
||||
.flags = BCH_VALIDATE_write,
|
||||
});
|
||||
if (ret) {
|
||||
bch2_fs_inconsistent(c, "invalid btree node key before write");
|
||||
return ret;
|
||||
@ -2300,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b,
|
||||
enum six_lock_type lock_type_held,
|
||||
unsigned flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
if (lock_type_held == SIX_LOCK_intent ||
|
||||
(lock_type_held == SIX_LOCK_read &&
|
||||
six_lock_tryupgrade(&b->c.lock))) {
|
||||
__bch2_btree_node_write(c, b, flags);
|
||||
|
||||
/* don't cycle lock unnecessarily: */
|
||||
if (btree_node_just_written(b) &&
|
||||
six_trylock_write(&b->c.lock)) {
|
||||
bch2_btree_post_write_cleanup(c, b);
|
||||
__bch2_btree_node_unlock_write(trans, b);
|
||||
}
|
||||
|
||||
if (lock_type_held == SIX_LOCK_read)
|
||||
six_lock_downgrade(&b->c.lock);
|
||||
} else {
|
||||
__bch2_btree_node_write(c, b, flags);
|
||||
if (lock_type_held == SIX_LOCK_write &&
|
||||
btree_node_just_written(b))
|
||||
bch2_btree_post_write_cleanup(c, b);
|
||||
}
|
||||
}
|
||||
|
||||
static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
|
||||
{
|
||||
struct bucket_table *tbl;
|
||||
|
@ -144,11 +144,13 @@ enum btree_write_flags {
|
||||
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
|
||||
void bch2_btree_node_write(struct bch_fs *, struct btree *,
|
||||
enum six_lock_type, unsigned);
|
||||
void bch2_btree_node_write_trans(struct btree_trans *, struct btree *,
|
||||
enum six_lock_type, unsigned);
|
||||
|
||||
static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
|
||||
static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b,
|
||||
enum six_lock_type lock_held)
|
||||
{
|
||||
bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
|
||||
bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED);
|
||||
}
|
||||
|
||||
bool bch2_btree_flush_all_reads(struct bch_fs *);
|
||||
|
@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
|
||||
BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
|
||||
iter->pos.snapshot != iter->snapshot);
|
||||
|
||||
BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
|
||||
bkey_gt(iter->pos, iter->k.p));
|
||||
BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
|
||||
!(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
|
||||
(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
|
||||
bkey_gt(iter->pos, iter->k.p)));
|
||||
}
|
||||
|
||||
static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
|
||||
@ -327,7 +329,7 @@ out:
|
||||
void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
|
||||
struct bpos pos)
|
||||
{
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
struct btree_path *path;
|
||||
struct trans_for_each_path_inorder_iter iter;
|
||||
@ -697,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans,
|
||||
bch2_trans_revalidate_updates_in_node(trans, b);
|
||||
}
|
||||
|
||||
void bch2_trans_node_drop(struct btree_trans *trans,
|
||||
struct btree *b)
|
||||
{
|
||||
struct btree_path *path;
|
||||
unsigned i, level = b->c.level;
|
||||
|
||||
trans_for_each_path(trans, path, i)
|
||||
if (path->l[level].b == b) {
|
||||
btree_node_unlock(trans, path, level);
|
||||
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* A btree node has been modified in such a way as to invalidate iterators - fix
|
||||
* them:
|
||||
@ -720,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
|
||||
unsigned long trace_ip)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b;
|
||||
struct btree_root *r = bch2_btree_id_root(c, path->btree_id);
|
||||
enum six_lock_type lock_type;
|
||||
unsigned i;
|
||||
int ret;
|
||||
@ -728,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
|
||||
EBUG_ON(path->nodes_locked);
|
||||
|
||||
while (1) {
|
||||
b = READ_ONCE(*rootp);
|
||||
struct btree *b = READ_ONCE(r->b);
|
||||
if (unlikely(!b)) {
|
||||
BUG_ON(!r->error);
|
||||
return r->error;
|
||||
}
|
||||
|
||||
path->level = READ_ONCE(b->c.level);
|
||||
|
||||
if (unlikely(path->level < depth_want)) {
|
||||
@ -748,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
|
||||
ret = btree_node_lock(trans, path, &b->c,
|
||||
path->level, lock_type, trace_ip);
|
||||
if (unlikely(ret)) {
|
||||
if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed))
|
||||
continue;
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
return ret;
|
||||
BUG();
|
||||
}
|
||||
|
||||
if (likely(b == READ_ONCE(*rootp) &&
|
||||
if (likely(b == READ_ONCE(r->b) &&
|
||||
b->c.level == path->level &&
|
||||
!race_fault())) {
|
||||
for (i = 0; i < path->level; i++)
|
||||
@ -825,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
|
||||
|
||||
bch2_bkey_buf_init(&tmp);
|
||||
|
||||
jiter->fail_if_too_many_whiteouts = true;
|
||||
|
||||
while (nr-- && !ret) {
|
||||
if (!bch2_btree_node_relock(trans, path, path->level))
|
||||
break;
|
||||
@ -1000,7 +1020,7 @@ retry_all:
|
||||
|
||||
bch2_trans_unlock(trans);
|
||||
cond_resched();
|
||||
trans_set_locked(trans);
|
||||
trans_set_locked(trans, false);
|
||||
|
||||
if (unlikely(trans->memory_allocation_failure)) {
|
||||
struct closure cl;
|
||||
@ -1267,7 +1287,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
|
||||
{
|
||||
int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
|
||||
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
EBUG_ON(!trans->paths[path_idx].ref);
|
||||
|
||||
trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos);
|
||||
@ -1427,17 +1447,31 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_
|
||||
(void *) trans->last_begin_ip);
|
||||
}
|
||||
|
||||
void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
|
||||
static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
|
||||
{
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_prt_backtrace(&buf, &trans->last_restarted_trace);
|
||||
panic("in transaction restart: %s, last restarted by\n%s",
|
||||
bch2_err_str(trans->restarted),
|
||||
buf.buf);
|
||||
#else
|
||||
panic("in transaction restart: %s, last restarted by %pS\n",
|
||||
bch2_err_str(trans->restarted),
|
||||
(void *) trans->last_restarted_ip);
|
||||
#endif
|
||||
}
|
||||
|
||||
void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans)
|
||||
void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans)
|
||||
{
|
||||
panic("trans should be locked, unlocked by %pS\n",
|
||||
(void *) trans->last_unlock_ip);
|
||||
if (trans->restarted)
|
||||
bch2_trans_in_restart_error(trans);
|
||||
|
||||
if (!trans->locked)
|
||||
panic("trans should be locked, unlocked by %pS\n",
|
||||
(void *) trans->last_unlock_ip);
|
||||
|
||||
BUG();
|
||||
}
|
||||
|
||||
noinline __cold
|
||||
@ -1450,10 +1484,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
|
||||
trans_for_each_update(trans, i) {
|
||||
struct bkey_s_c old = { &i->old_k, i->old_v };
|
||||
|
||||
prt_printf(buf, "update: btree=%s cached=%u %pS\n",
|
||||
bch2_btree_id_str(i->btree_id),
|
||||
i->cached,
|
||||
(void *) i->ip_allocated);
|
||||
prt_str(buf, "update: btree=");
|
||||
bch2_btree_id_to_text(buf, i->btree_id);
|
||||
prt_printf(buf, " cached=%u %pS\n",
|
||||
i->cached,
|
||||
(void *) i->ip_allocated);
|
||||
|
||||
prt_printf(buf, " old ");
|
||||
bch2_bkey_val_to_text(buf, trans->c, old);
|
||||
@ -1486,13 +1521,13 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra
|
||||
{
|
||||
struct btree_path *path = trans->paths + path_idx;
|
||||
|
||||
prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ",
|
||||
prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ",
|
||||
path_idx, path->ref, path->intent_ref,
|
||||
path->preserve ? 'P' : ' ',
|
||||
path->should_be_locked ? 'S' : ' ',
|
||||
path->cached ? 'C' : 'B',
|
||||
bch2_btree_id_str(path->btree_id),
|
||||
path->level);
|
||||
path->cached ? 'C' : 'B');
|
||||
bch2_btree_id_level_to_text(out, path->btree_id, path->level);
|
||||
prt_str(out, " pos ");
|
||||
bch2_bpos_to_text(out, path->pos);
|
||||
|
||||
if (!path->cached && btree_node_locked(path, path->level)) {
|
||||
@ -1717,8 +1752,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
|
||||
struct trans_for_each_path_inorder_iter iter;
|
||||
btree_path_idx_t path_pos = 0, path_idx;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
bch2_trans_verify_locks(trans);
|
||||
|
||||
btree_trans_sort_paths(trans);
|
||||
@ -1833,7 +1867,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *
|
||||
!bkey_eq(path->pos, ck->key.pos));
|
||||
|
||||
*u = ck->k->k;
|
||||
k = bkey_i_to_s_c(ck->k);
|
||||
k = (struct bkey_s_c) { u, &ck->k->v };
|
||||
}
|
||||
|
||||
return k;
|
||||
@ -1843,7 +1877,6 @@ hole:
|
||||
return (struct bkey_s_c) { u, NULL };
|
||||
}
|
||||
|
||||
|
||||
void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
@ -1870,7 +1903,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
|
||||
struct btree_trans *trans = iter->trans;
|
||||
int ret;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
iter->path = bch2_btree_path_set_pos(trans, iter->path,
|
||||
btree_iter_search_key(iter),
|
||||
@ -1945,7 +1978,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
|
||||
int ret;
|
||||
|
||||
EBUG_ON(trans->paths[iter->path].cached);
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
bch2_btree_iter_verify(iter);
|
||||
|
||||
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
||||
@ -2101,7 +2134,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
|
||||
{
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
|
||||
return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
|
||||
return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
|
||||
path->level,
|
||||
path->pos,
|
||||
end_pos,
|
||||
@ -2124,21 +2157,47 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
static noinline
|
||||
struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k)
|
||||
void btree_trans_peek_journal(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c *k)
|
||||
{
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
struct bkey_i *next_journal =
|
||||
bch2_btree_journal_peek(trans, iter,
|
||||
k.k ? k.k->p : path_l(path)->b->key.k.p);
|
||||
k->k ? k->k->p : path_l(path)->b->key.k.p);
|
||||
if (next_journal) {
|
||||
iter->k = next_journal->k;
|
||||
*k = bkey_i_to_s_c(next_journal);
|
||||
}
|
||||
}
|
||||
|
||||
static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bpos end_pos)
|
||||
{
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
|
||||
return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
|
||||
path->level,
|
||||
path->pos,
|
||||
end_pos,
|
||||
&iter->journal_idx);
|
||||
}
|
||||
|
||||
static noinline
|
||||
void btree_trans_peek_prev_journal(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c *k)
|
||||
{
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
struct bkey_i *next_journal =
|
||||
bch2_btree_journal_peek_prev(trans, iter,
|
||||
k->k ? k->k->p : path_l(path)->b->key.k.p);
|
||||
|
||||
if (next_journal) {
|
||||
iter->k = next_journal->k;
|
||||
k = bkey_i_to_s_c(next_journal);
|
||||
*k = bkey_i_to_s_c(next_journal);
|
||||
}
|
||||
|
||||
return k;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2154,8 +2213,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
if ((iter->flags & BTREE_ITER_key_cache_fill) &&
|
||||
bpos_eq(iter->pos, pos))
|
||||
@ -2184,10 +2242,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
|
||||
btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path);
|
||||
|
||||
k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
|
||||
if (k.k && !bkey_err(k)) {
|
||||
iter->k = u;
|
||||
k.k = &iter->k;
|
||||
}
|
||||
if (!k.k)
|
||||
return k;
|
||||
|
||||
if ((iter->flags & BTREE_ITER_all_snapshots) &&
|
||||
!bpos_eq(pos, k.k->p))
|
||||
return bkey_s_c_null;
|
||||
|
||||
iter->k = u;
|
||||
k.k = &iter->k;
|
||||
return k;
|
||||
}
|
||||
|
||||
@ -2201,8 +2264,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
||||
bch2_btree_iter_verify(iter);
|
||||
|
||||
while (1) {
|
||||
struct btree_path_level *l;
|
||||
|
||||
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
@ -2212,17 +2273,17 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
||||
/* ensure that iter->k is consistent with iter->pos: */
|
||||
bch2_btree_iter_set_pos(iter, iter->pos);
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
l = path_l(path);
|
||||
struct btree_path_level *l = path_l(path);
|
||||
|
||||
if (unlikely(!l->b)) {
|
||||
/* No btree nodes at requested level: */
|
||||
bch2_btree_iter_set_pos(iter, SPOS_MAX);
|
||||
k = bkey_s_c_null;
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
@ -2233,15 +2294,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
||||
k.k &&
|
||||
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
|
||||
k = k2;
|
||||
ret = bkey_err(k);
|
||||
if (ret) {
|
||||
if (bkey_err(k)) {
|
||||
bch2_btree_iter_set_pos(iter, iter->pos);
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(iter->flags & BTREE_ITER_with_journal))
|
||||
k = btree_trans_peek_journal(trans, iter, k);
|
||||
btree_trans_peek_journal(trans, iter, &k);
|
||||
|
||||
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
|
||||
trans->nr_updates))
|
||||
@ -2270,32 +2330,32 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
|
||||
/* End of btree: */
|
||||
bch2_btree_iter_set_pos(iter, SPOS_MAX);
|
||||
k = bkey_s_c_null;
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
}
|
||||
out:
|
||||
bch2_btree_iter_verify(iter);
|
||||
|
||||
bch2_btree_iter_verify(iter);
|
||||
return k;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_btree_iter_peek_upto() - returns first key greater than or equal to
|
||||
* bch2_btree_iter_peek_max() - returns first key greater than or equal to
|
||||
* iterator's current position
|
||||
* @iter: iterator to peek from
|
||||
* @end: search limit: returns keys less than or equal to @end
|
||||
*
|
||||
* Returns: key if found, or an error extractable with bkey_err().
|
||||
*/
|
||||
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
|
||||
struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct bpos search_key = btree_iter_search_key(iter);
|
||||
struct bkey_s_c k;
|
||||
struct bpos iter_pos;
|
||||
struct bpos iter_pos = iter->pos;
|
||||
int ret;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
|
||||
|
||||
if (iter->update_path) {
|
||||
@ -2304,8 +2364,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
|
||||
iter->update_path = 0;
|
||||
}
|
||||
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
|
||||
while (1) {
|
||||
k = __bch2_btree_iter_peek(iter, search_key);
|
||||
if (unlikely(!k.k))
|
||||
@ -2313,77 +2371,77 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
|
||||
if (unlikely(bkey_err(k)))
|
||||
goto out_no_locked;
|
||||
|
||||
/*
|
||||
* We need to check against @end before FILTER_SNAPSHOTS because
|
||||
* if we get to a different inode that requested we might be
|
||||
* seeing keys for a different snapshot tree that will all be
|
||||
* filtered out.
|
||||
*
|
||||
* But we can't do the full check here, because bkey_start_pos()
|
||||
* isn't monotonically increasing before FILTER_SNAPSHOTS, and
|
||||
* that's what we check against in extents mode:
|
||||
*/
|
||||
if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
|
||||
? bkey_gt(k.k->p, end)
|
||||
: k.k->p.inode > end.inode))
|
||||
goto end;
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots) {
|
||||
/*
|
||||
* We need to check against @end before FILTER_SNAPSHOTS because
|
||||
* if we get to a different inode that requested we might be
|
||||
* seeing keys for a different snapshot tree that will all be
|
||||
* filtered out.
|
||||
*
|
||||
* But we can't do the full check here, because bkey_start_pos()
|
||||
* isn't monotonically increasing before FILTER_SNAPSHOTS, and
|
||||
* that's what we check against in extents mode:
|
||||
*/
|
||||
if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
|
||||
? bkey_gt(k.k->p, end)
|
||||
: k.k->p.inode > end.inode))
|
||||
goto end;
|
||||
|
||||
if (iter->update_path &&
|
||||
!bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
|
||||
bch2_path_put_nokeep(trans, iter->update_path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
iter->update_path = 0;
|
||||
}
|
||||
if (iter->update_path &&
|
||||
!bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
|
||||
bch2_path_put_nokeep(trans, iter->update_path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
iter->update_path = 0;
|
||||
}
|
||||
|
||||
if ((iter->flags & BTREE_ITER_filter_snapshots) &&
|
||||
(iter->flags & BTREE_ITER_intent) &&
|
||||
!(iter->flags & BTREE_ITER_is_extents) &&
|
||||
!iter->update_path) {
|
||||
struct bpos pos = k.k->p;
|
||||
if ((iter->flags & BTREE_ITER_intent) &&
|
||||
!(iter->flags & BTREE_ITER_is_extents) &&
|
||||
!iter->update_path) {
|
||||
struct bpos pos = k.k->p;
|
||||
|
||||
if (pos.snapshot < iter->snapshot) {
|
||||
if (pos.snapshot < iter->snapshot) {
|
||||
search_key = bpos_successor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
pos.snapshot = iter->snapshot;
|
||||
|
||||
/*
|
||||
* advance, same as on exit for iter->path, but only up
|
||||
* to snapshot
|
||||
*/
|
||||
__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
|
||||
iter->update_path = iter->path;
|
||||
|
||||
iter->update_path = bch2_btree_path_set_pos(trans,
|
||||
iter->update_path, pos,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
_THIS_IP_);
|
||||
ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
|
||||
if (unlikely(ret)) {
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We can never have a key in a leaf node at POS_MAX, so
|
||||
* we don't have to check these successor() calls:
|
||||
*/
|
||||
if (!bch2_snapshot_is_ancestor(trans->c,
|
||||
iter->snapshot,
|
||||
k.k->p.snapshot)) {
|
||||
search_key = bpos_successor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
pos.snapshot = iter->snapshot;
|
||||
|
||||
/*
|
||||
* advance, same as on exit for iter->path, but only up
|
||||
* to snapshot
|
||||
*/
|
||||
__btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent);
|
||||
iter->update_path = iter->path;
|
||||
|
||||
iter->update_path = bch2_btree_path_set_pos(trans,
|
||||
iter->update_path, pos,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
_THIS_IP_);
|
||||
ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags);
|
||||
if (unlikely(ret)) {
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
if (bkey_whiteout(k.k) &&
|
||||
!(iter->flags & BTREE_ITER_key_cache_fill)) {
|
||||
search_key = bkey_successor(iter, k.k->p);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We can never have a key in a leaf node at POS_MAX, so
|
||||
* we don't have to check these successor() calls:
|
||||
*/
|
||||
if ((iter->flags & BTREE_ITER_filter_snapshots) &&
|
||||
!bch2_snapshot_is_ancestor(trans->c,
|
||||
iter->snapshot,
|
||||
k.k->p.snapshot)) {
|
||||
search_key = bpos_successor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (bkey_whiteout(k.k) &&
|
||||
!(iter->flags & BTREE_ITER_all_snapshots)) {
|
||||
search_key = bkey_successor(iter, k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* iter->pos should be mononotically increasing, and always be
|
||||
* equal to the key we just returned - except extents can
|
||||
@ -2451,111 +2509,66 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
|
||||
return bch2_btree_iter_peek(iter);
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_btree_iter_peek_prev() - returns first key less than or equal to
|
||||
* iterator's current position
|
||||
* @iter: iterator to peek from
|
||||
*
|
||||
* Returns: key if found, or an error extractable with bkey_err().
|
||||
*/
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
||||
static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct bpos search_key = iter->pos;
|
||||
struct bkey_s_c k;
|
||||
struct bkey saved_k;
|
||||
const struct bch_val *saved_v;
|
||||
btree_path_idx_t saved_path = 0;
|
||||
int ret;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
EBUG_ON(btree_iter_path(trans, iter)->cached ||
|
||||
btree_iter_path(trans, iter)->level);
|
||||
|
||||
if (iter->flags & BTREE_ITER_with_journal)
|
||||
return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
|
||||
struct bkey_s_c k, k2;
|
||||
|
||||
bch2_btree_iter_verify(iter);
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots)
|
||||
search_key.snapshot = U32_MAX;
|
||||
|
||||
while (1) {
|
||||
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
|
||||
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
||||
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
||||
if (unlikely(ret)) {
|
||||
/* ensure that iter->k is consistent with iter->pos: */
|
||||
bch2_btree_iter_set_pos(iter, iter->pos);
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
break;
|
||||
}
|
||||
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
struct btree_path_level *l = path_l(path);
|
||||
|
||||
k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
|
||||
if (!k.k ||
|
||||
((iter->flags & BTREE_ITER_is_extents)
|
||||
? bpos_ge(bkey_start_pos(k.k), search_key)
|
||||
: bpos_gt(k.k->p, search_key)))
|
||||
k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
|
||||
if (unlikely(!l->b)) {
|
||||
/* No btree nodes at requested level: */
|
||||
bch2_btree_iter_set_pos(iter, SPOS_MAX);
|
||||
k = bkey_s_c_null;
|
||||
break;
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
|
||||
k = btree_path_level_peek_all(trans->c, l, &iter->k);
|
||||
if (!k.k || bpos_gt(k.k->p, search_key)) {
|
||||
k = btree_path_level_prev(trans, path, l, &iter->k);
|
||||
|
||||
BUG_ON(k.k && bpos_gt(k.k->p, search_key));
|
||||
}
|
||||
|
||||
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
|
||||
k.k &&
|
||||
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
|
||||
k = k2;
|
||||
if (bkey_err(k2)) {
|
||||
bch2_btree_iter_set_pos(iter, iter->pos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(iter->flags & BTREE_ITER_with_journal))
|
||||
btree_trans_peek_prev_journal(trans, iter, &k);
|
||||
|
||||
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
|
||||
trans->nr_updates))
|
||||
bch2_btree_trans_peek_prev_updates(trans, iter, &k);
|
||||
|
||||
if (likely(k.k)) {
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots) {
|
||||
if (k.k->p.snapshot == iter->snapshot)
|
||||
goto got_key;
|
||||
|
||||
/*
|
||||
* If we have a saved candidate, and we're no
|
||||
* longer at the same _key_ (not pos), return
|
||||
* that candidate
|
||||
*/
|
||||
if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
|
||||
bch2_path_put_nokeep(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
iter->path = saved_path;
|
||||
saved_path = 0;
|
||||
iter->k = saved_k;
|
||||
k.v = saved_v;
|
||||
goto got_key;
|
||||
}
|
||||
|
||||
if (bch2_snapshot_is_ancestor(trans->c,
|
||||
iter->snapshot,
|
||||
k.k->p.snapshot)) {
|
||||
if (saved_path)
|
||||
bch2_path_put_nokeep(trans, saved_path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
saved_path = btree_path_clone(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
_THIS_IP_);
|
||||
path = btree_iter_path(trans, iter);
|
||||
trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
|
||||
saved_k = *k.k;
|
||||
saved_v = k.v;
|
||||
}
|
||||
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
got_key:
|
||||
if (bkey_whiteout(k.k) &&
|
||||
!(iter->flags & BTREE_ITER_all_snapshots)) {
|
||||
search_key = bkey_predecessor(iter, k.k->p);
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots)
|
||||
search_key.snapshot = U32_MAX;
|
||||
continue;
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
if (likely(k.k && !bkey_deleted(k.k))) {
|
||||
break;
|
||||
} else if (k.k) {
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
|
||||
/* Advance to previous leaf node: */
|
||||
search_key = bpos_predecessor(path->l[0].b->data->min_key);
|
||||
@ -2563,15 +2576,137 @@ got_key:
|
||||
/* Start of btree: */
|
||||
bch2_btree_iter_set_pos(iter, POS_MIN);
|
||||
k = bkey_s_c_null;
|
||||
goto out_no_locked;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
|
||||
bch2_btree_iter_verify(iter);
|
||||
return k;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
|
||||
* iterator's current position
|
||||
* @iter: iterator to peek from
|
||||
* @end: search limit: returns keys greater than or equal to @end
|
||||
*
|
||||
* Returns: key if found, or an error extractable with bkey_err().
|
||||
*/
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
|
||||
{
|
||||
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
|
||||
!bkey_eq(iter->pos, POS_MAX)) {
|
||||
/*
|
||||
* bkey_start_pos(), for extents, is not monotonically
|
||||
* increasing until after filtering for snapshots:
|
||||
*
|
||||
* Thus, for extents we need to search forward until we find a
|
||||
* real visible extents - easiest to just use peek_slot() (which
|
||||
* internally uses peek() for extents)
|
||||
*/
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
|
||||
if (bkey_err(k))
|
||||
return k;
|
||||
|
||||
if (!bkey_deleted(k.k) &&
|
||||
(!(iter->flags & BTREE_ITER_is_extents) ||
|
||||
bkey_lt(bkey_start_pos(k.k), iter->pos)))
|
||||
return k;
|
||||
}
|
||||
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct bpos search_key = iter->pos;
|
||||
struct bkey_s_c k;
|
||||
btree_path_idx_t saved_path = 0;
|
||||
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
|
||||
|
||||
while (1) {
|
||||
k = __bch2_btree_iter_peek_prev(iter, search_key);
|
||||
if (unlikely(!k.k))
|
||||
goto end;
|
||||
if (unlikely(bkey_err(k)))
|
||||
goto out_no_locked;
|
||||
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots) {
|
||||
struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
|
||||
if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
|
||||
/*
|
||||
* If we have a saved candidate, and we're past
|
||||
* the last possible snapshot overwrite, return
|
||||
* it:
|
||||
*/
|
||||
bch2_path_put_nokeep(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
iter->path = saved_path;
|
||||
saved_path = 0;
|
||||
k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to check against @end before FILTER_SNAPSHOTS because
|
||||
* if we get to a different inode that requested we might be
|
||||
* seeing keys for a different snapshot tree that will all be
|
||||
* filtered out.
|
||||
*/
|
||||
if (unlikely(bkey_lt(k.k->p, end)))
|
||||
goto end;
|
||||
|
||||
if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (k.k->p.snapshot != iter->snapshot) {
|
||||
/*
|
||||
* Have a key visible in iter->snapshot, but
|
||||
* might have overwrites: - save it and keep
|
||||
* searching. Unless it's a whiteout - then drop
|
||||
* our previous saved candidate:
|
||||
*/
|
||||
if (saved_path) {
|
||||
bch2_path_put_nokeep(trans, saved_path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
saved_path = 0;
|
||||
}
|
||||
|
||||
if (!bkey_whiteout(k.k)) {
|
||||
saved_path = btree_path_clone(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
_THIS_IP_);
|
||||
trace_btree_path_save_pos(trans,
|
||||
trans->paths + iter->path,
|
||||
trans->paths + saved_path);
|
||||
}
|
||||
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (bkey_whiteout(k.k)) {
|
||||
search_key = bkey_predecessor(iter, k.k->p);
|
||||
search_key.snapshot = U32_MAX;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
|
||||
iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
|
||||
bkey_gt(k.k->p, iter->pos));
|
||||
|
||||
if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
|
||||
iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
|
||||
bkey_lt(k.k->p, end)))
|
||||
goto end;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
/* Extents can straddle iter->pos: */
|
||||
if (bkey_lt(k.k->p, iter->pos))
|
||||
iter->pos = k.k->p;
|
||||
iter->pos = bpos_min(iter->pos, k.k->p);;
|
||||
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots)
|
||||
iter->pos.snapshot = iter->snapshot;
|
||||
@ -2581,8 +2716,11 @@ out_no_locked:
|
||||
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
bch2_btree_iter_verify(iter);
|
||||
|
||||
return k;
|
||||
end:
|
||||
bch2_btree_iter_set_pos(iter, end);
|
||||
k = bkey_s_c_null;
|
||||
goto out_no_locked;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -2607,7 +2745,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
bch2_btree_iter_verify(iter);
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
|
||||
@ -2632,6 +2770,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
||||
goto out_no_locked;
|
||||
}
|
||||
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
if (unlikely(!btree_path_node(path, path->level)))
|
||||
return bkey_s_c_null;
|
||||
|
||||
if ((iter->flags & BTREE_ITER_cached) ||
|
||||
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
|
||||
k = bkey_s_c_null;
|
||||
@ -2658,6 +2800,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
||||
k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
|
||||
if (unlikely(!k.k))
|
||||
goto out_no_locked;
|
||||
|
||||
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
|
||||
(iter->flags & BTREE_ITER_filter_snapshots) &&
|
||||
!(iter->flags & BTREE_ITER_key_cache_fill)))
|
||||
iter->k.type = KEY_TYPE_deleted;
|
||||
} else {
|
||||
struct bpos next;
|
||||
struct bpos end = iter->pos;
|
||||
@ -2671,7 +2818,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
||||
struct btree_iter iter2;
|
||||
|
||||
bch2_trans_copy_iter(&iter2, iter);
|
||||
k = bch2_btree_iter_peek_upto(&iter2, end);
|
||||
k = bch2_btree_iter_peek_max(&iter2, end);
|
||||
|
||||
if (k.k && !bkey_err(k)) {
|
||||
swap(iter->key_cache_path, iter2.key_cache_path);
|
||||
@ -2682,7 +2829,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
||||
} else {
|
||||
struct bpos pos = iter->pos;
|
||||
|
||||
k = bch2_btree_iter_peek_upto(iter, end);
|
||||
k = bch2_btree_iter_peek_max(iter, end);
|
||||
if (unlikely(bkey_err(k)))
|
||||
bch2_btree_iter_set_pos(iter, pos);
|
||||
else
|
||||
@ -2902,7 +3049,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans,
|
||||
unsigned flags)
|
||||
{
|
||||
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
|
||||
bch2_btree_iter_flags(trans, btree_id, flags),
|
||||
bch2_btree_iter_flags(trans, btree_id, 0, flags),
|
||||
_RET_IP_);
|
||||
}
|
||||
|
||||
@ -2918,8 +3065,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
|
||||
flags |= BTREE_ITER_snapshot_field;
|
||||
flags |= BTREE_ITER_all_snapshots;
|
||||
|
||||
if (!depth && btree_id_cached(trans->c, btree_id))
|
||||
flags |= BTREE_ITER_with_key_cache;
|
||||
|
||||
bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
|
||||
__bch2_btree_iter_flags(trans, btree_id, flags),
|
||||
bch2_btree_iter_flags(trans, btree_id, depth, flags),
|
||||
_RET_IP_);
|
||||
|
||||
iter->min_depth = depth;
|
||||
@ -3122,14 +3272,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
|
||||
|
||||
trans->last_begin_ip = _RET_IP_;
|
||||
|
||||
trans_set_locked(trans);
|
||||
trans_set_locked(trans, false);
|
||||
|
||||
if (trans->restarted) {
|
||||
bch2_btree_path_traverse_all(trans);
|
||||
trans->notrace_relock_fail = false;
|
||||
}
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
return trans->restart_count;
|
||||
}
|
||||
|
||||
@ -3228,7 +3378,7 @@ got_trans:
|
||||
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
|
||||
trans->srcu_lock_time = jiffies;
|
||||
trans->srcu_held = true;
|
||||
trans_set_locked(trans);
|
||||
trans_set_locked(trans, false);
|
||||
|
||||
closure_init_stack_release(&trans->ref);
|
||||
return trans;
|
||||
@ -3262,6 +3412,9 @@ void bch2_trans_put(struct btree_trans *trans)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
if (trans->restarted)
|
||||
bch2_trans_in_restart_error(trans);
|
||||
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
trans_for_each_update(trans, i)
|
||||
@ -3285,6 +3438,10 @@ void bch2_trans_put(struct btree_trans *trans)
|
||||
closure_return_sync(&trans->ref);
|
||||
trans->locking_wait.task = NULL;
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
darray_exit(&trans->last_restarted_trace);
|
||||
#endif
|
||||
|
||||
unsigned long *paths_allocated = trans->paths_allocated;
|
||||
trans->paths_allocated = NULL;
|
||||
trans->paths = NULL;
|
||||
@ -3338,8 +3495,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
|
||||
pid = owner ? owner->pid : 0;
|
||||
rcu_read_unlock();
|
||||
|
||||
prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
|
||||
b->level, bch2_btree_id_str(b->btree_id));
|
||||
prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
|
||||
bch2_btree_id_to_text(out, b->btree_id);
|
||||
prt_printf(out, " l=%u:", b->level);
|
||||
bch2_bpos_to_text(out, btree_node_pos(b));
|
||||
|
||||
prt_printf(out, "\t locks %u:%u:%u held by pid %u",
|
||||
@ -3378,11 +3536,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
|
||||
if (!path->nodes_locked)
|
||||
continue;
|
||||
|
||||
prt_printf(out, " path %u %c l=%u %s:",
|
||||
idx,
|
||||
path->cached ? 'c' : 'b',
|
||||
path->level,
|
||||
bch2_btree_id_str(path->btree_id));
|
||||
prt_printf(out, " path %u %c ",
|
||||
idx,
|
||||
path->cached ? 'c' : 'b');
|
||||
bch2_btree_id_to_text(out, path->btree_id);
|
||||
prt_printf(out, " l=%u:", path->level);
|
||||
bch2_bpos_to_text(out, path->pos);
|
||||
prt_newline(out);
|
||||
|
||||
@ -3488,7 +3646,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
fs_reclaim_acquire(GFP_KERNEL);
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
trans_set_locked(trans);
|
||||
trans_set_locked(trans, false);
|
||||
bch2_trans_put(trans);
|
||||
fs_reclaim_release(GFP_KERNEL);
|
||||
#endif
|
||||
|
@ -23,6 +23,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path
|
||||
{
|
||||
unsigned idx = path - trans->paths;
|
||||
|
||||
EBUG_ON(idx >= trans->nr_paths);
|
||||
EBUG_ON(!test_bit(idx, trans->paths_allocated));
|
||||
if (unlikely(path->ref == U8_MAX)) {
|
||||
bch2_dump_trans_paths_updates(trans);
|
||||
@ -36,6 +37,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path
|
||||
|
||||
static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
|
||||
{
|
||||
EBUG_ON(path - trans->paths >= trans->nr_paths);
|
||||
EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated));
|
||||
EBUG_ON(!path->ref);
|
||||
EBUG_ON(!path->intent_ref && intent);
|
||||
@ -234,12 +236,12 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
|
||||
btree_path_idx_t,
|
||||
unsigned, unsigned long);
|
||||
|
||||
static inline void bch2_trans_verify_not_unlocked(struct btree_trans *);
|
||||
static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *);
|
||||
|
||||
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
|
||||
btree_path_idx_t path, unsigned flags)
|
||||
{
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
|
||||
return 0;
|
||||
@ -324,38 +326,33 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
|
||||
bch2_trans_restart_error(trans, restart_count);
|
||||
}
|
||||
|
||||
void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
|
||||
void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *);
|
||||
|
||||
static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
|
||||
static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans)
|
||||
{
|
||||
if (trans->restarted)
|
||||
bch2_trans_in_restart_error(trans);
|
||||
}
|
||||
|
||||
void __noreturn bch2_trans_unlocked_error(struct btree_trans *);
|
||||
|
||||
static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans)
|
||||
{
|
||||
if (!trans->locked)
|
||||
bch2_trans_unlocked_error(trans);
|
||||
if (trans->restarted || !trans->locked)
|
||||
bch2_trans_unlocked_or_in_restart_error(trans);
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int btree_trans_restart_nounlock(struct btree_trans *trans, int err)
|
||||
static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
|
||||
{
|
||||
BUG_ON(err <= 0);
|
||||
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
|
||||
|
||||
trans->restarted = err;
|
||||
trans->last_restarted_ip = _THIS_IP_;
|
||||
trans->last_restarted_ip = ip;
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
darray_exit(&trans->last_restarted_trace);
|
||||
bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
|
||||
#endif
|
||||
return -err;
|
||||
}
|
||||
|
||||
__always_inline
|
||||
static int btree_trans_restart(struct btree_trans *trans, int err)
|
||||
{
|
||||
btree_trans_restart_nounlock(trans, err);
|
||||
return -err;
|
||||
return btree_trans_restart_ip(trans, err, _THIS_IP_);
|
||||
}
|
||||
|
||||
bool bch2_btree_node_upgrade(struct btree_trans *,
|
||||
@ -375,6 +372,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
|
||||
void bch2_trans_downgrade(struct btree_trans *);
|
||||
|
||||
void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
|
||||
void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
|
||||
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
|
||||
|
||||
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
|
||||
@ -384,15 +382,21 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
|
||||
struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
|
||||
struct btree *bch2_btree_iter_next_node(struct btree_iter *);
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
|
||||
struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
|
||||
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
|
||||
|
||||
static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
|
||||
{
|
||||
return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
|
||||
return bch2_btree_iter_peek_max(iter, SPOS_MAX);
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
|
||||
|
||||
static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
||||
{
|
||||
return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
|
||||
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
|
||||
@ -443,10 +447,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna
|
||||
|
||||
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
|
||||
|
||||
static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
|
||||
unsigned btree_id,
|
||||
unsigned flags)
|
||||
static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
|
||||
unsigned btree_id,
|
||||
unsigned level,
|
||||
unsigned flags)
|
||||
{
|
||||
if (level || !btree_id_cached(trans->c, btree_id)) {
|
||||
flags &= ~BTREE_ITER_cached;
|
||||
flags &= ~BTREE_ITER_with_key_cache;
|
||||
} else if (!(flags & BTREE_ITER_cached))
|
||||
flags |= BTREE_ITER_with_key_cache;
|
||||
|
||||
if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) &&
|
||||
btree_id_is_extents(btree_id))
|
||||
flags |= BTREE_ITER_is_extents;
|
||||
@ -465,19 +476,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
|
||||
return flags;
|
||||
}
|
||||
|
||||
static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans,
|
||||
unsigned btree_id,
|
||||
unsigned flags)
|
||||
{
|
||||
if (!btree_id_cached(trans->c, btree_id)) {
|
||||
flags &= ~BTREE_ITER_cached;
|
||||
flags &= ~BTREE_ITER_with_key_cache;
|
||||
} else if (!(flags & BTREE_ITER_cached))
|
||||
flags |= BTREE_ITER_with_key_cache;
|
||||
|
||||
return __bch2_btree_iter_flags(trans, btree_id, flags);
|
||||
}
|
||||
|
||||
static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
unsigned btree_id, struct bpos pos,
|
||||
@ -514,7 +512,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
|
||||
if (__builtin_constant_p(btree_id) &&
|
||||
__builtin_constant_p(flags))
|
||||
bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
|
||||
bch2_btree_iter_flags(trans, btree_id, flags),
|
||||
bch2_btree_iter_flags(trans, btree_id, 0, flags),
|
||||
_THIS_IP_);
|
||||
else
|
||||
bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags);
|
||||
@ -593,13 +591,18 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans,
|
||||
bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \
|
||||
_btree_id, _pos, _flags, KEY_TYPE_##_type))
|
||||
|
||||
static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k)
|
||||
{
|
||||
unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k));
|
||||
memcpy(dst_v, src_k.v, b);
|
||||
if (unlikely(b < dst_size))
|
||||
memset(dst_v + b, 0, dst_size - b);
|
||||
}
|
||||
|
||||
#define bkey_val_copy(_dst_v, _src_k) \
|
||||
do { \
|
||||
unsigned b = min_t(unsigned, sizeof(*_dst_v), \
|
||||
bkey_val_bytes(_src_k.k)); \
|
||||
memcpy(_dst_v, _src_k.v, b); \
|
||||
if (b < sizeof(*_dst_v)) \
|
||||
memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \
|
||||
BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \
|
||||
__bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \
|
||||
} while (0)
|
||||
|
||||
static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
|
||||
@ -608,17 +611,10 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
|
||||
unsigned val_size, void *val)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
|
||||
ret = bkey_err(k);
|
||||
struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type);
|
||||
int ret = bkey_err(k);
|
||||
if (!ret) {
|
||||
unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size);
|
||||
|
||||
memcpy(val, k.v, b);
|
||||
if (unlikely(b < sizeof(*val)))
|
||||
memset((void *) val + b, 0, sizeof(*val) - b);
|
||||
__bkey_val_copy(val, val_size, k);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
}
|
||||
|
||||
@ -677,12 +673,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
|
||||
bch2_btree_iter_peek(iter);
|
||||
}
|
||||
|
||||
static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter,
|
||||
static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
|
||||
struct bpos end,
|
||||
unsigned flags)
|
||||
{
|
||||
if (!(flags & BTREE_ITER_slots))
|
||||
return bch2_btree_iter_peek_upto(iter, end);
|
||||
return bch2_btree_iter_peek_max(iter, end);
|
||||
|
||||
if (bkey_gt(iter->pos, end))
|
||||
return bkey_s_c_null;
|
||||
@ -746,7 +742,7 @@ transaction_restart: \
|
||||
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
|
||||
})
|
||||
|
||||
#define for_each_btree_key_upto_continue(_trans, _iter, \
|
||||
#define for_each_btree_key_max_continue(_trans, _iter, \
|
||||
_end, _flags, _k, _do) \
|
||||
({ \
|
||||
struct bkey_s_c _k; \
|
||||
@ -754,7 +750,7 @@ transaction_restart: \
|
||||
\
|
||||
do { \
|
||||
_ret3 = lockrestart_do(_trans, ({ \
|
||||
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
|
||||
(_k) = bch2_btree_iter_peek_max_type(&(_iter), \
|
||||
_end, (_flags)); \
|
||||
if (!(_k).k) \
|
||||
break; \
|
||||
@ -768,9 +764,9 @@ transaction_restart: \
|
||||
})
|
||||
|
||||
#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \
|
||||
for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
|
||||
for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do)
|
||||
|
||||
#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
|
||||
#define for_each_btree_key_max(_trans, _iter, _btree_id, \
|
||||
_start, _end, _flags, _k, _do) \
|
||||
({ \
|
||||
bch2_trans_begin(trans); \
|
||||
@ -779,12 +775,12 @@ transaction_restart: \
|
||||
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
||||
(_start), (_flags)); \
|
||||
\
|
||||
for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\
|
||||
for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\
|
||||
})
|
||||
|
||||
#define for_each_btree_key(_trans, _iter, _btree_id, \
|
||||
_start, _flags, _k, _do) \
|
||||
for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
|
||||
for_each_btree_key_max(_trans, _iter, _btree_id, _start, \
|
||||
SPOS_MAX, _flags, _k, _do)
|
||||
|
||||
#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
|
||||
@ -828,33 +824,33 @@ transaction_restart: \
|
||||
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
|
||||
(_journal_seq), (_commit_flags)))
|
||||
|
||||
#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \
|
||||
#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \
|
||||
_start, _end, _iter_flags, _k, \
|
||||
_disk_res, _journal_seq, _commit_flags,\
|
||||
_do) \
|
||||
for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
|
||||
for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
|
||||
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
|
||||
(_journal_seq), (_commit_flags)))
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
|
||||
|
||||
#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \
|
||||
#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
|
||||
_start, _end, _flags, _k, _ret) \
|
||||
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
|
||||
(_start), (_flags)); \
|
||||
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\
|
||||
(_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
|
||||
!((_ret) = bkey_err(_k)) && (_k).k; \
|
||||
bch2_btree_iter_advance(&(_iter)))
|
||||
|
||||
#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
|
||||
#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
|
||||
for (; \
|
||||
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
|
||||
(_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
|
||||
!((_ret) = bkey_err(_k)) && (_k).k; \
|
||||
bch2_btree_iter_advance(&(_iter)))
|
||||
|
||||
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
|
||||
_start, _flags, _k, _ret) \
|
||||
for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
|
||||
for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
|
||||
SPOS_MAX, _flags, _k, _ret)
|
||||
|
||||
#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
|
||||
@ -866,7 +862,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
|
||||
bch2_btree_iter_rewind(&(_iter)))
|
||||
|
||||
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
|
||||
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
|
||||
for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
|
||||
|
||||
/*
|
||||
* This should not be used in a fastpath, without first trying _do in
|
||||
|
@ -16,6 +16,17 @@
|
||||
* operations for the regular btree iter code to use:
|
||||
*/
|
||||
|
||||
static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos)
|
||||
{
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
|
||||
BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size);
|
||||
|
||||
if (pos >= keys->gap)
|
||||
pos -= gap_size;
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
|
||||
{
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
@ -61,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
|
||||
}
|
||||
|
||||
/* Returns first non-overwritten key >= search key: */
|
||||
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
|
||||
struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos,
|
||||
struct bpos end_pos, size_t *idx)
|
||||
{
|
||||
@ -84,18 +95,54 @@ search:
|
||||
}
|
||||
}
|
||||
|
||||
struct bkey_i *ret = NULL;
|
||||
rcu_read_lock(); /* for overwritten_ranges */
|
||||
|
||||
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
||||
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
|
||||
return NULL;
|
||||
break;
|
||||
|
||||
if (k->overwritten) {
|
||||
(*idx)++;
|
||||
if (k->overwritten_range)
|
||||
*idx = rcu_dereference(k->overwritten_range)->end;
|
||||
else
|
||||
*idx += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
|
||||
return k->k;
|
||||
if (__journal_key_cmp(btree_id, level, pos, k) <= 0) {
|
||||
ret = k->k;
|
||||
break;
|
||||
}
|
||||
|
||||
(*idx)++;
|
||||
iters++;
|
||||
if (iters == 10) {
|
||||
*idx = 0;
|
||||
rcu_read_unlock();
|
||||
goto search;
|
||||
}
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos,
|
||||
struct bpos end_pos, size_t *idx)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
unsigned iters = 0;
|
||||
struct journal_key *k;
|
||||
|
||||
BUG_ON(*idx > keys->nr);
|
||||
search:
|
||||
if (!*idx)
|
||||
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
|
||||
|
||||
while (*idx &&
|
||||
__journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
|
||||
(*idx)++;
|
||||
iters++;
|
||||
if (iters == 10) {
|
||||
@ -104,7 +151,36 @@ search:
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
struct bkey_i *ret = NULL;
|
||||
rcu_read_lock(); /* for overwritten_ranges */
|
||||
|
||||
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
||||
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
|
||||
break;
|
||||
|
||||
if (k->overwritten) {
|
||||
if (k->overwritten_range)
|
||||
*idx = rcu_dereference(k->overwritten_range)->start - 1;
|
||||
else
|
||||
*idx -= 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (__journal_key_cmp(btree_id, level, pos, k) >= 0) {
|
||||
ret = k->k;
|
||||
break;
|
||||
}
|
||||
|
||||
--(*idx);
|
||||
iters++;
|
||||
if (iters == 10) {
|
||||
*idx = 0;
|
||||
goto search;
|
||||
}
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
|
||||
@ -112,11 +188,12 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
|
||||
{
|
||||
size_t idx = 0;
|
||||
|
||||
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
|
||||
return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx);
|
||||
}
|
||||
|
||||
static void journal_iter_verify(struct journal_iter *iter)
|
||||
{
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
struct journal_keys *keys = iter->keys;
|
||||
size_t gap_size = keys->size - keys->nr;
|
||||
|
||||
@ -126,10 +203,10 @@ static void journal_iter_verify(struct journal_iter *iter)
|
||||
if (iter->idx < keys->size) {
|
||||
struct journal_key *k = keys->data + iter->idx;
|
||||
|
||||
int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
|
||||
cmp_int(k->level, iter->level);
|
||||
BUG_ON(cmp < 0);
|
||||
int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
|
||||
BUG_ON(cmp > 0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void journal_iters_fix(struct bch_fs *c)
|
||||
@ -182,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
|
||||
* Ensure these keys are done last by journal replay, to unblock
|
||||
* journal reclaim:
|
||||
*/
|
||||
.journal_seq = U32_MAX,
|
||||
.journal_seq = U64_MAX,
|
||||
};
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
|
||||
@ -290,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
|
||||
bkey_deleted(&keys->data[idx].k->k));
|
||||
}
|
||||
|
||||
static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos)
|
||||
{
|
||||
struct journal_key *k = keys->data + pos;
|
||||
size_t idx = pos_to_idx(keys, pos);
|
||||
|
||||
k->overwritten = true;
|
||||
|
||||
struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL;
|
||||
struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL;
|
||||
|
||||
bool prev_overwritten = prev && prev->overwritten;
|
||||
bool next_overwritten = next && next->overwritten;
|
||||
|
||||
struct journal_key_range_overwritten *prev_range =
|
||||
prev_overwritten ? prev->overwritten_range : NULL;
|
||||
struct journal_key_range_overwritten *next_range =
|
||||
next_overwritten ? next->overwritten_range : NULL;
|
||||
|
||||
BUG_ON(prev_range && prev_range->end != idx);
|
||||
BUG_ON(next_range && next_range->start != idx + 1);
|
||||
|
||||
if (prev_range && next_range) {
|
||||
prev_range->end = next_range->end;
|
||||
|
||||
keys->data[pos].overwritten_range = prev_range;
|
||||
for (size_t i = next_range->start; i < next_range->end; i++) {
|
||||
struct journal_key *ip = keys->data + idx_to_pos(keys, i);
|
||||
BUG_ON(ip->overwritten_range != next_range);
|
||||
ip->overwritten_range = prev_range;
|
||||
}
|
||||
|
||||
kfree_rcu_mightsleep(next_range);
|
||||
} else if (prev_range) {
|
||||
prev_range->end++;
|
||||
k->overwritten_range = prev_range;
|
||||
if (next_overwritten) {
|
||||
prev_range->end++;
|
||||
next->overwritten_range = prev_range;
|
||||
}
|
||||
} else if (next_range) {
|
||||
next_range->start--;
|
||||
k->overwritten_range = next_range;
|
||||
if (prev_overwritten) {
|
||||
next_range->start--;
|
||||
prev->overwritten_range = next_range;
|
||||
}
|
||||
} else if (prev_overwritten || next_overwritten) {
|
||||
struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL);
|
||||
if (!r)
|
||||
return;
|
||||
|
||||
r->start = idx - (size_t) prev_overwritten;
|
||||
r->end = idx + 1 + (size_t) next_overwritten;
|
||||
|
||||
rcu_assign_pointer(k->overwritten_range, r);
|
||||
if (prev_overwritten)
|
||||
prev->overwritten_range = r;
|
||||
if (next_overwritten)
|
||||
next->overwritten_range = r;
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
@ -299,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
|
||||
if (idx < keys->size &&
|
||||
keys->data[idx].btree_id == btree &&
|
||||
keys->data[idx].level == level &&
|
||||
bpos_eq(keys->data[idx].k->k.p, pos))
|
||||
keys->data[idx].overwritten = true;
|
||||
bpos_eq(keys->data[idx].k->k.p, pos) &&
|
||||
!keys->data[idx].overwritten) {
|
||||
mutex_lock(&keys->overwrite_lock);
|
||||
__bch2_journal_key_overwritten(keys, idx);
|
||||
mutex_unlock(&keys->overwrite_lock);
|
||||
}
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_advance(struct journal_iter *iter)
|
||||
@ -314,24 +457,32 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
|
||||
|
||||
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
|
||||
{
|
||||
struct bkey_s_c ret = bkey_s_c_null;
|
||||
|
||||
journal_iter_verify(iter);
|
||||
|
||||
rcu_read_lock();
|
||||
while (iter->idx < iter->keys->size) {
|
||||
struct journal_key *k = iter->keys->data + iter->idx;
|
||||
|
||||
int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
|
||||
cmp_int(k->level, iter->level);
|
||||
if (cmp > 0)
|
||||
int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k);
|
||||
if (cmp < 0)
|
||||
break;
|
||||
BUG_ON(cmp);
|
||||
|
||||
if (!k->overwritten)
|
||||
return bkey_i_to_s_c(k->k);
|
||||
if (!k->overwritten) {
|
||||
ret = bkey_i_to_s_c(k->k);
|
||||
break;
|
||||
}
|
||||
|
||||
bch2_journal_iter_advance(iter);
|
||||
if (k->overwritten_range)
|
||||
iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
|
||||
else
|
||||
bch2_journal_iter_advance(iter);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return bkey_s_c_null;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void bch2_journal_iter_exit(struct journal_iter *iter)
|
||||
@ -382,6 +533,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
|
||||
: (level > 1 ? 1 : 16);
|
||||
|
||||
iter.prefetch = false;
|
||||
iter.fail_if_too_many_whiteouts = true;
|
||||
bch2_bkey_buf_init(&tmp);
|
||||
|
||||
while (nr--) {
|
||||
@ -400,6 +552,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter
|
||||
struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
|
||||
{
|
||||
struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
|
||||
size_t iters = 0;
|
||||
|
||||
if (iter->prefetch && iter->journal.level)
|
||||
btree_and_journal_iter_prefetch(iter);
|
||||
@ -407,6 +560,11 @@ again:
|
||||
if (iter->at_end)
|
||||
return bkey_s_c_null;
|
||||
|
||||
iters++;
|
||||
|
||||
if (iters > 20 && iter->fail_if_too_many_whiteouts)
|
||||
return bkey_s_c_null;
|
||||
|
||||
while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
|
||||
bpos_lt(btree_k.k->p, iter->pos))
|
||||
bch2_journal_iter_advance_btree(iter);
|
||||
@ -481,16 +639,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
|
||||
|
||||
/* sort and dedup all keys in the journal: */
|
||||
|
||||
void bch2_journal_entries_free(struct bch_fs *c)
|
||||
{
|
||||
struct journal_replay **i;
|
||||
struct genradix_iter iter;
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, i)
|
||||
kvfree(*i);
|
||||
genradix_free(&c->journal_entries);
|
||||
}
|
||||
|
||||
/*
|
||||
* When keys compare equal, oldest compares first:
|
||||
*/
|
||||
@ -515,15 +663,26 @@ void bch2_journal_keys_put(struct bch_fs *c)
|
||||
|
||||
move_gap(keys, keys->nr);
|
||||
|
||||
darray_for_each(*keys, i)
|
||||
darray_for_each(*keys, i) {
|
||||
if (i->overwritten_range &&
|
||||
(i == &darray_last(*keys) ||
|
||||
i->overwritten_range != i[1].overwritten_range))
|
||||
kfree(i->overwritten_range);
|
||||
|
||||
if (i->allocated)
|
||||
kfree(i->k);
|
||||
}
|
||||
|
||||
kvfree(keys->data);
|
||||
keys->data = NULL;
|
||||
keys->nr = keys->gap = keys->size = 0;
|
||||
|
||||
bch2_journal_entries_free(c);
|
||||
struct journal_replay **i;
|
||||
struct genradix_iter iter;
|
||||
|
||||
genradix_for_each(&c->journal_entries, iter, i)
|
||||
kvfree(*i);
|
||||
genradix_free(&c->journal_entries);
|
||||
}
|
||||
|
||||
static void __journal_keys_sort(struct journal_keys *keys)
|
||||
@ -628,8 +787,20 @@ void bch2_journal_keys_dump(struct bch_fs *c)
|
||||
|
||||
darray_for_each(*keys, i) {
|
||||
printbuf_reset(&buf);
|
||||
prt_printf(&buf, "btree=");
|
||||
bch2_btree_id_to_text(&buf, i->btree_id);
|
||||
prt_printf(&buf, " l=%u ", i->level);
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k));
|
||||
pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf);
|
||||
pr_err("%s", buf.buf);
|
||||
}
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
void bch2_fs_journal_keys_init(struct bch_fs *c)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
|
||||
atomic_set(&keys->ref, 1);
|
||||
keys->initial_ref_held = true;
|
||||
mutex_init(&keys->overwrite_lock);
|
||||
}
|
||||
|
@ -26,16 +26,24 @@ struct btree_and_journal_iter {
|
||||
struct bpos pos;
|
||||
bool at_end;
|
||||
bool prefetch;
|
||||
bool fail_if_too_many_whiteouts;
|
||||
};
|
||||
|
||||
static inline int __journal_key_btree_cmp(enum btree_id l_btree_id,
|
||||
unsigned l_level,
|
||||
const struct journal_key *r)
|
||||
{
|
||||
return -cmp_int(l_level, r->level) ?:
|
||||
cmp_int(l_btree_id, r->btree_id);
|
||||
}
|
||||
|
||||
static inline int __journal_key_cmp(enum btree_id l_btree_id,
|
||||
unsigned l_level,
|
||||
struct bpos l_pos,
|
||||
const struct journal_key *r)
|
||||
{
|
||||
return (cmp_int(l_btree_id, r->btree_id) ?:
|
||||
cmp_int(l_level, r->level) ?:
|
||||
bpos_cmp(l_pos, r->k->k.p));
|
||||
return __journal_key_btree_cmp(l_btree_id, l_level, r) ?:
|
||||
bpos_cmp(l_pos, r->k->k.p);
|
||||
}
|
||||
|
||||
static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
|
||||
@ -43,7 +51,9 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour
|
||||
return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
|
||||
struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos, struct bpos, size_t *);
|
||||
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos, struct bpos, size_t *);
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
@ -79,8 +89,6 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
|
||||
c->journal_keys.initial_ref_held = false;
|
||||
}
|
||||
|
||||
void bch2_journal_entries_free(struct bch_fs *);
|
||||
|
||||
int bch2_journal_keys_sort(struct bch_fs *);
|
||||
|
||||
void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
|
||||
@ -89,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
|
||||
|
||||
void bch2_journal_keys_dump(struct bch_fs *);
|
||||
|
||||
void bch2_fs_journal_keys_init(struct bch_fs *);
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
|
||||
|
36
fs/bcachefs/btree_journal_iter_types.h
Normal file
36
fs/bcachefs/btree_journal_iter_types.h
Normal file
@ -0,0 +1,36 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
|
||||
#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H
|
||||
|
||||
struct journal_key_range_overwritten {
|
||||
size_t start, end;
|
||||
};
|
||||
|
||||
struct journal_key {
|
||||
u64 journal_seq;
|
||||
u32 journal_offset;
|
||||
enum btree_id btree_id:8;
|
||||
unsigned level:8;
|
||||
bool allocated;
|
||||
bool overwritten;
|
||||
struct journal_key_range_overwritten __rcu *
|
||||
overwritten_range;
|
||||
struct bkey_i *k;
|
||||
};
|
||||
|
||||
struct journal_keys {
|
||||
/* must match layout in darray_types.h */
|
||||
size_t nr, size;
|
||||
struct journal_key *data;
|
||||
/*
|
||||
* Gap buffer: instead of all the empty space in the array being at the
|
||||
* end of the buffer - from @nr to @size - the empty space is at @gap.
|
||||
* This means that sequential insertions are O(n) instead of O(n^2).
|
||||
*/
|
||||
size_t gap;
|
||||
atomic_t ref;
|
||||
bool initial_ref_held;
|
||||
struct mutex overwrite_lock;
|
||||
};
|
||||
|
||||
#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */
|
@ -197,7 +197,9 @@ out:
|
||||
return ck;
|
||||
}
|
||||
|
||||
static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path,
|
||||
static int btree_key_cache_create(struct btree_trans *trans,
|
||||
struct btree_path *path,
|
||||
struct btree_path *ck_path,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
|
||||
key_u64s = min(256U, (key_u64s * 3) / 2);
|
||||
key_u64s = roundup_pow_of_two(key_u64s);
|
||||
|
||||
struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s);
|
||||
struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s);
|
||||
int ret = PTR_ERR_OR_ZERO(ck);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
|
||||
ck = bkey_cached_reuse(bc);
|
||||
if (unlikely(!ck)) {
|
||||
bch_err(c, "error allocating memory for key cache item, btree %s",
|
||||
bch2_btree_id_str(path->btree_id));
|
||||
bch2_btree_id_str(ck_path->btree_id));
|
||||
return -BCH_ERR_ENOMEM_btree_key_cache_create;
|
||||
}
|
||||
}
|
||||
|
||||
ck->c.level = 0;
|
||||
ck->c.btree_id = path->btree_id;
|
||||
ck->key.btree_id = path->btree_id;
|
||||
ck->key.pos = path->pos;
|
||||
ck->c.btree_id = ck_path->btree_id;
|
||||
ck->key.btree_id = ck_path->btree_id;
|
||||
ck->key.pos = ck_path->pos;
|
||||
ck->flags = 1U << BKEY_CACHED_ACCESSED;
|
||||
|
||||
if (unlikely(key_u64s > ck->u64s)) {
|
||||
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
||||
mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
|
||||
|
||||
struct bkey_i *new_k = allocate_dropping_locks(trans, ret,
|
||||
kmalloc(key_u64s * sizeof(u64), _gfp));
|
||||
@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *
|
||||
|
||||
bkey_reassemble(ck->k, k);
|
||||
|
||||
ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c);
|
||||
if (unlikely(ret))
|
||||
goto err;
|
||||
|
||||
ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params);
|
||||
|
||||
bch2_btree_node_unlock_write(trans, path, path_l(path)->b);
|
||||
|
||||
if (unlikely(ret)) /* raced with another fill? */
|
||||
goto err;
|
||||
|
||||
atomic_long_inc(&bc->nr_keys);
|
||||
six_unlock_write(&ck->c.lock);
|
||||
|
||||
enum six_lock_type lock_want = __btree_lock_want(path, 0);
|
||||
enum six_lock_type lock_want = __btree_lock_want(ck_path, 0);
|
||||
if (lock_want == SIX_LOCK_read)
|
||||
six_lock_downgrade(&ck->c.lock);
|
||||
btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want);
|
||||
path->uptodate = BTREE_ITER_UPTODATE;
|
||||
btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want);
|
||||
ck_path->uptodate = BTREE_ITER_UPTODATE;
|
||||
return 0;
|
||||
err:
|
||||
bkey_cached_free(bc, ck);
|
||||
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
|
||||
mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -282,10 +291,8 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
|
||||
struct btree_path *ck_path,
|
||||
unsigned flags)
|
||||
{
|
||||
if (flags & BTREE_ITER_cached_nofill) {
|
||||
ck_path->uptodate = BTREE_ITER_UPTODATE;
|
||||
if (flags & BTREE_ITER_cached_nofill)
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
@ -293,6 +300,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
|
||||
int ret;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos,
|
||||
BTREE_ITER_intent|
|
||||
BTREE_ITER_key_cache_fill|
|
||||
BTREE_ITER_cached_nofill);
|
||||
iter.flags &= ~BTREE_ITER_with_journal;
|
||||
@ -306,9 +314,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
|
||||
if (unlikely(ret))
|
||||
goto out;
|
||||
|
||||
ret = btree_key_cache_create(trans, ck_path, k);
|
||||
ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (trace_key_cache_fill_enabled()) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bpos_to_text(&buf, ck_path->pos);
|
||||
prt_char(&buf, ' ');
|
||||
bch2_bkey_val_to_text(&buf, trans->c, k);
|
||||
trace_key_cache_fill(trans, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
out:
|
||||
/* We're not likely to need this iterator again: */
|
||||
bch2_set_btree_iter_dontneed(&iter);
|
||||
@ -424,8 +442,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
|
||||
!test_bit(JOURNAL_space_low, &c->journal.flags))
|
||||
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
|
||||
|
||||
ret = bch2_btree_iter_traverse(&b_iter) ?:
|
||||
bch2_trans_update(trans, &b_iter, ck->k,
|
||||
struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
|
||||
ret = bkey_err(btree_k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/* * Check that we're not violating cache coherency rules: */
|
||||
BUG_ON(bkey_deleted(btree_k.k));
|
||||
|
||||
ret = bch2_trans_update(trans, &b_iter, ck->k,
|
||||
BTREE_UPDATE_key_cache_reclaim|
|
||||
BTREE_UPDATE_internal_snapshot_node|
|
||||
BTREE_TRIGGER_norun) ?:
|
||||
@ -433,7 +458,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
|
||||
BCH_TRANS_COMMIT_no_check_rw|
|
||||
BCH_TRANS_COMMIT_no_enospc|
|
||||
commit_flags);
|
||||
|
||||
err:
|
||||
bch2_fs_fatal_err_on(ret &&
|
||||
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
|
||||
!bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
|
||||
@ -586,8 +611,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
|
||||
bkey_cached_free(bc, ck);
|
||||
|
||||
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
|
||||
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
|
||||
path->should_be_locked = false;
|
||||
|
||||
struct btree_path *path2;
|
||||
unsigned i;
|
||||
trans_for_each_path(trans, path2, i)
|
||||
if (path2->l[0].b == (void *) ck) {
|
||||
__bch2_btree_path_unlock(trans, path2);
|
||||
path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
|
||||
path2->should_be_locked = false;
|
||||
btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
|
||||
}
|
||||
|
||||
bch2_trans_verify_locks(trans);
|
||||
}
|
||||
|
||||
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
||||
|
@ -782,7 +782,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
|
||||
return bch2_trans_relock_fail(trans, path, &f, trace);
|
||||
}
|
||||
|
||||
trans_set_locked(trans);
|
||||
trans_set_locked(trans, true);
|
||||
out:
|
||||
bch2_trans_verify_locks(trans);
|
||||
return 0;
|
||||
@ -818,6 +818,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
|
||||
bch2_trans_srcu_unlock(trans);
|
||||
}
|
||||
|
||||
void bch2_trans_unlock_write(struct btree_trans *trans)
|
||||
{
|
||||
struct btree_path *path;
|
||||
unsigned i;
|
||||
|
||||
trans_for_each_path(trans, path, i)
|
||||
for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++)
|
||||
if (btree_node_write_locked(path, l))
|
||||
bch2_btree_node_unlock_write(trans, path, path->l[l].b);
|
||||
}
|
||||
|
||||
int __bch2_trans_mutex_lock(struct btree_trans *trans,
|
||||
struct mutex *lock)
|
||||
{
|
||||
@ -856,6 +867,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
|
||||
(want == BTREE_NODE_UNLOCKED ||
|
||||
have != BTREE_NODE_WRITE_LOCKED) &&
|
||||
want != have);
|
||||
|
||||
BUG_ON(btree_node_locked(path, l) &&
|
||||
path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
|
||||
|
||||
void bch2_trans_unlock_noassert(struct btree_trans *);
|
||||
void bch2_trans_unlock_write(struct btree_trans *);
|
||||
|
||||
static inline bool is_btree_node(struct btree_path *path, unsigned l)
|
||||
{
|
||||
@ -75,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path,
|
||||
path->nodes_locked |= (type + 1) << (level << 1);
|
||||
}
|
||||
|
||||
static inline void mark_btree_node_unlocked(struct btree_path *path,
|
||||
unsigned level)
|
||||
{
|
||||
EBUG_ON(btree_node_write_locked(path, level));
|
||||
mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
|
||||
}
|
||||
|
||||
static inline void mark_btree_node_locked(struct btree_trans *trans,
|
||||
struct btree_path *path,
|
||||
unsigned level,
|
||||
@ -124,19 +118,25 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
|
||||
|
||||
/* unlock: */
|
||||
|
||||
void bch2_btree_node_unlock_write(struct btree_trans *,
|
||||
struct btree_path *, struct btree *);
|
||||
|
||||
static inline void btree_node_unlock(struct btree_trans *trans,
|
||||
struct btree_path *path, unsigned level)
|
||||
{
|
||||
int lock_type = btree_node_locked_type(path, level);
|
||||
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED);
|
||||
|
||||
if (lock_type != BTREE_NODE_UNLOCKED) {
|
||||
if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) {
|
||||
bch2_btree_node_unlock_write(trans, path, path->l[level].b);
|
||||
lock_type = BTREE_NODE_INTENT_LOCKED;
|
||||
}
|
||||
six_unlock_type(&path->l[level].b->c.lock, lock_type);
|
||||
btree_trans_lock_hold_time_update(trans, path, level);
|
||||
mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED);
|
||||
}
|
||||
mark_btree_node_unlocked(path, level);
|
||||
}
|
||||
|
||||
static inline int btree_path_lowest_level_locked(struct btree_path *path)
|
||||
@ -162,36 +162,40 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
|
||||
* Updates the saved lock sequence number, so that bch2_btree_node_relock() will
|
||||
* succeed:
|
||||
*/
|
||||
static inline void
|
||||
__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b)
|
||||
{
|
||||
if (!b->c.lock.write_lock_recurse) {
|
||||
struct btree_path *linked;
|
||||
unsigned i;
|
||||
|
||||
trans_for_each_path_with_node(trans, b, linked, i)
|
||||
linked->l[b->c.level].lock_seq++;
|
||||
}
|
||||
|
||||
six_unlock_write(&b->c.lock);
|
||||
}
|
||||
|
||||
static inline void
|
||||
bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
|
||||
struct btree *b)
|
||||
{
|
||||
struct btree_path *linked;
|
||||
unsigned i;
|
||||
|
||||
EBUG_ON(path->l[b->c.level].b != b);
|
||||
EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
|
||||
EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
|
||||
|
||||
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
|
||||
|
||||
trans_for_each_path_with_node(trans, b, linked, i)
|
||||
linked->l[b->c.level].lock_seq++;
|
||||
|
||||
six_unlock_write(&b->c.lock);
|
||||
__bch2_btree_node_unlock_write(trans, b);
|
||||
}
|
||||
|
||||
void bch2_btree_node_unlock_write(struct btree_trans *,
|
||||
struct btree_path *, struct btree *);
|
||||
|
||||
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
|
||||
|
||||
/* lock: */
|
||||
|
||||
static inline void trans_set_locked(struct btree_trans *trans)
|
||||
static inline void trans_set_locked(struct btree_trans *trans, bool try)
|
||||
{
|
||||
if (!trans->locked) {
|
||||
lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_);
|
||||
lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_);
|
||||
trans->locked = true;
|
||||
trans->last_unlock_ip = 0;
|
||||
|
||||
@ -282,7 +286,7 @@ static inline int btree_node_lock(struct btree_trans *trans,
|
||||
int ret = 0;
|
||||
|
||||
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
if (likely(six_trylock_type(&b->lock, type)) ||
|
||||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "recovery_passes.h"
|
||||
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/min_heap.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
struct find_btree_nodes_worker {
|
||||
@ -22,17 +23,15 @@ struct find_btree_nodes_worker {
|
||||
|
||||
static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
|
||||
{
|
||||
prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ",
|
||||
bch2_btree_id_str(n->btree_id), n->level, n->seq,
|
||||
n->journal_seq, n->cookie);
|
||||
bch2_btree_id_level_to_text(out, n->btree_id, n->level);
|
||||
prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ",
|
||||
n->seq, n->journal_seq, n->cookie);
|
||||
bch2_bpos_to_text(out, n->min_key);
|
||||
prt_str(out, "-");
|
||||
bch2_bpos_to_text(out, n->max_key);
|
||||
|
||||
if (n->range_updated)
|
||||
prt_str(out, " range updated");
|
||||
if (n->overwritten)
|
||||
prt_str(out, " overwritten");
|
||||
|
||||
for (unsigned i = 0; i < n->nr_ptrs; i++) {
|
||||
prt_char(out, ' ');
|
||||
@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r)
|
||||
-found_btree_node_cmp_time(l, r);
|
||||
}
|
||||
|
||||
static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg)
|
||||
{
|
||||
return found_btree_node_cmp_pos(l, r) < 0;
|
||||
}
|
||||
|
||||
static inline void found_btree_node_swap(void *_l, void *_r, void *arg)
|
||||
{
|
||||
struct found_btree_node *l = _l;
|
||||
struct found_btree_node *r = _r;
|
||||
|
||||
swap(*l, *r);
|
||||
}
|
||||
|
||||
static const struct min_heap_callbacks found_btree_node_heap_cbs = {
|
||||
.less = found_btree_node_cmp_pos_less,
|
||||
.swp = found_btree_node_swap,
|
||||
};
|
||||
|
||||
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
|
||||
struct bio *bio, struct btree_node *bn, u64 offset)
|
||||
{
|
||||
@ -159,6 +176,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
|
||||
return;
|
||||
|
||||
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
|
||||
if (!c->chacha20)
|
||||
return;
|
||||
|
||||
struct nonce nonce = btree_nonce(&bn->keys, 0);
|
||||
unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
|
||||
|
||||
@ -292,55 +312,48 @@ err:
|
||||
return f->ret ?: ret;
|
||||
}
|
||||
|
||||
static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
|
||||
static bool nodes_overlap(const struct found_btree_node *l,
|
||||
const struct found_btree_node *r)
|
||||
{
|
||||
while (n + 1 < end &&
|
||||
found_btree_node_cmp_pos(n, n + 1) > 0) {
|
||||
swap(n[0], n[1]);
|
||||
n++;
|
||||
}
|
||||
return (l->btree_id == r->btree_id &&
|
||||
l->level == r->level &&
|
||||
bpos_gt(l->max_key, r->min_key));
|
||||
}
|
||||
|
||||
static int handle_overwrites(struct bch_fs *c,
|
||||
struct found_btree_node *start,
|
||||
struct found_btree_node *end)
|
||||
struct found_btree_node *l,
|
||||
found_btree_nodes *nodes_heap)
|
||||
{
|
||||
struct found_btree_node *n;
|
||||
again:
|
||||
for (n = start + 1;
|
||||
n < end &&
|
||||
n->btree_id == start->btree_id &&
|
||||
n->level == start->level &&
|
||||
bpos_lt(n->min_key, start->max_key);
|
||||
n++) {
|
||||
int cmp = found_btree_node_cmp_time(start, n);
|
||||
struct found_btree_node *r;
|
||||
|
||||
while ((r = min_heap_peek(nodes_heap)) &&
|
||||
nodes_overlap(l, r)) {
|
||||
int cmp = found_btree_node_cmp_time(l, r);
|
||||
|
||||
if (cmp > 0) {
|
||||
if (bpos_cmp(start->max_key, n->max_key) >= 0)
|
||||
n->overwritten = true;
|
||||
if (bpos_cmp(l->max_key, r->max_key) >= 0)
|
||||
min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
|
||||
else {
|
||||
n->range_updated = true;
|
||||
n->min_key = bpos_successor(start->max_key);
|
||||
n->range_updated = true;
|
||||
bubble_up(n, end);
|
||||
goto again;
|
||||
r->range_updated = true;
|
||||
r->min_key = bpos_successor(l->max_key);
|
||||
r->range_updated = true;
|
||||
min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
|
||||
}
|
||||
} else if (cmp < 0) {
|
||||
BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
|
||||
BUG_ON(bpos_eq(l->min_key, r->min_key));
|
||||
|
||||
start->max_key = bpos_predecessor(n->min_key);
|
||||
start->range_updated = true;
|
||||
} else if (n->level) {
|
||||
n->overwritten = true;
|
||||
l->max_key = bpos_predecessor(r->min_key);
|
||||
l->range_updated = true;
|
||||
} else if (r->level) {
|
||||
min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
|
||||
} else {
|
||||
if (bpos_cmp(start->max_key, n->max_key) >= 0)
|
||||
n->overwritten = true;
|
||||
if (bpos_cmp(l->max_key, r->max_key) >= 0)
|
||||
min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL);
|
||||
else {
|
||||
n->range_updated = true;
|
||||
n->min_key = bpos_successor(start->max_key);
|
||||
n->range_updated = true;
|
||||
bubble_up(n, end);
|
||||
goto again;
|
||||
r->range_updated = true;
|
||||
r->min_key = bpos_successor(l->max_key);
|
||||
r->range_updated = true;
|
||||
min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -352,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
|
||||
{
|
||||
struct find_btree_nodes *f = &c->found_btree_nodes;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
found_btree_nodes nodes_heap = {};
|
||||
size_t dst;
|
||||
int ret = 0;
|
||||
|
||||
@ -406,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
|
||||
bch2_print_string_as_lines(KERN_INFO, buf.buf);
|
||||
}
|
||||
|
||||
dst = 0;
|
||||
darray_for_each(f->nodes, i) {
|
||||
if (i->overwritten)
|
||||
continue;
|
||||
swap(nodes_heap, f->nodes);
|
||||
|
||||
ret = handle_overwrites(c, i, &darray_top(f->nodes));
|
||||
{
|
||||
/* darray must have same layout as a heap */
|
||||
min_heap_char real_heap;
|
||||
BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr));
|
||||
BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size));
|
||||
BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr));
|
||||
BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size));
|
||||
}
|
||||
|
||||
min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL);
|
||||
|
||||
if (nodes_heap.nr) {
|
||||
ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
BUG_ON(i->overwritten);
|
||||
f->nodes.data[dst++] = *i;
|
||||
min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
|
||||
}
|
||||
f->nodes.nr = dst;
|
||||
|
||||
if (c->opts.verbose) {
|
||||
while (true) {
|
||||
ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (!nodes_heap.nr)
|
||||
break;
|
||||
|
||||
ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL);
|
||||
}
|
||||
|
||||
for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++)
|
||||
BUG_ON(nodes_overlap(n, n + 1));
|
||||
|
||||
if (0 && c->opts.verbose) {
|
||||
printbuf_reset(&buf);
|
||||
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
|
||||
found_btree_nodes_to_text(&buf, c, f->nodes);
|
||||
bch2_print_string_as_lines(KERN_INFO, buf.buf);
|
||||
} else {
|
||||
bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
|
||||
}
|
||||
|
||||
eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
|
||||
err:
|
||||
darray_exit(&nodes_heap);
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
@ -499,7 +541,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
|
||||
if (c->opts.verbose) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
|
||||
prt_str(&buf, "recovery ");
|
||||
bch2_btree_id_level_to_text(&buf, btree, level);
|
||||
prt_str(&buf, " ");
|
||||
bch2_bpos_to_text(&buf, node_min);
|
||||
prt_str(&buf, " - ");
|
||||
bch2_bpos_to_text(&buf, node_max);
|
||||
@ -533,7 +577,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
|
||||
bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
|
||||
BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0));
|
||||
BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
|
||||
(struct bkey_validate_context) {
|
||||
.from = BKEY_VALIDATE_btree_node,
|
||||
.level = level + 1,
|
||||
.btree = btree,
|
||||
}));
|
||||
|
||||
ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
|
||||
if (ret)
|
||||
|
@ -6,7 +6,6 @@
|
||||
|
||||
struct found_btree_node {
|
||||
bool range_updated:1;
|
||||
bool overwritten:1;
|
||||
u8 btree_id;
|
||||
u8 level;
|
||||
unsigned sectors_written;
|
||||
|
@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void bch2_trans_unlock_write(struct btree_trans *trans)
|
||||
static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans)
|
||||
{
|
||||
if (likely(trans->write_locked)) {
|
||||
trans_for_each_update(trans, i)
|
||||
@ -249,7 +249,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
|
||||
new |= 1 << BTREE_NODE_need_write;
|
||||
} while (!try_cmpxchg(&b->flags, &old, new));
|
||||
|
||||
btree_node_write_if_need(c, b, SIX_LOCK_read);
|
||||
btree_node_write_if_need(trans, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->c.lock);
|
||||
|
||||
bch2_trans_put(trans);
|
||||
@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
|
||||
struct bkey_i *new_k;
|
||||
int ret;
|
||||
|
||||
bch2_trans_unlock_write(trans);
|
||||
bch2_trans_unlock_updates_write(trans);
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
|
||||
@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
|
||||
old, flags);
|
||||
}
|
||||
|
||||
static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
|
||||
bool overwrite)
|
||||
static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i)
|
||||
{
|
||||
verify_update_old_key(trans, i);
|
||||
|
||||
@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
|
||||
return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
|
||||
BTREE_TRIGGER_insert|
|
||||
BTREE_TRIGGER_overwrite|flags) ?: 1;
|
||||
} else if (overwrite && !i->overwrite_trigger_run) {
|
||||
} else if (!i->overwrite_trigger_run) {
|
||||
i->overwrite_trigger_run = true;
|
||||
return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
|
||||
} else if (!overwrite && !i->insert_trigger_run) {
|
||||
} else if (!i->insert_trigger_run) {
|
||||
i->insert_trigger_run = true;
|
||||
return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
|
||||
} else {
|
||||
@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
|
||||
}
|
||||
|
||||
static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
|
||||
unsigned btree_id_start)
|
||||
unsigned *btree_id_updates_start)
|
||||
{
|
||||
for (int overwrite = 1; overwrite >= 0; --overwrite) {
|
||||
bool trans_trigger_run;
|
||||
bool trans_trigger_run;
|
||||
|
||||
/*
|
||||
* Running triggers will append more updates to the list of updates as
|
||||
* we're walking it:
|
||||
*/
|
||||
do {
|
||||
trans_trigger_run = false;
|
||||
/*
|
||||
* Running triggers will append more updates to the list of updates as
|
||||
* we're walking it:
|
||||
*/
|
||||
do {
|
||||
trans_trigger_run = false;
|
||||
|
||||
for (unsigned i = btree_id_start;
|
||||
i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
|
||||
i++) {
|
||||
if (trans->updates[i].btree_id != btree_id)
|
||||
continue;
|
||||
|
||||
int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
trans_trigger_run = true;
|
||||
for (unsigned i = *btree_id_updates_start;
|
||||
i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
|
||||
i++) {
|
||||
if (trans->updates[i].btree_id < btree_id) {
|
||||
*btree_id_updates_start = i;
|
||||
continue;
|
||||
}
|
||||
} while (trans_trigger_run);
|
||||
}
|
||||
|
||||
int ret = run_one_trans_trigger(trans, trans->updates + i);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
trans_trigger_run = true;
|
||||
}
|
||||
} while (trans_trigger_run);
|
||||
|
||||
trans_for_each_update(trans, i)
|
||||
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
|
||||
i->btree_id == btree_id &&
|
||||
btree_node_type_has_trans_triggers(i->bkey_type) &&
|
||||
(!i->insert_trigger_run || !i->overwrite_trigger_run));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
||||
{
|
||||
unsigned btree_id = 0, btree_id_start = 0;
|
||||
unsigned btree_id = 0, btree_id_updates_start = 0;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
|
||||
if (btree_id == BTREE_ID_alloc)
|
||||
continue;
|
||||
|
||||
while (btree_id_start < trans->nr_updates &&
|
||||
trans->updates[btree_id_start].btree_id < btree_id)
|
||||
btree_id_start++;
|
||||
|
||||
ret = run_btree_triggers(trans, btree_id, btree_id_start);
|
||||
ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
|
||||
struct btree_insert_entry *i = trans->updates + idx;
|
||||
|
||||
if (i->btree_id > BTREE_ID_alloc)
|
||||
break;
|
||||
if (i->btree_id == BTREE_ID_alloc) {
|
||||
ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
|
||||
if (ret)
|
||||
return ret;
|
||||
break;
|
||||
}
|
||||
}
|
||||
btree_id_updates_start = 0;
|
||||
ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
trans_for_each_update(trans, i)
|
||||
@ -609,14 +602,6 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
|
||||
{
|
||||
return (struct bversion) {
|
||||
.hi = res->seq >> 32,
|
||||
.lo = (res->seq << 32) | (res->offset + offset),
|
||||
};
|
||||
}
|
||||
|
||||
static inline int
|
||||
bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
struct btree_insert_entry **stopped_at,
|
||||
@ -627,12 +612,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
unsigned u64s = 0;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
if (race_fault()) {
|
||||
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
|
||||
return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
|
||||
return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -701,25 +685,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
struct jset_entry *entry = trans->journal_entries;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
|
||||
for (entry = trans->journal_entries;
|
||||
entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
||||
entry = vstruct_next(entry))
|
||||
if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
|
||||
entry->start->k.type == KEY_TYPE_accounting) {
|
||||
BUG_ON(!trans->journal_res.ref);
|
||||
|
||||
struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
|
||||
|
||||
a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
|
||||
(u64 *) entry - (u64 *) trans->journal_entries);
|
||||
BUG_ON(bversion_zero(a->k.bversion));
|
||||
|
||||
if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
|
||||
ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal);
|
||||
if (ret)
|
||||
goto revert_fs_usage;
|
||||
}
|
||||
ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
|
||||
if (ret)
|
||||
goto revert_fs_usage;
|
||||
}
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
@ -739,14 +712,29 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
goto fatal_err;
|
||||
}
|
||||
|
||||
struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit };
|
||||
|
||||
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
|
||||
validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
|
||||
|
||||
for (struct jset_entry *i = trans->journal_entries;
|
||||
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
||||
i = vstruct_next(i)) {
|
||||
ret = bch2_journal_entry_validate(c, NULL, i,
|
||||
bcachefs_metadata_version_current,
|
||||
CPU_BIG_ENDIAN, validate_context);
|
||||
if (unlikely(ret)) {
|
||||
bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
|
||||
trans->fn);
|
||||
goto fatal_err;
|
||||
}
|
||||
}
|
||||
|
||||
trans_for_each_update(trans, i) {
|
||||
enum bch_validate_flags invalid_flags = 0;
|
||||
validate_context.level = i->level;
|
||||
validate_context.btree = i->btree_id;
|
||||
|
||||
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
|
||||
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
|
||||
|
||||
ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
|
||||
i->bkey_type, invalid_flags);
|
||||
ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context);
|
||||
if (unlikely(ret)){
|
||||
bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
|
||||
trans->fn, (void *) i->ip_allocated);
|
||||
@ -755,24 +743,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
|
||||
btree_insert_entry_checks(trans, i);
|
||||
}
|
||||
|
||||
for (struct jset_entry *i = trans->journal_entries;
|
||||
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
|
||||
i = vstruct_next(i)) {
|
||||
enum bch_validate_flags invalid_flags = 0;
|
||||
|
||||
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
|
||||
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
|
||||
|
||||
ret = bch2_journal_entry_validate(c, NULL, i,
|
||||
bcachefs_metadata_version_current,
|
||||
CPU_BIG_ENDIAN, invalid_flags);
|
||||
if (unlikely(ret)) {
|
||||
bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
|
||||
trans->fn);
|
||||
goto fatal_err;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
|
||||
struct journal *j = &c->journal;
|
||||
struct jset_entry *entry;
|
||||
@ -833,13 +803,9 @@ revert_fs_usage:
|
||||
entry2 != entry;
|
||||
entry2 = vstruct_next(entry2))
|
||||
if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
|
||||
entry2->start->k.type == KEY_TYPE_accounting) {
|
||||
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
|
||||
|
||||
bch2_accounting_neg(a);
|
||||
bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
|
||||
bch2_accounting_neg(a);
|
||||
}
|
||||
entry2->start->k.type == KEY_TYPE_accounting)
|
||||
bch2_accounting_trans_commit_revert(trans,
|
||||
bkey_i_to_accounting(entry2->start), flags);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
return ret;
|
||||
}
|
||||
@ -902,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
|
||||
if (!ret && unlikely(trans->journal_replay_not_finished))
|
||||
bch2_drop_overwrites_from_journal(trans);
|
||||
|
||||
bch2_trans_unlock_write(trans);
|
||||
bch2_trans_unlock_updates_write(trans);
|
||||
|
||||
if (!ret && trans->journal_pin)
|
||||
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
|
||||
@ -994,24 +960,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline int
|
||||
bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
int ret;
|
||||
|
||||
if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
|
||||
test_bit(BCH_FS_started, &c->flags))
|
||||
return -BCH_ERR_erofs_trans_commit;
|
||||
|
||||
ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_write_ref_get(c, BCH_WRITE_REF_trans);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is for updates done in the early part of fsck - btree_gc - before we've
|
||||
* gone RW. we only add the new key to the list of keys for journal replay to
|
||||
@ -1022,6 +970,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
|
||||
BUG_ON(current != c->recovery_task);
|
||||
|
||||
trans_for_each_update(trans, i) {
|
||||
int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
|
||||
if (ret)
|
||||
@ -1047,8 +997,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
||||
struct bch_fs *c = trans->c;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
if (!trans->nr_updates &&
|
||||
!trans->journal_entries_u64s)
|
||||
@ -1058,16 +1007,13 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
||||
if (ret)
|
||||
goto out_reset;
|
||||
|
||||
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
|
||||
ret = do_bch2_trans_commit_to_journal_replay(trans);
|
||||
goto out_reset;
|
||||
}
|
||||
|
||||
if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
|
||||
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
|
||||
ret = bch2_trans_commit_get_rw_cold(trans, flags);
|
||||
if (ret)
|
||||
goto out_reset;
|
||||
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
|
||||
ret = do_bch2_trans_commit_to_journal_replay(trans);
|
||||
else
|
||||
ret = -BCH_ERR_erofs_trans_commit;
|
||||
goto out_reset;
|
||||
}
|
||||
|
||||
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
|
||||
@ -1112,8 +1058,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
||||
}
|
||||
retry:
|
||||
errored_at = NULL;
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
|
||||
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
|
||||
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
|
||||
|
@ -513,6 +513,9 @@ struct btree_trans {
|
||||
u64 last_begin_time;
|
||||
unsigned long last_begin_ip;
|
||||
unsigned long last_restarted_ip;
|
||||
#ifdef CONFIG_BCACHEFS_DEBUG
|
||||
bch_stacktrace last_restarted_trace;
|
||||
#endif
|
||||
unsigned long last_unlock_ip;
|
||||
unsigned long srcu_lock_time;
|
||||
|
||||
@ -787,53 +790,64 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type)
|
||||
return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS;
|
||||
}
|
||||
|
||||
static inline bool btree_node_type_is_extents(enum btree_node_type type)
|
||||
{
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return BIT_ULL(type) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_id_is_extents(enum btree_id btree)
|
||||
{
|
||||
return btree_node_type_is_extents(__btree_node_type(0, btree));
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_snapshots(enum btree_id id)
|
||||
{
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return BIT_ULL(id) & mask;
|
||||
return BIT_ULL(btree) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_snapshot_field(enum btree_id id)
|
||||
static inline bool btree_node_type_is_extents(enum btree_node_type type)
|
||||
{
|
||||
return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1);
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_snapshots(enum btree_id btree)
|
||||
{
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return BIT_ULL(id) & mask;
|
||||
return BIT_ULL(btree) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_ptrs(enum btree_id id)
|
||||
static inline bool btree_type_has_snapshot_field(enum btree_id btree)
|
||||
{
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr)
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return BIT_ULL(id) & mask;
|
||||
return BIT_ULL(btree) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_type_has_ptrs(enum btree_id btree)
|
||||
{
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return BIT_ULL(btree) & mask;
|
||||
}
|
||||
|
||||
static inline bool btree_type_uses_write_buffer(enum btree_id btree)
|
||||
{
|
||||
const u64 mask = 0
|
||||
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr)
|
||||
BCH_BTREE_IDS()
|
||||
#undef x
|
||||
;
|
||||
|
||||
return BIT_ULL(btree) & mask;
|
||||
}
|
||||
|
||||
struct btree_root {
|
||||
|
@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
|
||||
!(ret = bkey_err(old_k)) &&
|
||||
bkey_eq(old_pos, old_k.k->p)) {
|
||||
struct bpos whiteout_pos =
|
||||
SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
|
||||
SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
|
||||
|
||||
if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
|
||||
snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
|
||||
@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
|
||||
BTREE_ITER_intent|
|
||||
BTREE_ITER_with_updates|
|
||||
BTREE_ITER_not_extents);
|
||||
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
if ((ret = bkey_err(k)))
|
||||
goto err;
|
||||
if (!k.k)
|
||||
@ -323,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
|
||||
goto out;
|
||||
next:
|
||||
bch2_btree_iter_advance(&iter);
|
||||
k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
|
||||
if ((ret = bkey_err(k)))
|
||||
goto err;
|
||||
if (!k.k)
|
||||
@ -588,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi
|
||||
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
|
||||
enum btree_id btree, struct bpos end)
|
||||
{
|
||||
struct bkey_s_c k;
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent);
|
||||
k = bch2_btree_iter_prev(iter);
|
||||
ret = bkey_err(k);
|
||||
bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -672,25 +669,17 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
|
||||
bch2_btree_insert_trans(trans, id, k, iter_flags));
|
||||
}
|
||||
|
||||
int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
|
||||
unsigned len, unsigned update_flags)
|
||||
{
|
||||
struct bkey_i *k;
|
||||
|
||||
k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
if (IS_ERR(k))
|
||||
return PTR_ERR(k);
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.p = iter->pos;
|
||||
bch2_key_resize(&k->k, len);
|
||||
return bch2_trans_update(trans, iter, k, update_flags);
|
||||
}
|
||||
|
||||
int bch2_btree_delete_at(struct btree_trans *trans,
|
||||
struct btree_iter *iter, unsigned update_flags)
|
||||
{
|
||||
return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
|
||||
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
int ret = PTR_ERR_OR_ZERO(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.p = iter->pos;
|
||||
return bch2_trans_update(trans, iter, k, update_flags);
|
||||
}
|
||||
|
||||
int bch2_btree_delete(struct btree_trans *trans,
|
||||
@ -721,7 +710,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
|
||||
while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
|
||||
while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
|
||||
struct disk_reservation disk_res =
|
||||
bch2_disk_reservation_init(trans->c, 0);
|
||||
struct bkey_i delete;
|
||||
@ -794,8 +783,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
|
||||
struct bpos pos, bool set)
|
||||
int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set)
|
||||
{
|
||||
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
|
||||
int ret = PTR_ERR_OR_ZERO(k);
|
||||
@ -804,13 +792,21 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
|
||||
|
||||
bkey_init(&k->k);
|
||||
k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
|
||||
k->k.p = pos;
|
||||
k->k.p = iter->pos;
|
||||
if (iter->flags & BTREE_ITER_is_extents)
|
||||
bch2_key_resize(&k->k, 1);
|
||||
|
||||
return bch2_trans_update(trans, iter, k, 0);
|
||||
}
|
||||
|
||||
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
|
||||
struct bpos pos, bool set)
|
||||
{
|
||||
struct btree_iter iter;
|
||||
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
|
||||
|
||||
ret = bch2_btree_iter_traverse(&iter) ?:
|
||||
bch2_trans_update(trans, &iter, k, 0);
|
||||
int ret = bch2_btree_iter_traverse(&iter) ?:
|
||||
bch2_btree_bit_mod_iter(trans, &iter, set);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
@ -827,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
|
||||
return bch2_trans_update_buffered(trans, btree, &k);
|
||||
}
|
||||
|
||||
static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
|
||||
int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
|
||||
{
|
||||
unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
|
||||
prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
|
||||
|
||||
int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
|
||||
int ret = PTR_ERR_OR_ZERO(e);
|
||||
ret = PTR_ERR_OR_ZERO(e);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -865,9 +868,8 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
|
||||
memcpy(l->d, buf.buf, buf.pos);
|
||||
c->journal.early_journal_entries.nr += jset_u64s(u64s);
|
||||
} else {
|
||||
ret = bch2_trans_commit_do(c, NULL, NULL,
|
||||
BCH_TRANS_COMMIT_lazy_rw|commit_flags,
|
||||
__bch2_trans_log_msg(trans, &buf, u64s));
|
||||
ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
|
||||
bch2_trans_log_msg(trans, &buf));
|
||||
}
|
||||
err:
|
||||
printbuf_exit(&buf);
|
||||
|
@ -24,7 +24,6 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
|
||||
#define BCH_TRANS_COMMIT_FLAGS() \
|
||||
x(no_enospc, "don't check for enospc") \
|
||||
x(no_check_rw, "don't attempt to take a ref on c->writes") \
|
||||
x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
|
||||
x(no_journal_res, "don't take a journal reservation, instead " \
|
||||
"pin journal entry referred to by trans->journal_res.seq") \
|
||||
x(journal_reclaim, "operation required for journal reclaim; may return error" \
|
||||
@ -47,8 +46,6 @@ enum bch_trans_commit_flags {
|
||||
|
||||
void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags);
|
||||
|
||||
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
|
||||
unsigned, unsigned);
|
||||
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
|
||||
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
|
||||
|
||||
@ -66,6 +63,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
|
||||
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
|
||||
struct bpos, struct bpos, unsigned, u64 *);
|
||||
|
||||
int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool);
|
||||
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
|
||||
int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
|
||||
|
||||
@ -161,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *,
|
||||
struct btree_trans_commit_hook *);
|
||||
int __bch2_trans_commit(struct btree_trans *, unsigned);
|
||||
|
||||
int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
|
||||
__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
|
||||
__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
|
||||
|
||||
@ -244,7 +243,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra
|
||||
KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
|
||||
|
||||
static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct bkey_s_c *k, unsigned flags,
|
||||
struct bkey_s_c *k,
|
||||
enum btree_iter_update_trigger_flags flags,
|
||||
unsigned type, unsigned min_bytes)
|
||||
{
|
||||
struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes);
|
||||
@ -261,8 +261,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str
|
||||
return mut;
|
||||
}
|
||||
|
||||
static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter,
|
||||
struct bkey_s_c *k, unsigned flags)
|
||||
static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans,
|
||||
struct btree_iter *iter, struct bkey_s_c *k,
|
||||
enum btree_iter_update_trigger_flags flags)
|
||||
{
|
||||
return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0);
|
||||
}
|
||||
@ -274,7 +275,8 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc
|
||||
static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
unsigned btree_id, struct bpos pos,
|
||||
unsigned flags, unsigned type, unsigned min_bytes)
|
||||
enum btree_iter_update_trigger_flags flags,
|
||||
unsigned type, unsigned min_bytes)
|
||||
{
|
||||
struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
|
||||
btree_id, pos, flags|BTREE_ITER_intent, type);
|
||||
@ -289,7 +291,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
|
||||
static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
unsigned btree_id, struct bpos pos,
|
||||
unsigned flags)
|
||||
enum btree_iter_update_trigger_flags flags)
|
||||
{
|
||||
return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0);
|
||||
}
|
||||
@ -297,7 +299,8 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran
|
||||
static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
unsigned btree_id, struct bpos pos,
|
||||
unsigned flags, unsigned type, unsigned min_bytes)
|
||||
enum btree_iter_update_trigger_flags flags,
|
||||
unsigned type, unsigned min_bytes)
|
||||
{
|
||||
struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter,
|
||||
btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes);
|
||||
@ -318,7 +321,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans,
|
||||
static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
unsigned btree_id, struct bpos pos,
|
||||
unsigned flags, unsigned min_bytes)
|
||||
enum btree_iter_update_trigger_flags flags,
|
||||
unsigned min_bytes)
|
||||
{
|
||||
return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes);
|
||||
}
|
||||
@ -326,7 +330,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans
|
||||
static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
unsigned btree_id, struct bpos pos,
|
||||
unsigned flags)
|
||||
enum btree_iter_update_trigger_flags flags)
|
||||
{
|
||||
return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0);
|
||||
}
|
||||
@ -337,7 +341,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans,
|
||||
KEY_TYPE_##_type, sizeof(struct bkey_i_##_type)))
|
||||
|
||||
static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter,
|
||||
unsigned flags, unsigned type, unsigned val_size)
|
||||
enum btree_iter_update_trigger_flags flags,
|
||||
unsigned type, unsigned val_size)
|
||||
{
|
||||
struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size);
|
||||
int ret;
|
||||
|
@ -58,11 +58,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
|
||||
b->data->min_key));
|
||||
|
||||
bch2_bkey_buf_init(&prev);
|
||||
bkey_init(&prev.k->k);
|
||||
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
|
||||
|
||||
if (b == btree_node_root(c, b)) {
|
||||
if (!bpos_eq(b->data->min_key, POS_MIN)) {
|
||||
printbuf_reset(&buf);
|
||||
bch2_bpos_to_text(&buf, b->data->min_key);
|
||||
need_fsck_err(trans, btree_root_bad_min_key,
|
||||
log_fsck_err(trans, btree_root_bad_min_key,
|
||||
"btree root with incorrect min_key: %s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
@ -70,18 +74,14 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
|
||||
printbuf_reset(&buf);
|
||||
bch2_bpos_to_text(&buf, b->data->max_key);
|
||||
need_fsck_err(trans, btree_root_bad_max_key,
|
||||
log_fsck_err(trans, btree_root_bad_max_key,
|
||||
"btree root with incorrect max_key: %s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
}
|
||||
|
||||
if (!b->c.level)
|
||||
return 0;
|
||||
|
||||
bch2_bkey_buf_init(&prev);
|
||||
bkey_init(&prev.k->k);
|
||||
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
|
||||
goto out;
|
||||
|
||||
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
||||
if (k.k->type != KEY_TYPE_btree_ptr_v2)
|
||||
@ -97,16 +97,16 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
bch2_topology_error(c);
|
||||
|
||||
printbuf_reset(&buf);
|
||||
prt_str(&buf, "end of prev node doesn't match start of next node\n"),
|
||||
prt_printf(&buf, " in btree %s level %u node ",
|
||||
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
prt_str(&buf, "end of prev node doesn't match start of next node\n in ");
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_str(&buf, " node ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
prt_str(&buf, "\n prev ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
|
||||
prt_str(&buf, "\n next ");
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
|
||||
need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
|
||||
log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
|
||||
@ -118,25 +118,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
|
||||
bch2_topology_error(c);
|
||||
|
||||
printbuf_reset(&buf);
|
||||
prt_str(&buf, "empty interior node\n");
|
||||
prt_printf(&buf, " in btree %s level %u node ",
|
||||
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
prt_str(&buf, "empty interior node\n in ");
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_str(&buf, " node ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
|
||||
need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
|
||||
log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
|
||||
goto topology_repair;
|
||||
} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
|
||||
bch2_topology_error(c);
|
||||
|
||||
printbuf_reset(&buf);
|
||||
prt_str(&buf, "last child node doesn't end at end of parent node\n");
|
||||
prt_printf(&buf, " in btree %s level %u node ",
|
||||
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
prt_str(&buf, "last child node doesn't end at end of parent node\n in ");
|
||||
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
|
||||
prt_str(&buf, " node ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
prt_str(&buf, "\n last key ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
|
||||
|
||||
need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
|
||||
log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
|
||||
goto topology_repair;
|
||||
}
|
||||
out:
|
||||
@ -146,13 +146,7 @@ fsck_err:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
topology_repair:
|
||||
if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
|
||||
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
|
||||
bch2_inconsistent_error(c);
|
||||
ret = -BCH_ERR_btree_need_topology_repair;
|
||||
} else {
|
||||
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
|
||||
}
|
||||
ret = bch2_topology_error(c);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -244,7 +238,6 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
|
||||
struct btree *b)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
unsigned i, level = b->c.level;
|
||||
|
||||
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
|
||||
|
||||
@ -255,13 +248,9 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
|
||||
mutex_unlock(&c->btree_cache.lock);
|
||||
|
||||
six_unlock_write(&b->c.lock);
|
||||
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
|
||||
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
|
||||
|
||||
trans_for_each_path(trans, path, i)
|
||||
if (path->l[level].b == b) {
|
||||
btree_node_unlock(trans, path, level);
|
||||
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
|
||||
}
|
||||
bch2_trans_node_drop(trans, b);
|
||||
}
|
||||
|
||||
static void bch2_btree_node_free_never_used(struct btree_update *as,
|
||||
@ -270,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
|
||||
{
|
||||
struct bch_fs *c = as->c;
|
||||
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
|
||||
struct btree_path *path;
|
||||
unsigned i, level = b->c.level;
|
||||
|
||||
BUG_ON(!list_empty(&b->write_blocked));
|
||||
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
|
||||
@ -293,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
|
||||
|
||||
six_unlock_intent(&b->c.lock);
|
||||
|
||||
trans_for_each_path(trans, path, i)
|
||||
if (path->l[level].b == b) {
|
||||
btree_node_unlock(trans, path, level);
|
||||
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
|
||||
}
|
||||
bch2_trans_node_drop(trans, b);
|
||||
}
|
||||
|
||||
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
|
||||
@ -809,7 +792,7 @@ err:
|
||||
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
|
||||
six_unlock_write(&b->c.lock);
|
||||
|
||||
btree_node_write_if_need(c, b, SIX_LOCK_intent);
|
||||
btree_node_write_if_need(trans, b, SIX_LOCK_intent);
|
||||
btree_node_unlock(trans, path, b->c.level);
|
||||
bch2_path_put(trans, path_idx, true);
|
||||
}
|
||||
@ -830,7 +813,7 @@ err:
|
||||
b = as->new_nodes[i];
|
||||
|
||||
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
|
||||
btree_node_write_if_need(c, b, SIX_LOCK_read);
|
||||
btree_node_write_if_need(trans, b, SIX_LOCK_read);
|
||||
six_unlock_read(&b->c.lock);
|
||||
}
|
||||
|
||||
@ -1366,9 +1349,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
|
||||
if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags)))
|
||||
bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
|
||||
|
||||
if (bch2_bkey_validate(c, bkey_i_to_s_c(insert),
|
||||
btree_node_type(b), BCH_VALIDATE_write) ?:
|
||||
bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) {
|
||||
struct bkey_validate_context from = (struct bkey_validate_context) {
|
||||
.from = BKEY_VALIDATE_btree_node,
|
||||
.level = b->c.level,
|
||||
.btree = b->c.btree_id,
|
||||
.flags = BCH_VALIDATE_commit,
|
||||
};
|
||||
if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?:
|
||||
bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) {
|
||||
bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__);
|
||||
dump_stack();
|
||||
}
|
||||
@ -1418,15 +1406,26 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
|
||||
(bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
|
||||
;
|
||||
|
||||
while (!bch2_keylist_empty(keys)) {
|
||||
insert = bch2_keylist_front(keys);
|
||||
|
||||
if (bpos_gt(insert->k.p, b->key.k.p))
|
||||
break;
|
||||
|
||||
for (;
|
||||
insert != keys->top && bpos_le(insert->k.p, b->key.k.p);
|
||||
insert = bkey_next(insert))
|
||||
bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
|
||||
bch2_keylist_pop_front(keys);
|
||||
|
||||
if (bch2_btree_node_check_topology(trans, b)) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
for (struct bkey_i *k = keys->keys;
|
||||
k != insert;
|
||||
k = bkey_next(k)) {
|
||||
bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
|
||||
prt_newline(&buf);
|
||||
}
|
||||
|
||||
panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
|
||||
}
|
||||
|
||||
memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
|
||||
keys->top_p -= insert->_data - keys->keys_p;
|
||||
}
|
||||
|
||||
static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
|
||||
@ -1575,8 +1574,6 @@ static void btree_split_insert_keys(struct btree_update *as,
|
||||
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
|
||||
|
||||
bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
|
||||
|
||||
BUG_ON(bch2_btree_node_check_topology(trans, b));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1599,8 +1596,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
bch2_btree_interior_update_will_free_node(as, b);
|
||||
|
||||
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
|
||||
struct btree *n[2];
|
||||
|
||||
@ -1699,16 +1694,18 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_btree_interior_update_will_free_node(as, b);
|
||||
|
||||
if (n3) {
|
||||
bch2_btree_update_get_open_buckets(as, n3);
|
||||
bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
|
||||
bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0);
|
||||
}
|
||||
if (n2) {
|
||||
bch2_btree_update_get_open_buckets(as, n2);
|
||||
bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
|
||||
bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0);
|
||||
}
|
||||
bch2_btree_update_get_open_buckets(as, n1);
|
||||
bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
|
||||
bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0);
|
||||
|
||||
/*
|
||||
* The old node must be freed (in memory) _before_ unlocking the new
|
||||
@ -1827,8 +1824,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
|
||||
|
||||
btree_update_updated_node(as, b);
|
||||
bch2_btree_node_unlock_write(trans, path, b);
|
||||
|
||||
BUG_ON(bch2_btree_node_check_topology(trans, b));
|
||||
return 0;
|
||||
split:
|
||||
/*
|
||||
@ -1905,7 +1900,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans *
|
||||
BUG_ON(ret);
|
||||
|
||||
bch2_btree_update_get_open_buckets(as, n);
|
||||
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
|
||||
bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
|
||||
bch2_trans_node_add(trans, path, n);
|
||||
six_unlock_intent(&n->c.lock);
|
||||
|
||||
@ -1953,8 +1948,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
u64 start_time = local_clock();
|
||||
int ret = 0;
|
||||
|
||||
bch2_trans_verify_not_in_restart(trans);
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
BUG_ON(!trans->paths[path].should_be_locked);
|
||||
BUG_ON(!btree_node_locked(&trans->paths[path], level));
|
||||
|
||||
@ -2058,9 +2052,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
|
||||
trace_and_count(c, btree_node_merge, trans, b);
|
||||
|
||||
bch2_btree_interior_update_will_free_node(as, b);
|
||||
bch2_btree_interior_update_will_free_node(as, m);
|
||||
|
||||
n = bch2_btree_node_alloc(as, trans, b->c.level);
|
||||
|
||||
SET_BTREE_NODE_SEQ(n->data,
|
||||
@ -2096,10 +2087,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err_free_update;
|
||||
|
||||
bch2_btree_interior_update_will_free_node(as, b);
|
||||
bch2_btree_interior_update_will_free_node(as, m);
|
||||
|
||||
bch2_trans_verify_paths(trans);
|
||||
|
||||
bch2_btree_update_get_open_buckets(as, n);
|
||||
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
|
||||
bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
|
||||
|
||||
bch2_btree_node_free_inmem(trans, trans->paths + path, b);
|
||||
bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
|
||||
@ -2150,8 +2144,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
bch2_btree_interior_update_will_free_node(as, b);
|
||||
|
||||
n = bch2_btree_node_alloc_replacement(as, trans, b);
|
||||
|
||||
bch2_btree_build_aux_trees(n);
|
||||
@ -2175,8 +2167,10 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
bch2_btree_interior_update_will_free_node(as, b);
|
||||
|
||||
bch2_btree_update_get_open_buckets(as, n);
|
||||
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
|
||||
bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
|
||||
|
||||
bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
|
||||
|
||||
@ -2201,42 +2195,50 @@ struct async_btree_rewrite {
|
||||
struct list_head list;
|
||||
enum btree_id btree_id;
|
||||
unsigned level;
|
||||
struct bpos pos;
|
||||
__le64 seq;
|
||||
struct bkey_buf key;
|
||||
};
|
||||
|
||||
static int async_btree_node_rewrite_trans(struct btree_trans *trans,
|
||||
struct async_btree_rewrite *a)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
struct btree *b;
|
||||
int ret;
|
||||
|
||||
bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
|
||||
bch2_trans_node_iter_init(trans, &iter,
|
||||
a->btree_id, a->key.k->k.p,
|
||||
BTREE_MAX_DEPTH, a->level, 0);
|
||||
b = bch2_btree_iter_peek_node(&iter);
|
||||
ret = PTR_ERR_OR_ZERO(b);
|
||||
struct btree *b = bch2_btree_iter_peek_node(&iter);
|
||||
int ret = PTR_ERR_OR_ZERO(b);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (!b || b->data->keys.seq != a->seq) {
|
||||
bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
|
||||
ret = found
|
||||
? bch2_btree_node_rewrite(trans, &iter, b, 0)
|
||||
: -ENOENT;
|
||||
|
||||
#if 0
|
||||
/* Tracepoint... */
|
||||
if (!ret || ret == -ENOENT) {
|
||||
struct bch_fs *c = trans->c;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
if (b)
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
else
|
||||
prt_str(&buf, "(null");
|
||||
bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s",
|
||||
__func__, a->seq, buf.buf);
|
||||
if (!ret) {
|
||||
prt_printf(&buf, "rewrite node:\n ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
||||
} else {
|
||||
prt_printf(&buf, "node to rewrite not found:\n want: ");
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
|
||||
prt_printf(&buf, "\n got: ");
|
||||
if (b)
|
||||
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
||||
else
|
||||
prt_str(&buf, "(null)");
|
||||
}
|
||||
bch_info(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
|
||||
#endif
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -2247,81 +2249,97 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
|
||||
struct bch_fs *c = a->c;
|
||||
|
||||
int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
|
||||
bch_err_fn_ratelimited(c, ret);
|
||||
if (ret != -ENOENT)
|
||||
bch_err_fn_ratelimited(c, ret);
|
||||
|
||||
spin_lock(&c->btree_node_rewrites_lock);
|
||||
list_del(&a->list);
|
||||
spin_unlock(&c->btree_node_rewrites_lock);
|
||||
|
||||
closure_wake_up(&c->btree_node_rewrites_wait);
|
||||
|
||||
bch2_bkey_buf_exit(&a->key, c);
|
||||
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
|
||||
kfree(a);
|
||||
}
|
||||
|
||||
void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
|
||||
{
|
||||
struct async_btree_rewrite *a;
|
||||
int ret;
|
||||
|
||||
a = kmalloc(sizeof(*a), GFP_NOFS);
|
||||
if (!a) {
|
||||
bch_err(c, "%s: error allocating memory", __func__);
|
||||
struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
|
||||
if (!a)
|
||||
return;
|
||||
}
|
||||
|
||||
a->c = c;
|
||||
a->btree_id = b->c.btree_id;
|
||||
a->level = b->c.level;
|
||||
a->pos = b->key.k.p;
|
||||
a->seq = b->data->keys.seq;
|
||||
INIT_WORK(&a->work, async_btree_node_rewrite_work);
|
||||
|
||||
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
|
||||
mutex_lock(&c->pending_node_rewrites_lock);
|
||||
list_add(&a->list, &c->pending_node_rewrites);
|
||||
mutex_unlock(&c->pending_node_rewrites_lock);
|
||||
return;
|
||||
bch2_bkey_buf_init(&a->key);
|
||||
bch2_bkey_buf_copy(&a->key, c, &b->key);
|
||||
|
||||
bool now = false, pending = false;
|
||||
|
||||
spin_lock(&c->btree_node_rewrites_lock);
|
||||
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
|
||||
bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
|
||||
list_add(&a->list, &c->btree_node_rewrites);
|
||||
now = true;
|
||||
} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
|
||||
list_add(&a->list, &c->btree_node_rewrites_pending);
|
||||
pending = true;
|
||||
}
|
||||
spin_unlock(&c->btree_node_rewrites_lock);
|
||||
|
||||
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
|
||||
if (test_bit(BCH_FS_started, &c->flags)) {
|
||||
bch_err(c, "%s: error getting c->writes ref", __func__);
|
||||
kfree(a);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = bch2_fs_read_write_early(c);
|
||||
bch_err_msg(c, ret, "going read-write");
|
||||
if (ret) {
|
||||
kfree(a);
|
||||
return;
|
||||
}
|
||||
|
||||
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
|
||||
if (now) {
|
||||
queue_work(c->btree_node_rewrite_worker, &a->work);
|
||||
} else if (pending) {
|
||||
/* bch2_do_pending_node_rewrites will execute */
|
||||
} else {
|
||||
bch2_bkey_buf_exit(&a->key, c);
|
||||
kfree(a);
|
||||
}
|
||||
}
|
||||
|
||||
queue_work(c->btree_node_rewrite_worker, &a->work);
|
||||
void bch2_async_btree_node_rewrites_flush(struct bch_fs *c)
|
||||
{
|
||||
closure_wait_event(&c->btree_node_rewrites_wait,
|
||||
list_empty(&c->btree_node_rewrites));
|
||||
}
|
||||
|
||||
void bch2_do_pending_node_rewrites(struct bch_fs *c)
|
||||
{
|
||||
struct async_btree_rewrite *a, *n;
|
||||
while (1) {
|
||||
spin_lock(&c->btree_node_rewrites_lock);
|
||||
struct async_btree_rewrite *a =
|
||||
list_pop_entry(&c->btree_node_rewrites_pending,
|
||||
struct async_btree_rewrite, list);
|
||||
if (a)
|
||||
list_add(&a->list, &c->btree_node_rewrites);
|
||||
spin_unlock(&c->btree_node_rewrites_lock);
|
||||
|
||||
mutex_lock(&c->pending_node_rewrites_lock);
|
||||
list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
|
||||
list_del(&a->list);
|
||||
if (!a)
|
||||
break;
|
||||
|
||||
bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
|
||||
queue_work(c->btree_node_rewrite_worker, &a->work);
|
||||
}
|
||||
mutex_unlock(&c->pending_node_rewrites_lock);
|
||||
}
|
||||
|
||||
void bch2_free_pending_node_rewrites(struct bch_fs *c)
|
||||
{
|
||||
struct async_btree_rewrite *a, *n;
|
||||
while (1) {
|
||||
spin_lock(&c->btree_node_rewrites_lock);
|
||||
struct async_btree_rewrite *a =
|
||||
list_pop_entry(&c->btree_node_rewrites_pending,
|
||||
struct async_btree_rewrite, list);
|
||||
spin_unlock(&c->btree_node_rewrites_lock);
|
||||
|
||||
mutex_lock(&c->pending_node_rewrites_lock);
|
||||
list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) {
|
||||
list_del(&a->list);
|
||||
if (!a)
|
||||
break;
|
||||
|
||||
bch2_bkey_buf_exit(&a->key, c);
|
||||
kfree(a);
|
||||
}
|
||||
mutex_unlock(&c->pending_node_rewrites_lock);
|
||||
}
|
||||
|
||||
static int __bch2_btree_node_update_key(struct btree_trans *trans,
|
||||
@ -2575,8 +2593,9 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update
|
||||
prt_printf(out, "%ps: ", (void *) as->ip_started);
|
||||
bch2_trans_commit_flags_to_text(out, as->flags);
|
||||
|
||||
prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
|
||||
bch2_btree_id_str(as->btree_id),
|
||||
prt_str(out, " ");
|
||||
bch2_btree_id_to_text(out, as->btree_id);
|
||||
prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
|
||||
as->update_level_start,
|
||||
as->update_level_end,
|
||||
bch2_btree_update_modes[as->mode],
|
||||
@ -2677,6 +2696,9 @@ void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
|
||||
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
|
||||
{
|
||||
WARN_ON(!list_empty(&c->btree_node_rewrites));
|
||||
WARN_ON(!list_empty(&c->btree_node_rewrites_pending));
|
||||
|
||||
if (c->btree_node_rewrite_worker)
|
||||
destroy_workqueue(c->btree_node_rewrite_worker);
|
||||
if (c->btree_interior_update_worker)
|
||||
@ -2692,8 +2714,9 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
|
||||
mutex_init(&c->btree_interior_update_lock);
|
||||
INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
|
||||
|
||||
INIT_LIST_HEAD(&c->pending_node_rewrites);
|
||||
mutex_init(&c->pending_node_rewrites_lock);
|
||||
INIT_LIST_HEAD(&c->btree_node_rewrites);
|
||||
INIT_LIST_HEAD(&c->btree_node_rewrites_pending);
|
||||
spin_lock_init(&c->btree_node_rewrites_lock);
|
||||
}
|
||||
|
||||
int bch2_fs_btree_interior_update_init(struct bch_fs *c)
|
||||
|
@ -159,7 +159,7 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
|
||||
unsigned level,
|
||||
unsigned flags)
|
||||
{
|
||||
bch2_trans_verify_not_unlocked(trans);
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
return bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
|
||||
btree_prev_sib) ?:
|
||||
@ -334,6 +334,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
|
||||
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
|
||||
struct jset_entry *, unsigned long);
|
||||
|
||||
void bch2_async_btree_node_rewrites_flush(struct bch_fs *);
|
||||
void bch2_do_pending_node_rewrites(struct bch_fs *);
|
||||
void bch2_free_pending_node_rewrites(struct bch_fs *);
|
||||
|
||||
|
@ -19,8 +19,6 @@
|
||||
static int bch2_btree_write_buffer_journal_flush(struct journal *,
|
||||
struct journal_entry_pin *, u64);
|
||||
|
||||
static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
|
||||
|
||||
static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
|
||||
{
|
||||
return (cmp_int(l->hi, r->hi) ?:
|
||||
@ -314,6 +312,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
|
||||
darray_for_each(wb->sorted, i) {
|
||||
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
|
||||
|
||||
BUG_ON(!btree_type_uses_write_buffer(k->btree));
|
||||
|
||||
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
|
||||
prefetch(&wb->flushing.keys.data[n->idx]);
|
||||
|
||||
@ -481,21 +481,55 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
|
||||
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
|
||||
{
|
||||
struct journal_keys_to_wb dst;
|
||||
int ret = 0;
|
||||
|
||||
bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
|
||||
|
||||
for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
|
||||
jset_entry_for_each_key(entry, k) {
|
||||
ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
entry->type = BCH_JSET_ENTRY_btree_keys;
|
||||
}
|
||||
out:
|
||||
ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq)
|
||||
{
|
||||
struct journal *j = &c->journal;
|
||||
struct journal_buf *buf;
|
||||
bool blocked;
|
||||
int ret = 0;
|
||||
|
||||
while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
|
||||
while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) {
|
||||
ret = bch2_journal_keys_to_write_buffer(c, buf);
|
||||
|
||||
if (!blocked && !ret) {
|
||||
spin_lock(&j->lock);
|
||||
buf->need_flush_to_write_buffer = false;
|
||||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
mutex_unlock(&j->buf_lock);
|
||||
|
||||
if (blocked) {
|
||||
bch2_journal_unblock(j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
|
||||
static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq,
|
||||
bool *did_work)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
@ -505,7 +539,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
|
||||
do {
|
||||
bch2_trans_unlock(trans);
|
||||
|
||||
fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
|
||||
fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq);
|
||||
|
||||
*did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
|
||||
|
||||
@ -518,8 +552,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
|
||||
mutex_unlock(&wb->flushing.lock);
|
||||
} while (!ret &&
|
||||
(fetch_from_journal_err ||
|
||||
(wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
|
||||
(wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
|
||||
(wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) ||
|
||||
(wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq)));
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -600,6 +634,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
|
||||
bch2_bkey_buf_init(&tmp);
|
||||
|
||||
if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) {
|
||||
if (trace_write_buffer_maybe_flush_enabled()) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_bkey_val_to_text(&buf, c, referring_k);
|
||||
trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
|
||||
bch2_bkey_buf_reassemble(&tmp, c, referring_k);
|
||||
|
||||
if (bkey_is_btree_ptr(referring_k.k)) {
|
||||
@ -771,31 +813,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
|
||||
{
|
||||
struct journal_keys_to_wb dst;
|
||||
int ret = 0;
|
||||
|
||||
bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
|
||||
|
||||
for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
|
||||
jset_entry_for_each_key(entry, k) {
|
||||
ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
entry->type = BCH_JSET_ENTRY_btree_keys;
|
||||
}
|
||||
|
||||
spin_lock(&c->journal.lock);
|
||||
buf->need_flush_to_write_buffer = false;
|
||||
spin_unlock(&c->journal.lock);
|
||||
out:
|
||||
ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
|
||||
{
|
||||
if (wb->keys.size >= new_size)
|
||||
|
@ -18,7 +18,9 @@
|
||||
#include "error.h"
|
||||
#include "inode.h"
|
||||
#include "movinggc.h"
|
||||
#include "rebalance.h"
|
||||
#include "recovery.h"
|
||||
#include "recovery_passes.h"
|
||||
#include "reflink.h"
|
||||
#include "replicas.h"
|
||||
#include "subvolume.h"
|
||||
@ -260,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
|
||||
bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
|
||||
ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
|
||||
if (ret)
|
||||
@ -362,7 +362,6 @@ found:
|
||||
bch_info(c, "new key %s", buf.buf);
|
||||
}
|
||||
|
||||
percpu_up_read(&c->mark_lock);
|
||||
struct btree_iter iter;
|
||||
bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
|
||||
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
|
||||
@ -371,8 +370,6 @@ found:
|
||||
BTREE_UPDATE_internal_snapshot_node|
|
||||
BTREE_TRIGGER_norun);
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
percpu_down_read(&c->mark_lock);
|
||||
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -380,7 +377,6 @@ found:
|
||||
bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
|
||||
}
|
||||
err:
|
||||
percpu_up_read(&c->mark_lock);
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
}
|
||||
@ -401,8 +397,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
||||
BUG_ON(!sectors);
|
||||
|
||||
if (gen_after(ptr->gen, b_gen)) {
|
||||
bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
||||
ptr_gen_newer_than_bucket_gen,
|
||||
bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
||||
log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
|
||||
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
|
||||
"while marking %s",
|
||||
ptr->dev, bucket_nr, b_gen,
|
||||
@ -415,8 +411,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
|
||||
bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
||||
ptr_too_stale,
|
||||
bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
||||
log_fsck_err(trans, ptr_too_stale,
|
||||
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
|
||||
"while marking %s",
|
||||
ptr->dev, bucket_nr, b_gen,
|
||||
@ -435,8 +431,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
if (b_gen != ptr->gen) {
|
||||
bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
||||
stale_dirty_ptr,
|
||||
bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
||||
log_fsck_err(trans, stale_dirty_ptr,
|
||||
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
|
||||
"while marking %s",
|
||||
ptr->dev, bucket_nr, b_gen,
|
||||
@ -451,8 +447,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
|
||||
bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
||||
ptr_bucket_data_type_mismatch,
|
||||
bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
||||
log_fsck_err(trans, ptr_bucket_data_type_mismatch,
|
||||
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
|
||||
"while marking %s",
|
||||
ptr->dev, bucket_nr, b_gen,
|
||||
@ -466,8 +462,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
|
||||
}
|
||||
|
||||
if ((u64) *bucket_sectors + sectors > U32_MAX) {
|
||||
bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
||||
bucket_sector_count_overflow,
|
||||
bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
||||
log_fsck_err(trans, bucket_sector_count_overflow,
|
||||
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
|
||||
"while marking %s",
|
||||
ptr->dev, bucket_nr, b_gen,
|
||||
@ -485,7 +481,9 @@ out:
|
||||
printbuf_exit(&buf);
|
||||
return ret;
|
||||
err:
|
||||
fsck_err:
|
||||
bch2_dump_trans_updates(trans);
|
||||
bch2_inconsistent_error(c);
|
||||
ret = -BCH_ERR_bucket_ref_update;
|
||||
goto out;
|
||||
}
|
||||
@ -543,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
|
||||
struct bkey_s_c k,
|
||||
const struct extent_ptr_decoded *p,
|
||||
s64 sectors, enum bch_data_type ptr_data_type,
|
||||
struct bch_alloc_v4 *a)
|
||||
struct bch_alloc_v4 *a,
|
||||
bool insert)
|
||||
{
|
||||
u32 *dst_sectors = p->has_ec ? &a->stripe_sectors :
|
||||
!p->ptr.cached ? &a->dirty_sectors :
|
||||
@ -553,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca,
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
alloc_data_type_set(a, ptr_data_type);
|
||||
if (insert)
|
||||
alloc_data_type_set(a, ptr_data_type);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -570,8 +569,10 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
|
||||
struct printbuf buf = PRINTBUF;
|
||||
int ret = 0;
|
||||
|
||||
u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p);
|
||||
*sectors = insert ? abs_sectors : -abs_sectors;
|
||||
struct bkey_i_backpointer bp;
|
||||
bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp);
|
||||
|
||||
*sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len;
|
||||
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
|
||||
if (unlikely(!ca)) {
|
||||
@ -580,41 +581,36 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
|
||||
goto err;
|
||||
}
|
||||
|
||||
struct bpos bucket;
|
||||
struct bch_backpointer bp;
|
||||
__bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors);
|
||||
struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
|
||||
|
||||
if (flags & BTREE_TRIGGER_transactional) {
|
||||
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
|
||||
ret = PTR_ERR_OR_ZERO(a) ?:
|
||||
__mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v);
|
||||
__mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (!p.ptr.cached) {
|
||||
ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert);
|
||||
ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
if (flags & BTREE_TRIGGER_gc) {
|
||||
percpu_down_read(&c->mark_lock);
|
||||
struct bucket *g = gc_bucket(ca, bucket.offset);
|
||||
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
|
||||
p.ptr.dev,
|
||||
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
|
||||
ret = -BCH_ERR_trigger_pointer;
|
||||
goto err_unlock;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bucket_lock(g);
|
||||
struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old;
|
||||
ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new);
|
||||
ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert);
|
||||
alloc_to_bucket(g, new);
|
||||
bucket_unlock(g);
|
||||
err_unlock:
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
if (!ret)
|
||||
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
|
||||
@ -951,6 +947,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
||||
enum bch_data_type type,
|
||||
unsigned sectors)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
int ret = 0;
|
||||
|
||||
@ -960,8 +957,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
||||
return PTR_ERR(a);
|
||||
|
||||
if (a->v.data_type && type && a->v.data_type != type) {
|
||||
bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
||||
bucket_metadata_type_mismatch,
|
||||
bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
|
||||
log_fsck_err(trans, bucket_metadata_type_mismatch,
|
||||
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
|
||||
"while marking %s",
|
||||
iter.pos.inode, iter.pos.offset, a->v.gen,
|
||||
@ -979,6 +976,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
||||
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
|
||||
}
|
||||
err:
|
||||
fsck_err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
}
|
||||
@ -990,11 +988,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
|
||||
struct bch_fs *c = trans->c;
|
||||
int ret = 0;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
struct bucket *g = gc_bucket(ca, b);
|
||||
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s",
|
||||
ca->dev_idx, bch2_data_type_str(data_type)))
|
||||
goto err_unlock;
|
||||
goto err;
|
||||
|
||||
bucket_lock(g);
|
||||
struct bch_alloc_v4 old = bucket_m_to_alloc(*g);
|
||||
@ -1004,26 +1001,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
|
||||
"different types of data in same bucket: %s, %s",
|
||||
bch2_data_type_str(g->data_type),
|
||||
bch2_data_type_str(data_type)))
|
||||
goto err;
|
||||
goto err_unlock;
|
||||
|
||||
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
|
||||
"bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size",
|
||||
ca->dev_idx, b, g->gen,
|
||||
bch2_data_type_str(g->data_type ?: data_type),
|
||||
g->dirty_sectors, sectors))
|
||||
goto err;
|
||||
goto err_unlock;
|
||||
|
||||
g->data_type = data_type;
|
||||
g->dirty_sectors += sectors;
|
||||
struct bch_alloc_v4 new = bucket_m_to_alloc(*g);
|
||||
bucket_unlock(g);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
|
||||
return ret;
|
||||
err:
|
||||
bucket_unlock(g);
|
||||
err_unlock:
|
||||
percpu_up_read(&c->mark_lock);
|
||||
bucket_unlock(g);
|
||||
err:
|
||||
return -BCH_ERR_metadata_bucket_inconsistency;
|
||||
}
|
||||
|
||||
@ -1155,6 +1150,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
|
||||
return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional);
|
||||
}
|
||||
|
||||
bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b)
|
||||
{
|
||||
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
||||
u64 b_offset = bucket_to_sector(ca, b);
|
||||
u64 b_end = bucket_to_sector(ca, b + 1);
|
||||
unsigned i;
|
||||
|
||||
if (!b)
|
||||
return true;
|
||||
|
||||
for (i = 0; i < layout->nr_superblocks; i++) {
|
||||
u64 offset = le64_to_cpu(layout->sb_offset[i]);
|
||||
u64 end = offset + (1 << layout->sb_max_size_bits);
|
||||
|
||||
if (!(offset >= b_end || end <= b_offset))
|
||||
return true;
|
||||
}
|
||||
|
||||
for (i = 0; i < ca->journal.nr; i++)
|
||||
if (b == ca->journal.buckets[i])
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Disk reservations: */
|
||||
|
||||
#define SECTORS_CACHE 1024
|
||||
@ -1238,7 +1258,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
|
||||
for_each_member_device(c, ca) {
|
||||
BUG_ON(ca->buckets_nouse);
|
||||
|
||||
ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
|
||||
ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) *
|
||||
sizeof(unsigned long),
|
||||
GFP_KERNEL|__GFP_ZERO);
|
||||
if (!ca->buckets_nouse) {
|
||||
@ -1264,10 +1284,15 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
||||
bool resize = ca->bucket_gens != NULL;
|
||||
int ret;
|
||||
|
||||
BUG_ON(resize && ca->buckets_nouse);
|
||||
if (resize)
|
||||
lockdep_assert_held(&c->state_lock);
|
||||
|
||||
if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets,
|
||||
GFP_KERNEL|__GFP_ZERO))) {
|
||||
if (resize && ca->buckets_nouse)
|
||||
return -BCH_ERR_no_resize_with_buckets_nouse;
|
||||
|
||||
bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
|
||||
GFP_KERNEL|__GFP_ZERO);
|
||||
if (!bucket_gens) {
|
||||
ret = -BCH_ERR_ENOMEM_bucket_gens;
|
||||
goto err;
|
||||
}
|
||||
@ -1277,19 +1302,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
||||
bucket_gens->nbuckets_minus_first =
|
||||
bucket_gens->nbuckets - bucket_gens->first_bucket;
|
||||
|
||||
if (resize) {
|
||||
down_write(&ca->bucket_lock);
|
||||
percpu_down_write(&c->mark_lock);
|
||||
}
|
||||
|
||||
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
|
||||
|
||||
if (resize) {
|
||||
size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
|
||||
|
||||
bucket_gens->nbuckets = min(bucket_gens->nbuckets,
|
||||
old_bucket_gens->nbuckets);
|
||||
bucket_gens->nbuckets_minus_first =
|
||||
bucket_gens->nbuckets - bucket_gens->first_bucket;
|
||||
memcpy(bucket_gens->b,
|
||||
old_bucket_gens->b,
|
||||
n);
|
||||
bucket_gens->nbuckets);
|
||||
}
|
||||
|
||||
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
|
||||
@ -1297,11 +1319,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
||||
|
||||
nbuckets = ca->mi.nbuckets;
|
||||
|
||||
if (resize) {
|
||||
percpu_up_write(&c->mark_lock);
|
||||
up_write(&ca->bucket_lock);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
err:
|
||||
if (bucket_gens)
|
||||
|
@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b)
|
||||
|
||||
static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
|
||||
{
|
||||
return genradix_ptr(&ca->buckets_gc, b);
|
||||
return bucket_valid(ca, b)
|
||||
? genradix_ptr(&ca->buckets_gc, b)
|
||||
: NULL;
|
||||
}
|
||||
|
||||
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
|
||||
{
|
||||
return rcu_dereference_check(ca->bucket_gens,
|
||||
!ca->fs ||
|
||||
percpu_rwsem_is_held(&ca->fs->mark_lock) ||
|
||||
lockdep_is_held(&ca->fs->state_lock) ||
|
||||
lockdep_is_held(&ca->bucket_lock));
|
||||
lockdep_is_held(&ca->fs->state_lock));
|
||||
}
|
||||
|
||||
static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
|
||||
@ -308,26 +307,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *,
|
||||
enum btree_iter_update_trigger_flags);
|
||||
int bch2_trans_mark_dev_sbs(struct bch_fs *);
|
||||
|
||||
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
|
||||
{
|
||||
struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
||||
u64 b_offset = bucket_to_sector(ca, b);
|
||||
u64 b_end = bucket_to_sector(ca, b + 1);
|
||||
unsigned i;
|
||||
|
||||
if (!b)
|
||||
return true;
|
||||
|
||||
for (i = 0; i < layout->nr_superblocks; i++) {
|
||||
u64 offset = le64_to_cpu(layout->sb_offset[i]);
|
||||
u64 end = offset + (1 << layout->sb_max_size_bits);
|
||||
|
||||
if (!(offset >= b_end || end <= b_offset))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
bool bch2_is_superblock_bucket(struct bch_dev *, u64);
|
||||
|
||||
static inline const char *bch2_data_type_str(enum bch_data_type type)
|
||||
{
|
||||
|
@ -24,7 +24,7 @@ struct bucket_gens {
|
||||
u16 first_bucket;
|
||||
size_t nbuckets;
|
||||
size_t nbuckets_minus_first;
|
||||
u8 b[];
|
||||
u8 b[] __counted_by(nbuckets);
|
||||
};
|
||||
|
||||
struct bch_dev_usage {
|
||||
|
@ -6,11 +6,11 @@
|
||||
#include "buckets.h"
|
||||
#include "chardev.h"
|
||||
#include "disk_accounting.h"
|
||||
#include "fsck.h"
|
||||
#include "journal.h"
|
||||
#include "move.h"
|
||||
#include "recovery_passes.h"
|
||||
#include "replicas.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
#include "thread_with_file.h"
|
||||
|
||||
@ -127,130 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
|
||||
}
|
||||
#endif
|
||||
|
||||
struct fsck_thread {
|
||||
struct thread_with_stdio thr;
|
||||
struct bch_fs *c;
|
||||
struct bch_opts opts;
|
||||
};
|
||||
|
||||
static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
|
||||
{
|
||||
struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
|
||||
kfree(thr);
|
||||
}
|
||||
|
||||
static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
|
||||
{
|
||||
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
|
||||
struct bch_fs *c = thr->c;
|
||||
|
||||
int ret = PTR_ERR_OR_ZERO(c);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = bch2_fs_start(thr->c);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
|
||||
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
|
||||
ret |= 1;
|
||||
}
|
||||
if (test_bit(BCH_FS_error, &c->flags)) {
|
||||
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
|
||||
ret |= 4;
|
||||
}
|
||||
err:
|
||||
bch2_fs_stop(c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
|
||||
.exit = bch2_fsck_thread_exit,
|
||||
.fn = bch2_fsck_offline_thread_fn,
|
||||
};
|
||||
|
||||
static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
|
||||
{
|
||||
struct bch_ioctl_fsck_offline arg;
|
||||
struct fsck_thread *thr = NULL;
|
||||
darray_str(devs) = {};
|
||||
long ret = 0;
|
||||
|
||||
if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
||||
return -EFAULT;
|
||||
|
||||
if (arg.flags)
|
||||
return -EINVAL;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
for (size_t i = 0; i < arg.nr_devs; i++) {
|
||||
u64 dev_u64;
|
||||
ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
|
||||
ret = PTR_ERR_OR_ZERO(dev_str);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = darray_push(&devs, dev_str);
|
||||
if (ret) {
|
||||
kfree(dev_str);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
thr = kzalloc(sizeof(*thr), GFP_KERNEL);
|
||||
if (!thr) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
thr->opts = bch2_opts_empty();
|
||||
|
||||
if (arg.opts) {
|
||||
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
|
||||
ret = PTR_ERR_OR_ZERO(optstr) ?:
|
||||
bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
|
||||
if (!IS_ERR(optstr))
|
||||
kfree(optstr);
|
||||
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
|
||||
opt_set(thr->opts, read_only, 1);
|
||||
opt_set(thr->opts, ratelimit_errors, 0);
|
||||
|
||||
/* We need request_key() to be called before we punt to kthread: */
|
||||
opt_set(thr->opts, nostart, true);
|
||||
|
||||
bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
|
||||
|
||||
thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
|
||||
|
||||
if (!IS_ERR(thr->c) &&
|
||||
thr->c->opts.errors == BCH_ON_ERROR_panic)
|
||||
thr->c->opts.errors = BCH_ON_ERROR_ro;
|
||||
|
||||
ret = __bch2_run_thread_with_stdio(&thr->thr);
|
||||
out:
|
||||
darray_for_each(devs, i)
|
||||
kfree(*i);
|
||||
darray_exit(&devs);
|
||||
return ret;
|
||||
err:
|
||||
if (thr)
|
||||
bch2_fsck_thread_exit(&thr->thr);
|
||||
pr_err("ret %s", bch2_err_str(ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
|
||||
{
|
||||
long ret;
|
||||
@ -775,99 +651,6 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
|
||||
{
|
||||
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
|
||||
struct bch_fs *c = thr->c;
|
||||
|
||||
c->stdio_filter = current;
|
||||
c->stdio = &thr->thr.stdio;
|
||||
|
||||
/*
|
||||
* XXX: can we figure out a way to do this without mucking with c->opts?
|
||||
*/
|
||||
unsigned old_fix_errors = c->opts.fix_errors;
|
||||
if (opt_defined(thr->opts, fix_errors))
|
||||
c->opts.fix_errors = thr->opts.fix_errors;
|
||||
else
|
||||
c->opts.fix_errors = FSCK_FIX_ask;
|
||||
|
||||
c->opts.fsck = true;
|
||||
set_bit(BCH_FS_fsck_running, &c->flags);
|
||||
|
||||
c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
|
||||
int ret = bch2_run_online_recovery_passes(c);
|
||||
|
||||
clear_bit(BCH_FS_fsck_running, &c->flags);
|
||||
bch_err_fn(c, ret);
|
||||
|
||||
c->stdio = NULL;
|
||||
c->stdio_filter = NULL;
|
||||
c->opts.fix_errors = old_fix_errors;
|
||||
|
||||
up(&c->online_fsck_mutex);
|
||||
bch2_ro_ref_put(c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
|
||||
.exit = bch2_fsck_thread_exit,
|
||||
.fn = bch2_fsck_online_thread_fn,
|
||||
};
|
||||
|
||||
static long bch2_ioctl_fsck_online(struct bch_fs *c,
|
||||
struct bch_ioctl_fsck_online arg)
|
||||
{
|
||||
struct fsck_thread *thr = NULL;
|
||||
long ret = 0;
|
||||
|
||||
if (arg.flags)
|
||||
return -EINVAL;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
if (!bch2_ro_ref_tryget(c))
|
||||
return -EROFS;
|
||||
|
||||
if (down_trylock(&c->online_fsck_mutex)) {
|
||||
bch2_ro_ref_put(c);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
thr = kzalloc(sizeof(*thr), GFP_KERNEL);
|
||||
if (!thr) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
thr->c = c;
|
||||
thr->opts = bch2_opts_empty();
|
||||
|
||||
if (arg.opts) {
|
||||
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
|
||||
|
||||
ret = PTR_ERR_OR_ZERO(optstr) ?:
|
||||
bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
|
||||
if (!IS_ERR(optstr))
|
||||
kfree(optstr);
|
||||
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
|
||||
err:
|
||||
if (ret < 0) {
|
||||
bch_err_fn(c, ret);
|
||||
if (thr)
|
||||
bch2_fsck_thread_exit(&thr->thr);
|
||||
up(&c->online_fsck_mutex);
|
||||
bch2_ro_ref_put(c);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BCH_IOCTL(_name, _argtype) \
|
||||
do { \
|
||||
_argtype i; \
|
||||
|
@ -2,6 +2,7 @@
|
||||
#include "bcachefs.h"
|
||||
#include "checksum.h"
|
||||
#include "errcode.h"
|
||||
#include "error.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
|
||||
@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
|
||||
if (!bch2_csum_type_is_encryption(type))
|
||||
return 0;
|
||||
|
||||
if (bch2_fs_inconsistent_on(!c->chacha20,
|
||||
c, "attempting to encrypt without encryption key"))
|
||||
return -BCH_ERR_no_encryption_key;
|
||||
|
||||
return do_encrypt(c->chacha20, nonce, data, len);
|
||||
}
|
||||
|
||||
@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
||||
size_t sgl_len = 0;
|
||||
int ret = 0;
|
||||
|
||||
if (!bch2_csum_type_is_encryption(type))
|
||||
return 0;
|
||||
if (bch2_fs_inconsistent_on(!c->chacha20,
|
||||
c, "attempting to encrypt without encryption key"))
|
||||
return -BCH_ERR_no_encryption_key;
|
||||
|
||||
darray_init(&sgl);
|
||||
|
||||
|
@ -109,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool);
|
||||
void bch2_fs_encryption_exit(struct bch_fs *);
|
||||
int bch2_fs_encryption_init(struct bch_fs *);
|
||||
|
||||
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
|
||||
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type,
|
||||
bool data)
|
||||
{
|
||||
switch (type) {
|
||||
|
@ -2,13 +2,33 @@
|
||||
#include "bcachefs.h"
|
||||
#include "checksum.h"
|
||||
#include "compress.h"
|
||||
#include "error.h"
|
||||
#include "extents.h"
|
||||
#include "opts.h"
|
||||
#include "super-io.h"
|
||||
|
||||
#include <linux/lz4.h>
|
||||
#include <linux/zlib.h>
|
||||
#include <linux/zstd.h>
|
||||
|
||||
static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type)
|
||||
{
|
||||
switch (type) {
|
||||
case BCH_COMPRESSION_TYPE_none:
|
||||
case BCH_COMPRESSION_TYPE_incompressible:
|
||||
return BCH_COMPRESSION_OPT_none;
|
||||
case BCH_COMPRESSION_TYPE_lz4_old:
|
||||
case BCH_COMPRESSION_TYPE_lz4:
|
||||
return BCH_COMPRESSION_OPT_lz4;
|
||||
case BCH_COMPRESSION_TYPE_gzip:
|
||||
return BCH_COMPRESSION_OPT_gzip;
|
||||
case BCH_COMPRESSION_TYPE_zstd:
|
||||
return BCH_COMPRESSION_OPT_zstd;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
/* Bounce buffer: */
|
||||
struct bbuf {
|
||||
void *b;
|
||||
@ -158,6 +178,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
void *workspace;
|
||||
int ret;
|
||||
|
||||
enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
|
||||
mempool_t *workspace_pool = &c->compress_workspace[opt];
|
||||
if (unlikely(!mempool_initialized(workspace_pool))) {
|
||||
if (fsck_err(c, compression_type_not_marked_in_sb,
|
||||
"compression type %s set but not marked in superblock",
|
||||
__bch2_compression_types[crc.compression_type]))
|
||||
ret = bch2_check_set_has_compressed_data(c, opt);
|
||||
else
|
||||
ret = -BCH_ERR_compression_workspace_not_initialized;
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
src_data = bio_map_or_bounce(c, src, READ);
|
||||
|
||||
switch (crc.compression_type) {
|
||||
@ -176,13 +209,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
.avail_out = dst_len,
|
||||
};
|
||||
|
||||
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
|
||||
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
|
||||
|
||||
zlib_set_workspace(&strm, workspace);
|
||||
zlib_inflateInit2(&strm, -MAX_WBITS);
|
||||
ret = zlib_inflate(&strm, Z_FINISH);
|
||||
|
||||
mempool_free(workspace, &c->decompress_workspace);
|
||||
mempool_free(workspace, workspace_pool);
|
||||
|
||||
if (ret != Z_STREAM_END)
|
||||
goto err;
|
||||
@ -195,14 +228,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
if (real_src_len > src_len - 4)
|
||||
goto err;
|
||||
|
||||
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS);
|
||||
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
|
||||
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
|
||||
|
||||
ret = zstd_decompress_dctx(ctx,
|
||||
dst_data, dst_len,
|
||||
src_data.b + 4, real_src_len);
|
||||
|
||||
mempool_free(workspace, &c->decompress_workspace);
|
||||
mempool_free(workspace, workspace_pool);
|
||||
|
||||
if (ret != dst_len)
|
||||
goto err;
|
||||
@ -212,6 +245,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
||||
BUG();
|
||||
}
|
||||
ret = 0;
|
||||
fsck_err:
|
||||
out:
|
||||
bio_unmap_or_unbounce(c, src_data);
|
||||
return ret;
|
||||
@ -394,8 +428,21 @@ static unsigned __bio_compress(struct bch_fs *c,
|
||||
unsigned pad;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
|
||||
BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
|
||||
/* bch2_compression_decode catches unknown compression types: */
|
||||
BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR);
|
||||
|
||||
mempool_t *workspace_pool = &c->compress_workspace[compression.type];
|
||||
if (unlikely(!mempool_initialized(workspace_pool))) {
|
||||
if (fsck_err(c, compression_opt_not_marked_in_sb,
|
||||
"compression opt %s set but not marked in superblock",
|
||||
bch2_compression_opts[compression.type])) {
|
||||
ret = bch2_check_set_has_compressed_data(c, compression.type);
|
||||
if (ret) /* memory allocation failure, don't compress */
|
||||
return 0;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* If it's only one block, don't bother trying to compress: */
|
||||
if (src->bi_iter.bi_size <= c->opts.block_size)
|
||||
@ -404,7 +451,7 @@ static unsigned __bio_compress(struct bch_fs *c,
|
||||
dst_data = bio_map_or_bounce(c, dst, WRITE);
|
||||
src_data = bio_map_or_bounce(c, src, READ);
|
||||
|
||||
workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS);
|
||||
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
|
||||
|
||||
*src_len = src->bi_iter.bi_size;
|
||||
*dst_len = dst->bi_iter.bi_size;
|
||||
@ -447,7 +494,7 @@ static unsigned __bio_compress(struct bch_fs *c,
|
||||
*src_len = round_down(*src_len, block_bytes(c));
|
||||
}
|
||||
|
||||
mempool_free(workspace, &c->compress_workspace[compression_type]);
|
||||
mempool_free(workspace, workspace_pool);
|
||||
|
||||
if (ret)
|
||||
goto err;
|
||||
@ -477,6 +524,9 @@ out:
|
||||
err:
|
||||
ret = BCH_COMPRESSION_TYPE_incompressible;
|
||||
goto out;
|
||||
fsck_err:
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
unsigned bch2_bio_compress(struct bch_fs *c,
|
||||
@ -559,7 +609,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
|
||||
{
|
||||
unsigned i;
|
||||
|
||||
mempool_exit(&c->decompress_workspace);
|
||||
for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
|
||||
mempool_exit(&c->compress_workspace[i]);
|
||||
mempool_exit(&c->compression_bounce[WRITE]);
|
||||
@ -568,7 +617,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
|
||||
|
||||
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
|
||||
{
|
||||
size_t decompress_workspace_size = 0;
|
||||
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
|
||||
c->opts.encoded_extent_max);
|
||||
|
||||
@ -576,19 +624,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
|
||||
|
||||
struct {
|
||||
unsigned feature;
|
||||
enum bch_compression_type type;
|
||||
enum bch_compression_opts type;
|
||||
size_t compress_workspace;
|
||||
size_t decompress_workspace;
|
||||
} compression_types[] = {
|
||||
{ BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
|
||||
max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
|
||||
0 },
|
||||
{ BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
|
||||
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
|
||||
zlib_inflate_workspacesize(), },
|
||||
{ BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
|
||||
c->zstd_workspace_size,
|
||||
zstd_dctx_workspace_bound() },
|
||||
{ BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4,
|
||||
max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
|
||||
{ BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip,
|
||||
max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
|
||||
zlib_inflate_workspacesize()) },
|
||||
{ BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd,
|
||||
max(c->zstd_workspace_size,
|
||||
zstd_dctx_workspace_bound()) },
|
||||
}, *i;
|
||||
bool have_compressed = false;
|
||||
|
||||
@ -613,9 +659,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
|
||||
for (i = compression_types;
|
||||
i < compression_types + ARRAY_SIZE(compression_types);
|
||||
i++) {
|
||||
decompress_workspace_size =
|
||||
max(decompress_workspace_size, i->decompress_workspace);
|
||||
|
||||
if (!(features & (1 << i->feature)))
|
||||
continue;
|
||||
|
||||
@ -628,11 +671,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
|
||||
return -BCH_ERR_ENOMEM_compression_workspace_init;
|
||||
}
|
||||
|
||||
if (!mempool_initialized(&c->decompress_workspace) &&
|
||||
mempool_init_kvmalloc_pool(&c->decompress_workspace,
|
||||
1, decompress_workspace_size))
|
||||
return -BCH_ERR_ENOMEM_decompression_workspace_init;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
|
||||
for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
|
||||
|
||||
#define darray_for_each_reverse(_d, _i) \
|
||||
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
|
||||
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
|
||||
|
||||
#define darray_init(_d) \
|
||||
do { \
|
||||
|
@ -110,11 +110,8 @@ static void trace_move_extent_fail2(struct data_update *m,
|
||||
{
|
||||
struct bch_fs *c = m->op.c;
|
||||
struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct bch_extent_ptr *ptr;
|
||||
struct extent_ptr_decoded p;
|
||||
struct printbuf buf = PRINTBUF;
|
||||
unsigned i, rewrites_found = 0;
|
||||
unsigned rewrites_found = 0;
|
||||
|
||||
if (!trace_move_extent_fail_enabled())
|
||||
return;
|
||||
@ -122,27 +119,25 @@ static void trace_move_extent_fail2(struct data_update *m,
|
||||
prt_str(&buf, msg);
|
||||
|
||||
if (insert) {
|
||||
i = 0;
|
||||
const union bch_extent_entry *entry;
|
||||
struct bch_extent_ptr *ptr;
|
||||
struct extent_ptr_decoded p;
|
||||
|
||||
unsigned ptr_bit = 1;
|
||||
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
|
||||
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
|
||||
if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
|
||||
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
|
||||
!ptr->cached)
|
||||
rewrites_found |= 1U << i;
|
||||
i++;
|
||||
rewrites_found |= ptr_bit;
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
}
|
||||
|
||||
prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u",
|
||||
(m->data_opts.rewrite_ptrs & (1 << 0)) != 0,
|
||||
(m->data_opts.rewrite_ptrs & (1 << 1)) != 0,
|
||||
(m->data_opts.rewrite_ptrs & (1 << 2)) != 0,
|
||||
(m->data_opts.rewrite_ptrs & (1 << 3)) != 0);
|
||||
prt_str(&buf, "rewrites found:\t");
|
||||
bch2_prt_u64_base2(&buf, rewrites_found);
|
||||
prt_newline(&buf);
|
||||
|
||||
prt_printf(&buf, "\nrewrites found: %u%u%u%u",
|
||||
(rewrites_found & (1 << 0)) != 0,
|
||||
(rewrites_found & (1 << 1)) != 0,
|
||||
(rewrites_found & (1 << 2)) != 0,
|
||||
(rewrites_found & (1 << 3)) != 0);
|
||||
bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
|
||||
|
||||
prt_str(&buf, "\nold: ");
|
||||
bch2_bkey_val_to_text(&buf, c, old);
|
||||
@ -194,7 +189,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
||||
struct bpos next_pos;
|
||||
bool should_check_enospc;
|
||||
s64 i_sectors_delta = 0, disk_sectors_delta = 0;
|
||||
unsigned rewrites_found = 0, durability, i;
|
||||
unsigned rewrites_found = 0, durability, ptr_bit;
|
||||
|
||||
bch2_trans_begin(trans);
|
||||
|
||||
@ -231,16 +226,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
|
||||
*
|
||||
* Fist, drop rewrite_ptrs from @new:
|
||||
*/
|
||||
i = 0;
|
||||
ptr_bit = 1;
|
||||
bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) {
|
||||
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
|
||||
if ((ptr_bit & m->data_opts.rewrite_ptrs) &&
|
||||
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
|
||||
!ptr->cached) {
|
||||
bch2_extent_ptr_set_cached(c, &m->op.opts,
|
||||
bkey_i_to_s(insert), ptr);
|
||||
rewrites_found |= 1U << i;
|
||||
rewrites_found |= ptr_bit;
|
||||
}
|
||||
i++;
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
|
||||
if (m->data_opts.rewrite_ptrs &&
|
||||
@ -323,8 +318,11 @@ restart_drop_extra_replicas:
|
||||
* it's been hard to reproduce, so this should give us some more
|
||||
* information when it does occur:
|
||||
*/
|
||||
int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id),
|
||||
BCH_VALIDATE_commit);
|
||||
int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert),
|
||||
(struct bkey_validate_context) {
|
||||
.btree = m->btree_id,
|
||||
.flags = BCH_VALIDATE_commit,
|
||||
});
|
||||
if (invalid) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
@ -362,7 +360,7 @@ restart_drop_extra_replicas:
|
||||
k.k->p, bkey_start_pos(&insert->k)) ?:
|
||||
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
|
||||
k.k->p, insert->k.p) ?:
|
||||
bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
|
||||
bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
|
||||
bch2_trans_update(trans, &iter, insert,
|
||||
BTREE_UPDATE_internal_snapshot_node) ?:
|
||||
bch2_trans_commit(trans, &op->res,
|
||||
@ -540,7 +538,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
prt_newline(out);
|
||||
|
||||
prt_str(out, "compression:\t");
|
||||
bch2_compression_opt_to_text(out, background_compression(*io_opts));
|
||||
bch2_compression_opt_to_text(out, io_opts->background_compression);
|
||||
prt_newline(out);
|
||||
|
||||
prt_str(out, "opts.replicas:\t");
|
||||
@ -614,7 +612,7 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
|
||||
unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
@ -622,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
* and we have to check for this because we go rw before repairing the
|
||||
* snapshots table - just skip it, we can move it later.
|
||||
*/
|
||||
if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
|
||||
if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
|
||||
return -BCH_ERR_data_update_done;
|
||||
|
||||
if (!bkey_get_dev_refs(c, k))
|
||||
@ -652,22 +650,22 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
BCH_WRITE_DATA_ENCODED|
|
||||
BCH_WRITE_MOVE|
|
||||
m->data_opts.write_flags;
|
||||
m->op.compression_opt = background_compression(io_opts);
|
||||
m->op.compression_opt = io_opts.background_compression;
|
||||
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
|
||||
|
||||
unsigned durability_have = 0, durability_removing = 0;
|
||||
|
||||
i = 0;
|
||||
unsigned ptr_bit = 1;
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
||||
if (!p.ptr.cached) {
|
||||
rcu_read_lock();
|
||||
if (BIT(i) & m->data_opts.rewrite_ptrs) {
|
||||
if (ptr_bit & m->data_opts.rewrite_ptrs) {
|
||||
if (crc_is_compressed(p.crc))
|
||||
reserve_sectors += k.k->size;
|
||||
|
||||
m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p);
|
||||
durability_removing += bch2_extent_ptr_desired_durability(c, &p);
|
||||
} else if (!(BIT(i) & m->data_opts.kill_ptrs)) {
|
||||
} else if (!(ptr_bit & m->data_opts.kill_ptrs)) {
|
||||
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
|
||||
durability_have += bch2_extent_ptr_durability(c, &p);
|
||||
}
|
||||
@ -687,7 +685,7 @@ int bch2_data_update_init(struct btree_trans *trans,
|
||||
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
|
||||
m->op.incompressible = true;
|
||||
|
||||
i++;
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
|
||||
unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
|
||||
@ -750,14 +748,14 @@ out:
|
||||
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
unsigned i = 0;
|
||||
unsigned ptr_bit = 1;
|
||||
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) {
|
||||
opts->kill_ptrs |= 1U << i;
|
||||
opts->rewrite_ptrs ^= 1U << i;
|
||||
if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) {
|
||||
opts->kill_ptrs |= ptr_bit;
|
||||
opts->rewrite_ptrs ^= ptr_bit;
|
||||
}
|
||||
|
||||
i++;
|
||||
ptr_bit <<= 1;
|
||||
}
|
||||
}
|
||||
|
@ -472,7 +472,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
|
||||
if (!out->nr_tabstops)
|
||||
printbuf_tabstop_push(out, 32);
|
||||
|
||||
prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level);
|
||||
prt_printf(out, "%px ", b);
|
||||
bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level);
|
||||
prt_printf(out, "\n");
|
||||
|
||||
printbuf_indent_add(out, 2);
|
||||
|
||||
|
@ -101,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
|
||||
};
|
||||
|
||||
int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
||||
struct qstr d_name = bch2_dirent_get_name(d);
|
||||
@ -120,7 +120,7 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
* Check new keys don't exceed the max length
|
||||
* (older keys may be larger.)
|
||||
*/
|
||||
bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
|
||||
bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX,
|
||||
c, dirent_name_too_long,
|
||||
"dirent name too big (%u > %u)",
|
||||
d_name.len, BCH_NAME_MAX);
|
||||
@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
|
||||
} else {
|
||||
target->subvol = le32_to_cpu(d.v->d_child_subvol);
|
||||
|
||||
ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s);
|
||||
ret = bch2_subvolume_get(trans, target->subvol, true, &s);
|
||||
|
||||
target->inum = le64_to_cpu(s.inode);
|
||||
}
|
||||
@ -500,7 +500,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32
|
||||
struct bkey_s_c k;
|
||||
int ret;
|
||||
|
||||
for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
|
||||
for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
|
||||
SPOS(dir, 0, snapshot),
|
||||
POS(dir, U64_MAX), 0, k, ret)
|
||||
if (k.k->type == KEY_TYPE_dirent) {
|
||||
@ -549,7 +549,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
|
||||
bch2_bkey_buf_init(&sk);
|
||||
|
||||
int ret = bch2_trans_run(c,
|
||||
for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents,
|
||||
for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents,
|
||||
POS(inum.inum, ctx->pos),
|
||||
POS(inum.inum, U64_MAX),
|
||||
inum.subvol, 0, k, ({
|
||||
|
@ -4,10 +4,10 @@
|
||||
|
||||
#include "str_hash.h"
|
||||
|
||||
enum bch_validate_flags;
|
||||
extern const struct bch_hash_desc bch2_dirent_hash_desc;
|
||||
|
||||
int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
#define bch2_bkey_ops_dirent ((struct bkey_ops) { \
|
||||
|
@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_
|
||||
memcpy_u64s_small(acc->v.d, d, nr);
|
||||
}
|
||||
|
||||
static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
|
||||
|
||||
int bch2_disk_accounting_mod(struct btree_trans *trans,
|
||||
struct disk_accounting_pos *k,
|
||||
s64 *d, unsigned nr, bool gc)
|
||||
@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
|
||||
|
||||
accounting_key_init(&k_i.k, k, d, nr);
|
||||
|
||||
return likely(!gc)
|
||||
? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k)
|
||||
: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
|
||||
if (unlikely(gc)) {
|
||||
int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
|
||||
if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
|
||||
ret = drop_locks_do(trans,
|
||||
bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
|
||||
bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
|
||||
return ret;
|
||||
} else {
|
||||
return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
|
||||
@ -127,14 +136,15 @@ static inline bool is_zero(char *start, char *end)
|
||||
#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
|
||||
|
||||
int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
|
||||
void *end = &acc_k + 1;
|
||||
int ret = 0;
|
||||
|
||||
bkey_fsck_err_on(bversion_zero(k.k->bversion),
|
||||
bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) &&
|
||||
bversion_zero(k.k->bversion),
|
||||
c, accounting_key_version_0,
|
||||
"accounting key with version=0");
|
||||
|
||||
@ -217,7 +227,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
|
||||
prt_printf(out, "id=%u", k->snapshot.id);
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_btree:
|
||||
prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id));
|
||||
prt_str(out, "btree=");
|
||||
bch2_btree_id_to_text(out, k->btree.id);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -243,10 +254,10 @@ void bch2_accounting_swab(struct bkey_s k)
|
||||
}
|
||||
|
||||
static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
|
||||
struct disk_accounting_pos acc)
|
||||
struct disk_accounting_pos *acc)
|
||||
{
|
||||
unsafe_memcpy(r, &acc.replicas,
|
||||
replicas_entry_bytes(&acc.replicas),
|
||||
unsafe_memcpy(r, &acc->replicas,
|
||||
replicas_entry_bytes(&acc->replicas),
|
||||
"variable length struct");
|
||||
}
|
||||
|
||||
@ -257,7 +268,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
|
||||
|
||||
switch (acc_k.type) {
|
||||
case BCH_DISK_ACCOUNTING_replicas:
|
||||
__accounting_to_replicas(r, acc_k);
|
||||
__accounting_to_replicas(r, &acc_k);
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@ -322,6 +333,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
|
||||
|
||||
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, NULL);
|
||||
|
||||
if (trace_accounting_mem_insert_enabled()) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
bch2_accounting_to_text(&buf, c, a.s_c);
|
||||
trace_accounting_mem_insert(c, buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
}
|
||||
return 0;
|
||||
err:
|
||||
free_percpu(n.v[1]);
|
||||
@ -461,32 +480,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c)
|
||||
{
|
||||
struct bch_accounting_mem *acc = &c->accounting;
|
||||
|
||||
percpu_down_read(&c->mark_lock);
|
||||
out->atomic++;
|
||||
|
||||
eytzinger0_for_each(i, acc->k.nr) {
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos);
|
||||
|
||||
bch2_accounting_key_to_text(out, &acc_k);
|
||||
|
||||
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
|
||||
|
||||
prt_str(out, ":");
|
||||
for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
|
||||
prt_printf(out, " %llu", v[j]);
|
||||
prt_newline(out);
|
||||
}
|
||||
|
||||
--out->atomic;
|
||||
percpu_up_read(&c->mark_lock);
|
||||
}
|
||||
|
||||
static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
|
||||
{
|
||||
darray_for_each(acc->k, e) {
|
||||
@ -625,7 +618,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
|
||||
switch (acc.type) {
|
||||
case BCH_DISK_ACCOUNTING_replicas: {
|
||||
struct bch_replicas_padded r;
|
||||
__accounting_to_replicas(&r.e, acc);
|
||||
__accounting_to_replicas(&r.e, &acc);
|
||||
|
||||
for (unsigned i = 0; i < r.e.nr_devs; i++)
|
||||
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
|
||||
@ -699,11 +692,45 @@ int bch2_accounting_read(struct bch_fs *c)
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
struct printbuf buf = PRINTBUF;
|
||||
|
||||
int ret = for_each_btree_key(trans, iter,
|
||||
BTREE_ID_accounting, POS_MIN,
|
||||
/*
|
||||
* We might run more than once if we rewind to start topology repair or
|
||||
* btree node scan - and those might cause us to get different results,
|
||||
* so we can't just skip if we've already run.
|
||||
*
|
||||
* Instead, zero out any accounting we have:
|
||||
*/
|
||||
percpu_down_write(&c->mark_lock);
|
||||
darray_for_each(acc->k, e)
|
||||
percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
|
||||
for_each_member_device(c, ca)
|
||||
percpu_memset(ca->usage, 0, sizeof(*ca->usage));
|
||||
percpu_memset(c->usage, 0, sizeof(*c->usage));
|
||||
percpu_up_write(&c->mark_lock);
|
||||
|
||||
struct btree_iter iter;
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
|
||||
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
|
||||
iter.flags &= ~BTREE_ITER_with_journal;
|
||||
int ret = for_each_btree_key_continue(trans, iter,
|
||||
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
|
||||
struct bkey u;
|
||||
struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u);
|
||||
|
||||
if (k.k->type != KEY_TYPE_accounting)
|
||||
continue;
|
||||
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
|
||||
|
||||
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
|
||||
break;
|
||||
|
||||
if (!bch2_accounting_is_mem(acc_k)) {
|
||||
struct disk_accounting_pos next = { .type = acc_k.type + 1 };
|
||||
bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
|
||||
continue;
|
||||
}
|
||||
|
||||
accounting_read_key(trans, k);
|
||||
}));
|
||||
if (ret)
|
||||
@ -715,6 +742,12 @@ int bch2_accounting_read(struct bch_fs *c)
|
||||
|
||||
darray_for_each(*keys, i) {
|
||||
if (i->k->k.type == KEY_TYPE_accounting) {
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
|
||||
|
||||
if (!bch2_accounting_is_mem(acc_k))
|
||||
continue;
|
||||
|
||||
struct bkey_s_c k = bkey_i_to_s_c(i->k);
|
||||
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr,
|
||||
sizeof(acc->k.data[0]),
|
||||
@ -748,15 +781,16 @@ int bch2_accounting_read(struct bch_fs *c)
|
||||
keys->gap = keys->nr = dst - keys->data;
|
||||
|
||||
percpu_down_write(&c->mark_lock);
|
||||
unsigned i = 0;
|
||||
while (i < acc->k.nr) {
|
||||
unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
|
||||
|
||||
darray_for_each_reverse(acc->k, i) {
|
||||
struct disk_accounting_pos acc_k;
|
||||
bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
|
||||
bpos_to_disk_accounting_pos(&acc_k, i->pos);
|
||||
|
||||
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
|
||||
bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
|
||||
memset(v, 0, sizeof(v));
|
||||
|
||||
for (unsigned j = 0; j < i->nr_counters; j++)
|
||||
v[j] = percpu_u64_get(i->v[0] + j);
|
||||
|
||||
/*
|
||||
* If the entry counters are zeroed, it should be treated as
|
||||
@ -765,26 +799,25 @@ int bch2_accounting_read(struct bch_fs *c)
|
||||
* Remove it, so that if it's re-added it gets re-marked in the
|
||||
* superblock:
|
||||
*/
|
||||
ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
|
||||
ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
|
||||
? -BCH_ERR_remove_disk_accounting_entry
|
||||
: bch2_disk_accounting_validate_late(trans, acc_k,
|
||||
v, acc->k.data[idx].nr_counters);
|
||||
: bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
|
||||
|
||||
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
|
||||
free_percpu(acc->k.data[idx].v[0]);
|
||||
free_percpu(acc->k.data[idx].v[1]);
|
||||
darray_remove_item(&acc->k, &acc->k.data[idx]);
|
||||
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, NULL);
|
||||
free_percpu(i->v[0]);
|
||||
free_percpu(i->v[1]);
|
||||
darray_remove_item(&acc->k, i);
|
||||
ret = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ret)
|
||||
goto fsck_err;
|
||||
i++;
|
||||
}
|
||||
|
||||
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, NULL);
|
||||
|
||||
preempt_disable();
|
||||
struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
|
||||
|
||||
@ -804,7 +837,7 @@ int bch2_accounting_read(struct bch_fs *c)
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type:
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev);
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
|
||||
if (ca) {
|
||||
struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
|
||||
percpu_u64_set(&d->buckets, v[0]);
|
||||
@ -881,10 +914,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
|
||||
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
|
||||
|
||||
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
|
||||
continue;
|
||||
break;
|
||||
|
||||
if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
|
||||
if (!bch2_accounting_is_mem(acc_k)) {
|
||||
struct disk_accounting_pos next = { .type = acc_k.type + 1 };
|
||||
bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
|
||||
continue;
|
||||
}
|
||||
|
||||
bch2_accounting_mem_read(c, k.k->p, v, nr);
|
||||
|
||||
@ -910,7 +946,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type: {
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
|
||||
if (!ca) {
|
||||
rcu_read_unlock();
|
||||
continue;
|
||||
|
@ -2,6 +2,7 @@
|
||||
#ifndef _BCACHEFS_DISK_ACCOUNTING_H
|
||||
#define _BCACHEFS_DISK_ACCOUNTING_H
|
||||
|
||||
#include "btree_update.h"
|
||||
#include "eytzinger.h"
|
||||
#include "sb-members.h"
|
||||
|
||||
@ -62,27 +63,32 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage
|
||||
|
||||
static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p)
|
||||
{
|
||||
acc->_pad = p;
|
||||
BUILD_BUG_ON(sizeof(*acc) != sizeof(p));
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
bch2_bpos_swab(&acc->_pad);
|
||||
acc->_pad = p;
|
||||
#else
|
||||
memcpy_swab(acc, &p, sizeof(p));
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k)
|
||||
static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc)
|
||||
{
|
||||
struct bpos ret = k->_pad;
|
||||
|
||||
struct bpos p;
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
bch2_bpos_swab(&ret);
|
||||
p = acc->_pad;
|
||||
#else
|
||||
memcpy_swab(&p, acc, sizeof(p));
|
||||
#endif
|
||||
return ret;
|
||||
return p;
|
||||
}
|
||||
|
||||
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
|
||||
s64 *, unsigned, bool);
|
||||
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
|
||||
|
||||
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
|
||||
void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_accounting_swab(struct bkey_s);
|
||||
@ -112,6 +118,12 @@ enum bch_accounting_mode {
|
||||
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
|
||||
void bch2_accounting_mem_gc(struct bch_fs *);
|
||||
|
||||
static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
|
||||
{
|
||||
return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
|
||||
acc.type != BCH_DISK_ACCOUNTING_inum;
|
||||
}
|
||||
|
||||
/*
|
||||
* Update in memory counters so they match the btree update we're doing; called
|
||||
* from transaction commit path
|
||||
@ -126,9 +138,10 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
||||
bpos_to_disk_accounting_pos(&acc_k, a.k->p);
|
||||
bool gc = mode == BCH_ACCOUNTING_gc;
|
||||
|
||||
EBUG_ON(gc && !acc->gc_running);
|
||||
if (gc && !acc->gc_running)
|
||||
return 0;
|
||||
|
||||
if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
|
||||
if (!bch2_accounting_is_mem(acc_k))
|
||||
return 0;
|
||||
|
||||
if (mode == BCH_ACCOUNTING_normal) {
|
||||
@ -141,7 +154,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
|
||||
break;
|
||||
case BCH_DISK_ACCOUNTING_dev_data_type:
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev);
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
|
||||
if (ca) {
|
||||
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
|
||||
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
|
||||
@ -204,9 +217,45 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
|
||||
bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
|
||||
}
|
||||
|
||||
static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
|
||||
{
|
||||
EBUG_ON(!res->ref);
|
||||
|
||||
return (struct bversion) {
|
||||
.hi = res->seq >> 32,
|
||||
.lo = (res->seq << 32) | (res->offset + offset),
|
||||
};
|
||||
}
|
||||
|
||||
static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
|
||||
struct bkey_i_accounting *a,
|
||||
unsigned commit_flags)
|
||||
{
|
||||
a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
|
||||
(u64 *) a - (u64 *) trans->journal_entries);
|
||||
|
||||
EBUG_ON(bversion_zero(a->k.bversion));
|
||||
|
||||
return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
|
||||
? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
|
||||
: 0;
|
||||
}
|
||||
|
||||
static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans,
|
||||
struct bkey_i_accounting *a_i,
|
||||
unsigned commit_flags)
|
||||
{
|
||||
if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
|
||||
struct bkey_s_accounting a = accounting_i_to_s(a_i);
|
||||
|
||||
bch2_accounting_neg(a);
|
||||
bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
|
||||
bch2_accounting_neg(a);
|
||||
}
|
||||
}
|
||||
|
||||
int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *);
|
||||
int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned);
|
||||
void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *);
|
||||
|
||||
int bch2_gc_accounting_start(struct bch_fs *);
|
||||
int bch2_gc_accounting_done(struct bch_fs *);
|
||||
|
275
fs/bcachefs/ec.c
275
fs/bcachefs/ec.c
@ -26,6 +26,7 @@
|
||||
#include "util.h"
|
||||
|
||||
#include <linux/sort.h>
|
||||
#include <linux/string_choices.h>
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
@ -109,7 +110,7 @@ struct ec_bio {
|
||||
/* Stripes btree keys: */
|
||||
|
||||
int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
|
||||
int ret = 0;
|
||||
@ -129,7 +130,7 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
"invalid csum granularity (%u >= 64)",
|
||||
s->csum_granularity_bits);
|
||||
|
||||
ret = bch2_bkey_ptrs_validate(c, k, flags);
|
||||
ret = bch2_bkey_ptrs_validate(c, k, from);
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -304,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans,
|
||||
}
|
||||
|
||||
if (flags & BTREE_TRIGGER_gc) {
|
||||
percpu_down_read(&c->mark_lock);
|
||||
struct bucket *g = gc_bucket(ca, bucket.offset);
|
||||
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
|
||||
ptr->dev,
|
||||
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
|
||||
ret = -BCH_ERR_mark_stripe;
|
||||
goto err_unlock;
|
||||
goto err;
|
||||
}
|
||||
|
||||
bucket_lock(g);
|
||||
@ -318,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
|
||||
ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags);
|
||||
alloc_to_bucket(g, new);
|
||||
bucket_unlock(g);
|
||||
err_unlock:
|
||||
percpu_up_read(&c->mark_lock);
|
||||
|
||||
if (!ret)
|
||||
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags);
|
||||
}
|
||||
@ -732,7 +731,7 @@ static void ec_block_endio(struct bio *bio)
|
||||
? BCH_MEMBER_ERROR_write
|
||||
: BCH_MEMBER_ERROR_read,
|
||||
"erasure coding %s error: %s",
|
||||
bio_data_dir(bio) ? "write" : "read",
|
||||
str_write_read(bio_data_dir(bio)),
|
||||
bch2_blk_status_to_str(bio->bi_status)))
|
||||
clear_bit(ec_bio->idx, ec_bio->buf->valid);
|
||||
|
||||
@ -909,7 +908,7 @@ err:
|
||||
bch2_bkey_val_to_text(&msgbuf, c, orig_k);
|
||||
bch_err_ratelimited(c,
|
||||
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
|
||||
printbuf_exit(&msgbuf);;
|
||||
printbuf_exit(&msgbuf);
|
||||
ret = -BCH_ERR_stripe_reconstruct;
|
||||
goto out;
|
||||
}
|
||||
@ -1266,11 +1265,11 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
|
||||
struct bch_dev *ca,
|
||||
struct bpos bucket, u8 gen,
|
||||
struct ec_stripe_buf *s,
|
||||
struct bpos *bp_pos)
|
||||
struct bkey_s_c_backpointer bp,
|
||||
struct bkey_buf *last_flushed)
|
||||
{
|
||||
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_backpointer bp;
|
||||
struct btree_iter iter;
|
||||
struct bkey_s_c k;
|
||||
const struct bch_extent_ptr *ptr_c;
|
||||
@ -1279,33 +1278,26 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
|
||||
struct bkey_i *n;
|
||||
int ret, dev, block;
|
||||
|
||||
ret = bch2_get_next_backpointer(trans, ca, bucket, gen,
|
||||
bp_pos, &bp, BTREE_ITER_cached);
|
||||
if (ret)
|
||||
return ret;
|
||||
if (bpos_eq(*bp_pos, SPOS_MAX))
|
||||
return 0;
|
||||
|
||||
if (bp.level) {
|
||||
if (bp.v->level) {
|
||||
struct printbuf buf = PRINTBUF;
|
||||
struct btree_iter node_iter;
|
||||
struct btree *b;
|
||||
|
||||
b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp);
|
||||
b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed);
|
||||
bch2_trans_iter_exit(trans, &node_iter);
|
||||
|
||||
if (!b)
|
||||
return 0;
|
||||
|
||||
prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b);
|
||||
bch2_backpointer_to_text(&buf, &bp);
|
||||
bch2_bkey_val_to_text(&buf, c, bp.s_c);
|
||||
|
||||
bch2_fs_inconsistent(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent);
|
||||
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
|
||||
ret = bkey_err(k);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -1364,7 +1356,6 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
|
||||
struct bch_extent_ptr ptr = v->ptrs[block];
|
||||
struct bpos bp_pos = POS_MIN;
|
||||
int ret = 0;
|
||||
|
||||
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
|
||||
@ -1373,19 +1364,27 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
|
||||
|
||||
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
|
||||
|
||||
while (1) {
|
||||
ret = commit_do(trans, NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_check_rw|
|
||||
BCH_TRANS_COMMIT_no_enospc,
|
||||
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos));
|
||||
if (ret)
|
||||
break;
|
||||
if (bkey_eq(bp_pos, POS_MAX))
|
||||
struct bkey_buf last_flushed;
|
||||
bch2_bkey_buf_init(&last_flushed);
|
||||
bkey_init(&last_flushed.k->k);
|
||||
|
||||
ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
|
||||
bucket_pos_to_bp_start(ca, bucket_pos),
|
||||
bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
|
||||
NULL, NULL,
|
||||
BCH_TRANS_COMMIT_no_check_rw|
|
||||
BCH_TRANS_COMMIT_no_enospc, ({
|
||||
if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
|
||||
break;
|
||||
|
||||
bp_pos = bpos_nosnap_successor(bp_pos);
|
||||
}
|
||||
if (bp_k.k->type != KEY_TYPE_backpointer)
|
||||
continue;
|
||||
|
||||
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
|
||||
bkey_s_c_to_backpointer(bp_k), &last_flushed);
|
||||
}));
|
||||
|
||||
bch2_bkey_buf_exit(&last_flushed, c);
|
||||
bch2_dev_put(ca);
|
||||
return ret;
|
||||
}
|
||||
@ -1707,7 +1706,7 @@ static void ec_stripe_key_init(struct bch_fs *c,
|
||||
set_bkey_val_u64s(&s->k, u64s);
|
||||
}
|
||||
|
||||
static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
{
|
||||
struct ec_stripe_new *s;
|
||||
|
||||
@ -1715,7 +1714,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
|
||||
s = kzalloc(sizeof(*s), GFP_KERNEL);
|
||||
if (!s)
|
||||
return -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
|
||||
return NULL;
|
||||
|
||||
mutex_init(&s->lock);
|
||||
closure_init(&s->iodone, NULL);
|
||||
@ -1730,10 +1729,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
ec_stripe_key_init(c, &s->new_stripe.key,
|
||||
s->nr_data, s->nr_parity,
|
||||
h->blocksize, h->disk_label);
|
||||
|
||||
h->s = s;
|
||||
h->nr_created++;
|
||||
return 0;
|
||||
return s;
|
||||
}
|
||||
|
||||
static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
|
||||
@ -1878,25 +1874,26 @@ err:
|
||||
return h;
|
||||
}
|
||||
|
||||
static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h,
|
||||
static int new_stripe_alloc_buckets(struct btree_trans *trans,
|
||||
struct ec_stripe_head *h, struct ec_stripe_new *s,
|
||||
enum bch_watermark watermark, struct closure *cl)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_devs_mask devs = h->devs;
|
||||
struct open_bucket *ob;
|
||||
struct open_buckets buckets;
|
||||
struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
|
||||
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
|
||||
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
|
||||
bool have_cache = true;
|
||||
int ret = 0;
|
||||
|
||||
BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity);
|
||||
BUG_ON(v->nr_redundant != h->s->nr_parity);
|
||||
BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
|
||||
BUG_ON(v->nr_redundant != s->nr_parity);
|
||||
|
||||
/* * We bypass the sector allocator which normally does this: */
|
||||
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
|
||||
|
||||
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
|
||||
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
|
||||
/*
|
||||
* Note: we don't yet repair invalid blocks (failed/removed
|
||||
* devices) when reusing stripes - we still need a codepath to
|
||||
@ -1906,21 +1903,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
|
||||
__clear_bit(v->ptrs[i].dev, devs.d);
|
||||
|
||||
if (i < h->s->nr_data)
|
||||
if (i < s->nr_data)
|
||||
nr_have_data++;
|
||||
else
|
||||
nr_have_parity++;
|
||||
}
|
||||
|
||||
BUG_ON(nr_have_data > h->s->nr_data);
|
||||
BUG_ON(nr_have_parity > h->s->nr_parity);
|
||||
BUG_ON(nr_have_data > s->nr_data);
|
||||
BUG_ON(nr_have_parity > s->nr_parity);
|
||||
|
||||
buckets.nr = 0;
|
||||
if (nr_have_parity < h->s->nr_parity) {
|
||||
if (nr_have_parity < s->nr_parity) {
|
||||
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
|
||||
&h->parity_stripe,
|
||||
&devs,
|
||||
h->s->nr_parity,
|
||||
s->nr_parity,
|
||||
&nr_have_parity,
|
||||
&have_cache, 0,
|
||||
BCH_DATA_parity,
|
||||
@ -1928,14 +1925,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
cl);
|
||||
|
||||
open_bucket_for_each(c, &buckets, ob, i) {
|
||||
j = find_next_zero_bit(h->s->blocks_gotten,
|
||||
h->s->nr_data + h->s->nr_parity,
|
||||
h->s->nr_data);
|
||||
BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
|
||||
j = find_next_zero_bit(s->blocks_gotten,
|
||||
s->nr_data + s->nr_parity,
|
||||
s->nr_data);
|
||||
BUG_ON(j >= s->nr_data + s->nr_parity);
|
||||
|
||||
h->s->blocks[j] = buckets.v[i];
|
||||
s->blocks[j] = buckets.v[i];
|
||||
v->ptrs[j] = bch2_ob_ptr(c, ob);
|
||||
__set_bit(j, h->s->blocks_gotten);
|
||||
__set_bit(j, s->blocks_gotten);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
@ -1943,11 +1940,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
}
|
||||
|
||||
buckets.nr = 0;
|
||||
if (nr_have_data < h->s->nr_data) {
|
||||
if (nr_have_data < s->nr_data) {
|
||||
ret = bch2_bucket_alloc_set_trans(trans, &buckets,
|
||||
&h->block_stripe,
|
||||
&devs,
|
||||
h->s->nr_data,
|
||||
s->nr_data,
|
||||
&nr_have_data,
|
||||
&have_cache, 0,
|
||||
BCH_DATA_user,
|
||||
@ -1955,13 +1952,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
|
||||
cl);
|
||||
|
||||
open_bucket_for_each(c, &buckets, ob, i) {
|
||||
j = find_next_zero_bit(h->s->blocks_gotten,
|
||||
h->s->nr_data, 0);
|
||||
BUG_ON(j >= h->s->nr_data);
|
||||
j = find_next_zero_bit(s->blocks_gotten,
|
||||
s->nr_data, 0);
|
||||
BUG_ON(j >= s->nr_data);
|
||||
|
||||
h->s->blocks[j] = buckets.v[i];
|
||||
s->blocks[j] = buckets.v[i];
|
||||
v->ptrs[j] = bch2_ob_ptr(c, ob);
|
||||
__set_bit(j, h->s->blocks_gotten);
|
||||
__set_bit(j, s->blocks_gotten);
|
||||
}
|
||||
|
||||
if (ret)
|
||||
@ -2007,12 +2004,54 @@ static s64 get_existing_stripe(struct bch_fs *c,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
|
||||
static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s)
|
||||
{
|
||||
struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
|
||||
struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v;
|
||||
unsigned i;
|
||||
|
||||
BUG_ON(existing_v->nr_redundant != s->nr_parity);
|
||||
s->nr_data = existing_v->nr_blocks -
|
||||
existing_v->nr_redundant;
|
||||
|
||||
int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
|
||||
if (ret) {
|
||||
bch2_stripe_close(c, s);
|
||||
return ret;
|
||||
}
|
||||
|
||||
BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
|
||||
|
||||
/*
|
||||
* Free buckets we initially allocated - they might conflict with
|
||||
* blocks from the stripe we're reusing:
|
||||
*/
|
||||
for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) {
|
||||
bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]);
|
||||
s->blocks[i] = 0;
|
||||
}
|
||||
memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten));
|
||||
memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated));
|
||||
|
||||
for (unsigned i = 0; i < existing_v->nr_blocks; i++) {
|
||||
if (stripe_blockcount_get(existing_v, i)) {
|
||||
__set_bit(i, s->blocks_gotten);
|
||||
__set_bit(i, s->blocks_allocated);
|
||||
}
|
||||
|
||||
ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone);
|
||||
}
|
||||
|
||||
bkey_copy(&s->new_stripe.key, &s->existing_stripe.key);
|
||||
s->have_existing_stripe = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h,
|
||||
struct ec_stripe_new *s)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
|
||||
struct bch_stripe *existing_v;
|
||||
unsigned i;
|
||||
s64 idx;
|
||||
int ret;
|
||||
|
||||
@ -2024,56 +2063,19 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
|
||||
if (idx < 0)
|
||||
return -BCH_ERR_stripe_alloc_blocked;
|
||||
|
||||
ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
|
||||
ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
|
||||
bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
|
||||
"reading stripe key: %s", bch2_err_str(ret));
|
||||
if (ret) {
|
||||
bch2_stripe_close(c, h->s);
|
||||
bch2_stripe_close(c, s);
|
||||
return ret;
|
||||
}
|
||||
|
||||
existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
|
||||
|
||||
BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
|
||||
h->s->nr_data = existing_v->nr_blocks -
|
||||
existing_v->nr_redundant;
|
||||
|
||||
ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
|
||||
if (ret) {
|
||||
bch2_stripe_close(c, h->s);
|
||||
return ret;
|
||||
}
|
||||
|
||||
BUG_ON(h->s->existing_stripe.size != h->blocksize);
|
||||
BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
|
||||
|
||||
/*
|
||||
* Free buckets we initially allocated - they might conflict with
|
||||
* blocks from the stripe we're reusing:
|
||||
*/
|
||||
for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
|
||||
bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
|
||||
h->s->blocks[i] = 0;
|
||||
}
|
||||
memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
|
||||
memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
|
||||
|
||||
for (i = 0; i < existing_v->nr_blocks; i++) {
|
||||
if (stripe_blockcount_get(existing_v, i)) {
|
||||
__set_bit(i, h->s->blocks_gotten);
|
||||
__set_bit(i, h->s->blocks_allocated);
|
||||
}
|
||||
|
||||
ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
|
||||
}
|
||||
|
||||
bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
|
||||
h->s->have_existing_stripe = true;
|
||||
|
||||
return 0;
|
||||
return init_new_stripe_from_existing(c, s);
|
||||
}
|
||||
|
||||
static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h)
|
||||
static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h,
|
||||
struct ec_stripe_new *s)
|
||||
{
|
||||
struct bch_fs *c = trans->c;
|
||||
struct btree_iter iter;
|
||||
@ -2082,15 +2084,19 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
|
||||
struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
|
||||
int ret;
|
||||
|
||||
if (!h->s->res.sectors) {
|
||||
ret = bch2_disk_reservation_get(c, &h->s->res,
|
||||
if (!s->res.sectors) {
|
||||
ret = bch2_disk_reservation_get(c, &s->res,
|
||||
h->blocksize,
|
||||
h->s->nr_parity,
|
||||
s->nr_parity,
|
||||
BCH_DISK_RESERVATION_NOFAIL);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate stripe slot
|
||||
* XXX: we're going to need a bitrange btree of free stripes
|
||||
*/
|
||||
for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos,
|
||||
BTREE_ITER_slots|BTREE_ITER_intent, k, ret) {
|
||||
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
|
||||
@ -2105,7 +2111,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
|
||||
}
|
||||
|
||||
if (bkey_deleted(k.k) &&
|
||||
bch2_try_open_stripe(c, h->s, k.k->p.offset))
|
||||
bch2_try_open_stripe(c, s, k.k->p.offset))
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2116,16 +2122,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
|
||||
|
||||
ret = ec_stripe_mem_alloc(trans, &iter);
|
||||
if (ret) {
|
||||
bch2_stripe_close(c, h->s);
|
||||
bch2_stripe_close(c, s);
|
||||
goto err;
|
||||
}
|
||||
|
||||
h->s->new_stripe.key.k.p = iter.pos;
|
||||
s->new_stripe.key.k.p = iter.pos;
|
||||
out:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
return ret;
|
||||
err:
|
||||
bch2_disk_reservation_put(c, &h->s->res);
|
||||
bch2_disk_reservation_put(c, &s->res);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -2156,22 +2162,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
||||
return h;
|
||||
|
||||
if (!h->s) {
|
||||
ret = ec_new_stripe_alloc(c, h);
|
||||
if (ret) {
|
||||
h->s = ec_new_stripe_alloc(c, h);
|
||||
if (!h->s) {
|
||||
ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
|
||||
bch_err(c, "failed to allocate new stripe");
|
||||
goto err;
|
||||
}
|
||||
|
||||
h->nr_created++;
|
||||
}
|
||||
|
||||
if (h->s->allocated)
|
||||
struct ec_stripe_new *s = h->s;
|
||||
|
||||
if (s->allocated)
|
||||
goto allocated;
|
||||
|
||||
if (h->s->have_existing_stripe)
|
||||
if (s->have_existing_stripe)
|
||||
goto alloc_existing;
|
||||
|
||||
/* First, try to allocate a full stripe: */
|
||||
ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?:
|
||||
__bch2_ec_stripe_head_reserve(trans, h);
|
||||
ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
|
||||
__bch2_ec_stripe_head_reserve(trans, h, s);
|
||||
if (!ret)
|
||||
goto allocate_buf;
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
|
||||
@ -2183,15 +2194,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
|
||||
* existing stripe:
|
||||
*/
|
||||
while (1) {
|
||||
ret = __bch2_ec_stripe_head_reuse(trans, h);
|
||||
ret = __bch2_ec_stripe_head_reuse(trans, h, s);
|
||||
if (!ret)
|
||||
break;
|
||||
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
|
||||
goto err;
|
||||
|
||||
if (watermark == BCH_WATERMARK_copygc) {
|
||||
ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?:
|
||||
__bch2_ec_stripe_head_reserve(trans, h);
|
||||
ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
|
||||
__bch2_ec_stripe_head_reserve(trans, h, s);
|
||||
if (ret)
|
||||
goto err;
|
||||
goto allocate_buf;
|
||||
@ -2209,19 +2220,19 @@ alloc_existing:
|
||||
* Retry allocating buckets, with the watermark for this
|
||||
* particular write:
|
||||
*/
|
||||
ret = new_stripe_alloc_buckets(trans, h, watermark, cl);
|
||||
ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
allocate_buf:
|
||||
ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize);
|
||||
ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
h->s->allocated = true;
|
||||
s->allocated = true;
|
||||
allocated:
|
||||
BUG_ON(!h->s->idx);
|
||||
BUG_ON(!h->s->new_stripe.data[0]);
|
||||
BUG_ON(!s->idx);
|
||||
BUG_ON(!s->new_stripe.data[0]);
|
||||
BUG_ON(trans->restarted);
|
||||
return h;
|
||||
err:
|
||||
@ -2286,7 +2297,7 @@ err:
|
||||
int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
|
||||
{
|
||||
return bch2_trans_run(c,
|
||||
for_each_btree_key_upto_commit(trans, iter,
|
||||
for_each_btree_key_max_commit(trans, iter,
|
||||
BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
|
||||
BTREE_ITER_intent, k,
|
||||
NULL, NULL, 0, ({
|
||||
@ -2449,11 +2460,9 @@ void bch2_fs_ec_exit(struct bch_fs *c)
|
||||
|
||||
while (1) {
|
||||
mutex_lock(&c->ec_stripe_head_lock);
|
||||
h = list_first_entry_or_null(&c->ec_stripe_head_list,
|
||||
struct ec_stripe_head, list);
|
||||
if (h)
|
||||
list_del(&h->list);
|
||||
h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list);
|
||||
mutex_unlock(&c->ec_stripe_head_lock);
|
||||
|
||||
if (!h)
|
||||
break;
|
||||
|
||||
|
@ -6,9 +6,8 @@
|
||||
#include "buckets_types.h"
|
||||
#include "extents_types.h"
|
||||
|
||||
enum bch_validate_flags;
|
||||
|
||||
int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags);
|
||||
int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c,
|
||||
struct bkey_validate_context);
|
||||
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
|
||||
struct bkey_s_c);
|
||||
int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
|
||||
|
@ -54,7 +54,8 @@
|
||||
x(ENOMEM, ENOMEM_compression_bounce_read_init) \
|
||||
x(ENOMEM, ENOMEM_compression_bounce_write_init) \
|
||||
x(ENOMEM, ENOMEM_compression_workspace_init) \
|
||||
x(ENOMEM, ENOMEM_decompression_workspace_init) \
|
||||
x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \
|
||||
x(EIO, compression_workspace_not_initialized) \
|
||||
x(ENOMEM, ENOMEM_bucket_gens) \
|
||||
x(ENOMEM, ENOMEM_buckets_nouse) \
|
||||
x(ENOMEM, ENOMEM_usage_init) \
|
||||
@ -116,6 +117,8 @@
|
||||
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
|
||||
x(ENOENT, ENOENT_dev_not_found) \
|
||||
x(ENOENT, ENOENT_dev_idx_not_found) \
|
||||
x(ENOENT, ENOENT_inode_no_backpointer) \
|
||||
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
|
||||
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
|
||||
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
|
||||
x(EEXIST, EEXIST_str_hash_set) \
|
||||
@ -148,6 +151,7 @@
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
|
||||
x(BCH_ERR_transaction_restart, transaction_restart_commit) \
|
||||
x(0, no_btree_node) \
|
||||
x(BCH_ERR_no_btree_node, no_btree_node_relock) \
|
||||
x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \
|
||||
@ -164,7 +168,6 @@
|
||||
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
|
||||
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
|
||||
x(0, backpointer_to_overwritten_btree_node) \
|
||||
x(0, lock_fail_root_changed) \
|
||||
x(0, journal_reclaim_would_deadlock) \
|
||||
x(EINVAL, fsck) \
|
||||
x(BCH_ERR_fsck, fsck_fix) \
|
||||
@ -173,7 +176,9 @@
|
||||
x(BCH_ERR_fsck, fsck_errors_not_fixed) \
|
||||
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
|
||||
x(BCH_ERR_fsck, fsck_repair_impossible) \
|
||||
x(0, restart_recovery) \
|
||||
x(EINVAL, restart_recovery) \
|
||||
x(EINVAL, not_in_recovery) \
|
||||
x(EINVAL, cannot_rewind_recovery) \
|
||||
x(0, data_update_done) \
|
||||
x(EINVAL, device_state_not_allowed) \
|
||||
x(EINVAL, member_info_missing) \
|
||||
@ -192,7 +197,9 @@
|
||||
x(EINVAL, opt_parse_error) \
|
||||
x(EINVAL, remove_with_metadata_missing_unimplemented)\
|
||||
x(EINVAL, remove_would_lose_data) \
|
||||
x(EINVAL, btree_iter_with_journal_not_supported) \
|
||||
x(EINVAL, no_resize_with_buckets_nouse) \
|
||||
x(EINVAL, inode_unpack_error) \
|
||||
x(EINVAL, varint_decode_error) \
|
||||
x(EROFS, erofs_trans_commit) \
|
||||
x(EROFS, erofs_no_writes) \
|
||||
x(EROFS, erofs_journal_err) \
|
||||
@ -241,7 +248,10 @@
|
||||
x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
|
||||
x(BCH_ERR_invalid, invalid_bkey) \
|
||||
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
|
||||
x(EIO, journal_shutdown) \
|
||||
x(EIO, journal_flush_err) \
|
||||
x(EIO, btree_node_read_err) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
|
||||
x(EIO, sb_not_downgraded) \
|
||||
x(EIO, btree_node_write_all_failed) \
|
||||
x(EIO, btree_node_read_error) \
|
||||
@ -257,6 +267,8 @@
|
||||
x(EIO, no_device_to_read_from) \
|
||||
x(EIO, missing_indirect_extent) \
|
||||
x(EIO, invalidate_stripe_to_dev) \
|
||||
x(EIO, no_encryption_key) \
|
||||
x(EIO, insufficient_journal_devices) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
|
||||
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
|
||||
@ -305,6 +317,7 @@ static inline long bch2_err_class(long err)
|
||||
|
||||
#define BLK_STS_REMOVED ((__force blk_status_t)128)
|
||||
|
||||
#include <linux/blk_types.h>
|
||||
const char *bch2_blk_status_to_str(blk_status_t);
|
||||
|
||||
#endif /* _BCACHFES_ERRCODE_H */
|
||||
|
@ -1,7 +1,9 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "bcachefs.h"
|
||||
#include "btree_cache.h"
|
||||
#include "btree_iter.h"
|
||||
#include "error.h"
|
||||
#include "fs-common.h"
|
||||
#include "journal.h"
|
||||
#include "recovery_passes.h"
|
||||
#include "super.h"
|
||||
@ -33,7 +35,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
|
||||
int bch2_topology_error(struct bch_fs *c)
|
||||
{
|
||||
set_bit(BCH_FS_topology_error, &c->flags);
|
||||
if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
|
||||
if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
|
||||
bch2_inconsistent_error(c);
|
||||
return -BCH_ERR_btree_need_topology_repair;
|
||||
} else {
|
||||
@ -218,6 +220,30 @@ static const u8 fsck_flags_extra[] = {
|
||||
#undef x
|
||||
};
|
||||
|
||||
static int do_fsck_ask_yn(struct bch_fs *c,
|
||||
struct btree_trans *trans,
|
||||
struct printbuf *question,
|
||||
const char *action)
|
||||
{
|
||||
prt_str(question, ", ");
|
||||
prt_str(question, action);
|
||||
|
||||
if (bch2_fs_stdio_redirect(c))
|
||||
bch2_print(c, "%s", question->buf);
|
||||
else
|
||||
bch2_print_string_as_lines(KERN_ERR, question->buf);
|
||||
|
||||
int ask = bch2_fsck_ask_yn(c, trans);
|
||||
|
||||
if (trans) {
|
||||
int ret = bch2_trans_relock(trans);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return ask;
|
||||
}
|
||||
|
||||
int __bch2_fsck_err(struct bch_fs *c,
|
||||
struct btree_trans *trans,
|
||||
enum bch_fsck_flags flags,
|
||||
@ -226,7 +252,7 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
{
|
||||
struct fsck_err_state *s = NULL;
|
||||
va_list args;
|
||||
bool print = true, suppressing = false, inconsistent = false;
|
||||
bool print = true, suppressing = false, inconsistent = false, exiting = false;
|
||||
struct printbuf buf = PRINTBUF, *out = &buf;
|
||||
int ret = -BCH_ERR_fsck_ignore;
|
||||
const char *action_orig = "fix?", *action = action_orig;
|
||||
@ -256,9 +282,10 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
!trans &&
|
||||
bch2_current_has_btree_trans(c));
|
||||
|
||||
if ((flags & FSCK_CAN_FIX) &&
|
||||
test_bit(err, c->sb.errors_silent))
|
||||
return -BCH_ERR_fsck_fix;
|
||||
if (test_bit(err, c->sb.errors_silent))
|
||||
return flags & FSCK_CAN_FIX
|
||||
? -BCH_ERR_fsck_fix
|
||||
: -BCH_ERR_fsck_ignore;
|
||||
|
||||
bch2_sb_error_count(c, err);
|
||||
|
||||
@ -289,16 +316,14 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
*/
|
||||
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
|
||||
ret = s->ret;
|
||||
mutex_unlock(&c->fsck_error_msgs_lock);
|
||||
goto err;
|
||||
goto err_unlock;
|
||||
}
|
||||
|
||||
kfree(s->last_msg);
|
||||
s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
|
||||
if (!s->last_msg) {
|
||||
mutex_unlock(&c->fsck_error_msgs_lock);
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
goto err_unlock;
|
||||
}
|
||||
|
||||
if (c->opts.ratelimit_errors &&
|
||||
@ -318,13 +343,19 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
prt_printf(out, bch2_log_msg(c, ""));
|
||||
#endif
|
||||
|
||||
if ((flags & FSCK_CAN_FIX) &&
|
||||
(flags & FSCK_AUTOFIX) &&
|
||||
if ((flags & FSCK_AUTOFIX) &&
|
||||
(c->opts.errors == BCH_ON_ERROR_continue ||
|
||||
c->opts.errors == BCH_ON_ERROR_fix_safe)) {
|
||||
prt_str(out, ", ");
|
||||
prt_actioning(out, action);
|
||||
ret = -BCH_ERR_fsck_fix;
|
||||
if (flags & FSCK_CAN_FIX) {
|
||||
prt_actioning(out, action);
|
||||
ret = -BCH_ERR_fsck_fix;
|
||||
} else {
|
||||
prt_str(out, ", continuing");
|
||||
ret = -BCH_ERR_fsck_ignore;
|
||||
}
|
||||
|
||||
goto print;
|
||||
} else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
|
||||
if (c->opts.errors != BCH_ON_ERROR_continue ||
|
||||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
|
||||
@ -348,31 +379,18 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
: c->opts.fix_errors;
|
||||
|
||||
if (fix == FSCK_FIX_ask) {
|
||||
prt_str(out, ", ");
|
||||
prt_str(out, action);
|
||||
|
||||
if (bch2_fs_stdio_redirect(c))
|
||||
bch2_print(c, "%s", out->buf);
|
||||
else
|
||||
bch2_print_string_as_lines(KERN_ERR, out->buf);
|
||||
print = false;
|
||||
|
||||
int ask = bch2_fsck_ask_yn(c, trans);
|
||||
ret = do_fsck_ask_yn(c, trans, out, action);
|
||||
if (ret < 0)
|
||||
goto err_unlock;
|
||||
|
||||
if (trans) {
|
||||
ret = bch2_trans_relock(trans);
|
||||
if (ret) {
|
||||
mutex_unlock(&c->fsck_error_msgs_lock);
|
||||
goto err;
|
||||
}
|
||||
}
|
||||
|
||||
if (ask >= YN_ALLNO && s)
|
||||
s->fix = ask == YN_ALLNO
|
||||
if (ret >= YN_ALLNO && s)
|
||||
s->fix = ret == YN_ALLNO
|
||||
? FSCK_FIX_no
|
||||
: FSCK_FIX_yes;
|
||||
|
||||
ret = ask & 1
|
||||
ret = ret & 1
|
||||
? -BCH_ERR_fsck_fix
|
||||
: -BCH_ERR_fsck_ignore;
|
||||
} else if (fix == FSCK_FIX_yes ||
|
||||
@ -385,9 +403,7 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
prt_str(out, ", not ");
|
||||
prt_actioning(out, action);
|
||||
}
|
||||
} else if (flags & FSCK_NEED_FSCK) {
|
||||
prt_str(out, " (run fsck to correct)");
|
||||
} else {
|
||||
} else if (!(flags & FSCK_CAN_IGNORE)) {
|
||||
prt_str(out, " (repair unimplemented)");
|
||||
}
|
||||
|
||||
@ -396,14 +412,13 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
!(flags & FSCK_CAN_IGNORE)))
|
||||
ret = -BCH_ERR_fsck_errors_not_fixed;
|
||||
|
||||
bool exiting =
|
||||
test_bit(BCH_FS_fsck_running, &c->flags) &&
|
||||
(ret != -BCH_ERR_fsck_fix &&
|
||||
ret != -BCH_ERR_fsck_ignore);
|
||||
|
||||
if (exiting)
|
||||
if (test_bit(BCH_FS_fsck_running, &c->flags) &&
|
||||
(ret != -BCH_ERR_fsck_fix &&
|
||||
ret != -BCH_ERR_fsck_ignore)) {
|
||||
exiting = true;
|
||||
print = true;
|
||||
|
||||
}
|
||||
print:
|
||||
if (print) {
|
||||
if (bch2_fs_stdio_redirect(c))
|
||||
bch2_print(c, "%s\n", out->buf);
|
||||
@ -419,17 +434,24 @@ int __bch2_fsck_err(struct bch_fs *c,
|
||||
if (s)
|
||||
s->ret = ret;
|
||||
|
||||
mutex_unlock(&c->fsck_error_msgs_lock);
|
||||
|
||||
if (inconsistent)
|
||||
bch2_inconsistent_error(c);
|
||||
|
||||
if (ret == -BCH_ERR_fsck_fix) {
|
||||
set_bit(BCH_FS_errors_fixed, &c->flags);
|
||||
} else {
|
||||
set_bit(BCH_FS_errors_not_fixed, &c->flags);
|
||||
set_bit(BCH_FS_error, &c->flags);
|
||||
/*
|
||||
* We don't yet track whether the filesystem currently has errors, for
|
||||
* log_fsck_err()s: that would require us to track for every error type
|
||||
* which recovery pass corrects it, to get the fsck exit status correct:
|
||||
*/
|
||||
if (flags & FSCK_CAN_FIX) {
|
||||
if (ret == -BCH_ERR_fsck_fix) {
|
||||
set_bit(BCH_FS_errors_fixed, &c->flags);
|
||||
} else {
|
||||
set_bit(BCH_FS_errors_not_fixed, &c->flags);
|
||||
set_bit(BCH_FS_error, &c->flags);
|
||||
}
|
||||
}
|
||||
err_unlock:
|
||||
mutex_unlock(&c->fsck_error_msgs_lock);
|
||||
err:
|
||||
if (action != action_orig)
|
||||
kfree(action);
|
||||
@ -437,28 +459,52 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char * const bch2_bkey_validate_contexts[] = {
|
||||
#define x(n) #n,
|
||||
BKEY_VALIDATE_CONTEXTS()
|
||||
#undef x
|
||||
NULL
|
||||
};
|
||||
|
||||
int __bch2_bkey_fsck_err(struct bch_fs *c,
|
||||
struct bkey_s_c k,
|
||||
enum bch_validate_flags validate_flags,
|
||||
struct bkey_validate_context from,
|
||||
enum bch_sb_error_id err,
|
||||
const char *fmt, ...)
|
||||
{
|
||||
if (validate_flags & BCH_VALIDATE_silent)
|
||||
if (from.flags & BCH_VALIDATE_silent)
|
||||
return -BCH_ERR_fsck_delete_bkey;
|
||||
|
||||
unsigned fsck_flags = 0;
|
||||
if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)))
|
||||
if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
|
||||
if (test_bit(err, c->sb.errors_silent))
|
||||
return -BCH_ERR_fsck_delete_bkey;
|
||||
|
||||
fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
|
||||
}
|
||||
if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
|
||||
fsck_flags |= fsck_flags_extra[err];
|
||||
|
||||
struct printbuf buf = PRINTBUF;
|
||||
va_list args;
|
||||
prt_printf(&buf, "invalid bkey in %s",
|
||||
bch2_bkey_validate_contexts[from.from]);
|
||||
|
||||
if (from.from == BKEY_VALIDATE_journal)
|
||||
prt_printf(&buf, " journal seq=%llu offset=%u",
|
||||
from.journal_seq, from.journal_offset);
|
||||
|
||||
prt_str(&buf, " btree=");
|
||||
bch2_btree_id_to_text(&buf, from.btree);
|
||||
prt_printf(&buf, " level=%u: ", from.level);
|
||||
|
||||
prt_str(&buf, "invalid bkey ");
|
||||
bch2_bkey_val_to_text(&buf, c, k);
|
||||
prt_str(&buf, "\n ");
|
||||
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
prt_vprintf(&buf, fmt, args);
|
||||
va_end(args);
|
||||
|
||||
prt_str(&buf, ": delete?");
|
||||
|
||||
int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
|
||||
@ -483,3 +529,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
|
||||
|
||||
mutex_unlock(&c->fsck_error_msgs_lock);
|
||||
}
|
||||
|
||||
int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
|
||||
{
|
||||
u32 restart_count = trans->restart_count;
|
||||
int ret = 0;
|
||||
|
||||
/* XXX: we don't yet attempt to print paths when we don't know the subvol */
|
||||
if (inum.subvol)
|
||||
ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
|
||||
if (!inum.subvol || ret)
|
||||
prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
|
||||
|
||||
return trans_was_restarted(trans, restart_count);
|
||||
}
|
||||
|
||||
int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
|
||||
subvol_inum inum, u64 offset)
|
||||
{
|
||||
int ret = bch2_inum_err_msg_trans(trans, out, inum);
|
||||
prt_printf(out, " offset %llu: ", offset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
|
||||
{
|
||||
bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
|
||||
}
|
||||
|
||||
void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
|
||||
subvol_inum inum, u64 offset)
|
||||
{
|
||||
bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
|
||||
}
|
||||
|
@ -45,32 +45,11 @@ int bch2_topology_error(struct bch_fs *);
|
||||
bch2_inconsistent_error(c); \
|
||||
})
|
||||
|
||||
#define bch2_fs_inconsistent_on(cond, c, ...) \
|
||||
#define bch2_fs_inconsistent_on(cond, ...) \
|
||||
({ \
|
||||
bool _ret = unlikely(!!(cond)); \
|
||||
\
|
||||
if (_ret) \
|
||||
bch2_fs_inconsistent(c, __VA_ARGS__); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Later we might want to mark only the particular device inconsistent, not the
|
||||
* entire filesystem:
|
||||
*/
|
||||
|
||||
#define bch2_dev_inconsistent(ca, ...) \
|
||||
do { \
|
||||
bch_err(ca, __VA_ARGS__); \
|
||||
bch2_inconsistent_error((ca)->fs); \
|
||||
} while (0)
|
||||
|
||||
#define bch2_dev_inconsistent_on(cond, ca, ...) \
|
||||
({ \
|
||||
bool _ret = unlikely(!!(cond)); \
|
||||
\
|
||||
if (_ret) \
|
||||
bch2_dev_inconsistent(ca, __VA_ARGS__); \
|
||||
bch2_fs_inconsistent(__VA_ARGS__); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
@ -123,9 +102,9 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
|
||||
|
||||
void bch2_flush_fsck_errs(struct bch_fs *);
|
||||
|
||||
#define __fsck_err(c, _flags, _err_type, ...) \
|
||||
#define fsck_err_wrap(_do) \
|
||||
({ \
|
||||
int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \
|
||||
int _ret = _do; \
|
||||
if (_ret != -BCH_ERR_fsck_fix && \
|
||||
_ret != -BCH_ERR_fsck_ignore) { \
|
||||
ret = _ret; \
|
||||
@ -135,6 +114,8 @@ void bch2_flush_fsck_errs(struct bch_fs *);
|
||||
_ret == -BCH_ERR_fsck_fix; \
|
||||
})
|
||||
|
||||
#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
|
||||
|
||||
/* These macros return true if error should be fixed: */
|
||||
|
||||
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
|
||||
@ -149,12 +130,6 @@ void bch2_flush_fsck_errs(struct bch_fs *);
|
||||
(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\
|
||||
})
|
||||
|
||||
#define need_fsck_err_on(cond, c, _err_type, ...) \
|
||||
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
|
||||
|
||||
#define need_fsck_err(c, _err_type, ...) \
|
||||
__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
|
||||
|
||||
#define mustfix_fsck_err(c, _err_type, ...) \
|
||||
__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
|
||||
|
||||
@ -167,11 +142,22 @@ void bch2_flush_fsck_errs(struct bch_fs *);
|
||||
#define fsck_err_on(cond, c, _err_type, ...) \
|
||||
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
|
||||
|
||||
#define log_fsck_err(c, _err_type, ...) \
|
||||
__fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
|
||||
|
||||
#define log_fsck_err_on(cond, ...) \
|
||||
({ \
|
||||
bool _ret = unlikely(!!(cond)); \
|
||||
if (_ret) \
|
||||
log_fsck_err(__VA_ARGS__); \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
enum bch_validate_flags;
|
||||
__printf(5, 6)
|
||||
int __bch2_bkey_fsck_err(struct bch_fs *,
|
||||
struct bkey_s_c,
|
||||
enum bch_validate_flags,
|
||||
struct bkey_validate_context from,
|
||||
enum bch_sb_error_id,
|
||||
const char *, ...);
|
||||
|
||||
@ -181,7 +167,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *,
|
||||
*/
|
||||
#define bkey_fsck_err(c, _err_type, _err_msg, ...) \
|
||||
do { \
|
||||
int _ret = __bch2_bkey_fsck_err(c, k, flags, \
|
||||
int _ret = __bch2_bkey_fsck_err(c, k, from, \
|
||||
BCH_FSCK_ERR_##_err_type, \
|
||||
_err_msg, ##__VA_ARGS__); \
|
||||
if (_ret != -BCH_ERR_fsck_fix && \
|
||||
@ -252,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
|
||||
_ret; \
|
||||
})
|
||||
|
||||
int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
|
||||
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
|
||||
|
||||
void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
|
||||
void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
|
||||
|
||||
#endif /* _BCACHEFS_ERROR_H */
|
||||
|
@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
|
||||
break;
|
||||
case KEY_TYPE_reflink_p: {
|
||||
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
|
||||
u64 idx = le64_to_cpu(p.v->idx);
|
||||
u64 idx = REFLINK_P_IDX(p.v);
|
||||
unsigned sectors = bpos_min(*end, p.k->p).offset -
|
||||
bkey_start_offset(p.k);
|
||||
struct btree_iter iter;
|
||||
@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans,
|
||||
|
||||
bch2_trans_copy_iter(©, iter);
|
||||
|
||||
for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) {
|
||||
for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
|
||||
unsigned offset = 0;
|
||||
|
||||
if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "extents.h"
|
||||
#include "inode.h"
|
||||
#include "journal.h"
|
||||
#include "rebalance.h"
|
||||
#include "replicas.h"
|
||||
#include "super.h"
|
||||
#include "super-io.h"
|
||||
@ -88,6 +89,14 @@ static inline bool ptr_better(struct bch_fs *c,
|
||||
u64 l1 = dev_latency(c, p1.ptr.dev);
|
||||
u64 l2 = dev_latency(c, p2.ptr.dev);
|
||||
|
||||
/*
|
||||
* Square the latencies, to bias more in favor of the faster
|
||||
* device - we never want to stop issuing reads to the slower
|
||||
* device altogether, so that we can update our latency numbers:
|
||||
*/
|
||||
l1 *= l1;
|
||||
l2 *= l2;
|
||||
|
||||
/* Pick at random, biased in favor of the faster device: */
|
||||
|
||||
return bch2_rand_range(l1 + l2) > l1;
|
||||
@ -169,7 +178,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
||||
/* KEY_TYPE_btree_ptr: */
|
||||
|
||||
int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
@ -177,7 +186,7 @@ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
c, btree_ptr_val_too_big,
|
||||
"value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
|
||||
|
||||
ret = bch2_bkey_ptrs_validate(c, k, flags);
|
||||
ret = bch2_bkey_ptrs_validate(c, k, from);
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -189,7 +198,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
}
|
||||
|
||||
int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
|
||||
int ret = 0;
|
||||
@ -203,12 +212,13 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
c, btree_ptr_v2_min_key_bad,
|
||||
"min_key > key");
|
||||
|
||||
if (flags & BCH_VALIDATE_write)
|
||||
if ((from.flags & BCH_VALIDATE_write) &&
|
||||
c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written)
|
||||
bkey_fsck_err_on(!bp.v->sectors_written,
|
||||
c, btree_ptr_v2_written_0,
|
||||
"sectors_written == 0");
|
||||
|
||||
ret = bch2_bkey_ptrs_validate(c, k, flags);
|
||||
ret = bch2_bkey_ptrs_validate(c, k, from);
|
||||
fsck_err:
|
||||
return ret;
|
||||
}
|
||||
@ -395,7 +405,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
|
||||
/* KEY_TYPE_reservation: */
|
||||
|
||||
int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
|
||||
int ret = 0;
|
||||
@ -1120,6 +1130,57 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr
|
||||
bch2_prt_compression_type(out, crc->compression_type);
|
||||
}
|
||||
|
||||
static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
const struct bch_extent_rebalance *r)
|
||||
{
|
||||
prt_str(out, "rebalance:");
|
||||
|
||||
prt_printf(out, " replicas=%u", r->data_replicas);
|
||||
if (r->data_replicas_from_inode)
|
||||
prt_str(out, " (inode)");
|
||||
|
||||
prt_str(out, " checksum=");
|
||||
bch2_prt_csum_opt(out, r->data_checksum);
|
||||
if (r->data_checksum_from_inode)
|
||||
prt_str(out, " (inode)");
|
||||
|
||||
if (r->background_compression || r->background_compression_from_inode) {
|
||||
prt_str(out, " background_compression=");
|
||||
bch2_compression_opt_to_text(out, r->background_compression);
|
||||
|
||||
if (r->background_compression_from_inode)
|
||||
prt_str(out, " (inode)");
|
||||
}
|
||||
|
||||
if (r->background_target || r->background_target_from_inode) {
|
||||
prt_str(out, " background_target=");
|
||||
if (c)
|
||||
bch2_target_to_text(out, c, r->background_target);
|
||||
else
|
||||
prt_printf(out, "%u", r->background_target);
|
||||
|
||||
if (r->background_target_from_inode)
|
||||
prt_str(out, " (inode)");
|
||||
}
|
||||
|
||||
if (r->promote_target || r->promote_target_from_inode) {
|
||||
prt_str(out, " promote_target=");
|
||||
if (c)
|
||||
bch2_target_to_text(out, c, r->promote_target);
|
||||
else
|
||||
prt_printf(out, "%u", r->promote_target);
|
||||
|
||||
if (r->promote_target_from_inode)
|
||||
prt_str(out, " (inode)");
|
||||
}
|
||||
|
||||
if (r->erasure_code || r->erasure_code_from_inode) {
|
||||
prt_printf(out, " ec=%u", r->erasure_code);
|
||||
if (r->erasure_code_from_inode)
|
||||
prt_str(out, " (inode)");
|
||||
}
|
||||
}
|
||||
|
||||
void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
@ -1155,18 +1216,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
(u64) ec->idx, ec->block);
|
||||
break;
|
||||
}
|
||||
case BCH_EXTENT_ENTRY_rebalance: {
|
||||
const struct bch_extent_rebalance *r = &entry->rebalance;
|
||||
|
||||
prt_str(out, "rebalance: target ");
|
||||
if (c)
|
||||
bch2_target_to_text(out, c, r->target);
|
||||
else
|
||||
prt_printf(out, "%u", r->target);
|
||||
prt_str(out, " compression ");
|
||||
bch2_compression_opt_to_text(out, r->compression);
|
||||
case BCH_EXTENT_ENTRY_rebalance:
|
||||
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
|
||||
return;
|
||||
@ -1178,13 +1231,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
|
||||
static int extent_ptr_validate(struct bch_fs *c,
|
||||
struct bkey_s_c k,
|
||||
enum bch_validate_flags flags,
|
||||
struct bkey_validate_context from,
|
||||
const struct bch_extent_ptr *ptr,
|
||||
unsigned size_ondisk,
|
||||
bool metadata)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
bkey_for_each_ptr(ptrs, ptr2)
|
||||
bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
|
||||
c, ptr_to_duplicate_device,
|
||||
"multiple pointers to same device (%u)", ptr->dev);
|
||||
|
||||
/* bad pointers are repaired by check_fix_ptrs(): */
|
||||
rcu_read_lock();
|
||||
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
|
||||
@ -1199,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c,
|
||||
unsigned bucket_size = ca->mi.bucket_size;
|
||||
rcu_read_unlock();
|
||||
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
bkey_for_each_ptr(ptrs, ptr2)
|
||||
bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev,
|
||||
c, ptr_to_duplicate_device,
|
||||
"multiple pointers to same device (%u)", ptr->dev);
|
||||
|
||||
|
||||
bkey_fsck_err_on(bucket >= nbuckets,
|
||||
c, ptr_after_last_bucket,
|
||||
"pointer past last bucket (%llu > %llu)", bucket, nbuckets);
|
||||
@ -1221,7 +1273,7 @@ fsck_err:
|
||||
}
|
||||
|
||||
int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
enum bch_validate_flags flags)
|
||||
struct bkey_validate_context from)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
@ -1248,7 +1300,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
|
||||
switch (extent_entry_type(entry)) {
|
||||
case BCH_EXTENT_ENTRY_ptr:
|
||||
ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false);
|
||||
ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@ -1270,9 +1322,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
case BCH_EXTENT_ENTRY_crc128:
|
||||
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
|
||||
|
||||
bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
|
||||
c, ptr_crc_uncompressed_size_too_small,
|
||||
"checksum offset + key size > uncompressed size");
|
||||
bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type),
|
||||
c, ptr_crc_csum_type_unknown,
|
||||
"invalid checksum type");
|
||||
@ -1280,6 +1329,19 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
c, ptr_crc_compression_type_unknown,
|
||||
"invalid compression type");
|
||||
|
||||
bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size,
|
||||
c, ptr_crc_uncompressed_size_too_small,
|
||||
"checksum offset + key size > uncompressed size");
|
||||
bkey_fsck_err_on(crc_is_encoded(crc) &&
|
||||
(crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
|
||||
(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
|
||||
c, ptr_crc_uncompressed_size_too_big,
|
||||
"too large encoded extent");
|
||||
bkey_fsck_err_on(!crc_is_compressed(crc) &&
|
||||
crc.compressed_size != crc.uncompressed_size,
|
||||
c, ptr_crc_uncompressed_size_mismatch,
|
||||
"not compressed but compressed != uncompressed size");
|
||||
|
||||
if (bch2_csum_type_is_encryption(crc.csum_type)) {
|
||||
if (nonce == UINT_MAX)
|
||||
nonce = crc.offset + crc.nonce;
|
||||
@ -1293,12 +1355,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
|
||||
"redundant crc entry");
|
||||
crc_since_last_ptr = true;
|
||||
|
||||
bkey_fsck_err_on(crc_is_encoded(crc) &&
|
||||
(crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
|
||||
(flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)),
|
||||
c, ptr_crc_uncompressed_size_too_big,
|
||||
"too large encoded extent");
|
||||
|
||||
size_ondisk = crc.compressed_size;
|
||||
break;
|
||||
case BCH_EXTENT_ENTRY_stripe_ptr:
|
||||
@ -1391,166 +1447,6 @@ void bch2_ptr_swab(struct bkey_s k)
|
||||
}
|
||||
}
|
||||
|
||||
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
|
||||
bkey_extent_entry_for_each(ptrs, entry)
|
||||
if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
|
||||
return &entry->rebalance;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
|
||||
unsigned target, unsigned compression)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
unsigned rewrite_ptrs = 0;
|
||||
|
||||
if (compression) {
|
||||
unsigned compression_type = bch2_compression_opt_to_type(compression);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
unsigned i = 0;
|
||||
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
||||
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
|
||||
p.ptr.unwritten) {
|
||||
rewrite_ptrs = 0;
|
||||
goto incompressible;
|
||||
}
|
||||
|
||||
if (!p.ptr.cached && p.crc.compression_type != compression_type)
|
||||
rewrite_ptrs |= 1U << i;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
incompressible:
|
||||
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
|
||||
unsigned i = 0;
|
||||
|
||||
bkey_for_each_ptr(ptrs, ptr) {
|
||||
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
|
||||
rewrite_ptrs |= 1U << i;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
return rewrite_ptrs;
|
||||
}
|
||||
|
||||
bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
|
||||
|
||||
/*
|
||||
* If it's an indirect extent, we don't delete the rebalance entry when
|
||||
* done so that we know what options were applied - check if it still
|
||||
* needs work done:
|
||||
*/
|
||||
if (r &&
|
||||
k.k->type == KEY_TYPE_reflink_v &&
|
||||
!bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
|
||||
r = NULL;
|
||||
|
||||
return r != NULL;
|
||||
}
|
||||
|
||||
static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
|
||||
unsigned target, unsigned compression)
|
||||
{
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded p;
|
||||
u64 sectors = 0;
|
||||
|
||||
if (compression) {
|
||||
unsigned compression_type = bch2_compression_opt_to_type(compression);
|
||||
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
||||
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
|
||||
p.ptr.unwritten) {
|
||||
sectors = 0;
|
||||
goto incompressible;
|
||||
}
|
||||
|
||||
if (!p.ptr.cached && p.crc.compression_type != compression_type)
|
||||
sectors += p.crc.compressed_size;
|
||||
}
|
||||
}
|
||||
incompressible:
|
||||
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
||||
if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target))
|
||||
sectors += p.crc.compressed_size;
|
||||
}
|
||||
|
||||
return sectors;
|
||||
}
|
||||
|
||||
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
|
||||
{
|
||||
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
|
||||
|
||||
return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0;
|
||||
}
|
||||
|
||||
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
|
||||
struct bch_io_opts *opts)
|
||||
{
|
||||
struct bkey_s k = bkey_i_to_s(_k);
|
||||
struct bch_extent_rebalance *r;
|
||||
unsigned target = opts->background_target;
|
||||
unsigned compression = background_compression(*opts);
|
||||
bool needs_rebalance;
|
||||
|
||||
if (!bkey_extent_is_direct_data(k.k))
|
||||
return 0;
|
||||
|
||||
/* get existing rebalance entry: */
|
||||
r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
|
||||
if (r) {
|
||||
if (k.k->type == KEY_TYPE_reflink_v) {
|
||||
/*
|
||||
* indirect extents: existing options take precedence,
|
||||
* so that we don't move extents back and forth if
|
||||
* they're referenced by different inodes with different
|
||||
* options:
|
||||
*/
|
||||
if (r->target)
|
||||
target = r->target;
|
||||
if (r->compression)
|
||||
compression = r->compression;
|
||||
}
|
||||
|
||||
r->target = target;
|
||||
r->compression = compression;
|
||||
}
|
||||
|
||||
needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
|
||||
|
||||
if (needs_rebalance && !r) {
|
||||
union bch_extent_entry *new = bkey_val_end(k);
|
||||
|
||||
new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
|
||||
new->rebalance.compression = compression;
|
||||
new->rebalance.target = target;
|
||||
new->rebalance.unused = 0;
|
||||
k.k->u64s += extent_entry_u64s(new);
|
||||
} else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
|
||||
/*
|
||||
* For indirect extents, don't delete the rebalance entry when
|
||||
* we're finished so that we know we specifically moved it or
|
||||
* compressed it to its current location/compression type
|
||||
*/
|
||||
extent_entry_drop(k, (union bch_extent_entry *) r);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Generic extent code: */
|
||||
|
||||
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
||||
@ -1610,7 +1506,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
||||
case KEY_TYPE_reflink_p: {
|
||||
struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
|
||||
|
||||
le64_add_cpu(&p.v->idx, sub);
|
||||
SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub);
|
||||
break;
|
||||
}
|
||||
case KEY_TYPE_inline_data:
|
||||
|
@ -8,7 +8,6 @@
|
||||
|
||||
struct bch_fs;
|
||||
struct btree_trans;
|
||||
enum bch_validate_flags;
|
||||
|
||||
/* extent entries: */
|
||||
|
||||
@ -410,12 +409,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
|
||||
/* KEY_TYPE_btree_ptr: */
|
||||
|
||||
int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
struct bkey_validate_context);
|
||||
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
|
||||
struct bkey_s_c);
|
||||
|
||||
int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
struct bkey_validate_context);
|
||||
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
|
||||
int, struct bkey_s);
|
||||
@ -452,7 +451,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
|
||||
/* KEY_TYPE_reservation: */
|
||||
|
||||
int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
struct bkey_validate_context);
|
||||
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
||||
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
|
||||
|
||||
@ -696,7 +695,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct
|
||||
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
|
||||
struct bkey_s_c);
|
||||
int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
|
||||
enum bch_validate_flags);
|
||||
struct bkey_validate_context);
|
||||
|
||||
static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
|
||||
struct bch_extent_ptr ptr2)
|
||||
@ -710,15 +709,6 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
|
||||
|
||||
void bch2_ptr_swab(struct bkey_s);
|
||||
|
||||
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
|
||||
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
|
||||
unsigned, unsigned);
|
||||
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
|
||||
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
|
||||
|
||||
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
|
||||
struct bch_io_opts *);
|
||||
|
||||
/* Generic extent code: */
|
||||
|
||||
enum bch_extent_overlap {
|
||||
|
@ -201,19 +201,8 @@ struct bch_extent_stripe_ptr {
|
||||
#endif
|
||||
};
|
||||
|
||||
struct bch_extent_rebalance {
|
||||
#if defined(__LITTLE_ENDIAN_BITFIELD)
|
||||
__u64 type:6,
|
||||
unused:34,
|
||||
compression:8, /* enum bch_compression_opt */
|
||||
target:16;
|
||||
#elif defined (__BIG_ENDIAN_BITFIELD)
|
||||
__u64 target:16,
|
||||
compression:8,
|
||||
unused:34,
|
||||
type:6;
|
||||
#endif
|
||||
};
|
||||
/* bch_extent_rebalance: */
|
||||
#include "rebalance_format.h"
|
||||
|
||||
union bch_extent_entry {
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
|
||||
|
@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans,
|
||||
if (!snapshot_src.inum) {
|
||||
/* Inode wasn't specified, just snapshot: */
|
||||
struct bch_subvolume s;
|
||||
|
||||
ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
|
||||
BTREE_ITER_cached, &s);
|
||||
ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
@ -172,6 +170,10 @@ int bch2_create_trans(struct btree_trans *trans,
|
||||
new_inode->bi_dir_offset = dir_offset;
|
||||
}
|
||||
|
||||
if (S_ISDIR(mode) &&
|
||||
!new_inode->bi_subvol)
|
||||
new_inode->bi_depth = dir_u->bi_depth + 1;
|
||||
|
||||
inode_iter.flags &= ~BTREE_ITER_all_snapshots;
|
||||
bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
|
||||
|
||||
@ -512,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans,
|
||||
dst_dir_u->bi_nlink++;
|
||||
}
|
||||
|
||||
if (S_ISDIR(src_inode_u->bi_mode) &&
|
||||
!src_inode_u->bi_subvol)
|
||||
src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
|
||||
|
||||
if (mode == BCH_RENAME_EXCHANGE &&
|
||||
S_ISDIR(dst_inode_u->bi_mode) &&
|
||||
!dst_inode_u->bi_subvol)
|
||||
dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
|
||||
|
||||
if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
|
||||
dst_dir_u->bi_nlink--;
|
||||
src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
|
||||
@ -548,3 +559,94 @@ err:
|
||||
bch2_trans_iter_exit(trans, &src_dir_iter);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
|
||||
{
|
||||
bch2_printbuf_make_room(out, n);
|
||||
|
||||
unsigned can_print = min(n, printbuf_remaining(out));
|
||||
|
||||
b += n;
|
||||
|
||||
for (unsigned i = 0; i < can_print; i++)
|
||||
out->buf[out->pos++] = *((char *) --b);
|
||||
|
||||
printbuf_nul_terminate(out);
|
||||
}
|
||||
|
||||
static inline void prt_str_reversed(struct printbuf *out, const char *s)
|
||||
{
|
||||
prt_bytes_reversed(out, s, strlen(s));
|
||||
}
|
||||
|
||||
static inline void reverse_bytes(void *b, size_t n)
|
||||
{
|
||||
char *e = b + n, *s = b;
|
||||
|
||||
while (s < e) {
|
||||
--e;
|
||||
swap(*s, *e);
|
||||
s++;
|
||||
}
|
||||
}
|
||||
|
||||
/* XXX: we don't yet attempt to print paths when we don't know the subvol */
|
||||
int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
|
||||
{
|
||||
unsigned orig_pos = path->pos;
|
||||
int ret = 0;
|
||||
|
||||
while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
|
||||
inum.inum == BCACHEFS_ROOT_INO)) {
|
||||
struct bch_inode_unpacked inode;
|
||||
ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
|
||||
if (ret)
|
||||
goto disconnected;
|
||||
|
||||
if (!inode.bi_dir && !inode.bi_dir_offset) {
|
||||
ret = -BCH_ERR_ENOENT_inode_no_backpointer;
|
||||
goto disconnected;
|
||||
}
|
||||
|
||||
inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
|
||||
inum.inum = inode.bi_dir;
|
||||
|
||||
u32 snapshot;
|
||||
ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
|
||||
if (ret)
|
||||
goto disconnected;
|
||||
|
||||
struct btree_iter d_iter;
|
||||
struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
|
||||
BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
|
||||
0, dirent);
|
||||
ret = bkey_err(d.s_c);
|
||||
if (ret)
|
||||
goto disconnected;
|
||||
|
||||
struct qstr dirent_name = bch2_dirent_get_name(d);
|
||||
prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
|
||||
|
||||
prt_char(path, '/');
|
||||
|
||||
bch2_trans_iter_exit(trans, &d_iter);
|
||||
}
|
||||
|
||||
if (orig_pos == path->pos)
|
||||
prt_char(path, '/');
|
||||
out:
|
||||
ret = path->allocation_failure ? -ENOMEM : 0;
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
|
||||
return 0;
|
||||
err:
|
||||
return ret;
|
||||
disconnected:
|
||||
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
||||
goto err;
|
||||
|
||||
prt_str_reversed(path, "(disconnected)");
|
||||
goto out;
|
||||
}
|
||||
|
@ -42,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *,
|
||||
bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
|
||||
struct bch_inode_unpacked *);
|
||||
|
||||
int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
|
||||
|
||||
#endif /* _BCACHEFS_FS_COMMON_H */
|
||||
|
@ -164,7 +164,8 @@ static void bchfs_read(struct btree_trans *trans,
|
||||
BTREE_ITER_slots);
|
||||
while (1) {
|
||||
struct bkey_s_c k;
|
||||
unsigned bytes, sectors, offset_into_extent;
|
||||
unsigned bytes, sectors;
|
||||
s64 offset_into_extent;
|
||||
enum btree_id data_btree = BTREE_ID_extents;
|
||||
|
||||
bch2_trans_begin(trans);
|
||||
@ -197,7 +198,7 @@ static void bchfs_read(struct btree_trans *trans,
|
||||
|
||||
k = bkey_i_to_s_c(sk.k);
|
||||
|
||||
sectors = min(sectors, k.k->size - offset_into_extent);
|
||||
sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
|
||||
|
||||
if (readpages_iter) {
|
||||
ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
|
||||
@ -230,10 +231,12 @@ err:
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
|
||||
if (ret) {
|
||||
bch_err_inum_offset_ratelimited(c,
|
||||
iter.pos.inode,
|
||||
iter.pos.offset << 9,
|
||||
"read error %i from btree lookup", ret);
|
||||
struct printbuf buf = PRINTBUF;
|
||||
bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
|
||||
prt_printf(&buf, "read error %i from btree lookup", ret);
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
|
||||
rbio->bio.bi_status = BLK_STS_IOERR;
|
||||
bio_endio(&rbio->bio);
|
||||
}
|
||||
@ -248,6 +251,7 @@ void bch2_readahead(struct readahead_control *ractl)
|
||||
struct bch_io_opts opts;
|
||||
struct folio *folio;
|
||||
struct readpages_iter readpages_iter;
|
||||
struct blk_plug plug;
|
||||
|
||||
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
|
||||
|
||||
@ -255,6 +259,16 @@ void bch2_readahead(struct readahead_control *ractl)
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Besides being a general performance optimization, plugging helps with
|
||||
* avoiding btree transaction srcu warnings - submitting a bio can
|
||||
* block, and we don't want todo that with the transaction locked.
|
||||
*
|
||||
* However, plugged bios are submitted when we schedule; we ideally
|
||||
* would have our own scheduler hook to call unlock_long() before
|
||||
* scheduling.
|
||||
*/
|
||||
blk_start_plug(&plug);
|
||||
bch2_pagecache_add_get(inode);
|
||||
|
||||
struct btree_trans *trans = bch2_trans_get(c);
|
||||
@ -281,7 +295,7 @@ void bch2_readahead(struct readahead_control *ractl)
|
||||
bch2_trans_put(trans);
|
||||
|
||||
bch2_pagecache_add_put(inode);
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
darray_exit(&readpages_iter.folios);
|
||||
}
|
||||
|
||||
@ -296,9 +310,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct bch_read_bio *rbio;
|
||||
struct bch_io_opts opts;
|
||||
struct blk_plug plug;
|
||||
int ret;
|
||||
DECLARE_COMPLETION_ONSTACK(done);
|
||||
|
||||
BUG_ON(folio_test_uptodate(folio));
|
||||
BUG_ON(folio_test_dirty(folio));
|
||||
|
||||
if (!bch2_folio_create(folio, GFP_KERNEL))
|
||||
return -ENOMEM;
|
||||
|
||||
@ -313,7 +331,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
|
||||
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
|
||||
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
|
||||
|
||||
blk_start_plug(&plug);
|
||||
bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
|
||||
blk_finish_plug(&plug);
|
||||
wait_for_completion(&done);
|
||||
|
||||
ret = blk_status_to_errno(rbio->bio.bi_status);
|
||||
@ -605,15 +625,6 @@ do_io:
|
||||
BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
|
||||
sectors << 9, offset << 9));
|
||||
|
||||
/* Check for writing past i_size: */
|
||||
WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
|
||||
round_up(i_size, block_bytes(c)) &&
|
||||
!test_bit(BCH_FS_emergency_ro, &c->flags),
|
||||
"writing past i_size: %llu > %llu (unrounded %llu)\n",
|
||||
bio_end_sector(&w->io->op.wbio.bio) << 9,
|
||||
round_up(i_size, block_bytes(c)),
|
||||
i_size);
|
||||
|
||||
w->io->op.res.sectors += reserved_sectors;
|
||||
w->io->op.i_sectors_delta -= dirty_sectors;
|
||||
w->io->op.new_i_size = i_size;
|
||||
@ -669,7 +680,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping,
|
||||
folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
|
||||
FGP_WRITEBEGIN | fgf_set_order(len),
|
||||
mapping_gfp_mask(mapping));
|
||||
if (IS_ERR_OR_NULL(folio))
|
||||
if (IS_ERR(folio))
|
||||
goto err_unlock;
|
||||
|
||||
offset = pos - folio_pos(folio);
|
||||
|
@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
||||
struct bch_io_opts opts;
|
||||
struct dio_read *dio;
|
||||
struct bio *bio;
|
||||
struct blk_plug plug;
|
||||
loff_t offset = req->ki_pos;
|
||||
bool sync = is_sync_kiocb(req);
|
||||
size_t shorten;
|
||||
@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
||||
*/
|
||||
dio->should_dirty = iter_is_iovec(iter);
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
goto start;
|
||||
while (iter->count) {
|
||||
bio = bio_alloc_bioset(NULL,
|
||||
@ -160,6 +163,8 @@ start:
|
||||
bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
|
||||
}
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
iter->count += shorten;
|
||||
|
||||
if (sync) {
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user