2020-05-12 16:54:17 -07:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2005-11-02 14:58:39 +11:00
|
|
|
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
|
2013-04-24 18:58:02 +10:00
|
|
|
* Copyright (c) 2013 Red Hat, Inc.
|
2005-11-02 14:58:39 +11:00
|
|
|
* All Rights Reserved.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
#ifndef __XFS_DA_BTREE_H__
|
|
|
|
#define __XFS_DA_BTREE_H__
|
|
|
|
|
|
|
|
struct xfs_inode;
|
|
|
|
struct xfs_trans;
|
|
|
|
|
2014-06-06 15:01:58 +10:00
|
|
|
/*
|
|
|
|
* Directory/attribute geometry information. There will be one of these for each
|
|
|
|
* data fork type, and it will be passed around via the xfs_da_args. Global
|
|
|
|
* structures will be attached to the xfs_mount.
|
|
|
|
*/
|
|
|
|
struct xfs_da_geometry {
|
2019-11-08 14:52:07 -08:00
|
|
|
unsigned int blksize; /* da block size in bytes */
|
|
|
|
unsigned int fsbcount; /* da block size in filesystem blocks */
|
2014-06-06 15:01:58 +10:00
|
|
|
uint8_t fsblog; /* log2 of _filesystem_ block size */
|
|
|
|
uint8_t blklog; /* log2 of da block size */
|
2019-11-08 14:57:49 -08:00
|
|
|
unsigned int node_hdr_size; /* danode header size in bytes */
|
2019-11-08 14:52:07 -08:00
|
|
|
unsigned int node_ents; /* # of entries in a danode */
|
|
|
|
unsigned int magicpct; /* 37% of block size in bytes */
|
2014-06-06 15:01:58 +10:00
|
|
|
xfs_dablk_t datablk; /* blockno of dir data v2 */
|
2019-11-08 14:57:51 -08:00
|
|
|
unsigned int leaf_hdr_size; /* dir2 leaf header size */
|
2019-11-08 14:57:51 -08:00
|
|
|
unsigned int leaf_max_ents; /* # of entries in dir2 leaf */
|
2014-06-06 15:01:58 +10:00
|
|
|
xfs_dablk_t leafblk; /* blockno of leaf data v2 */
|
2019-11-08 15:01:29 -08:00
|
|
|
unsigned int free_hdr_size; /* dir2 free header size */
|
2019-11-08 15:01:30 -08:00
|
|
|
unsigned int free_max_bests; /* # of bests entries in dir2 free */
|
2014-06-06 15:01:58 +10:00
|
|
|
xfs_dablk_t freeblk; /* blockno of free data v2 */
|
2022-03-29 06:14:00 +00:00
|
|
|
xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
|
2019-11-08 15:05:38 -08:00
|
|
|
|
|
|
|
xfs_dir2_data_aoff_t data_first_offset;
|
|
|
|
size_t data_entry_offset;
|
2014-06-06 15:01:58 +10:00
|
|
|
};
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*========================================================================
|
|
|
|
* Btree searching and modification structure definitions.
|
|
|
|
*========================================================================*/
|
|
|
|
|
2008-05-21 16:41:01 +10:00
|
|
|
/*
|
|
|
|
* Search comparison results
|
|
|
|
*/
|
|
|
|
enum xfs_dacmp {
|
|
|
|
XFS_CMP_DIFFERENT, /* names are completely different */
|
|
|
|
XFS_CMP_EXACT, /* names are exactly the same */
|
|
|
|
XFS_CMP_CASE /* names are same but differ in case */
|
|
|
|
};
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Structure to ease passing around component names.
|
|
|
|
*/
|
|
|
|
typedef struct xfs_da_args {
|
2014-06-06 15:01:58 +10:00
|
|
|
struct xfs_da_geometry *geo; /* da block geometry */
|
2017-06-16 11:00:05 -07:00
|
|
|
const uint8_t *name; /* string (maybe not NULL terminated) */
|
2005-04-16 15:20:36 -07:00
|
|
|
int namelen; /* length of string (maybe no NULL) */
|
2017-06-16 11:00:05 -07:00
|
|
|
uint8_t filetype; /* filetype of inode for directories */
|
2020-02-26 17:30:33 -08:00
|
|
|
void *value; /* set of bytes (maybe contain NULLs) */
|
2005-04-16 15:20:36 -07:00
|
|
|
int valuelen; /* length of value */
|
2020-02-26 17:30:43 -08:00
|
|
|
unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
|
2020-02-26 17:30:42 -08:00
|
|
|
unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
|
2005-04-16 15:20:36 -07:00
|
|
|
xfs_dahash_t hashval; /* hash value of name */
|
|
|
|
xfs_ino_t inumber; /* input/output inode number */
|
|
|
|
struct xfs_inode *dp; /* directory inode to manipulate */
|
|
|
|
struct xfs_trans *trans; /* current trans (changes over time) */
|
|
|
|
xfs_extlen_t total; /* total blocks needed, for 1st bmap */
|
|
|
|
int whichfork; /* data or attribute fork */
|
|
|
|
xfs_dablk_t blkno; /* blkno of attr leaf of interest */
|
|
|
|
int index; /* index of attr of interest in blk */
|
|
|
|
xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
|
|
|
|
int rmtblkcnt; /* remote attr value block count */
|
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
|
|
|
int rmtvaluelen; /* remote attr value length in bytes */
|
2005-04-16 15:20:36 -07:00
|
|
|
xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
|
|
|
|
int index2; /* index of 2nd attr in blk */
|
|
|
|
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
|
|
|
|
int rmtblkcnt2; /* remote attr value block count */
|
xfs: remote attribute overwrite causes transaction overrun
Commit e461fcb ("xfs: remote attribute lookups require the value
length") passes the remote attribute length in the xfs_da_args
structure on lookup so that CRC calculations and validity checking
can be performed correctly by related code. This, unfortunately has
the side effect of changing the args->valuelen parameter in cases
where it shouldn't.
That is, when we replace a remote attribute, the incoming
replacement stores the value and length in args->value and
args->valuelen, but then the lookup which finds the existing remote
attribute overwrites args->valuelen with the length of the remote
attribute being replaced. Hence when we go to create the new
attribute, we create it of the size of the existing remote
attribute, not the size it is supposed to be. When the new attribute
is much smaller than the old attribute, this results in a
transaction overrun and an ASSERT() failure on a debug kernel:
XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331
Fix this by keeping the remote attribute value length separate to
the attribute value length in the xfs_da_args structure. The enables
us to pass the length of the remote attribute to be removed without
overwriting the new attribute's length.
Also, ensure that when we save remote block contexts for a later
rename we zero the original state variables so that we don't confuse
the state of the attribute to be removes with the state of the new
attribute that we just added. [Spotted by Brain Foster.]
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
|
|
|
int rmtvaluelen2; /* remote attr value length in bytes */
|
2022-04-21 10:46:47 +10:00
|
|
|
uint32_t op_flags; /* operation flags */
|
2008-05-21 16:41:01 +10:00
|
|
|
enum xfs_dacmp cmpresult; /* name compare result for lookups */
|
2024-04-15 14:54:34 -07:00
|
|
|
xfs_ino_t owner; /* inode that owns the dir/attr data */
|
2005-04-16 15:20:36 -07:00
|
|
|
} xfs_da_args_t;
|
|
|
|
|
2008-05-21 16:42:05 +10:00
|
|
|
/*
|
|
|
|
* Operation flags:
|
|
|
|
*/
|
2022-04-21 10:46:47 +10:00
|
|
|
#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
|
xfs: use XFS_DA_OP flags in deferred attr ops
We currently store the high level attr operation in
args->attr_flags. This field contains what the VFS is telling us to
do, but don't necessarily match what we are doing in the low level
modification state machine. e.g. XATTR_REPLACE implies both
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a
remove and adding a new attr.
However, deep in the individual state machine operations, we check
errors against this high level VFS op flags, not the low level
XFS_DA_OP flags. Indeed, we don't even have a low level flag for
a REMOVE operation, so the only way we know we are doing a remove
is the complete absence of XATTR_REPLACE, XATTR_CREATE,
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other
flags in these fields, this is a pain to check if we need to.
As the XFS_DA_OP flags are only needed once the deferred operations
are set up, set these flags appropriately when we set the initial
operation state. We also introduce a XFS_DA_OP_REMOVE flag to make
it easy to know that we are doing a remove operation.
With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE
in low level lookup operations, and manipulate the low level flags
according to the low level context that is operating. e.g. log
recovery does not have a VFS xattr operation state to copy into
args->attr_flags, and the low level state machine ops we do for
recovery do not match the high level VFS operations that were in
progress when the system failed...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
|
|
|
#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
|
2022-04-21 10:46:47 +10:00
|
|
|
#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
|
|
|
|
#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
|
|
|
|
#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
|
|
|
|
#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
|
xfs: use XFS_DA_OP flags in deferred attr ops
We currently store the high level attr operation in
args->attr_flags. This field contains what the VFS is telling us to
do, but don't necessarily match what we are doing in the low level
modification state machine. e.g. XATTR_REPLACE implies both
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a
remove and adding a new attr.
However, deep in the individual state machine operations, we check
errors against this high level VFS op flags, not the low level
XFS_DA_OP flags. Indeed, we don't even have a low level flag for
a REMOVE operation, so the only way we know we are doing a remove
is the complete absence of XATTR_REPLACE, XATTR_CREATE,
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other
flags in these fields, this is a pain to check if we need to.
As the XFS_DA_OP flags are only needed once the deferred operations
are set up, set these flags appropriately when we set the initial
operation state. We also introduce a XFS_DA_OP_REMOVE flag to make
it easy to know that we are doing a remove operation.
With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE
in low level lookup operations, and manipulate the low level flags
according to the low level context that is operating. e.g. log
recovery does not have a VFS xattr operation state to copy into
args->attr_flags, and the low level state machine ops we do for
recovery do not match the high level VFS operations that were in
progress when the system failed...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
|
|
|
#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
|
|
|
#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
|
xfs: fix TOCTOU race involving the new logged xattrs control knob
I found a race involving the larp control knob, aka the debugging knob
that lets developers enable logging of extended attribute updates:
Thread 1 Thread 2
echo 0 > /sys/fs/xfs/debug/larp
setxattr(REPLACE)
xfs_has_larp (returns false)
xfs_attr_set
echo 1 > /sys/fs/xfs/debug/larp
xfs_attr_defer_replace
xfs_attr_init_replace_state
xfs_has_larp (returns true)
xfs_attr_init_remove_state
<oops, wrong DAS state!>
This isn't a particularly severe problem right now because xattr logging
is only enabled when CONFIG_XFS_DEBUG=y, and developers *should* know
what they're doing.
However, the eventual intent is that callers should be able to ask for
the assistance of the log in persisting xattr updates. This capability
might not be required for /all/ callers, which means that dynamic
control must work correctly. Once an xattr update has decided whether
or not to use logged xattrs, it needs to stay in that mode until the end
of the operation regardless of what subsequent parallel operations might
do.
Therefore, it is an error to continue sampling xfs_globals.larp once
xfs_attr_change has made a decision about larp, and it was not correct
for me to have told Allison that ->create_intent functions can sample
the global log incompat feature bitfield to decide to elide a log item.
Instead, create a new op flag for the xfs_da_args structure, and convert
all other callers of xfs_has_larp and xfs_sb_version_haslogxattrs within
the attr update state machine to look for the operations flag.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
2022-06-05 18:51:22 -07:00
|
|
|
#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
|
2008-05-21 16:42:05 +10:00
|
|
|
|
2009-12-14 23:14:59 +00:00
|
|
|
#define XFS_DA_OP_FLAGS \
|
|
|
|
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
|
xfs: use XFS_DA_OP flags in deferred attr ops
We currently store the high level attr operation in
args->attr_flags. This field contains what the VFS is telling us to
do, but don't necessarily match what we are doing in the low level
modification state machine. e.g. XATTR_REPLACE implies both
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a
remove and adding a new attr.
However, deep in the individual state machine operations, we check
errors against this high level VFS op flags, not the low level
XFS_DA_OP flags. Indeed, we don't even have a low level flag for
a REMOVE operation, so the only way we know we are doing a remove
is the complete absence of XATTR_REPLACE, XATTR_CREATE,
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other
flags in these fields, this is a pain to check if we need to.
As the XFS_DA_OP flags are only needed once the deferred operations
are set up, set these flags appropriately when we set the initial
operation state. We also introduce a XFS_DA_OP_REMOVE flag to make
it easy to know that we are doing a remove operation.
With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE
in low level lookup operations, and manipulate the low level flags
according to the low level context that is operating. e.g. log
recovery does not have a VFS xattr operation state to copy into
args->attr_flags, and the low level state machine ops we do for
recovery do not match the high level VFS operations that were in
progress when the system failed...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
|
|
|
{ XFS_DA_OP_REPLACE, "REPLACE" }, \
|
2009-12-14 23:14:59 +00:00
|
|
|
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
|
|
|
|
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
|
xfs: allocate xattr buffer on demand
When doing file lookups and checking for permissions, we end up in
xfs_get_acl() to see if there are any ACLs on the inode. This
requires and xattr lookup, and to do that we have to supply a buffer
large enough to hold an maximum sized xattr.
On workloads were we are accessing a wide range of cache cold files
under memory pressure (e.g. NFS fileservers) we end up spending a
lot of time allocating the buffer. The buffer is 64k in length, so
is a contiguous multi-page allocation, and if that then fails we
fall back to vmalloc(). Hence the allocation here is /expensive/
when we are looking up hundreds of thousands of files a second.
Initial numbers from a bpf trace show average time in xfs_get_acl()
is ~32us, with ~19us of that in the memory allocation. Note these
are average times, so there are going to be affected by the worst
case allocations more than the common fast case...
To avoid this, we could just do a "null" lookup to see if the ACL
xattr exists and then only do the allocation if it exists. This,
however, optimises the path for the "no ACL present" case at the
expense of the "acl present" case. i.e. we can halve the time in
xfs_get_acl() for the no acl case (i.e down to ~10-15us), but that
then increases the ACL case by 30% (i.e. up to 40-45us).
To solve this and speed up both cases, drive the xattr buffer
allocation into the attribute code once we know what the actual
xattr length is. For the no-xattr case, we avoid the allocation
completely, speeding up that case. For the common ACL case, we'll
end up with a fast heap allocation (because it'll be smaller than a
page), and only for the rarer "we have a remote xattr" will we have
a multi-page allocation occur. Hence the common ACL case will be
much faster, too.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-08-29 09:04:10 -07:00
|
|
|
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
|
xfs: use XFS_DA_OP flags in deferred attr ops
We currently store the high level attr operation in
args->attr_flags. This field contains what the VFS is telling us to
do, but don't necessarily match what we are doing in the low level
modification state machine. e.g. XATTR_REPLACE implies both
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a
remove and adding a new attr.
However, deep in the individual state machine operations, we check
errors against this high level VFS op flags, not the low level
XFS_DA_OP flags. Indeed, we don't even have a low level flag for
a REMOVE operation, so the only way we know we are doing a remove
is the complete absence of XATTR_REPLACE, XATTR_CREATE,
XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other
flags in these fields, this is a pain to check if we need to.
As the XFS_DA_OP flags are only needed once the deferred operations
are set up, set these flags appropriately when we set the initial
operation state. We also introduce a XFS_DA_OP_REMOVE flag to make
it easy to know that we are doing a remove operation.
With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE
in low level lookup operations, and manipulate the low level flags
according to the low level context that is operating. e.g. log
recovery does not have a VFS xattr operation state to copy into
args->attr_flags, and the low level state machine ops we do for
recovery do not match the high level VFS operations that were in
progress when the system failed...
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
|
|
|
{ XFS_DA_OP_NOTIME, "NOTIME" }, \
|
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework
We can't use the same algorithm for replacing an existing attribute
when logging attributes. The existing algorithm is essentially:
1. create new attr w/ INCOMPLETE
2. atomically flip INCOMPLETE flags between old + new attribute
3. remove old attr which is marked w/ INCOMPLETE
This algorithm guarantees that we see either the old or new
attribute, and if we fail after the atomic flag flip, we don't have
to recover the removal of the old attr because we never see
INCOMPLETE attributes in lookups.
For logged attributes, however, this does not work. The logged
attribute intents do not track the work that has been done as the
transaction rolls, and hence the only recovery mechanism we have is
"run the replace operation from scratch".
This is further exacerbated by the attempt to avoid needing the
INCOMPLETE flag to create an atomic swap. This means we can create
a second active attribute of the same name before we remove the
original. If we fail at any point after the create but before the
removal has completed, we end up with duplicate attributes in
the attr btree and recovery only tries to replace one of them.
There are several other failure modes where we can leave partially
allocated remote attributes that expose stale data, partially free
remote attributes that enable UAF based stale data exposure, etc.
TO fix this, we need a different algorithm for replace operations
when LARP is enabled. Luckily, it's not that complex if we take the
right first step. That is, the first thing we log is the attri
intent with the new name/value pair and mark the old attr as
INCOMPLETE in the same transaction.
From there, we then remove the old attr and keep relogging the
new name/value in the intent, such that we always know that we have
to create the new attr in recovery. Once the old attr is removed,
we then run a normal ATTR_CREATE operation relogging the intent as
we go. If the new attr is local, then it gets created in a single
atomic transaction that also logs the final intent done. If the new
attr is remote, the we set INCOMPLETE on the new attr while we
allocate and set the remote value, and then we clear the INCOMPLETE
flag at in the last transaction taht logs the final intent done.
If we fail at any point in this algorithm, log recovery will always
see the same state on disk: the new name/value in the intent, and
either an INCOMPLETE attr or no attr in the attr btree. If we find
an INCOMPLETE attr, we run the full replace starting with removing
the INCOMPLETE attr. If we don't find it, then we simply create the
new attr.
Notably, recovery of a failed create that has an INCOMPLETE flag set
is now the same - we start with the lookup of the INCOMPLETE attr,
and if that exists then we do the full replace recovery process,
otherwise we just create the new attr.
Hence changing the way we do the replace operation when LARP is
enabled allows us to use the same log recovery algorithm for both
the ATTR_CREATE and ATTR_REPLACE operations. This is also the same
algorithm we use for runtime ATTR_REPLACE operations (except for the
step setting up the initial conditions).
The result is that:
- ATTR_CREATE uses the same algorithm regardless of whether LARP is
enabled or not
- ATTR_REPLACE with larp=0 is identical to the old algorithm
- ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm
from the larp=0 code and then runs the unmodified ATTR_CREATE
code.
- log recovery when larp=1 runs the same ATTR_REPLACE algorithm as
it uses at runtime.
Because the state machine is now quite clean, changing the algorithm
is really just a case of changing the initial state and how the
states link together for the ATTR_REPLACE case. Hence it's not a
huge amount of code for what is a fairly substantial rework
of the attr logging and recovery algorithm....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
|
|
|
{ XFS_DA_OP_REMOVE, "REMOVE" }, \
|
xfs: fix TOCTOU race involving the new logged xattrs control knob
I found a race involving the larp control knob, aka the debugging knob
that lets developers enable logging of extended attribute updates:
Thread 1 Thread 2
echo 0 > /sys/fs/xfs/debug/larp
setxattr(REPLACE)
xfs_has_larp (returns false)
xfs_attr_set
echo 1 > /sys/fs/xfs/debug/larp
xfs_attr_defer_replace
xfs_attr_init_replace_state
xfs_has_larp (returns true)
xfs_attr_init_remove_state
<oops, wrong DAS state!>
This isn't a particularly severe problem right now because xattr logging
is only enabled when CONFIG_XFS_DEBUG=y, and developers *should* know
what they're doing.
However, the eventual intent is that callers should be able to ask for
the assistance of the log in persisting xattr updates. This capability
might not be required for /all/ callers, which means that dynamic
control must work correctly. Once an xattr update has decided whether
or not to use logged xattrs, it needs to stay in that mode until the end
of the operation regardless of what subsequent parallel operations might
do.
Therefore, it is an error to continue sampling xfs_globals.larp once
xfs_attr_change has made a decision about larp, and it was not correct
for me to have told Allison that ->create_intent functions can sample
the global log incompat feature bitfield to decide to elide a log item.
Instead, create a new op flag for the xfs_da_args structure, and convert
all other callers of xfs_has_larp and xfs_sb_version_haslogxattrs within
the attr update state machine to look for the operations flag.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
2022-06-05 18:51:22 -07:00
|
|
|
{ XFS_DA_OP_RECOVERY, "RECOVERY" }, \
|
|
|
|
{ XFS_DA_OP_LOGGED, "LOGGED" }
|
2009-12-14 23:14:59 +00:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Storage for holding state during Btree searches and split/join ops.
|
|
|
|
*
|
|
|
|
* Only need space for 5 intermediate nodes. With a minimum of 62-way
|
|
|
|
* fanout to the Btree, we can support over 900 million directory blocks,
|
|
|
|
* which is slightly more than enough.
|
|
|
|
*/
|
|
|
|
typedef struct xfs_da_state_blk {
|
2012-06-22 18:50:14 +10:00
|
|
|
struct xfs_buf *bp; /* buffer containing block */
|
2005-04-16 15:20:36 -07:00
|
|
|
xfs_dablk_t blkno; /* filesystem blkno of buffer */
|
|
|
|
xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
|
|
|
|
int index; /* relevant index into block */
|
|
|
|
xfs_dahash_t hashval; /* last hash value in block */
|
|
|
|
int magic; /* blk's magic number, ie: blk type */
|
|
|
|
} xfs_da_state_blk_t;
|
|
|
|
|
|
|
|
typedef struct xfs_da_state_path {
|
|
|
|
int active; /* number of active levels */
|
|
|
|
xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH];
|
|
|
|
} xfs_da_state_path_t;
|
|
|
|
|
|
|
|
typedef struct xfs_da_state {
|
|
|
|
xfs_da_args_t *args; /* filename arguments */
|
|
|
|
struct xfs_mount *mp; /* filesystem mount point */
|
|
|
|
xfs_da_state_path_t path; /* search/split paths */
|
|
|
|
xfs_da_state_path_t altpath; /* alternate path for join */
|
|
|
|
unsigned char inleaf; /* insert into 1->lf, 0->splf */
|
|
|
|
unsigned char extravalid; /* T/F: extrablk is in use */
|
|
|
|
unsigned char extraafter; /* T/F: extrablk is after new */
|
2009-03-29 09:55:42 +02:00
|
|
|
xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
|
2005-04-16 15:20:36 -07:00
|
|
|
/* for dirv2 extrablk is data */
|
|
|
|
} xfs_da_state_t;
|
|
|
|
|
2019-11-08 14:52:06 -08:00
|
|
|
/*
|
|
|
|
* In-core version of the node header to abstract the differences in the v2 and
|
|
|
|
* v3 disk format of the headers. Callers need to convert to/from disk format as
|
|
|
|
* appropriate.
|
|
|
|
*/
|
|
|
|
struct xfs_da3_icnode_hdr {
|
|
|
|
uint32_t forw;
|
|
|
|
uint32_t back;
|
|
|
|
uint16_t magic;
|
|
|
|
uint16_t count;
|
|
|
|
uint16_t level;
|
2019-11-08 14:57:48 -08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Pointer to the on-disk format entries, which are behind the
|
|
|
|
* variable size (v4 vs v5) header in the on-disk block.
|
|
|
|
*/
|
|
|
|
struct xfs_da_node_entry *btree;
|
2019-11-08 14:52:06 -08:00
|
|
|
};
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Utility macros to aid in logging changed structure fields.
|
|
|
|
*/
|
|
|
|
#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
|
|
|
|
#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \
|
|
|
|
(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
|
|
|
|
(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
|
|
|
|
|
|
|
|
/*========================================================================
|
2008-10-30 17:05:38 +11:00
|
|
|
* Function prototypes.
|
2005-04-16 15:20:36 -07:00
|
|
|
*========================================================================*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines used for growing the Btree.
|
|
|
|
*/
|
2013-04-24 18:58:02 +10:00
|
|
|
int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
|
|
|
|
int level, struct xfs_buf **bpp, int whichfork);
|
|
|
|
int xfs_da3_split(xfs_da_state_t *state);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines used for shrinking the Btree.
|
|
|
|
*/
|
2013-04-24 18:58:02 +10:00
|
|
|
int xfs_da3_join(xfs_da_state_t *state);
|
|
|
|
void xfs_da3_fixhashpath(struct xfs_da_state *state,
|
|
|
|
struct xfs_da_state_path *path_to_to_fix);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines used for finding things in the Btree.
|
|
|
|
*/
|
2013-04-24 18:58:02 +10:00
|
|
|
int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
|
|
|
|
int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
|
2005-04-16 15:20:36 -07:00
|
|
|
int forward, int release, int *result);
|
|
|
|
/*
|
|
|
|
* Utility routines.
|
|
|
|
*/
|
2013-04-24 18:58:02 +10:00
|
|
|
int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
|
2005-04-16 15:20:36 -07:00
|
|
|
xfs_da_state_blk_t *new_blk);
|
2013-04-24 18:58:02 +10:00
|
|
|
int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
|
2019-11-20 09:46:04 -08:00
|
|
|
xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork);
|
|
|
|
int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
|
|
|
|
xfs_daddr_t mappedbno, struct xfs_buf **bpp,
|
|
|
|
int whichfork);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Utility routines.
|
|
|
|
*/
|
2019-11-20 10:18:50 -08:00
|
|
|
|
2022-04-21 10:46:47 +10:00
|
|
|
#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
|
2019-11-20 10:18:50 -08:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
|
2011-07-13 13:43:49 +02:00
|
|
|
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
|
|
|
|
int count);
|
2005-04-16 15:20:36 -07:00
|
|
|
int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
|
2019-11-20 09:46:05 -08:00
|
|
|
xfs_dablk_t bno, struct xfs_buf **bp, int whichfork);
|
2005-04-16 15:20:36 -07:00
|
|
|
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
|
2019-11-20 09:46:04 -08:00
|
|
|
xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp,
|
|
|
|
int whichfork, const struct xfs_buf_ops *ops);
|
2017-02-02 15:13:58 -08:00
|
|
|
int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
|
2019-11-20 09:46:02 -08:00
|
|
|
unsigned int flags, int whichfork,
|
|
|
|
const struct xfs_buf_ops *ops);
|
2005-04-16 15:20:36 -07:00
|
|
|
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
|
2012-06-22 18:50:14 +10:00
|
|
|
struct xfs_buf *dead_buf);
|
2023-12-05 13:59:00 +08:00
|
|
|
void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src,
|
|
|
|
size_t size);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2017-06-16 11:00:05 -07:00
|
|
|
uint xfs_da_hashname(const uint8_t *name_string, int name_length);
|
2008-05-21 16:41:01 +10:00
|
|
|
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
|
2010-01-20 10:47:17 +11:00
|
|
|
const unsigned char *name, int len);
|
2008-05-21 16:41:01 +10:00
|
|
|
|
|
|
|
|
2020-07-22 09:23:18 -07:00
|
|
|
struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
|
2005-04-16 15:20:36 -07:00
|
|
|
void xfs_da_state_free(xfs_da_state_t *state);
|
2022-05-22 15:59:34 +10:00
|
|
|
void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2019-11-08 14:53:00 -08:00
|
|
|
void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
|
|
|
|
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
|
2019-11-08 14:57:48 -08:00
|
|
|
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
|
|
|
|
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
|
2024-04-15 14:54:36 -07:00
|
|
|
xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner);
|
2024-04-15 14:54:38 -07:00
|
|
|
xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner);
|
2019-11-08 14:53:00 -08:00
|
|
|
|
2021-10-12 11:09:23 -07:00
|
|
|
extern struct kmem_cache *xfs_da_state_cache;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
#endif /* __XFS_DA_BTREE_H__ */
|