linux-stable/fs/xfs/libxfs/xfs_da_btree.h

245 lines
9.3 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
* Copyright (c) 2013 Red Hat, Inc.
* All Rights Reserved.
*/
#ifndef __XFS_DA_BTREE_H__
#define __XFS_DA_BTREE_H__
struct xfs_inode;
struct xfs_trans;
/*
* Directory/attribute geometry information. There will be one of these for each
* data fork type, and it will be passed around via the xfs_da_args. Global
* structures will be attached to the xfs_mount.
*/
struct xfs_da_geometry {
unsigned int blksize; /* da block size in bytes */
unsigned int fsbcount; /* da block size in filesystem blocks */
uint8_t fsblog; /* log2 of _filesystem_ block size */
uint8_t blklog; /* log2 of da block size */
unsigned int node_hdr_size; /* danode header size in bytes */
unsigned int node_ents; /* # of entries in a danode */
unsigned int magicpct; /* 37% of block size in bytes */
xfs_dablk_t datablk; /* blockno of dir data v2 */
unsigned int leaf_hdr_size; /* dir2 leaf header size */
unsigned int leaf_max_ents; /* # of entries in dir2 leaf */
xfs_dablk_t leafblk; /* blockno of leaf data v2 */
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
};
/*========================================================================
* Btree searching and modification structure definitions.
*========================================================================*/
/*
* Search comparison results
*/
enum xfs_dacmp {
XFS_CMP_DIFFERENT, /* names are completely different */
XFS_CMP_EXACT, /* names are exactly the same */
XFS_CMP_CASE /* names are same but differ in case */
};
/*
* Structure to ease passing around component names.
*/
typedef struct xfs_da_args {
struct xfs_da_geometry *geo; /* da block geometry */
const uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
uint8_t filetype; /* filetype of inode for directories */
void *value; /* set of bytes (maybe contain NULLs) */
int valuelen; /* length of value */
unsigned int attr_filter; /* XFS_ATTR_{ROOT,SECURE,INCOMPLETE} */
unsigned int attr_flags; /* XATTR_{CREATE,REPLACE} */
xfs_dahash_t hashval; /* hash value of name */
xfs_ino_t inumber; /* input/output inode number */
struct xfs_inode *dp; /* directory inode to manipulate */
struct xfs_trans *trans; /* current trans (changes over time) */
xfs_extlen_t total; /* total blocks needed, for 1st bmap */
int whichfork; /* data or attribute fork */
xfs_dablk_t blkno; /* blkno of attr leaf of interest */
int index; /* index of attr of interest in blk */
xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
int rmtblkcnt; /* remote attr value block count */
xfs: remote attribute overwrite causes transaction overrun Commit e461fcb ("xfs: remote attribute lookups require the value length") passes the remote attribute length in the xfs_da_args structure on lookup so that CRC calculations and validity checking can be performed correctly by related code. This, unfortunately has the side effect of changing the args->valuelen parameter in cases where it shouldn't. That is, when we replace a remote attribute, the incoming replacement stores the value and length in args->value and args->valuelen, but then the lookup which finds the existing remote attribute overwrites args->valuelen with the length of the remote attribute being replaced. Hence when we go to create the new attribute, we create it of the size of the existing remote attribute, not the size it is supposed to be. When the new attribute is much smaller than the old attribute, this results in a transaction overrun and an ASSERT() failure on a debug kernel: XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331 Fix this by keeping the remote attribute value length separate to the attribute value length in the xfs_da_args structure. The enables us to pass the length of the remote attribute to be removed without overwriting the new attribute's length. Also, ensure that when we save remote block contexts for a later rename we zero the original state variables so that we don't confuse the state of the attribute to be removes with the state of the new attribute that we just added. [Spotted by Brain Foster.] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
int rmtvaluelen; /* remote attr value length in bytes */
xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
int index2; /* index of 2nd attr in blk */
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
xfs: remote attribute overwrite causes transaction overrun Commit e461fcb ("xfs: remote attribute lookups require the value length") passes the remote attribute length in the xfs_da_args structure on lookup so that CRC calculations and validity checking can be performed correctly by related code. This, unfortunately has the side effect of changing the args->valuelen parameter in cases where it shouldn't. That is, when we replace a remote attribute, the incoming replacement stores the value and length in args->value and args->valuelen, but then the lookup which finds the existing remote attribute overwrites args->valuelen with the length of the remote attribute being replaced. Hence when we go to create the new attribute, we create it of the size of the existing remote attribute, not the size it is supposed to be. When the new attribute is much smaller than the old attribute, this results in a transaction overrun and an ASSERT() failure on a debug kernel: XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331 Fix this by keeping the remote attribute value length separate to the attribute value length in the xfs_da_args structure. The enables us to pass the length of the remote attribute to be removed without overwriting the new attribute's length. Also, ensure that when we save remote block contexts for a later rename we zero the original state variables so that we don't confuse the state of the attribute to be removes with the state of the new attribute that we just added. [Spotted by Brain Foster.] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-05-06 07:37:31 +10:00
int rmtvaluelen2; /* remote attr value length in bytes */
uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
xfs_ino_t owner; /* inode that owns the dir/attr data */
} xfs_da_args_t;
/*
* Operation flags:
*/
#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
xfs: use XFS_DA_OP flags in deferred attr ops We currently store the high level attr operation in args->attr_flags. This field contains what the VFS is telling us to do, but don't necessarily match what we are doing in the low level modification state machine. e.g. XATTR_REPLACE implies both XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a remove and adding a new attr. However, deep in the individual state machine operations, we check errors against this high level VFS op flags, not the low level XFS_DA_OP flags. Indeed, we don't even have a low level flag for a REMOVE operation, so the only way we know we are doing a remove is the complete absence of XATTR_REPLACE, XATTR_CREATE, XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other flags in these fields, this is a pain to check if we need to. As the XFS_DA_OP flags are only needed once the deferred operations are set up, set these flags appropriately when we set the initial operation state. We also introduce a XFS_DA_OP_REMOVE flag to make it easy to know that we are doing a remove operation. With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE in low level lookup operations, and manipulate the low level flags according to the low level context that is operating. e.g. log recovery does not have a VFS xattr operation state to copy into args->attr_flags, and the low level state machine ops we do for recovery do not match the high level VFS operations that were in progress when the system failed... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
xfs: use XFS_DA_OP flags in deferred attr ops We currently store the high level attr operation in args->attr_flags. This field contains what the VFS is telling us to do, but don't necessarily match what we are doing in the low level modification state machine. e.g. XATTR_REPLACE implies both XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a remove and adding a new attr. However, deep in the individual state machine operations, we check errors against this high level VFS op flags, not the low level XFS_DA_OP flags. Indeed, we don't even have a low level flag for a REMOVE operation, so the only way we know we are doing a remove is the complete absence of XATTR_REPLACE, XATTR_CREATE, XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other flags in these fields, this is a pain to check if we need to. As the XFS_DA_OP flags are only needed once the deferred operations are set up, set these flags appropriately when we set the initial operation state. We also introduce a XFS_DA_OP_REMOVE flag to make it easy to know that we are doing a remove operation. With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE in low level lookup operations, and manipulate the low level flags according to the low level context that is operating. e.g. log recovery does not have a VFS xattr operation state to copy into args->attr_flags, and the low level state machine ops we do for recovery do not match the high level VFS operations that were in progress when the system failed... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework We can't use the same algorithm for replacing an existing attribute when logging attributes. The existing algorithm is essentially: 1. create new attr w/ INCOMPLETE 2. atomically flip INCOMPLETE flags between old + new attribute 3. remove old attr which is marked w/ INCOMPLETE This algorithm guarantees that we see either the old or new attribute, and if we fail after the atomic flag flip, we don't have to recover the removal of the old attr because we never see INCOMPLETE attributes in lookups. For logged attributes, however, this does not work. The logged attribute intents do not track the work that has been done as the transaction rolls, and hence the only recovery mechanism we have is "run the replace operation from scratch". This is further exacerbated by the attempt to avoid needing the INCOMPLETE flag to create an atomic swap. This means we can create a second active attribute of the same name before we remove the original. If we fail at any point after the create but before the removal has completed, we end up with duplicate attributes in the attr btree and recovery only tries to replace one of them. There are several other failure modes where we can leave partially allocated remote attributes that expose stale data, partially free remote attributes that enable UAF based stale data exposure, etc. TO fix this, we need a different algorithm for replace operations when LARP is enabled. Luckily, it's not that complex if we take the right first step. That is, the first thing we log is the attri intent with the new name/value pair and mark the old attr as INCOMPLETE in the same transaction. From there, we then remove the old attr and keep relogging the new name/value in the intent, such that we always know that we have to create the new attr in recovery. Once the old attr is removed, we then run a normal ATTR_CREATE operation relogging the intent as we go. If the new attr is local, then it gets created in a single atomic transaction that also logs the final intent done. If the new attr is remote, the we set INCOMPLETE on the new attr while we allocate and set the remote value, and then we clear the INCOMPLETE flag at in the last transaction taht logs the final intent done. If we fail at any point in this algorithm, log recovery will always see the same state on disk: the new name/value in the intent, and either an INCOMPLETE attr or no attr in the attr btree. If we find an INCOMPLETE attr, we run the full replace starting with removing the INCOMPLETE attr. If we don't find it, then we simply create the new attr. Notably, recovery of a failed create that has an INCOMPLETE flag set is now the same - we start with the lookup of the INCOMPLETE attr, and if that exists then we do the full replace recovery process, otherwise we just create the new attr. Hence changing the way we do the replace operation when LARP is enabled allows us to use the same log recovery algorithm for both the ATTR_CREATE and ATTR_REPLACE operations. This is also the same algorithm we use for runtime ATTR_REPLACE operations (except for the step setting up the initial conditions). The result is that: - ATTR_CREATE uses the same algorithm regardless of whether LARP is enabled or not - ATTR_REPLACE with larp=0 is identical to the old algorithm - ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm from the larp=0 code and then runs the unmodified ATTR_CREATE code. - log recovery when larp=1 runs the same ATTR_REPLACE algorithm as it uses at runtime. Because the state machine is now quite clean, changing the algorithm is really just a case of changing the initial state and how the states link together for the ATTR_REPLACE case. Hence it's not a huge amount of code for what is a fairly substantial rework of the attr logging and recovery algorithm.... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
xfs: fix TOCTOU race involving the new logged xattrs control knob I found a race involving the larp control knob, aka the debugging knob that lets developers enable logging of extended attribute updates: Thread 1 Thread 2 echo 0 > /sys/fs/xfs/debug/larp setxattr(REPLACE) xfs_has_larp (returns false) xfs_attr_set echo 1 > /sys/fs/xfs/debug/larp xfs_attr_defer_replace xfs_attr_init_replace_state xfs_has_larp (returns true) xfs_attr_init_remove_state <oops, wrong DAS state!> This isn't a particularly severe problem right now because xattr logging is only enabled when CONFIG_XFS_DEBUG=y, and developers *should* know what they're doing. However, the eventual intent is that callers should be able to ask for the assistance of the log in persisting xattr updates. This capability might not be required for /all/ callers, which means that dynamic control must work correctly. Once an xattr update has decided whether or not to use logged xattrs, it needs to stay in that mode until the end of the operation regardless of what subsequent parallel operations might do. Therefore, it is an error to continue sampling xfs_globals.larp once xfs_attr_change has made a decision about larp, and it was not correct for me to have told Allison that ->create_intent functions can sample the global log incompat feature bitfield to decide to elide a log item. Instead, create a new op flag for the xfs_da_args structure, and convert all other callers of xfs_has_larp and xfs_sb_version_haslogxattrs within the attr update state machine to look for the operations flag. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
2022-06-05 18:51:22 -07:00
#define XFS_DA_OP_LOGGED (1u << 8) /* Use intent items to track op */
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-14 23:14:59 +00:00
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
xfs: use XFS_DA_OP flags in deferred attr ops We currently store the high level attr operation in args->attr_flags. This field contains what the VFS is telling us to do, but don't necessarily match what we are doing in the low level modification state machine. e.g. XATTR_REPLACE implies both XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a remove and adding a new attr. However, deep in the individual state machine operations, we check errors against this high level VFS op flags, not the low level XFS_DA_OP flags. Indeed, we don't even have a low level flag for a REMOVE operation, so the only way we know we are doing a remove is the complete absence of XATTR_REPLACE, XATTR_CREATE, XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other flags in these fields, this is a pain to check if we need to. As the XFS_DA_OP flags are only needed once the deferred operations are set up, set these flags appropriately when we set the initial operation state. We also introduce a XFS_DA_OP_REMOVE flag to make it easy to know that we are doing a remove operation. With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE in low level lookup operations, and manipulate the low level flags according to the low level context that is operating. e.g. log recovery does not have a VFS xattr operation state to copy into args->attr_flags, and the low level state machine ops we do for recovery do not match the high level VFS operations that were in progress when the system failed... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
{ XFS_DA_OP_REPLACE, "REPLACE" }, \
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-14 23:14:59 +00:00
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
xfs: allocate xattr buffer on demand When doing file lookups and checking for permissions, we end up in xfs_get_acl() to see if there are any ACLs on the inode. This requires and xattr lookup, and to do that we have to supply a buffer large enough to hold an maximum sized xattr. On workloads were we are accessing a wide range of cache cold files under memory pressure (e.g. NFS fileservers) we end up spending a lot of time allocating the buffer. The buffer is 64k in length, so is a contiguous multi-page allocation, and if that then fails we fall back to vmalloc(). Hence the allocation here is /expensive/ when we are looking up hundreds of thousands of files a second. Initial numbers from a bpf trace show average time in xfs_get_acl() is ~32us, with ~19us of that in the memory allocation. Note these are average times, so there are going to be affected by the worst case allocations more than the common fast case... To avoid this, we could just do a "null" lookup to see if the ACL xattr exists and then only do the allocation if it exists. This, however, optimises the path for the "no ACL present" case at the expense of the "acl present" case. i.e. we can halve the time in xfs_get_acl() for the no acl case (i.e down to ~10-15us), but that then increases the ACL case by 30% (i.e. up to 40-45us). To solve this and speed up both cases, drive the xattr buffer allocation into the attribute code once we know what the actual xattr length is. For the no-xattr case, we avoid the allocation completely, speeding up that case. For the common ACL case, we'll end up with a fast heap allocation (because it'll be smaller than a page), and only for the rarer "we have a remote xattr" will we have a multi-page allocation occur. Hence the common ACL case will be much faster, too. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-08-29 09:04:10 -07:00
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
xfs: use XFS_DA_OP flags in deferred attr ops We currently store the high level attr operation in args->attr_flags. This field contains what the VFS is telling us to do, but don't necessarily match what we are doing in the low level modification state machine. e.g. XATTR_REPLACE implies both XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME because it is doing both a remove and adding a new attr. However, deep in the individual state machine operations, we check errors against this high level VFS op flags, not the low level XFS_DA_OP flags. Indeed, we don't even have a low level flag for a REMOVE operation, so the only way we know we are doing a remove is the complete absence of XATTR_REPLACE, XATTR_CREATE, XFS_DA_OP_ADDNAME and XFS_DA_OP_RENAME. And because there are other flags in these fields, this is a pain to check if we need to. As the XFS_DA_OP flags are only needed once the deferred operations are set up, set these flags appropriately when we set the initial operation state. We also introduce a XFS_DA_OP_REMOVE flag to make it easy to know that we are doing a remove operation. With these, we can remove the use of XATTR_REPLACE and XATTR_CREATE in low level lookup operations, and manipulate the low level flags according to the low level context that is operating. e.g. log recovery does not have a VFS xattr operation state to copy into args->attr_flags, and the low level state machine ops we do for recovery do not match the high level VFS operations that were in progress when the system failed... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
{ XFS_DA_OP_NOTIME, "NOTIME" }, \
xfs: ATTR_REPLACE algorithm with LARP enabled needs rework We can't use the same algorithm for replacing an existing attribute when logging attributes. The existing algorithm is essentially: 1. create new attr w/ INCOMPLETE 2. atomically flip INCOMPLETE flags between old + new attribute 3. remove old attr which is marked w/ INCOMPLETE This algorithm guarantees that we see either the old or new attribute, and if we fail after the atomic flag flip, we don't have to recover the removal of the old attr because we never see INCOMPLETE attributes in lookups. For logged attributes, however, this does not work. The logged attribute intents do not track the work that has been done as the transaction rolls, and hence the only recovery mechanism we have is "run the replace operation from scratch". This is further exacerbated by the attempt to avoid needing the INCOMPLETE flag to create an atomic swap. This means we can create a second active attribute of the same name before we remove the original. If we fail at any point after the create but before the removal has completed, we end up with duplicate attributes in the attr btree and recovery only tries to replace one of them. There are several other failure modes where we can leave partially allocated remote attributes that expose stale data, partially free remote attributes that enable UAF based stale data exposure, etc. TO fix this, we need a different algorithm for replace operations when LARP is enabled. Luckily, it's not that complex if we take the right first step. That is, the first thing we log is the attri intent with the new name/value pair and mark the old attr as INCOMPLETE in the same transaction. From there, we then remove the old attr and keep relogging the new name/value in the intent, such that we always know that we have to create the new attr in recovery. Once the old attr is removed, we then run a normal ATTR_CREATE operation relogging the intent as we go. If the new attr is local, then it gets created in a single atomic transaction that also logs the final intent done. If the new attr is remote, the we set INCOMPLETE on the new attr while we allocate and set the remote value, and then we clear the INCOMPLETE flag at in the last transaction taht logs the final intent done. If we fail at any point in this algorithm, log recovery will always see the same state on disk: the new name/value in the intent, and either an INCOMPLETE attr or no attr in the attr btree. If we find an INCOMPLETE attr, we run the full replace starting with removing the INCOMPLETE attr. If we don't find it, then we simply create the new attr. Notably, recovery of a failed create that has an INCOMPLETE flag set is now the same - we start with the lookup of the INCOMPLETE attr, and if that exists then we do the full replace recovery process, otherwise we just create the new attr. Hence changing the way we do the replace operation when LARP is enabled allows us to use the same log recovery algorithm for both the ATTR_CREATE and ATTR_REPLACE operations. This is also the same algorithm we use for runtime ATTR_REPLACE operations (except for the step setting up the initial conditions). The result is that: - ATTR_CREATE uses the same algorithm regardless of whether LARP is enabled or not - ATTR_REPLACE with larp=0 is identical to the old algorithm - ATTR_REPLACE with larp=1 runs an unmodified attr removal algorithm from the larp=0 code and then runs the unmodified ATTR_CREATE code. - log recovery when larp=1 runs the same ATTR_REPLACE algorithm as it uses at runtime. Because the state machine is now quite clean, changing the algorithm is really just a case of changing the initial state and how the states link together for the ATTR_REPLACE case. Hence it's not a huge amount of code for what is a fairly substantial rework of the attr logging and recovery algorithm.... Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-12 15:12:56 +10:00
{ XFS_DA_OP_REMOVE, "REMOVE" }, \
xfs: fix TOCTOU race involving the new logged xattrs control knob I found a race involving the larp control knob, aka the debugging knob that lets developers enable logging of extended attribute updates: Thread 1 Thread 2 echo 0 > /sys/fs/xfs/debug/larp setxattr(REPLACE) xfs_has_larp (returns false) xfs_attr_set echo 1 > /sys/fs/xfs/debug/larp xfs_attr_defer_replace xfs_attr_init_replace_state xfs_has_larp (returns true) xfs_attr_init_remove_state <oops, wrong DAS state!> This isn't a particularly severe problem right now because xattr logging is only enabled when CONFIG_XFS_DEBUG=y, and developers *should* know what they're doing. However, the eventual intent is that callers should be able to ask for the assistance of the log in persisting xattr updates. This capability might not be required for /all/ callers, which means that dynamic control must work correctly. Once an xattr update has decided whether or not to use logged xattrs, it needs to stay in that mode until the end of the operation regardless of what subsequent parallel operations might do. Therefore, it is an error to continue sampling xfs_globals.larp once xfs_attr_change has made a decision about larp, and it was not correct for me to have told Allison that ->create_intent functions can sample the global log incompat feature bitfield to decide to elide a log item. Instead, create a new op flag for the xfs_da_args structure, and convert all other callers of xfs_has_larp and xfs_sb_version_haslogxattrs within the attr update state machine to look for the operations flag. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
2022-06-05 18:51:22 -07:00
{ XFS_DA_OP_RECOVERY, "RECOVERY" }, \
{ XFS_DA_OP_LOGGED, "LOGGED" }
xfs: event tracing support Convert the old xfs tracing support that could only be used with the out of tree kdb and xfsidbg patches to use the generic event tracer. To use it make sure CONFIG_EVENT_TRACING is enabled and then enable all xfs trace channels by: echo 1 > /sys/kernel/debug/tracing/events/xfs/enable or alternatively enable single events by just doing the same in one event subdirectory, e.g. echo 1 > /sys/kernel/debug/tracing/events/xfs/xfs_ihold/enable or set more complex filters, etc. In Documentation/trace/events.txt all this is desctribed in more detail. To reads the events do a cat /sys/kernel/debug/tracing/trace Compared to the last posting this patch converts the tracing mostly to the one tracepoint per callsite model that other users of the new tracing facility also employ. This allows a very fine-grained control of the tracing, a cleaner output of the traces and also enables the perf tool to use each tracepoint as a virtual performance counter, allowing us to e.g. count how often certain workloads git various spots in XFS. Take a look at http://lwn.net/Articles/346470/ for some examples. Also the btree tracing isn't included at all yet, as it will require additional core tracing features not in mainline yet, I plan to deliver it later. And the really nice thing about this patch is that it actually removes many lines of code while adding this nice functionality: fs/xfs/Makefile | 8 fs/xfs/linux-2.6/xfs_acl.c | 1 fs/xfs/linux-2.6/xfs_aops.c | 52 - fs/xfs/linux-2.6/xfs_aops.h | 2 fs/xfs/linux-2.6/xfs_buf.c | 117 +-- fs/xfs/linux-2.6/xfs_buf.h | 33 fs/xfs/linux-2.6/xfs_fs_subr.c | 3 fs/xfs/linux-2.6/xfs_ioctl.c | 1 fs/xfs/linux-2.6/xfs_ioctl32.c | 1 fs/xfs/linux-2.6/xfs_iops.c | 1 fs/xfs/linux-2.6/xfs_linux.h | 1 fs/xfs/linux-2.6/xfs_lrw.c | 87 -- fs/xfs/linux-2.6/xfs_lrw.h | 45 - fs/xfs/linux-2.6/xfs_super.c | 104 --- fs/xfs/linux-2.6/xfs_super.h | 7 fs/xfs/linux-2.6/xfs_sync.c | 1 fs/xfs/linux-2.6/xfs_trace.c | 75 ++ fs/xfs/linux-2.6/xfs_trace.h | 1369 +++++++++++++++++++++++++++++++++++++++++ fs/xfs/linux-2.6/xfs_vnode.h | 4 fs/xfs/quota/xfs_dquot.c | 110 --- fs/xfs/quota/xfs_dquot.h | 21 fs/xfs/quota/xfs_qm.c | 40 - fs/xfs/quota/xfs_qm_syscalls.c | 4 fs/xfs/support/ktrace.c | 323 --------- fs/xfs/support/ktrace.h | 85 -- fs/xfs/xfs.h | 16 fs/xfs/xfs_ag.h | 14 fs/xfs/xfs_alloc.c | 230 +----- fs/xfs/xfs_alloc.h | 27 fs/xfs/xfs_alloc_btree.c | 1 fs/xfs/xfs_attr.c | 107 --- fs/xfs/xfs_attr.h | 10 fs/xfs/xfs_attr_leaf.c | 14 fs/xfs/xfs_attr_sf.h | 40 - fs/xfs/xfs_bmap.c | 507 +++------------ fs/xfs/xfs_bmap.h | 49 - fs/xfs/xfs_bmap_btree.c | 6 fs/xfs/xfs_btree.c | 5 fs/xfs/xfs_btree_trace.h | 17 fs/xfs/xfs_buf_item.c | 87 -- fs/xfs/xfs_buf_item.h | 20 fs/xfs/xfs_da_btree.c | 3 fs/xfs/xfs_da_btree.h | 7 fs/xfs/xfs_dfrag.c | 2 fs/xfs/xfs_dir2.c | 8 fs/xfs/xfs_dir2_block.c | 20 fs/xfs/xfs_dir2_leaf.c | 21 fs/xfs/xfs_dir2_node.c | 27 fs/xfs/xfs_dir2_sf.c | 26 fs/xfs/xfs_dir2_trace.c | 216 ------ fs/xfs/xfs_dir2_trace.h | 72 -- fs/xfs/xfs_filestream.c | 8 fs/xfs/xfs_fsops.c | 2 fs/xfs/xfs_iget.c | 111 --- fs/xfs/xfs_inode.c | 67 -- fs/xfs/xfs_inode.h | 76 -- fs/xfs/xfs_inode_item.c | 5 fs/xfs/xfs_iomap.c | 85 -- fs/xfs/xfs_iomap.h | 8 fs/xfs/xfs_log.c | 181 +---- fs/xfs/xfs_log_priv.h | 20 fs/xfs/xfs_log_recover.c | 1 fs/xfs/xfs_mount.c | 2 fs/xfs/xfs_quota.h | 8 fs/xfs/xfs_rename.c | 1 fs/xfs/xfs_rtalloc.c | 1 fs/xfs/xfs_rw.c | 3 fs/xfs/xfs_trans.h | 47 + fs/xfs/xfs_trans_buf.c | 62 - fs/xfs/xfs_vnodeops.c | 8 70 files changed, 2151 insertions(+), 2592 deletions(-) Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
2009-12-14 23:14:59 +00:00
/*
* Storage for holding state during Btree searches and split/join ops.
*
* Only need space for 5 intermediate nodes. With a minimum of 62-way
* fanout to the Btree, we can support over 900 million directory blocks,
* which is slightly more than enough.
*/
typedef struct xfs_da_state_blk {
struct xfs_buf *bp; /* buffer containing block */
xfs_dablk_t blkno; /* filesystem blkno of buffer */
xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */
int index; /* relevant index into block */
xfs_dahash_t hashval; /* last hash value in block */
int magic; /* blk's magic number, ie: blk type */
} xfs_da_state_blk_t;
typedef struct xfs_da_state_path {
int active; /* number of active levels */
xfs_da_state_blk_t blk[XFS_DA_NODE_MAXDEPTH];
} xfs_da_state_path_t;
typedef struct xfs_da_state {
xfs_da_args_t *args; /* filename arguments */
struct xfs_mount *mp; /* filesystem mount point */
xfs_da_state_path_t path; /* search/split paths */
xfs_da_state_path_t altpath; /* alternate path for join */
unsigned char inleaf; /* insert into 1->lf, 0->splf */
unsigned char extravalid; /* T/F: extrablk is in use */
unsigned char extraafter; /* T/F: extrablk is after new */
xfs_da_state_blk_t extrablk; /* for double-splits on leaves */
/* for dirv2 extrablk is data */
} xfs_da_state_t;
/*
* In-core version of the node header to abstract the differences in the v2 and
* v3 disk format of the headers. Callers need to convert to/from disk format as
* appropriate.
*/
struct xfs_da3_icnode_hdr {
uint32_t forw;
uint32_t back;
uint16_t magic;
uint16_t count;
uint16_t level;
/*
* Pointer to the on-disk format entries, which are behind the
* variable size (v4 vs v5) header in the on-disk block.
*/
struct xfs_da_node_entry *btree;
};
/*
* Utility macros to aid in logging changed structure fields.
*/
#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
#define XFS_DA_LOGRANGE(BASE, ADDR, SIZE) \
(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
/*========================================================================
* Function prototypes.
*========================================================================*/
/*
* Routines used for growing the Btree.
*/
int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno,
int level, struct xfs_buf **bpp, int whichfork);
int xfs_da3_split(xfs_da_state_t *state);
/*
* Routines used for shrinking the Btree.
*/
int xfs_da3_join(xfs_da_state_t *state);
void xfs_da3_fixhashpath(struct xfs_da_state *state,
struct xfs_da_state_path *path_to_to_fix);
/*
* Routines used for finding things in the Btree.
*/
int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result);
int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
int forward, int release, int *result);
/*
* Utility routines.
*/
int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, struct xfs_buf **bpp, int whichfork);
int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_daddr_t mappedbno, struct xfs_buf **bpp,
int whichfork);
/*
* Utility routines.
*/
#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
int count);
int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, struct xfs_buf **bp, int whichfork);
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, unsigned int flags, struct xfs_buf **bpp,
int whichfork, const struct xfs_buf_ops *ops);
int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
unsigned int flags, int whichfork,
const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src,
size_t size);
uint xfs_da_hashname(const uint8_t *name_string, int name_length);
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
const unsigned char *name, int len);
struct xfs_da_state *xfs_da_state_alloc(struct xfs_da_args *args);
void xfs_da_state_free(xfs_da_state_t *state);
void xfs_da_state_reset(struct xfs_da_state *state, struct xfs_da_args *args);
void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp,
struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from);
void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp,
struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from);
xfs_failaddr_t xfs_da3_header_check(struct xfs_buf *bp, xfs_ino_t owner);
xfs_failaddr_t xfs_da3_node_header_check(struct xfs_buf *bp, xfs_ino_t owner);
extern struct kmem_cache *xfs_da_state_cache;
#endif /* __XFS_DA_BTREE_H__ */