2020-05-12 16:54:17 -07:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2005-11-02 14:58:39 +11:00
|
|
|
* Copyright (c) 2000-2001,2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
#ifndef __XFS_BTREE_H__
|
|
|
|
#define __XFS_BTREE_H__
|
|
|
|
|
|
|
|
struct xfs_buf;
|
|
|
|
struct xfs_inode;
|
|
|
|
struct xfs_mount;
|
|
|
|
struct xfs_trans;
|
2020-03-11 10:42:34 -07:00
|
|
|
struct xfs_ifork;
|
2021-06-02 10:48:24 +10:00
|
|
|
struct xfs_perag;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-10-30 16:54:12 +11:00
|
|
|
/*
|
|
|
|
* Generic key, ptr and record wrapper structures.
|
|
|
|
*
|
|
|
|
* These are disk format structures, and are converted where necessary
|
|
|
|
* by the btree specific code that needs to interpret them.
|
|
|
|
*/
|
|
|
|
union xfs_btree_ptr {
|
|
|
|
__be32 s; /* short form ptr */
|
|
|
|
__be64 l; /* long form ptr */
|
|
|
|
};
|
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
/*
|
2016-09-19 10:24:36 +10:00
|
|
|
* The in-core btree key. Overlapping btrees actually store two keys
|
|
|
|
* per pointer, so we reserve enough memory to hold both. The __*bigkey
|
|
|
|
* items should never be accessed directly.
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
*/
|
2016-09-19 10:24:36 +10:00
|
|
|
union xfs_btree_key {
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
struct xfs_bmbt_key bmbt;
|
|
|
|
xfs_bmdr_key_t bmbr; /* bmbt root block */
|
|
|
|
xfs_alloc_key_t alloc;
|
|
|
|
struct xfs_inobt_key inobt;
|
2016-09-19 10:24:36 +10:00
|
|
|
struct xfs_rmap_key rmap;
|
|
|
|
struct xfs_rmap_key __rmap_bigkey[2];
|
2016-10-03 09:11:18 -07:00
|
|
|
struct xfs_refcount_key refc;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
};
|
|
|
|
|
2008-10-30 16:54:12 +11:00
|
|
|
union xfs_btree_rec {
|
2016-08-03 11:36:07 +10:00
|
|
|
struct xfs_bmbt_rec bmbt;
|
|
|
|
xfs_bmdr_rec_t bmbr; /* bmbt root block */
|
|
|
|
struct xfs_alloc_rec alloc;
|
|
|
|
struct xfs_inobt_rec inobt;
|
|
|
|
struct xfs_rmap_rec rmap;
|
2016-10-03 09:11:18 -07:00
|
|
|
struct xfs_refcount_rec refc;
|
2008-10-30 16:54:12 +11:00
|
|
|
};
|
|
|
|
|
2013-10-23 10:51:50 +11:00
|
|
|
/*
|
|
|
|
* This nonsense is to make -wlint happy.
|
|
|
|
*/
|
|
|
|
#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
|
|
|
|
#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
|
|
|
|
#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
|
|
|
|
|
2024-02-22 12:35:16 -08:00
|
|
|
struct xfs_btree_ops;
|
|
|
|
uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
|
2017-01-27 23:16:38 -08:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* For logging record fields.
|
|
|
|
*/
|
2022-04-21 10:46:33 +10:00
|
|
|
#define XFS_BB_MAGIC (1u << 0)
|
|
|
|
#define XFS_BB_LEVEL (1u << 1)
|
|
|
|
#define XFS_BB_NUMRECS (1u << 2)
|
|
|
|
#define XFS_BB_LEFTSIB (1u << 3)
|
|
|
|
#define XFS_BB_RIGHTSIB (1u << 4)
|
|
|
|
#define XFS_BB_BLKNO (1u << 5)
|
|
|
|
#define XFS_BB_LSN (1u << 6)
|
|
|
|
#define XFS_BB_UUID (1u << 7)
|
|
|
|
#define XFS_BB_OWNER (1u << 8)
|
2005-04-16 15:20:36 -07:00
|
|
|
#define XFS_BB_NUM_BITS 5
|
2022-04-21 10:46:33 +10:00
|
|
|
#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1)
|
2013-08-30 10:23:44 +10:00
|
|
|
#define XFS_BB_NUM_BITS_CRC 9
|
2022-04-21 10:46:33 +10:00
|
|
|
#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1)
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-10-30 16:55:03 +11:00
|
|
|
/*
|
|
|
|
* Generic stats interface
|
|
|
|
*/
|
2015-10-12 18:21:22 +11:00
|
|
|
#define XFS_BTREE_STATS_INC(cur, stat) \
|
2024-02-22 12:35:21 -08:00
|
|
|
XFS_STATS_INC_OFF((cur)->bc_mp, \
|
|
|
|
(cur)->bc_ops->statoff + __XBTS_ ## stat)
|
2016-12-05 14:38:58 +11:00
|
|
|
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
|
2024-02-22 12:35:21 -08:00
|
|
|
XFS_STATS_ADD_OFF((cur)->bc_mp, \
|
|
|
|
(cur)->bc_ops->statoff + __XBTS_ ## stat, val)
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2023-04-11 19:00:10 -07:00
|
|
|
enum xbtree_key_contig {
|
|
|
|
XBTREE_KEY_GAP = 0,
|
|
|
|
XBTREE_KEY_CONTIGUOUS,
|
|
|
|
XBTREE_KEY_OVERLAP,
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decide if these two numeric btree key fields are contiguous, overlapping,
|
|
|
|
* or if there's a gap between them. @x should be the field from the high
|
|
|
|
* key and @y should be the field from the low key.
|
|
|
|
*/
|
|
|
|
static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
|
|
|
|
{
|
|
|
|
x++;
|
|
|
|
if (x < y)
|
|
|
|
return XBTREE_KEY_GAP;
|
|
|
|
if (x == y)
|
|
|
|
return XBTREE_KEY_CONTIGUOUS;
|
|
|
|
return XBTREE_KEY_OVERLAP;
|
|
|
|
}
|
|
|
|
|
2024-02-22 12:35:36 -08:00
|
|
|
#define XFS_BTREE_LONG_PTR_LEN (sizeof(__be64))
|
|
|
|
#define XFS_BTREE_SHORT_PTR_LEN (sizeof(__be32))
|
|
|
|
|
2024-02-22 12:36:17 -08:00
|
|
|
enum xfs_btree_type {
|
|
|
|
XFS_BTREE_TYPE_AG,
|
|
|
|
XFS_BTREE_TYPE_INODE,
|
2024-02-22 12:43:35 -08:00
|
|
|
XFS_BTREE_TYPE_MEM,
|
2024-02-22 12:36:17 -08:00
|
|
|
};
|
|
|
|
|
2008-10-30 16:53:59 +11:00
|
|
|
struct xfs_btree_ops {
|
2024-02-22 12:39:47 -08:00
|
|
|
const char *name;
|
|
|
|
|
2024-02-22 12:36:17 -08:00
|
|
|
/* Type of btree - AG-rooted or inode-rooted */
|
|
|
|
enum xfs_btree_type type;
|
|
|
|
|
2024-02-22 12:34:29 -08:00
|
|
|
/* XFS_BTGEO_* flags that determine the geometry of the btree */
|
|
|
|
unsigned int geom_flags;
|
|
|
|
|
2024-02-22 12:35:36 -08:00
|
|
|
/* size of the key, pointer, and record structures */
|
|
|
|
size_t key_len;
|
|
|
|
size_t ptr_len;
|
|
|
|
size_t rec_len;
|
2008-10-30 16:55:34 +11:00
|
|
|
|
2024-02-22 12:35:20 -08:00
|
|
|
/* LRU refcount to set on each btree buffer created */
|
|
|
|
unsigned int lru_refs;
|
|
|
|
|
2024-02-22 12:35:21 -08:00
|
|
|
/* offset of btree stats array */
|
|
|
|
unsigned int statoff;
|
|
|
|
|
2024-02-22 12:39:47 -08:00
|
|
|
/* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
|
|
|
|
unsigned int sick_mask;
|
|
|
|
|
2008-10-30 16:53:59 +11:00
|
|
|
/* cursor operations */
|
|
|
|
struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
|
2008-10-30 16:57:40 +11:00
|
|
|
void (*update_cursor)(struct xfs_btree_cur *src,
|
|
|
|
struct xfs_btree_cur *dst);
|
2008-10-30 16:55:13 +11:00
|
|
|
|
2008-10-30 16:57:16 +11:00
|
|
|
/* update btree root pointer */
|
|
|
|
void (*set_root)(struct xfs_btree_cur *cur,
|
2021-08-12 09:49:03 -07:00
|
|
|
const union xfs_btree_ptr *nptr, int level_change);
|
2008-10-30 16:57:16 +11:00
|
|
|
|
2008-10-30 16:57:03 +11:00
|
|
|
/* block allocation / freeing */
|
|
|
|
int (*alloc_block)(struct xfs_btree_cur *cur,
|
2021-08-12 09:53:27 -07:00
|
|
|
const union xfs_btree_ptr *start_bno,
|
2008-10-30 16:57:03 +11:00
|
|
|
union xfs_btree_ptr *new_bno,
|
2014-04-14 19:03:53 +10:00
|
|
|
int *stat);
|
2008-10-30 16:57:51 +11:00
|
|
|
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
|
2008-10-30 16:57:03 +11:00
|
|
|
|
2008-10-30 16:55:23 +11:00
|
|
|
/* records in block/level */
|
2008-10-30 16:58:01 +11:00
|
|
|
int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
|
2008-10-30 16:55:23 +11:00
|
|
|
int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
|
|
|
|
|
2008-10-30 16:57:40 +11:00
|
|
|
/* records on disk. Matter for the root in inode case. */
|
|
|
|
int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
|
|
|
|
|
2008-10-30 16:56:09 +11:00
|
|
|
/* init values of btree structures */
|
|
|
|
void (*init_key_from_rec)(union xfs_btree_key *key,
|
2021-08-10 17:02:16 -07:00
|
|
|
const union xfs_btree_rec *rec);
|
2008-10-30 16:57:40 +11:00
|
|
|
void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_rec *rec);
|
2008-10-30 16:56:09 +11:00
|
|
|
void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
void (*init_high_key_from_rec)(union xfs_btree_key *key,
|
2021-08-10 17:02:16 -07:00
|
|
|
const union xfs_btree_rec *rec);
|
2008-10-30 16:56:09 +11:00
|
|
|
|
|
|
|
/* difference between key value and cursor value */
|
2017-06-16 11:00:05 -07:00
|
|
|
int64_t (*key_diff)(struct xfs_btree_cur *cur,
|
2021-08-10 17:02:15 -07:00
|
|
|
const union xfs_btree_key *key);
|
2008-10-30 16:56:09 +11:00
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
/*
|
|
|
|
* Difference between key2 and key1 -- positive if key1 > key2,
|
2023-04-11 19:00:11 -07:00
|
|
|
* negative if key1 < key2, and zero if equal. If the @mask parameter
|
|
|
|
* is non NULL, each key field to be used in the comparison must
|
|
|
|
* contain a nonzero value.
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
*/
|
2017-06-16 11:00:05 -07:00
|
|
|
int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
|
2021-08-10 17:02:15 -07:00
|
|
|
const union xfs_btree_key *key1,
|
2023-04-11 19:00:11 -07:00
|
|
|
const union xfs_btree_key *key2,
|
|
|
|
const union xfs_btree_key *mask);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 11:08:36 +10:00
|
|
|
|
2012-11-14 17:54:40 +11:00
|
|
|
const struct xfs_buf_ops *buf_ops;
|
2012-11-14 17:53:49 +11:00
|
|
|
|
2008-10-30 16:58:32 +11:00
|
|
|
/* check that k1 is lower than k2 */
|
|
|
|
int (*keys_inorder)(struct xfs_btree_cur *cur,
|
2021-08-10 17:02:17 -07:00
|
|
|
const union xfs_btree_key *k1,
|
|
|
|
const union xfs_btree_key *k2);
|
2008-10-30 16:58:32 +11:00
|
|
|
|
|
|
|
/* check that r1 is lower than r2 */
|
|
|
|
int (*recs_inorder)(struct xfs_btree_cur *cur,
|
2021-08-10 17:02:17 -07:00
|
|
|
const union xfs_btree_rec *r1,
|
|
|
|
const union xfs_btree_rec *r2);
|
2023-04-11 19:00:10 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Are these two btree keys immediately adjacent?
|
|
|
|
*
|
|
|
|
* Given two btree keys @key1 and @key2, decide if it is impossible for
|
|
|
|
* there to be a third btree key K satisfying the relationship
|
|
|
|
* @key1 < K < @key2. To determine if two btree records are
|
|
|
|
* immediately adjacent, @key1 should be the high key of the first
|
|
|
|
* record and @key2 should be the low key of the second record.
|
2023-04-11 19:00:11 -07:00
|
|
|
* If the @mask parameter is non NULL, each key field to be used in the
|
|
|
|
* comparison must contain a nonzero value.
|
2023-04-11 19:00:10 -07:00
|
|
|
*/
|
|
|
|
enum xbtree_key_contig (*keys_contiguous)(struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
2023-04-11 19:00:11 -07:00
|
|
|
const union xfs_btree_key *key2,
|
|
|
|
const union xfs_btree_key *mask);
|
2008-10-30 16:53:59 +11:00
|
|
|
};
|
|
|
|
|
2024-02-22 12:34:29 -08:00
|
|
|
/* btree geometry flags */
|
xfs: Avoid races with cnt_btree lastrec updates
A concurrent file creation and little writing could unexpectedly return
-ENOSPC error since there is a race window that the allocator could get
the wrong agf->agf_longest.
Write file process steps:
1) Find the entry that best meets the conditions, then calculate the start
address and length of the remaining part of the entry after allocation.
2) Delete this entry and update the -current- agf->agf_longest.
3) Insert the remaining unused parts of this entry based on the
calculations in 1), and update the agf->agf_longest again if necessary.
Create file process steps:
1) Check whether there are free inodes in the inode chunk.
2) If there is no free inode, check whether there has space for creating
inode chunks, perform the no-lock judgment first.
3) If the judgment succeeds, the judgment is performed again with agf lock
held. Otherwire, an error is returned directly.
If the write process is in step 2) but not go to 3) yet, the create file
process goes to 2) at this time, it may be mistaken for no space,
resulting in the file system still has space but the file creation fails.
We have sent two different commits to the community in order to fix this
problem[1][2]. Unfortunately, both solutions have flaws. In [2], I
discussed with Dave and Darrick, realized that a better solution to this
problem requires the "last cnt record tracking" to be ripped out of the
generic btree code. And surprisingly, Dave directly provided his fix code.
This patch includes appropriate modifications based on his tmp-code to
address this issue.
The entire fix can be roughly divided into two parts:
1) Delete the code related to lastrec-update in the generic btree code.
2) Place the process of updating longest freespace with cntbt separately
to the end of the cntbt modifications. Move the cursor to the rightmost
firstly, and update the longest free extent based on the record.
Note that we can not update the longest with xfs_alloc_get_rec() after
find the longest record, as xfs_verify_agbno() may not pass because
pag->block_count is updated on the outside. Therefore, use
xfs_btree_get_rec() as a replacement.
[1] https://lore.kernel.org/all/20240419061848.1032366-2-yebin10@huawei.com
[2] https://lore.kernel.org/all/20240604071121.3981686-1-wozizhi@huawei.com
Reported by: Ye Bin <yebin10@huawei.com>
Signed-off-by: Zizhi Wo <wozizhi@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-07-01 14:02:36 +08:00
|
|
|
#define XFS_BTGEO_OVERLAPPING (1U << 0) /* overlapping intervals */
|
2008-10-30 16:56:32 +11:00
|
|
|
|
|
|
|
|
2016-08-03 11:10:21 +10:00
|
|
|
union xfs_btree_irec {
|
|
|
|
struct xfs_alloc_rec_incore a;
|
|
|
|
struct xfs_bmbt_irec b;
|
|
|
|
struct xfs_inobt_rec_incore i;
|
2016-08-03 11:39:05 +10:00
|
|
|
struct xfs_rmap_irec r;
|
2016-10-03 09:11:18 -07:00
|
|
|
struct xfs_refcount_irec rc;
|
|
|
|
};
|
|
|
|
|
2021-09-16 12:24:04 -07:00
|
|
|
struct xfs_btree_level {
|
|
|
|
/* buffer pointer */
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
|
|
|
|
/* key/record number */
|
|
|
|
uint16_t ptr;
|
|
|
|
|
|
|
|
/* readahead info */
|
|
|
|
#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */
|
|
|
|
#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */
|
|
|
|
uint16_t ra;
|
|
|
|
};
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* Btree cursor structure.
|
|
|
|
* This collects all information needed by the btree code in one place.
|
|
|
|
*/
|
2021-09-16 12:18:47 -07:00
|
|
|
struct xfs_btree_cur
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct xfs_trans *bc_tp; /* transaction we're in, if any */
|
|
|
|
struct xfs_mount *bc_mp; /* file system mount struct */
|
2008-10-30 16:53:59 +11:00
|
|
|
const struct xfs_btree_ops *bc_ops;
|
2021-09-27 14:26:19 -07:00
|
|
|
struct kmem_cache *bc_cache; /* cursor cache */
|
2021-10-12 09:41:47 -07:00
|
|
|
unsigned int bc_flags; /* btree features - below */
|
2016-08-03 11:10:21 +10:00
|
|
|
union xfs_btree_irec bc_rec; /* current insert/search record value */
|
2021-10-12 09:41:47 -07:00
|
|
|
uint8_t bc_nlevels; /* number of levels in the tree */
|
2021-09-16 12:26:56 -07:00
|
|
|
uint8_t bc_maxlevels; /* maximum levels for this btree type */
|
2024-11-03 20:18:44 -08:00
|
|
|
struct xfs_group *bc_group;
|
2021-06-02 10:48:24 +10:00
|
|
|
|
2024-02-22 12:37:03 -08:00
|
|
|
/* per-type information */
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
struct xfs_inode *ip;
|
|
|
|
short forksize;
|
|
|
|
char whichfork;
|
|
|
|
struct xbtree_ifakeroot *ifake; /* for staging cursor */
|
|
|
|
} bc_ino;
|
|
|
|
struct {
|
|
|
|
struct xfs_buf *agbp;
|
|
|
|
struct xbtree_afakeroot *afake; /* for staging cursor */
|
|
|
|
} bc_ag;
|
2024-02-22 12:43:35 -08:00
|
|
|
struct {
|
|
|
|
struct xfbtree *xfbtree;
|
|
|
|
} bc_mem;
|
2024-02-22 12:37:03 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
/* per-format private data */
|
2005-04-16 15:20:36 -07:00
|
|
|
union {
|
2024-02-22 12:37:03 -08:00
|
|
|
struct {
|
|
|
|
int allocated;
|
|
|
|
} bc_bmap; /* bmapbt */
|
|
|
|
struct {
|
|
|
|
unsigned int nr_ops; /* # record updates */
|
|
|
|
unsigned int shape_changes; /* # of extent splits */
|
|
|
|
} bc_refc; /* refcountbt */
|
2020-03-10 17:57:07 -07:00
|
|
|
};
|
2021-09-16 12:24:04 -07:00
|
|
|
|
|
|
|
/* Must be at the end of the struct! */
|
|
|
|
struct xfs_btree_level bc_levels[];
|
2021-09-16 12:18:47 -07:00
|
|
|
};
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2021-09-16 12:24:04 -07:00
|
|
|
/*
|
|
|
|
* Compute the size of a btree cursor that can handle a btree of a given
|
|
|
|
* height. The bc_levels array handles node and leaf blocks, so its size
|
|
|
|
* is exactly nlevels.
|
|
|
|
*/
|
|
|
|
static inline size_t
|
|
|
|
xfs_btree_cur_sizeof(unsigned int nlevels)
|
|
|
|
{
|
2023-05-22 14:18:13 -07:00
|
|
|
return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels);
|
2021-09-16 12:24:04 -07:00
|
|
|
}
|
|
|
|
|
2024-02-22 12:34:29 -08:00
|
|
|
/* cursor state flags */
|
2020-03-11 10:40:26 -07:00
|
|
|
/*
|
|
|
|
* The root of this btree is a fakeroot structure so that we can stage a btree
|
|
|
|
* rebuild without leaving it accessible via primary metadata. The ops struct
|
|
|
|
* is dynamically allocated and must be freed when the cursor is deleted.
|
|
|
|
*/
|
2024-02-22 12:34:29 -08:00
|
|
|
#define XFS_BTREE_STAGING (1U << 0)
|
2008-10-30 16:54:22 +11:00
|
|
|
|
2024-02-22 12:35:13 -08:00
|
|
|
/* We are converting a delalloc reservation (only for bmbt btrees) */
|
|
|
|
#define XFS_BTREE_BMBT_WASDEL (1U << 1)
|
|
|
|
|
|
|
|
/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */
|
|
|
|
#define XFS_BTREE_BMBT_INVALID_OWNER (1U << 2)
|
|
|
|
|
2024-02-22 12:35:15 -08:00
|
|
|
/* Cursor is active (only for allocbt btrees) */
|
|
|
|
#define XFS_BTREE_ALLOCBT_ACTIVE (1U << 3)
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#define XFS_BTREE_NOERROR 0
|
|
|
|
#define XFS_BTREE_ERROR 1
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert from buffer to btree block header.
|
|
|
|
*/
|
2011-07-22 23:40:15 +00:00
|
|
|
#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr))
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2024-02-22 12:40:57 -08:00
|
|
|
xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur,
|
2017-10-17 21:37:33 -07:00
|
|
|
struct xfs_btree_block *block, int level, struct xfs_buf *bp);
|
2024-02-22 12:40:54 -08:00
|
|
|
int __xfs_btree_check_ptr(struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr, int index, int level);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
/*
|
2008-10-30 16:54:53 +11:00
|
|
|
* Check that block header is ok.
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
2008-10-30 16:54:53 +11:00
|
|
|
int
|
|
|
|
xfs_btree_check_block(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
struct xfs_btree_block *block, /* generic btree block pointer */
|
2005-04-16 15:20:36 -07:00
|
|
|
int level, /* level of the btree block */
|
|
|
|
struct xfs_buf *bp); /* buffer containing block, if any */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Delete the btree cursor.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_btree_del_cursor(
|
2021-09-16 12:18:47 -07:00
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
2005-04-16 15:20:36 -07:00
|
|
|
int error); /* del because of error */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Duplicate the btree cursor.
|
|
|
|
* Allocate a new one, copy the record, re-get the buffers.
|
|
|
|
*/
|
|
|
|
int /* error */
|
|
|
|
xfs_btree_dup_cursor(
|
2021-09-16 12:18:47 -07:00
|
|
|
struct xfs_btree_cur *cur, /* input cursor */
|
|
|
|
struct xfs_btree_cur **ncur);/* output cursor */
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute first and last byte offsets for the fields given.
|
|
|
|
* Interprets the offsets table, which contains struct field offsets.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_btree_offsets(
|
2022-04-21 10:46:33 +10:00
|
|
|
uint32_t fields, /* bitmask of fields */
|
2005-04-16 15:20:36 -07:00
|
|
|
const short *offsets,/* table of field offsets */
|
|
|
|
int nbits, /* number of bits to inspect */
|
|
|
|
int *first, /* output: first byte offset */
|
|
|
|
int *last); /* output: last byte offset */
|
|
|
|
|
2012-11-13 16:40:27 -06:00
|
|
|
/*
|
|
|
|
* Initialise a new btree block header
|
|
|
|
*/
|
2024-02-22 12:35:17 -08:00
|
|
|
void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp,
|
2024-02-22 12:35:16 -08:00
|
|
|
const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs,
|
|
|
|
__u64 owner);
|
2024-02-22 12:35:17 -08:00
|
|
|
void xfs_btree_init_block(struct xfs_mount *mp,
|
2024-02-22 12:35:16 -08:00
|
|
|
struct xfs_btree_block *buf, const struct xfs_btree_ops *ops,
|
2024-02-22 12:35:19 -08:00
|
|
|
__u16 level, __u16 numrecs, __u64 owner);
|
2013-04-21 14:53:46 -05:00
|
|
|
|
2008-10-30 16:55:45 +11:00
|
|
|
/*
|
|
|
|
* Common btree core entry points.
|
|
|
|
*/
|
|
|
|
int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
|
2008-10-30 16:55:58 +11:00
|
|
|
int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
|
2008-10-30 16:56:09 +11:00
|
|
|
int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
|
2008-10-30 16:56:32 +11:00
|
|
|
int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
|
2008-10-30 16:57:28 +11:00
|
|
|
int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
|
2008-10-30 16:57:40 +11:00
|
|
|
int xfs_btree_insert(struct xfs_btree_cur *, int *);
|
2008-10-30 16:58:01 +11:00
|
|
|
int xfs_btree_delete(struct xfs_btree_cur *, int *);
|
2008-10-30 16:58:11 +11:00
|
|
|
int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
|
2017-06-16 11:00:05 -07:00
|
|
|
int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
|
xfs: recovery of swap extents operations for CRC filesystems
This is the recovery side of the btree block owner change operation
performed by swapext on CRC enabled filesystems. We detect that an
owner change is needed by the flag that has been placed on the inode
log format flag field. Because the inode recovery is being replayed
after the buffers that make up the BMBT in the given checkpoint, we
can walk all the buffers and directly modify them when we see the
flag set on an inode.
Because the inode can be relogged and hence present in multiple
chekpoints with the "change owner" flag set, we could do multiple
passes across the inode to do this change. While this isn't optimal,
we can't directly ignore the flag as there may be multiple
independent swap extent operations being replayed on the same inode
in different checkpoints so we can't ignore them.
Further, because the owner change operation uses ordered buffers, we
might have buffers that are newer on disk than the current
checkpoint and so already have the owner changed in them. Hence we
cannot just peek at a buffer in the tree and check that it has the
correct owner and assume that the change was completed.
So, for the moment just brute force the owner change every time we
see an inode with the flag set. Note that we have to be careful here
because the owner of the buffers may point to either the old owner
or the new owner. Currently the verifier can't verify the owner
directly, so there is no failure case here right now. If we verify
the owner exactly in future, then we'll have to take this into
account.
This was tested in terms of normal operation via xfstests - all of
the fsr tests now pass without failure. however, we really need to
modify xfs/227 to stress v3 inodes correctly to ensure we fully
cover this case for v5 filesystems.
In terms of recovery testing, I used a hacked version of xfs_fsr
that held the temp inode open for a few seconds before exiting so
that the filesystem could be shut down with an open owner change
recovery flags set on at least the temp inode. fsr leaves the temp
inode unlinked and in btree format, so this was necessary for the
owner change to be reliably replayed.
logprint confirmed the tmp inode in the log had the correct flag set:
INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88
INODE: #regs:3 ino:0x44 flags:0x209 dsize:88
^^^^^
0x200 is set, indicating a data fork owner change needed to be
replayed on inode 0x44. A printk in the revoery code confirmed that
the inode change was recovered:
XFS (vdc): Mounting Filesystem
XFS (vdc): Starting recovery (logdev: internal)
recovering owner change ino 0x44
XFS (vdc): Version 5 superblock detected. This kernel L support enabled!
Use of these features in this kernel is at your own risk!
XFS (vdc): Ending recovery (logdev: internal)
The script used to test this was:
$ cat ./recovery-fsr.sh
#!/bin/bash
dev=/dev/vdc
mntpt=/mnt/scratch
testfile=$mntpt/testfile
umount $mntpt
mkfs.xfs -f -m crc=1 $dev
mount $dev $mntpt
chmod 777 $mntpt
for i in `seq 10000 -1 0`; do
xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1
done
xfs_bmap -vp $testfile |head -20
xfs_fsr -d -v $testfile &
sleep 10
/home/dave/src/xfstests-dev/src/godown -f $mntpt
wait
umount $mntpt
xfs_logprint -t $dev |tail -20
time mount $dev $mntpt
xfs_bmap -vp $testfile
umount $mntpt
$
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 10:23:45 +10:00
|
|
|
struct list_head *buffer_list);
|
2008-10-30 16:55:45 +11:00
|
|
|
|
2013-04-21 14:53:46 -05:00
|
|
|
/*
|
|
|
|
* btree block CRC helpers
|
|
|
|
*/
|
2024-02-22 12:40:58 -08:00
|
|
|
void xfs_btree_fsblock_calc_crc(struct xfs_buf *);
|
|
|
|
bool xfs_btree_fsblock_verify_crc(struct xfs_buf *);
|
|
|
|
void xfs_btree_agblock_calc_crc(struct xfs_buf *);
|
|
|
|
bool xfs_btree_agblock_verify_crc(struct xfs_buf *);
|
2013-04-21 14:53:46 -05:00
|
|
|
|
2008-10-30 16:58:21 +11:00
|
|
|
/*
|
|
|
|
* Internal btree helpers also used by xfs_bmap.c.
|
|
|
|
*/
|
2022-04-21 10:46:33 +10:00
|
|
|
void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
|
2008-10-30 16:58:21 +11:00
|
|
|
void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
|
|
|
|
|
2008-10-30 16:55:34 +11:00
|
|
|
/*
|
|
|
|
* Helpers.
|
|
|
|
*/
|
2021-08-12 09:56:49 -07:00
|
|
|
static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block)
|
2008-10-30 16:55:45 +11:00
|
|
|
{
|
|
|
|
return be16_to_cpu(block->bb_numrecs);
|
|
|
|
}
|
|
|
|
|
2008-10-30 16:56:43 +11:00
|
|
|
static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
|
2017-06-16 11:00:05 -07:00
|
|
|
uint16_t numrecs)
|
2008-10-30 16:56:43 +11:00
|
|
|
{
|
|
|
|
block->bb_numrecs = cpu_to_be16(numrecs);
|
|
|
|
}
|
|
|
|
|
2021-08-12 09:56:49 -07:00
|
|
|
static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
|
2008-10-30 16:55:34 +11:00
|
|
|
{
|
|
|
|
return be16_to_cpu(block->bb_level);
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Min and max functions for extlen, agblock, fileoff, and filblks types.
|
|
|
|
*/
|
2007-06-28 16:43:39 +10:00
|
|
|
#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b))
|
|
|
|
#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b))
|
|
|
|
#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b))
|
|
|
|
#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b))
|
|
|
|
#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b))
|
|
|
|
#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b))
|
|
|
|
#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b))
|
|
|
|
#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b))
|
2005-11-02 14:38:42 +11:00
|
|
|
|
2024-02-22 12:40:58 -08:00
|
|
|
xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp);
|
|
|
|
xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp,
|
2018-01-08 10:51:03 -08:00
|
|
|
unsigned int max_recs);
|
2024-02-22 12:40:58 -08:00
|
|
|
xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp,
|
2018-01-08 10:51:03 -08:00
|
|
|
uint64_t owner);
|
2024-02-22 12:40:58 -08:00
|
|
|
xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp,
|
2018-01-08 10:51:03 -08:00
|
|
|
unsigned int max_recs);
|
2024-02-22 12:43:35 -08:00
|
|
|
xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp,
|
|
|
|
unsigned int max_recs);
|
2018-01-08 10:51:00 -08:00
|
|
|
|
2021-10-13 11:10:45 -07:00
|
|
|
unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
|
|
|
|
unsigned long long records);
|
|
|
|
unsigned long long xfs_btree_calc_size(const unsigned int *limits,
|
|
|
|
unsigned long long records);
|
2021-09-16 12:27:43 -07:00
|
|
|
unsigned int xfs_btree_space_to_height(const unsigned int *limits,
|
|
|
|
unsigned long long blocks);
|
2016-01-04 16:13:21 +11:00
|
|
|
|
2019-08-28 14:37:57 -07:00
|
|
|
/*
|
|
|
|
* Return codes for the query range iterator function are 0 to continue
|
|
|
|
* iterating, and non-zero to stop iterating. Any non-zero value will be
|
|
|
|
* passed up to the _query_range caller. The special value -ECANCELED can be
|
|
|
|
* used to stop iteration, because _query_range never generates that error
|
|
|
|
* code on its own.
|
|
|
|
*/
|
2016-08-03 11:10:21 +10:00
|
|
|
typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
|
2021-08-10 17:02:16 -07:00
|
|
|
const union xfs_btree_rec *rec, void *priv);
|
2016-08-03 11:10:21 +10:00
|
|
|
|
|
|
|
int xfs_btree_query_range(struct xfs_btree_cur *cur,
|
2021-08-10 17:02:15 -07:00
|
|
|
const union xfs_btree_irec *low_rec,
|
|
|
|
const union xfs_btree_irec *high_rec,
|
2016-08-03 11:10:21 +10:00
|
|
|
xfs_btree_query_range_fn fn, void *priv);
|
2017-03-28 14:56:35 -07:00
|
|
|
int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
|
|
|
|
void *priv);
|
2016-08-03 11:10:21 +10:00
|
|
|
|
2016-08-03 11:10:55 +10:00
|
|
|
typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
|
|
|
|
void *data);
|
2019-10-28 16:12:35 -07:00
|
|
|
/* Visit record blocks. */
|
|
|
|
#define XFS_BTREE_VISIT_RECORDS (1 << 0)
|
|
|
|
/* Visit leaf blocks. */
|
|
|
|
#define XFS_BTREE_VISIT_LEAVES (1 << 1)
|
|
|
|
/* Visit all blocks. */
|
|
|
|
#define XFS_BTREE_VISIT_ALL (XFS_BTREE_VISIT_RECORDS | \
|
|
|
|
XFS_BTREE_VISIT_LEAVES)
|
2016-08-03 11:10:55 +10:00
|
|
|
int xfs_btree_visit_blocks(struct xfs_btree_cur *cur,
|
2019-10-28 16:12:35 -07:00
|
|
|
xfs_btree_visit_blocks_fn fn, unsigned int flags, void *data);
|
2016-08-03 11:10:55 +10:00
|
|
|
|
2024-12-02 10:57:26 -08:00
|
|
|
int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_filblks_t *blocks);
|
2016-09-19 10:25:20 +10:00
|
|
|
|
2017-06-16 11:00:07 -07:00
|
|
|
union xfs_btree_rec *xfs_btree_rec_addr(struct xfs_btree_cur *cur, int n,
|
|
|
|
struct xfs_btree_block *block);
|
|
|
|
union xfs_btree_key *xfs_btree_key_addr(struct xfs_btree_cur *cur, int n,
|
|
|
|
struct xfs_btree_block *block);
|
|
|
|
union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n,
|
|
|
|
struct xfs_btree_block *block);
|
|
|
|
union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n,
|
|
|
|
struct xfs_btree_block *block);
|
|
|
|
int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level,
|
2021-08-12 10:10:44 -07:00
|
|
|
const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp);
|
2017-06-16 11:00:07 -07:00
|
|
|
struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
|
|
|
|
int level, struct xfs_buf **bpp);
|
2021-08-12 10:10:44 -07:00
|
|
|
bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr);
|
2017-10-17 21:37:37 -07:00
|
|
|
int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *a,
|
|
|
|
const union xfs_btree_ptr *b);
|
|
|
|
void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
union xfs_btree_ptr *ptr, int lr);
|
2017-10-25 15:03:46 -07:00
|
|
|
void xfs_btree_get_keys(struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block, union xfs_btree_key *key);
|
|
|
|
union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_key *key);
|
2023-04-11 19:00:10 -07:00
|
|
|
typedef bool (*xfs_btree_key_gap_fn)(struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2);
|
|
|
|
|
|
|
|
int xfs_btree_has_records(struct xfs_btree_cur *cur,
|
2021-08-12 10:10:44 -07:00
|
|
|
const union xfs_btree_irec *low,
|
2023-04-11 19:00:10 -07:00
|
|
|
const union xfs_btree_irec *high,
|
2023-04-11 19:00:11 -07:00
|
|
|
const union xfs_btree_key *mask,
|
2023-04-11 19:00:10 -07:00
|
|
|
enum xbtree_recpacking *outcome);
|
|
|
|
|
2018-05-09 10:02:03 -07:00
|
|
|
bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
|
2020-03-11 10:42:34 -07:00
|
|
|
struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur);
|
2017-06-16 11:00:07 -07:00
|
|
|
|
2023-04-11 19:00:10 -07:00
|
|
|
/* Key comparison helpers */
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_keycmp_lt(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2)
|
|
|
|
{
|
2023-04-11 19:00:11 -07:00
|
|
|
return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0;
|
2023-04-11 19:00:10 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_keycmp_gt(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2)
|
|
|
|
{
|
2023-04-11 19:00:11 -07:00
|
|
|
return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0;
|
2023-04-11 19:00:10 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_keycmp_eq(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2)
|
|
|
|
{
|
2023-04-11 19:00:11 -07:00
|
|
|
return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0;
|
2023-04-11 19:00:10 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_keycmp_le(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2)
|
|
|
|
{
|
|
|
|
return !xfs_btree_keycmp_gt(cur, key1, key2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_keycmp_ge(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2)
|
|
|
|
{
|
|
|
|
return !xfs_btree_keycmp_lt(cur, key1, key2);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_keycmp_ne(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2)
|
|
|
|
{
|
|
|
|
return !xfs_btree_keycmp_eq(cur, key1, key2);
|
|
|
|
}
|
|
|
|
|
2023-04-11 19:00:11 -07:00
|
|
|
/* Masked key comparison helpers */
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_masked_keycmp_lt(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2,
|
|
|
|
const union xfs_btree_key *mask)
|
|
|
|
{
|
|
|
|
return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_masked_keycmp_gt(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2,
|
|
|
|
const union xfs_btree_key *mask)
|
|
|
|
{
|
|
|
|
return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_masked_keycmp_ge(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_key *key1,
|
|
|
|
const union xfs_btree_key *key2,
|
|
|
|
const union xfs_btree_key *mask)
|
|
|
|
{
|
|
|
|
return !xfs_btree_masked_keycmp_lt(cur, key1, key2, mask);
|
|
|
|
}
|
|
|
|
|
2019-11-06 08:47:09 -08:00
|
|
|
/* Does this cursor point to the last block in the given level? */
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_islastblock(
|
2021-09-16 12:18:47 -07:00
|
|
|
struct xfs_btree_cur *cur,
|
2019-11-06 08:47:09 -08:00
|
|
|
int level)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
|
2024-02-22 12:35:36 -08:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
|
2019-11-06 08:47:09 -08:00
|
|
|
return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
|
|
|
|
return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
|
|
|
|
}
|
|
|
|
|
2020-03-11 10:51:50 -07:00
|
|
|
void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr);
|
2021-08-12 10:10:44 -07:00
|
|
|
int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
|
|
|
|
struct xfs_buf **bpp);
|
2023-12-15 10:03:28 -08:00
|
|
|
int xfs_btree_read_buf_block(struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr, int flags,
|
|
|
|
struct xfs_btree_block **block, struct xfs_buf **bpp);
|
2020-03-11 10:51:50 -07:00
|
|
|
void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
|
2021-08-12 10:10:44 -07:00
|
|
|
struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
|
2020-03-11 10:51:50 -07:00
|
|
|
int lr);
|
|
|
|
void xfs_btree_init_block_cur(struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp, int level, int numrecs);
|
|
|
|
void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *dst_ptr,
|
|
|
|
const union xfs_btree_ptr *src_ptr, int numptrs);
|
|
|
|
void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
|
2021-08-12 10:10:44 -07:00
|
|
|
union xfs_btree_key *dst_key,
|
|
|
|
const union xfs_btree_key *src_key, int numkeys);
|
2024-02-22 12:37:26 -08:00
|
|
|
void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr);
|
2020-03-11 10:51:50 -07:00
|
|
|
|
2021-09-16 12:25:32 -07:00
|
|
|
static inline struct xfs_btree_cur *
|
|
|
|
xfs_btree_alloc_cursor(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_trans *tp,
|
2024-02-22 12:33:18 -08:00
|
|
|
const struct xfs_btree_ops *ops,
|
2021-09-23 12:21:37 -07:00
|
|
|
uint8_t maxlevels,
|
2021-09-27 14:26:19 -07:00
|
|
|
struct kmem_cache *cache)
|
2021-09-16 12:25:32 -07:00
|
|
|
{
|
|
|
|
struct xfs_btree_cur *cur;
|
|
|
|
|
2024-02-22 12:35:36 -08:00
|
|
|
ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN ||
|
|
|
|
ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN);
|
|
|
|
|
2024-01-16 09:59:45 +11:00
|
|
|
/* BMBT allocations can come through from non-transactional context. */
|
|
|
|
cur = kmem_cache_zalloc(cache,
|
|
|
|
GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
|
2024-02-22 12:33:18 -08:00
|
|
|
cur->bc_ops = ops;
|
2021-09-16 12:25:32 -07:00
|
|
|
cur->bc_tp = tp;
|
|
|
|
cur->bc_mp = mp;
|
2021-09-16 12:27:24 -07:00
|
|
|
cur->bc_maxlevels = maxlevels;
|
2021-09-23 12:21:37 -07:00
|
|
|
cur->bc_cache = cache;
|
2021-09-16 12:25:32 -07:00
|
|
|
|
|
|
|
return cur;
|
|
|
|
}
|
|
|
|
|
2021-09-23 12:21:37 -07:00
|
|
|
int __init xfs_btree_init_cur_caches(void);
|
|
|
|
void xfs_btree_destroy_cur_caches(void);
|
|
|
|
|
2023-12-15 10:03:33 -08:00
|
|
|
int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
|
|
|
|
|
2024-02-22 12:37:24 -08:00
|
|
|
/* Does this level of the cursor point to the inode root (and not a block)? */
|
|
|
|
static inline bool
|
|
|
|
xfs_btree_at_iroot(
|
|
|
|
const struct xfs_btree_cur *cur,
|
|
|
|
int level)
|
|
|
|
{
|
|
|
|
return cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
|
|
|
|
level == cur->bc_nlevels - 1;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif /* __XFS_BTREE_H__ */
|