2018-06-06 02:42:14 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
2005-11-02 03:58:39 +00:00
|
|
|
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
2005-11-02 03:38:42 +00:00
|
|
|
#include "xfs_fs.h"
|
2013-10-22 23:36:05 +00:00
|
|
|
#include "xfs_shared.h"
|
2013-10-22 23:51:50 +00:00
|
|
|
#include "xfs_format.h"
|
2013-10-22 23:50:10 +00:00
|
|
|
#include "xfs_log_format.h"
|
|
|
|
#include "xfs_trans_resv.h"
|
2005-11-02 03:38:42 +00:00
|
|
|
#include "xfs_bit.h"
|
2005-04-16 22:20:36 +00:00
|
|
|
#include "xfs_mount.h"
|
|
|
|
#include "xfs_inode.h"
|
2013-10-22 23:50:10 +00:00
|
|
|
#include "xfs_trans.h"
|
2013-04-21 19:53:46 +00:00
|
|
|
#include "xfs_buf_item.h"
|
2005-11-02 03:38:42 +00:00
|
|
|
#include "xfs_btree.h"
|
2017-10-31 19:04:49 +00:00
|
|
|
#include "xfs_errortag.h"
|
2005-04-16 22:20:36 +00:00
|
|
|
#include "xfs_error.h"
|
2009-12-14 23:14:59 +00:00
|
|
|
#include "xfs_trace.h"
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
#include "xfs_alloc.h"
|
2015-10-12 04:59:25 +00:00
|
|
|
#include "xfs_log.h"
|
2020-03-11 17:40:26 +00:00
|
|
|
#include "xfs_btree_staging.h"
|
2021-06-02 00:48:24 +00:00
|
|
|
#include "xfs_ag.h"
|
2021-09-23 19:21:37 +00:00
|
|
|
#include "xfs_alloc_btree.h"
|
|
|
|
#include "xfs_ialloc_btree.h"
|
|
|
|
#include "xfs_bmap_btree.h"
|
|
|
|
#include "xfs_rmap_btree.h"
|
|
|
|
#include "xfs_refcount_btree.h"
|
2024-02-22 20:32:09 +00:00
|
|
|
#include "xfs_health.h"
|
2024-02-22 20:43:35 +00:00
|
|
|
#include "xfs_buf_mem.h"
|
|
|
|
#include "xfs_btree_mem.h"
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Btree magic numbers.
|
|
|
|
*/
|
2017-06-16 18:00:05 +00:00
|
|
|
uint32_t
|
2017-01-28 07:16:38 +00:00
|
|
|
xfs_btree_magic(
|
2024-02-22 20:35:16 +00:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
const struct xfs_btree_ops *ops)
|
2017-01-28 07:16:38 +00:00
|
|
|
{
|
2024-02-22 20:35:16 +00:00
|
|
|
int idx = xfs_has_crc(mp) ? 1 : 0;
|
|
|
|
__be32 magic = ops->buf_ops->magic[idx];
|
2017-01-28 07:16:38 +00:00
|
|
|
|
|
|
|
/* Ensure we asked for crc for crc-only magics. */
|
|
|
|
ASSERT(magic != 0);
|
2024-02-22 20:35:16 +00:00
|
|
|
return be32_to_cpu(magic);
|
2017-01-28 07:16:38 +00:00
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2022-05-27 00:20:45 +00:00
|
|
|
/*
|
|
|
|
* These sibling pointer checks are optimised for null sibling pointers. This
|
|
|
|
* happens a lot, and we don't need to byte swap at runtime if the sibling
|
|
|
|
* pointer is NULL.
|
|
|
|
*
|
|
|
|
* These are explicitly marked at inline because the cost of calling them as
|
|
|
|
* functions instead of inlining them is about 36 bytes extra code per call site
|
|
|
|
* on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
|
|
|
|
* two sibling check functions reduces the compiled code size by over 300
|
|
|
|
* bytes.
|
|
|
|
*/
|
|
|
|
static inline xfs_failaddr_t
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_check_fsblock_siblings(
|
2022-05-04 02:13:35 +00:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
xfs_fsblock_t fsb,
|
2022-05-27 00:20:45 +00:00
|
|
|
__be64 dsibling)
|
2022-05-04 02:13:35 +00:00
|
|
|
{
|
2022-05-27 00:20:45 +00:00
|
|
|
xfs_fsblock_t sibling;
|
|
|
|
|
|
|
|
if (dsibling == cpu_to_be64(NULLFSBLOCK))
|
2022-05-04 02:13:35 +00:00
|
|
|
return NULL;
|
2022-05-27 00:20:45 +00:00
|
|
|
|
|
|
|
sibling = be64_to_cpu(dsibling);
|
2022-05-04 02:13:35 +00:00
|
|
|
if (sibling == fsb)
|
|
|
|
return __this_address;
|
2024-02-22 20:40:53 +00:00
|
|
|
if (!xfs_verify_fsbno(mp, sibling))
|
|
|
|
return __this_address;
|
2022-05-04 02:13:35 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
static inline xfs_failaddr_t
|
|
|
|
xfs_btree_check_memblock_siblings(
|
|
|
|
struct xfs_buftarg *btp,
|
|
|
|
xfbno_t bno,
|
|
|
|
__be64 dsibling)
|
|
|
|
{
|
|
|
|
xfbno_t sibling;
|
|
|
|
|
|
|
|
if (dsibling == cpu_to_be64(NULLFSBLOCK))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
sibling = be64_to_cpu(dsibling);
|
|
|
|
if (sibling == bno)
|
|
|
|
return __this_address;
|
|
|
|
if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
|
|
|
|
return __this_address;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2022-05-27 00:20:45 +00:00
|
|
|
static inline xfs_failaddr_t
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_check_agblock_siblings(
|
2022-07-07 09:13:02 +00:00
|
|
|
struct xfs_perag *pag,
|
2022-05-04 02:13:35 +00:00
|
|
|
xfs_agblock_t agbno,
|
2022-05-27 00:20:45 +00:00
|
|
|
__be32 dsibling)
|
2022-05-04 02:13:35 +00:00
|
|
|
{
|
2022-05-27 00:20:45 +00:00
|
|
|
xfs_agblock_t sibling;
|
|
|
|
|
|
|
|
if (dsibling == cpu_to_be32(NULLAGBLOCK))
|
2022-05-04 02:13:35 +00:00
|
|
|
return NULL;
|
2022-05-27 00:20:45 +00:00
|
|
|
|
|
|
|
sibling = be32_to_cpu(dsibling);
|
2022-05-04 02:13:35 +00:00
|
|
|
if (sibling == agbno)
|
|
|
|
return __this_address;
|
2024-02-22 20:40:52 +00:00
|
|
|
if (!xfs_verify_agbno(pag, sibling))
|
|
|
|
return __this_address;
|
2022-05-04 02:13:35 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:40:57 +00:00
|
|
|
static xfs_failaddr_t
|
2024-02-22 20:40:59 +00:00
|
|
|
__xfs_btree_check_lblock_hdr(
|
2017-10-18 04:37:33 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
int level,
|
|
|
|
struct xfs_buf *bp)
|
2008-10-30 05:54:53 +00:00
|
|
|
{
|
2017-10-18 04:37:33 +00:00
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
2013-04-21 19:53:46 +00:00
|
|
|
|
2024-02-22 20:40:56 +00:00
|
|
|
if (xfs_has_crc(mp)) {
|
2017-10-18 04:37:33 +00:00
|
|
|
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
|
|
|
|
return __this_address;
|
|
|
|
if (block->bb_u.l.bb_blkno !=
|
2021-08-19 01:47:05 +00:00
|
|
|
cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
|
2017-10-18 04:37:33 +00:00
|
|
|
return __this_address;
|
|
|
|
if (block->bb_u.l.bb_pad != cpu_to_be32(0))
|
|
|
|
return __this_address;
|
2013-04-21 19:53:46 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:16 +00:00
|
|
|
if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
|
2017-10-18 04:37:33 +00:00
|
|
|
return __this_address;
|
|
|
|
if (be16_to_cpu(block->bb_level) != level)
|
|
|
|
return __this_address;
|
|
|
|
if (be16_to_cpu(block->bb_numrecs) >
|
|
|
|
cur->bc_ops->get_maxrecs(cur, level))
|
|
|
|
return __this_address;
|
|
|
|
|
2024-02-22 20:40:59 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check a long btree block header. Return the address of the failing check,
|
|
|
|
* or NULL if everything is ok.
|
|
|
|
*/
|
|
|
|
static xfs_failaddr_t
|
|
|
|
__xfs_btree_check_fsblock(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
int level,
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
|
|
|
xfs_failaddr_t fa;
|
|
|
|
xfs_fsblock_t fsb;
|
|
|
|
|
|
|
|
fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
|
|
|
|
if (fa)
|
|
|
|
return fa;
|
|
|
|
|
2024-02-22 20:40:57 +00:00
|
|
|
/*
|
|
|
|
* For inode-rooted btrees, the root block sits in the inode fork. In
|
|
|
|
* that case bp is NULL, and the block must not have any siblings.
|
|
|
|
*/
|
|
|
|
if (!bp) {
|
|
|
|
if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK))
|
|
|
|
return __this_address;
|
|
|
|
if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK))
|
|
|
|
return __this_address;
|
|
|
|
return NULL;
|
|
|
|
}
|
2022-05-04 02:13:35 +00:00
|
|
|
|
2024-02-22 20:40:57 +00:00
|
|
|
fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_fsblock_siblings(mp, fsb,
|
|
|
|
block->bb_u.l.bb_leftsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
if (!fa)
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_fsblock_siblings(mp, fsb,
|
2022-05-27 00:20:45 +00:00
|
|
|
block->bb_u.l.bb_rightsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
return fa;
|
2017-10-18 04:37:33 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
/*
|
|
|
|
* Check an in-memory btree block header. Return the address of the failing
|
|
|
|
* check, or NULL if everything is ok.
|
|
|
|
*/
|
|
|
|
static xfs_failaddr_t
|
|
|
|
__xfs_btree_check_memblock(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
int level,
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
|
|
|
struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
|
|
|
|
xfs_failaddr_t fa;
|
|
|
|
xfbno_t bno;
|
|
|
|
|
|
|
|
fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
|
|
|
|
if (fa)
|
|
|
|
return fa;
|
|
|
|
|
|
|
|
bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
|
|
|
|
fa = xfs_btree_check_memblock_siblings(btp, bno,
|
|
|
|
block->bb_u.l.bb_leftsib);
|
|
|
|
if (!fa)
|
|
|
|
fa = xfs_btree_check_memblock_siblings(btp, bno,
|
|
|
|
block->bb_u.l.bb_rightsib);
|
|
|
|
return fa;
|
|
|
|
}
|
|
|
|
|
2017-10-18 04:37:33 +00:00
|
|
|
/*
|
|
|
|
* Check a short btree block header. Return the address of the failing check,
|
|
|
|
* or NULL if everything is ok.
|
|
|
|
*/
|
2024-02-22 20:40:57 +00:00
|
|
|
static xfs_failaddr_t
|
2024-02-22 20:40:58 +00:00
|
|
|
__xfs_btree_check_agblock(
|
2017-10-18 04:37:33 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
int level,
|
|
|
|
struct xfs_buf *bp)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2017-10-18 04:37:33 +00:00
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
2024-11-04 04:18:44 +00:00
|
|
|
struct xfs_perag *pag = to_perag(cur->bc_group);
|
2022-05-04 02:13:35 +00:00
|
|
|
xfs_failaddr_t fa;
|
2024-02-22 20:40:55 +00:00
|
|
|
xfs_agblock_t agbno;
|
2013-04-21 19:53:46 +00:00
|
|
|
|
2024-02-22 20:40:55 +00:00
|
|
|
if (xfs_has_crc(mp)) {
|
2017-10-18 04:37:33 +00:00
|
|
|
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
|
|
|
|
return __this_address;
|
2024-02-22 20:40:55 +00:00
|
|
|
if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
|
2017-10-18 04:37:33 +00:00
|
|
|
return __this_address;
|
2013-04-21 19:53:46 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:16 +00:00
|
|
|
if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
|
2017-10-18 04:37:33 +00:00
|
|
|
return __this_address;
|
|
|
|
if (be16_to_cpu(block->bb_level) != level)
|
|
|
|
return __this_address;
|
|
|
|
if (be16_to_cpu(block->bb_numrecs) >
|
|
|
|
cur->bc_ops->get_maxrecs(cur, level))
|
|
|
|
return __this_address;
|
|
|
|
|
2024-02-22 20:40:55 +00:00
|
|
|
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_agblock_siblings(pag, agbno,
|
2022-05-27 00:20:45 +00:00
|
|
|
block->bb_u.s.bb_leftsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
if (!fa)
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_agblock_siblings(pag, agbno,
|
2022-07-07 09:13:02 +00:00
|
|
|
block->bb_u.s.bb_rightsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
return fa;
|
2017-10-18 04:37:33 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:40:57 +00:00
|
|
|
/*
|
|
|
|
* Internal btree block check.
|
|
|
|
*
|
|
|
|
* Return NULL if the block is ok or the address of the failed check otherwise.
|
|
|
|
*/
|
|
|
|
xfs_failaddr_t
|
|
|
|
__xfs_btree_check_block(
|
2017-10-18 04:37:33 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
int level,
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2024-02-22 20:43:35 +00:00
|
|
|
switch (cur->bc_ops->type) {
|
|
|
|
case XFS_BTREE_TYPE_MEM:
|
|
|
|
return __xfs_btree_check_memblock(cur, block, level, bp);
|
|
|
|
case XFS_BTREE_TYPE_AG:
|
2024-02-22 20:40:58 +00:00
|
|
|
return __xfs_btree_check_agblock(cur, block, level, bp);
|
2024-02-22 20:43:35 +00:00
|
|
|
case XFS_BTREE_TYPE_INODE:
|
|
|
|
return __xfs_btree_check_fsblock(cur, block, level, bp);
|
|
|
|
default:
|
|
|
|
ASSERT(0);
|
|
|
|
return __this_address;
|
|
|
|
}
|
2024-02-22 20:40:57 +00:00
|
|
|
}
|
2017-10-18 04:37:33 +00:00
|
|
|
|
2024-02-22 20:40:57 +00:00
|
|
|
static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
|
|
|
|
{
|
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN)
|
|
|
|
return XFS_ERRTAG_BTREE_CHECK_SBLOCK;
|
|
|
|
return XFS_ERRTAG_BTREE_CHECK_LBLOCK;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2008-10-30 05:54:53 +00:00
|
|
|
* Debug routine: check that block header is ok.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_btree_check_block(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
struct xfs_btree_block *block, /* generic btree block pointer */
|
|
|
|
int level, /* level of the btree block */
|
|
|
|
struct xfs_buf *bp) /* buffer containing block, if any */
|
|
|
|
{
|
2024-02-22 20:40:57 +00:00
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
|
|
|
xfs_failaddr_t fa;
|
|
|
|
|
|
|
|
fa = __xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (XFS_IS_CORRUPT(mp, fa != NULL) ||
|
|
|
|
XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
|
|
|
|
if (bp)
|
|
|
|
trace_xfs_btree_corrupt(bp, _RET_IP_);
|
|
|
|
xfs_btree_mark_sick(cur);
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
return 0;
|
2008-10-30 05:54:53 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:40:54 +00:00
|
|
|
int
|
|
|
|
__xfs_btree_check_ptr(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
int index,
|
|
|
|
int level)
|
2008-10-30 05:54:53 +00:00
|
|
|
{
|
2017-10-18 04:37:33 +00:00
|
|
|
if (level <= 0)
|
2024-02-22 20:40:54 +00:00
|
|
|
return -EFSCORRUPTED;
|
2008-10-30 05:54:53 +00:00
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
switch (cur->bc_ops->type) {
|
|
|
|
case XFS_BTREE_TYPE_MEM:
|
|
|
|
if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
|
|
|
|
be64_to_cpu((&ptr->l)[index])))
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_INODE:
|
2024-02-22 20:40:54 +00:00
|
|
|
if (!xfs_verify_fsbno(cur->bc_mp,
|
|
|
|
be64_to_cpu((&ptr->l)[index])))
|
|
|
|
return -EFSCORRUPTED;
|
2024-02-22 20:43:35 +00:00
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_AG:
|
2024-11-04 04:18:44 +00:00
|
|
|
if (!xfs_verify_agbno(to_perag(cur->bc_group),
|
2024-02-22 20:40:54 +00:00
|
|
|
be32_to_cpu((&ptr->s)[index])))
|
|
|
|
return -EFSCORRUPTED;
|
2024-02-22 20:43:35 +00:00
|
|
|
break;
|
2024-02-22 20:40:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 05:54:53 +00:00
|
|
|
/*
|
2017-10-18 04:37:33 +00:00
|
|
|
* Check that a given (indexed) btree pointer at a certain level of a
|
|
|
|
* btree is valid and doesn't point past where it should.
|
2008-10-30 05:54:53 +00:00
|
|
|
*/
|
2017-11-06 19:54:01 +00:00
|
|
|
static int
|
2008-10-30 05:54:53 +00:00
|
|
|
xfs_btree_check_ptr(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
int index,
|
|
|
|
int level)
|
2008-10-30 05:54:53 +00:00
|
|
|
{
|
2024-02-22 20:40:54 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
error = __xfs_btree_check_ptr(cur, ptr, index, level);
|
|
|
|
if (error) {
|
2024-02-22 20:43:35 +00:00
|
|
|
switch (cur->bc_ops->type) {
|
|
|
|
case XFS_BTREE_TYPE_MEM:
|
|
|
|
xfs_err(cur->bc_mp,
|
|
|
|
"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
|
|
|
|
cur->bc_ops->name, cur->bc_flags, level, index,
|
|
|
|
__this_address);
|
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_INODE:
|
2024-02-22 20:40:54 +00:00
|
|
|
xfs_err(cur->bc_mp,
|
2024-02-22 20:39:47 +00:00
|
|
|
"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
|
2020-03-11 00:52:53 +00:00
|
|
|
cur->bc_ino.ip->i_ino,
|
2024-02-22 20:39:47 +00:00
|
|
|
cur->bc_ino.whichfork, cur->bc_ops->name,
|
2018-06-03 23:10:12 +00:00
|
|
|
level, index);
|
2024-02-22 20:43:35 +00:00
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_AG:
|
2024-02-22 20:40:54 +00:00
|
|
|
xfs_err(cur->bc_mp,
|
2024-02-22 20:39:47 +00:00
|
|
|
"AG %u: Corrupt %sbt pointer at level %d index %d.",
|
2024-11-04 04:18:44 +00:00
|
|
|
cur->bc_group->xg_gno, cur->bc_ops->name,
|
2018-06-03 23:10:12 +00:00
|
|
|
level, index);
|
2024-02-22 20:43:35 +00:00
|
|
|
break;
|
2024-02-22 20:40:54 +00:00
|
|
|
}
|
|
|
|
xfs_btree_mark_sick(cur);
|
2008-10-30 05:54:53 +00:00
|
|
|
}
|
2017-10-18 04:37:33 +00:00
|
|
|
|
2024-02-22 20:40:54 +00:00
|
|
|
return error;
|
2008-10-30 05:54:53 +00:00
|
|
|
}
|
2018-06-04 04:10:48 +00:00
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
# define xfs_btree_debug_check_ptr xfs_btree_check_ptr
|
|
|
|
#else
|
|
|
|
# define xfs_btree_debug_check_ptr(...) (0)
|
2008-10-30 06:05:26 +00:00
|
|
|
#endif
|
2008-10-30 05:54:53 +00:00
|
|
|
|
2013-04-21 19:53:46 +00:00
|
|
|
/*
|
|
|
|
* Calculate CRC on the whole btree block and stuff it into the
|
|
|
|
* long-form btree header.
|
|
|
|
*
|
|
|
|
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
|
2015-10-12 05:02:32 +00:00
|
|
|
* it into the buffer so recovery knows what the last modification was that made
|
2013-04-21 19:53:46 +00:00
|
|
|
* it to disk.
|
|
|
|
*/
|
|
|
|
void
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_fsblock_calc_crc(
|
2013-04-21 19:53:46 +00:00
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
2018-01-24 21:38:48 +00:00
|
|
|
struct xfs_buf_log_item *bip = bp->b_log_item;
|
2013-04-21 19:53:46 +00:00
|
|
|
|
2021-08-19 01:46:55 +00:00
|
|
|
if (!xfs_has_crc(bp->b_mount))
|
2013-04-21 19:53:46 +00:00
|
|
|
return;
|
|
|
|
if (bip)
|
|
|
|
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
|
2014-02-27 04:18:23 +00:00
|
|
|
xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
|
2013-04-21 19:53:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_fsblock_verify_crc(
|
2013-04-21 19:53:46 +00:00
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2015-10-12 04:59:25 +00:00
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
2019-06-29 02:27:29 +00:00
|
|
|
struct xfs_mount *mp = bp->b_mount;
|
2015-10-12 04:59:25 +00:00
|
|
|
|
2021-08-19 01:46:37 +00:00
|
|
|
if (xfs_has_crc(mp)) {
|
2015-10-12 04:59:25 +00:00
|
|
|
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
|
|
|
|
return false;
|
2014-02-27 04:17:27 +00:00
|
|
|
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
|
2015-10-12 04:59:25 +00:00
|
|
|
}
|
2014-02-27 04:17:27 +00:00
|
|
|
|
2013-04-21 19:53:46 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate CRC on the whole btree block and stuff it into the
|
|
|
|
* short-form btree header.
|
|
|
|
*
|
|
|
|
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
|
2015-10-12 05:02:32 +00:00
|
|
|
* it into the buffer so recovery knows what the last modification was that made
|
2013-04-21 19:53:46 +00:00
|
|
|
* it to disk.
|
|
|
|
*/
|
|
|
|
void
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_agblock_calc_crc(
|
2013-04-21 19:53:46 +00:00
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
2018-01-24 21:38:48 +00:00
|
|
|
struct xfs_buf_log_item *bip = bp->b_log_item;
|
2013-04-21 19:53:46 +00:00
|
|
|
|
2021-08-19 01:46:55 +00:00
|
|
|
if (!xfs_has_crc(bp->b_mount))
|
2013-04-21 19:53:46 +00:00
|
|
|
return;
|
|
|
|
if (bip)
|
|
|
|
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
|
2014-02-27 04:18:23 +00:00
|
|
|
xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
|
2013-04-21 19:53:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_agblock_verify_crc(
|
2013-04-21 19:53:46 +00:00
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2015-10-12 04:59:25 +00:00
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
2019-06-29 02:27:29 +00:00
|
|
|
struct xfs_mount *mp = bp->b_mount;
|
2015-10-12 04:59:25 +00:00
|
|
|
|
2021-08-19 01:46:37 +00:00
|
|
|
if (xfs_has_crc(mp)) {
|
2015-10-12 04:59:25 +00:00
|
|
|
if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
|
2018-11-30 15:55:57 +00:00
|
|
|
return false;
|
2014-02-27 04:17:27 +00:00
|
|
|
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
|
2015-10-12 04:59:25 +00:00
|
|
|
}
|
2014-02-27 04:17:27 +00:00
|
|
|
|
2013-04-21 19:53:46 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-02-08 03:58:07 +00:00
|
|
|
static int
|
|
|
|
xfs_btree_free_block(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2024-02-22 20:33:06 +00:00
|
|
|
trace_xfs_btree_free_block(cur, bp);
|
|
|
|
|
2024-02-22 20:37:35 +00:00
|
|
|
/*
|
|
|
|
* Don't allow block freeing for a staging cursor, because staging
|
|
|
|
* cursors do not support regular btree modifications.
|
|
|
|
*/
|
|
|
|
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
|
|
|
|
ASSERT(0);
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
2016-02-08 03:58:07 +00:00
|
|
|
error = cur->bc_ops->free_block(cur, bp);
|
2016-02-08 03:58:07 +00:00
|
|
|
if (!error) {
|
|
|
|
xfs_trans_binval(cur->bc_tp, bp);
|
2016-02-08 03:58:07 +00:00
|
|
|
XFS_BTREE_STATS_INC(cur, free);
|
2016-02-08 03:58:07 +00:00
|
|
|
}
|
2016-02-08 03:58:07 +00:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Delete the btree cursor.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_btree_del_cursor(
|
2021-02-11 16:46:38 +00:00
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
int error) /* del because of error */
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2021-02-11 16:46:38 +00:00
|
|
|
int i; /* btree level */
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
/*
|
2021-02-11 16:46:38 +00:00
|
|
|
* Clear the buffer pointers and release the buffers. If we're doing
|
|
|
|
* this because of an error, inspect all of the entries in the bc_bufs
|
|
|
|
* array for buffers to be unlocked. This is because some of the btree
|
|
|
|
* code works from level n down to 0, and if we get an error along the
|
|
|
|
* way we won't have initialized all the entries down to 0.
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
|
|
|
for (i = 0; i < cur->bc_nlevels; i++) {
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[i].bp)
|
|
|
|
xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
|
2005-04-16 22:20:36 +00:00
|
|
|
else if (!error)
|
|
|
|
break;
|
|
|
|
}
|
2021-02-11 16:46:38 +00:00
|
|
|
|
xfs: assert in xfs_btree_del_cursor should take into account error
xfs/538 on a 1kB block filesystem failed with this assert:
XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || xfs_is_shutdown(cur->bc_mp), file: fs/xfs/libxfs/xfs_btree.c, line: 448
The problem was that an allocation failed unexpectedly in
xfs_bmbt_alloc_block() after roughly 150,000 minlen allocation error
injections, resulting in an EFSCORRUPTED error being returned to
xfs_bmapi_write(). The error occurred on extent-to-btree format
conversion allocating the new root block:
RIP: 0010:xfs_bmbt_alloc_block+0x177/0x210
Call Trace:
<TASK>
xfs_btree_new_iroot+0xdf/0x520
xfs_btree_make_block_unfull+0x10d/0x1c0
xfs_btree_insrec+0x364/0x790
xfs_btree_insert+0xaa/0x210
xfs_bmap_add_extent_hole_real+0x1fe/0x9a0
xfs_bmapi_allocate+0x34c/0x420
xfs_bmapi_write+0x53c/0x9c0
xfs_alloc_file_space+0xee/0x320
xfs_file_fallocate+0x36b/0x450
vfs_fallocate+0x148/0x340
__x64_sys_fallocate+0x3c/0x70
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa
Why the allocation failed at this point is unknown, but is likely
that we ran the transaction out of reserved space and filesystem out
of space with bmbt blocks because of all the minlen allocations
being done causing worst case fragmentation of a large allocation.
Regardless of the cause, we've then called xfs_bmapi_finish() which
calls xfs_btree_del_cursor(cur, error) to tear down the cursor.
So we have a failed operation, error != 0, cur->bc_ino.allocated > 0
and the filesystem is still up. The assert fails to take into
account that allocation can fail with an error and the transaction
teardown will shut the filesystem down if necessary. i.e. the
assert needs to check "|| error != 0" as well, because at this point
shutdown is pending because the current transaction is dirty....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-27 00:21:09 +00:00
|
|
|
/*
|
|
|
|
* If we are doing a BMBT update, the number of unaccounted blocks
|
|
|
|
* allocated during this cursor life time should be zero. If it's not
|
|
|
|
* zero, then we should be shut down or on our way to shutdown due to
|
|
|
|
* cancelling a dirty transaction on error.
|
|
|
|
*/
|
2024-02-22 20:40:51 +00:00
|
|
|
ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
|
xfs: assert in xfs_btree_del_cursor should take into account error
xfs/538 on a 1kB block filesystem failed with this assert:
XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || xfs_is_shutdown(cur->bc_mp), file: fs/xfs/libxfs/xfs_btree.c, line: 448
The problem was that an allocation failed unexpectedly in
xfs_bmbt_alloc_block() after roughly 150,000 minlen allocation error
injections, resulting in an EFSCORRUPTED error being returned to
xfs_bmapi_write(). The error occurred on extent-to-btree format
conversion allocating the new root block:
RIP: 0010:xfs_bmbt_alloc_block+0x177/0x210
Call Trace:
<TASK>
xfs_btree_new_iroot+0xdf/0x520
xfs_btree_make_block_unfull+0x10d/0x1c0
xfs_btree_insrec+0x364/0x790
xfs_btree_insert+0xaa/0x210
xfs_bmap_add_extent_hole_real+0x1fe/0x9a0
xfs_bmapi_allocate+0x34c/0x420
xfs_bmapi_write+0x53c/0x9c0
xfs_alloc_file_space+0xee/0x320
xfs_file_fallocate+0x36b/0x450
vfs_fallocate+0x148/0x340
__x64_sys_fallocate+0x3c/0x70
do_syscall_64+0x35/0x80
entry_SYSCALL_64_after_hwframe+0x44/0xa
Why the allocation failed at this point is unknown, but is likely
that we ran the transaction out of reserved space and filesystem out
of space with bmbt blocks because of all the minlen allocations
being done causing worst case fragmentation of a large allocation.
Regardless of the cause, we've then called xfs_bmapi_finish() which
calls xfs_btree_del_cursor(cur, error) to tear down the cursor.
So we have a failed operation, error != 0, cur->bc_ino.allocated > 0
and the filesystem is still up. The assert fails to take into
account that allocation can fail with an error and the transaction
teardown will shut the filesystem down if necessary. i.e. the
assert needs to check "|| error != 0" as well, because at this point
shutdown is pending because the current transaction is dirty....
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2022-05-27 00:21:09 +00:00
|
|
|
xfs_is_shutdown(cur->bc_mp) || error != 0);
|
2024-02-22 20:36:17 +00:00
|
|
|
|
2024-11-04 04:18:44 +00:00
|
|
|
if (cur->bc_group)
|
|
|
|
xfs_group_put(cur->bc_group);
|
2021-09-23 19:21:37 +00:00
|
|
|
kmem_cache_free(cur->bc_cache, cur);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
/* Return the buffer target for this btree's buffer. */
|
|
|
|
static inline struct xfs_buftarg *
|
|
|
|
xfs_btree_buftarg(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
|
|
|
if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
|
|
|
|
return cur->bc_mem.xfbtree->target;
|
|
|
|
return cur->bc_mp->m_ddev_targp;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return the block size (in units of 512b sectors) for this btree. */
|
|
|
|
static inline unsigned int
|
|
|
|
xfs_btree_bbsize(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
|
|
|
if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
|
|
|
|
return XFBNO_BBSIZE;
|
|
|
|
return cur->bc_mp->m_bsize;
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Duplicate the btree cursor.
|
|
|
|
* Allocate a new one, copy the record, re-get the buffers.
|
|
|
|
*/
|
2024-02-22 20:37:35 +00:00
|
|
|
int /* error */
|
2005-04-16 22:20:36 +00:00
|
|
|
xfs_btree_dup_cursor(
|
2024-02-22 20:37:35 +00:00
|
|
|
struct xfs_btree_cur *cur, /* input cursor */
|
|
|
|
struct xfs_btree_cur **ncur) /* output cursor */
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2024-02-22 20:37:35 +00:00
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
|
|
|
struct xfs_trans *tp = cur->bc_tp;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
struct xfs_btree_cur *new;
|
|
|
|
int error;
|
|
|
|
int i;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2024-02-22 20:37:35 +00:00
|
|
|
/*
|
|
|
|
* Don't allow staging cursors to be duplicated because they're supposed
|
|
|
|
* to be kept private to a single thread.
|
|
|
|
*/
|
|
|
|
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
|
|
|
|
ASSERT(0);
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
2008-10-30 05:53:59 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Allocate a new cursor like the old one.
|
|
|
|
*/
|
2008-10-30 05:53:59 +00:00
|
|
|
new = cur->bc_ops->dup_cursor(cur);
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Copy the record currently in the cursor.
|
|
|
|
*/
|
|
|
|
new->bc_rec = cur->bc_rec;
|
2008-10-30 05:53:59 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* For each level current, re-get the buffer and copy the ptr value.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < new->bc_nlevels; i++) {
|
2021-09-16 19:24:04 +00:00
|
|
|
new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
|
|
|
|
new->bc_levels[i].ra = cur->bc_levels[i].ra;
|
|
|
|
bp = cur->bc_levels[i].bp;
|
2012-11-12 11:54:01 +00:00
|
|
|
if (bp) {
|
2024-02-22 20:43:35 +00:00
|
|
|
error = xfs_trans_read_buf(mp, tp,
|
|
|
|
xfs_btree_buftarg(cur),
|
|
|
|
xfs_buf_daddr(bp),
|
|
|
|
xfs_btree_bbsize(cur), 0, &bp,
|
|
|
|
cur->bc_ops->buf_ops);
|
2024-02-22 20:32:09 +00:00
|
|
|
if (xfs_metadata_is_sick(error))
|
|
|
|
xfs_btree_mark_sick(new);
|
2012-11-12 11:54:01 +00:00
|
|
|
if (error) {
|
2005-04-16 22:20:36 +00:00
|
|
|
xfs_btree_del_cursor(new, error);
|
|
|
|
*ncur = NULL;
|
|
|
|
return error;
|
|
|
|
}
|
2013-04-21 19:53:46 +00:00
|
|
|
}
|
2021-09-16 19:24:04 +00:00
|
|
|
new->bc_levels[i].bp = bp;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
*ncur = new;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:55:34 +00:00
|
|
|
/*
|
|
|
|
* XFS btree block layout and addressing:
|
|
|
|
*
|
|
|
|
* There are two types of blocks in the btree: leaf and non-leaf blocks.
|
|
|
|
*
|
|
|
|
* The leaf record start with a header then followed by records containing
|
|
|
|
* the values. A non-leaf block also starts with the same header, and
|
|
|
|
* then first contains lookup keys followed by an equal number of pointers
|
|
|
|
* to the btree blocks at the previous level.
|
|
|
|
*
|
|
|
|
* +--------+-------+-------+-------+-------+-------+-------+
|
|
|
|
* Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
|
|
|
|
* +--------+-------+-------+-------+-------+-------+-------+
|
|
|
|
*
|
|
|
|
* +--------+-------+-------+-------+-------+-------+-------+
|
|
|
|
* Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
|
|
|
|
* +--------+-------+-------+-------+-------+-------+-------+
|
|
|
|
*
|
|
|
|
* The header is called struct xfs_btree_block for reasons better left unknown
|
|
|
|
* and comes in different versions for short (32bit) and long (64bit) block
|
|
|
|
* pointers. The record and key structures are defined by the btree instances
|
|
|
|
* and opaque to the btree core. The block pointers are simple disk endian
|
|
|
|
* integers, available in a short (32bit) and long (64bit) variant.
|
|
|
|
*
|
|
|
|
* The helpers below calculate the offset of a given record, key or pointer
|
|
|
|
* into a btree block (xfs_btree_*_offset) or return a pointer to the given
|
|
|
|
* record, key or pointer (xfs_btree_*_addr). Note that all addressing
|
|
|
|
* inside the btree block is done using indices starting at one, not zero!
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
*
|
2024-02-22 20:34:29 +00:00
|
|
|
* If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
* overlapping intervals. In such a tree, records are still sorted lowest to
|
|
|
|
* highest and indexed by the smallest key value that refers to the record.
|
|
|
|
* However, nodes are different: each pointer has two associated keys -- one
|
|
|
|
* indexing the lowest key available in the block(s) below (the same behavior
|
|
|
|
* as the key in a regular btree) and another indexing the highest key
|
|
|
|
* available in the block(s) below. Because records are /not/ sorted by the
|
|
|
|
* highest key, all leaf block updates require us to compute the highest key
|
|
|
|
* that matches any record in the leaf and to recursively update the high keys
|
|
|
|
* in the nodes going further up in the tree, if necessary. Nodes look like
|
|
|
|
* this:
|
|
|
|
*
|
|
|
|
* +--------+-----+-----+-----+-----+-----+-------+-------+-----+
|
|
|
|
* Non-Leaf: | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
|
|
|
|
* +--------+-----+-----+-----+-----+-----+-------+-------+-----+
|
|
|
|
*
|
|
|
|
* To perform an interval query on an overlapped tree, perform the usual
|
|
|
|
* depth-first search and use the low and high keys to decide if we can skip
|
|
|
|
* that particular node. If a leaf node is reached, return the records that
|
|
|
|
* intersect the interval. Note that an interval query may return numerous
|
|
|
|
* entries. For a non-overlapped tree, simply search for the record associated
|
|
|
|
* with the lowest key and iterate forward until a non-matching record is
|
|
|
|
* found. Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
|
|
|
|
* Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
|
|
|
|
* more detail.
|
|
|
|
*
|
|
|
|
* Why do we care about overlapping intervals? Let's say you have a bunch of
|
|
|
|
* reverse mapping records on a reflink filesystem:
|
|
|
|
*
|
|
|
|
* 1: +- file A startblock B offset C length D -----------+
|
|
|
|
* 2: +- file E startblock F offset G length H --------------+
|
|
|
|
* 3: +- file I startblock F offset J length K --+
|
|
|
|
* 4: +- file L... --+
|
|
|
|
*
|
|
|
|
* Now say we want to map block (B+D) into file A at offset (C+D). Ideally,
|
|
|
|
* we'd simply increment the length of record 1. But how do we find the record
|
|
|
|
* that ends at (B+D-1) (i.e. record 1)? A LE lookup of (B+D-1) would return
|
|
|
|
* record 3 because the keys are ordered first by startblock. An interval
|
|
|
|
* query would return records 1 and 2 because they both overlap (B+D-1), and
|
|
|
|
* from that we can pick out record 1 as the appropriate left neighbor.
|
|
|
|
*
|
|
|
|
* In the non-overlapped case you can do a LE lookup and decrement the cursor
|
|
|
|
* because a record's interval must end before the next record.
|
2008-10-30 05:55:34 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return size of the btree block header for this btree instance.
|
|
|
|
*/
|
|
|
|
static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
|
|
|
|
{
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
|
2024-02-22 20:34:12 +00:00
|
|
|
if (xfs_has_crc(cur->bc_mp))
|
2013-04-21 19:53:46 +00:00
|
|
|
return XFS_BTREE_LBLOCK_CRC_LEN;
|
|
|
|
return XFS_BTREE_LBLOCK_LEN;
|
|
|
|
}
|
2024-02-22 20:34:12 +00:00
|
|
|
if (xfs_has_crc(cur->bc_mp))
|
2013-04-21 19:53:46 +00:00
|
|
|
return XFS_BTREE_SBLOCK_CRC_LEN;
|
|
|
|
return XFS_BTREE_SBLOCK_LEN;
|
2008-10-30 05:55:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate offset of the n-th record in a btree block.
|
|
|
|
*/
|
|
|
|
STATIC size_t
|
|
|
|
xfs_btree_rec_offset(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n)
|
|
|
|
{
|
|
|
|
return xfs_btree_block_len(cur) +
|
|
|
|
(n - 1) * cur->bc_ops->rec_len;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate offset of the n-th key in a btree block.
|
|
|
|
*/
|
|
|
|
STATIC size_t
|
|
|
|
xfs_btree_key_offset(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n)
|
|
|
|
{
|
|
|
|
return xfs_btree_block_len(cur) +
|
|
|
|
(n - 1) * cur->bc_ops->key_len;
|
|
|
|
}
|
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/*
|
|
|
|
* Calculate offset of the n-th high key in a btree block.
|
|
|
|
*/
|
|
|
|
STATIC size_t
|
|
|
|
xfs_btree_high_key_offset(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n)
|
|
|
|
{
|
|
|
|
return xfs_btree_block_len(cur) +
|
|
|
|
(n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:55:34 +00:00
|
|
|
/*
|
|
|
|
* Calculate offset of the n-th block pointer in a btree block.
|
|
|
|
*/
|
|
|
|
STATIC size_t
|
|
|
|
xfs_btree_ptr_offset(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n,
|
|
|
|
int level)
|
|
|
|
{
|
|
|
|
return xfs_btree_block_len(cur) +
|
|
|
|
cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
|
2024-02-22 20:35:36 +00:00
|
|
|
(n - 1) * cur->bc_ops->ptr_len;
|
2008-10-30 05:55:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return a pointer to the n-th record in the btree block.
|
|
|
|
*/
|
2017-06-16 18:00:07 +00:00
|
|
|
union xfs_btree_rec *
|
2008-10-30 05:55:34 +00:00
|
|
|
xfs_btree_rec_addr(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n,
|
|
|
|
struct xfs_btree_block *block)
|
|
|
|
{
|
|
|
|
return (union xfs_btree_rec *)
|
|
|
|
((char *)block + xfs_btree_rec_offset(cur, n));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return a pointer to the n-th key in the btree block.
|
|
|
|
*/
|
2017-06-16 18:00:07 +00:00
|
|
|
union xfs_btree_key *
|
2008-10-30 05:55:34 +00:00
|
|
|
xfs_btree_key_addr(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n,
|
|
|
|
struct xfs_btree_block *block)
|
|
|
|
{
|
|
|
|
return (union xfs_btree_key *)
|
|
|
|
((char *)block + xfs_btree_key_offset(cur, n));
|
|
|
|
}
|
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/*
|
|
|
|
* Return a pointer to the n-th high key in the btree block.
|
|
|
|
*/
|
2017-06-16 18:00:07 +00:00
|
|
|
union xfs_btree_key *
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
xfs_btree_high_key_addr(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n,
|
|
|
|
struct xfs_btree_block *block)
|
|
|
|
{
|
|
|
|
return (union xfs_btree_key *)
|
|
|
|
((char *)block + xfs_btree_high_key_offset(cur, n));
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:55:34 +00:00
|
|
|
/*
|
|
|
|
* Return a pointer to the n-th block pointer in the btree block.
|
|
|
|
*/
|
2017-06-16 18:00:07 +00:00
|
|
|
union xfs_btree_ptr *
|
2008-10-30 05:55:34 +00:00
|
|
|
xfs_btree_ptr_addr(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int n,
|
|
|
|
struct xfs_btree_block *block)
|
|
|
|
{
|
|
|
|
int level = xfs_btree_get_level(block);
|
|
|
|
|
|
|
|
ASSERT(block->bb_level != 0);
|
|
|
|
|
|
|
|
return (union xfs_btree_ptr *)
|
|
|
|
((char *)block + xfs_btree_ptr_offset(cur, n, level));
|
|
|
|
}
|
|
|
|
|
2020-03-11 17:42:34 +00:00
|
|
|
struct xfs_ifork *
|
|
|
|
xfs_btree_ifork_ptr(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
2024-02-22 20:36:17 +00:00
|
|
|
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
|
2020-03-11 17:42:34 +00:00
|
|
|
|
|
|
|
if (cur->bc_flags & XFS_BTREE_STAGING)
|
|
|
|
return cur->bc_ino.ifake->if_fork;
|
2022-07-09 17:56:05 +00:00
|
|
|
return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
|
2020-03-11 17:42:34 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 05:54:22 +00:00
|
|
|
/*
|
2013-08-07 10:11:05 +00:00
|
|
|
* Get the root block which is stored in the inode.
|
2008-10-30 05:54:22 +00:00
|
|
|
*
|
|
|
|
* For now this btree implementation assumes the btree root is always
|
|
|
|
* stored in the if_broot field of an inode fork.
|
|
|
|
*/
|
|
|
|
STATIC struct xfs_btree_block *
|
|
|
|
xfs_btree_get_iroot(
|
2016-07-20 00:37:50 +00:00
|
|
|
struct xfs_btree_cur *cur)
|
2008-10-30 05:54:22 +00:00
|
|
|
{
|
2020-03-11 17:42:34 +00:00
|
|
|
struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
|
2008-10-30 05:54:22 +00:00
|
|
|
|
2016-07-20 00:37:50 +00:00
|
|
|
return (struct xfs_btree_block *)ifp->if_broot;
|
2008-10-30 05:54:22 +00:00
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Retrieve the block pointer from the cursor at the given level.
|
2008-10-30 05:54:22 +00:00
|
|
|
* This may be an inode btree root or from a buffer.
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
2017-06-16 18:00:07 +00:00
|
|
|
struct xfs_btree_block * /* generic btree block pointer */
|
2005-04-16 22:20:36 +00:00
|
|
|
xfs_btree_get_block(
|
2008-10-30 05:54:22 +00:00
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
2005-04-16 22:20:36 +00:00
|
|
|
int level, /* level in btree */
|
2008-10-30 05:54:22 +00:00
|
|
|
struct xfs_buf **bpp) /* buffer containing the block */
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2024-02-22 20:37:24 +00:00
|
|
|
if (xfs_btree_at_iroot(cur, level)) {
|
2008-10-30 05:54:22 +00:00
|
|
|
*bpp = NULL;
|
|
|
|
return xfs_btree_get_iroot(cur);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
2008-10-30 05:54:22 +00:00
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
*bpp = cur->bc_levels[level].bp;
|
2008-10-30 05:54:22 +00:00
|
|
|
return XFS_BUF_TO_BLOCK(*bpp);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2008-08-13 06:23:50 +00:00
|
|
|
/*
|
|
|
|
* Change the cursor to point to the first record at the given level.
|
|
|
|
* Other levels are unaffected.
|
|
|
|
*/
|
2008-10-30 05:58:41 +00:00
|
|
|
STATIC int /* success=1, failure=0 */
|
2008-08-13 06:23:50 +00:00
|
|
|
xfs_btree_firstrec(
|
2021-09-16 19:18:47 +00:00
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
2008-08-13 06:23:50 +00:00
|
|
|
int level) /* level to change */
|
|
|
|
{
|
2008-10-30 06:14:34 +00:00
|
|
|
struct xfs_btree_block *block; /* generic btree block pointer */
|
2020-12-17 00:07:34 +00:00
|
|
|
struct xfs_buf *bp; /* buffer containing block */
|
2008-08-13 06:23:50 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the block pointer for this level.
|
|
|
|
*/
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
2017-07-17 21:30:45 +00:00
|
|
|
if (xfs_btree_check_block(cur, block, level, bp))
|
|
|
|
return 0;
|
2008-08-13 06:23:50 +00:00
|
|
|
/*
|
|
|
|
* It's empty, there is no such record.
|
|
|
|
*/
|
2008-10-30 05:53:47 +00:00
|
|
|
if (!block->bb_numrecs)
|
2008-08-13 06:23:50 +00:00
|
|
|
return 0;
|
|
|
|
/*
|
|
|
|
* Set the ptr value to 1, that's the first record/key.
|
|
|
|
*/
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr = 1;
|
2008-08-13 06:23:50 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Change the cursor to point to the last record in the current block
|
|
|
|
* at the given level. Other levels are unaffected.
|
|
|
|
*/
|
2008-10-30 05:58:41 +00:00
|
|
|
STATIC int /* success=1, failure=0 */
|
2005-04-16 22:20:36 +00:00
|
|
|
xfs_btree_lastrec(
|
2021-09-16 19:18:47 +00:00
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
2005-04-16 22:20:36 +00:00
|
|
|
int level) /* level to change */
|
|
|
|
{
|
2008-10-30 06:14:34 +00:00
|
|
|
struct xfs_btree_block *block; /* generic btree block pointer */
|
2020-12-17 00:07:34 +00:00
|
|
|
struct xfs_buf *bp; /* buffer containing block */
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the block pointer for this level.
|
|
|
|
*/
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
2017-07-17 21:30:45 +00:00
|
|
|
if (xfs_btree_check_block(cur, block, level, bp))
|
|
|
|
return 0;
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* It's empty, there is no such record.
|
|
|
|
*/
|
2008-10-30 05:53:47 +00:00
|
|
|
if (!block->bb_numrecs)
|
2005-04-16 22:20:36 +00:00
|
|
|
return 0;
|
|
|
|
/*
|
|
|
|
* Set the ptr value to numrecs, that's the last record/key.
|
|
|
|
*/
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
|
2005-04-16 22:20:36 +00:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute first and last byte offsets for the fields given.
|
|
|
|
* Interprets the offsets table, which contains struct field offsets.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_btree_offsets(
|
2022-04-21 00:46:33 +00:00
|
|
|
uint32_t fields, /* bitmask of fields */
|
2005-04-16 22:20:36 +00:00
|
|
|
const short *offsets, /* table of field offsets */
|
|
|
|
int nbits, /* number of bits to inspect */
|
|
|
|
int *first, /* output: first byte offset */
|
|
|
|
int *last) /* output: last byte offset */
|
|
|
|
{
|
|
|
|
int i; /* current bit number */
|
2022-04-21 00:46:33 +00:00
|
|
|
uint32_t imask; /* mask for current bit number */
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
ASSERT(fields != 0);
|
|
|
|
/*
|
|
|
|
* Find the lowest bit, so the first byte offset.
|
|
|
|
*/
|
2022-04-21 00:46:33 +00:00
|
|
|
for (i = 0, imask = 1u; ; i++, imask <<= 1) {
|
2005-04-16 22:20:36 +00:00
|
|
|
if (imask & fields) {
|
|
|
|
*first = offsets[i];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Find the highest bit, so the last byte offset.
|
|
|
|
*/
|
2022-04-21 00:46:33 +00:00
|
|
|
for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
|
2005-04-16 22:20:36 +00:00
|
|
|
if (imask & fields) {
|
|
|
|
*last = offsets[i + 1] - 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:54:43 +00:00
|
|
|
STATIC int
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_readahead_fsblock(
|
2008-10-30 05:54:43 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int lr,
|
|
|
|
struct xfs_btree_block *block)
|
|
|
|
{
|
2024-02-22 20:41:00 +00:00
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
2014-07-29 23:12:05 +00:00
|
|
|
xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
|
|
|
|
xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
|
2024-02-22 20:41:00 +00:00
|
|
|
int rval = 0;
|
2008-10-30 05:54:43 +00:00
|
|
|
|
2014-07-29 23:12:05 +00:00
|
|
|
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
|
2024-02-22 20:41:00 +00:00
|
|
|
xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left),
|
|
|
|
mp->m_bsize, cur->bc_ops->buf_ops);
|
2008-10-30 05:54:43 +00:00
|
|
|
rval++;
|
|
|
|
}
|
|
|
|
|
2014-07-29 23:12:05 +00:00
|
|
|
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
|
2024-02-22 20:41:00 +00:00
|
|
|
xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right),
|
|
|
|
mp->m_bsize, cur->bc_ops->buf_ops);
|
2008-10-30 05:54:43 +00:00
|
|
|
rval++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return rval;
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
STATIC int
|
|
|
|
xfs_btree_readahead_memblock(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int lr,
|
|
|
|
struct xfs_btree_block *block)
|
|
|
|
{
|
|
|
|
struct xfs_buftarg *btp = cur->bc_mem.xfbtree->target;
|
|
|
|
xfbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
|
|
|
|
xfbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
|
|
|
|
int rval = 0;
|
|
|
|
|
|
|
|
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
|
|
|
|
xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
|
|
|
|
cur->bc_ops->buf_ops);
|
|
|
|
rval++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
|
|
|
|
xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
|
|
|
|
cur->bc_ops->buf_ops);
|
|
|
|
rval++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return rval;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:54:43 +00:00
|
|
|
STATIC int
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_readahead_agblock(
|
2008-10-30 05:54:43 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int lr,
|
2024-02-22 20:41:01 +00:00
|
|
|
struct xfs_btree_block *block)
|
2008-10-30 05:54:43 +00:00
|
|
|
{
|
2024-02-22 20:41:01 +00:00
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
2024-11-04 04:18:44 +00:00
|
|
|
struct xfs_perag *pag = to_perag(cur->bc_group);
|
2008-10-30 05:54:43 +00:00
|
|
|
xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
|
|
|
|
xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
|
2024-02-22 20:41:01 +00:00
|
|
|
int rval = 0;
|
2008-10-30 05:54:43 +00:00
|
|
|
|
|
|
|
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
|
2024-02-22 20:41:01 +00:00
|
|
|
xfs_buf_readahead(mp->m_ddev_targp,
|
2024-11-04 04:18:44 +00:00
|
|
|
xfs_agbno_to_daddr(pag, left), mp->m_bsize,
|
|
|
|
cur->bc_ops->buf_ops);
|
2008-10-30 05:54:43 +00:00
|
|
|
rval++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
|
2024-02-22 20:41:01 +00:00
|
|
|
xfs_buf_readahead(mp->m_ddev_targp,
|
2024-11-04 04:18:44 +00:00
|
|
|
xfs_agbno_to_daddr(pag, right), mp->m_bsize,
|
|
|
|
cur->bc_ops->buf_ops);
|
2008-10-30 05:54:43 +00:00
|
|
|
rval++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return rval;
|
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Read-ahead btree blocks, at the given level.
|
|
|
|
* Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
|
|
|
|
*/
|
2008-10-30 05:58:41 +00:00
|
|
|
STATIC int
|
2008-10-30 05:54:43 +00:00
|
|
|
xfs_btree_readahead(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
2005-04-16 22:20:36 +00:00
|
|
|
int lev, /* level in btree */
|
|
|
|
int lr) /* left/right bits */
|
|
|
|
{
|
2008-10-30 05:54:43 +00:00
|
|
|
struct xfs_btree_block *block;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* No readahead needed if we are at the root level and the
|
|
|
|
* btree root is stored in the inode.
|
|
|
|
*/
|
2024-02-22 20:37:24 +00:00
|
|
|
if (xfs_btree_at_iroot(cur, lev))
|
2008-10-30 05:54:43 +00:00
|
|
|
return 0;
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
|
2008-10-30 05:54:43 +00:00
|
|
|
return 0;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[lev].ra |= lr;
|
|
|
|
block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
|
2008-10-30 05:54:43 +00:00
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
switch (cur->bc_ops->type) {
|
|
|
|
case XFS_BTREE_TYPE_AG:
|
|
|
|
return xfs_btree_readahead_agblock(cur, lr, block);
|
|
|
|
case XFS_BTREE_TYPE_INODE:
|
2024-02-22 20:40:58 +00:00
|
|
|
return xfs_btree_readahead_fsblock(cur, lr, block);
|
2024-02-22 20:43:35 +00:00
|
|
|
case XFS_BTREE_TYPE_MEM:
|
|
|
|
return xfs_btree_readahead_memblock(cur, lr, block);
|
|
|
|
default:
|
|
|
|
ASSERT(0);
|
|
|
|
return 0;
|
|
|
|
}
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2018-06-04 20:58:34 +00:00
|
|
|
STATIC int
|
2013-08-30 00:23:44 +00:00
|
|
|
xfs_btree_ptr_to_daddr(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
xfs_daddr_t *daddr)
|
2013-08-30 00:23:44 +00:00
|
|
|
{
|
2018-06-04 20:58:34 +00:00
|
|
|
int error;
|
2013-08-30 00:23:44 +00:00
|
|
|
|
2018-06-04 20:58:34 +00:00
|
|
|
error = xfs_btree_check_ptr(cur, ptr, 0, 1);
|
|
|
|
if (error)
|
|
|
|
return error;
|
2013-08-30 00:23:44 +00:00
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
switch (cur->bc_ops->type) {
|
|
|
|
case XFS_BTREE_TYPE_AG:
|
2024-11-04 04:18:44 +00:00
|
|
|
*daddr = xfs_agbno_to_daddr(to_perag(cur->bc_group),
|
2024-02-22 20:43:35 +00:00
|
|
|
be32_to_cpu(ptr->s));
|
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_INODE:
|
|
|
|
*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
|
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_MEM:
|
|
|
|
*daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
|
|
|
|
break;
|
2013-08-30 00:23:44 +00:00
|
|
|
}
|
2018-06-04 20:58:34 +00:00
|
|
|
return 0;
|
2013-08-30 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Readahead @count btree blocks at the given @ptr location.
|
|
|
|
*
|
|
|
|
* We don't need to care about long or short form btrees here as we have a
|
|
|
|
* method of converting the ptr directly to a daddr available to us.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_readahead_ptr(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr,
|
|
|
|
xfs_extlen_t count)
|
|
|
|
{
|
2018-06-04 20:58:34 +00:00
|
|
|
xfs_daddr_t daddr;
|
|
|
|
|
|
|
|
if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
|
|
|
|
return;
|
2024-02-22 20:43:35 +00:00
|
|
|
xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
|
|
|
|
xfs_btree_bbsize(cur) * count,
|
|
|
|
cur->bc_ops->buf_ops);
|
2013-08-30 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* Set the buffer for level "lev" in the cursor to bp, releasing
|
|
|
|
* any previous buffer.
|
|
|
|
*/
|
2010-09-07 23:34:07 +00:00
|
|
|
STATIC void
|
2005-04-16 22:20:36 +00:00
|
|
|
xfs_btree_setbuf(
|
2021-09-16 19:18:47 +00:00
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
2005-04-16 22:20:36 +00:00
|
|
|
int lev, /* level in btree */
|
2020-12-17 00:07:34 +00:00
|
|
|
struct xfs_buf *bp) /* new buffer to set */
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2008-10-30 06:14:34 +00:00
|
|
|
struct xfs_btree_block *b; /* btree block */
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[lev].bp)
|
|
|
|
xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
|
|
|
|
cur->bc_levels[lev].bp = bp;
|
|
|
|
cur->bc_levels[lev].ra = 0;
|
2010-09-07 23:34:07 +00:00
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
b = XFS_BUF_TO_BLOCK(bp);
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
|
2014-07-29 23:12:05 +00:00
|
|
|
if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
|
2014-07-29 23:12:05 +00:00
|
|
|
if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
|
2005-04-16 22:20:36 +00:00
|
|
|
} else {
|
2011-07-08 12:36:05 +00:00
|
|
|
if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
|
2011-07-08 12:36:05 +00:00
|
|
|
if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
}
|
2008-10-30 05:55:45 +00:00
|
|
|
|
2017-10-18 04:37:37 +00:00
|
|
|
bool
|
2008-10-30 05:55:45 +00:00
|
|
|
xfs_btree_ptr_is_null(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr)
|
2008-10-30 05:55:45 +00:00
|
|
|
{
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
|
2014-07-29 23:12:05 +00:00
|
|
|
return ptr->l == cpu_to_be64(NULLFSBLOCK);
|
2008-10-30 05:55:45 +00:00
|
|
|
else
|
2011-07-08 12:36:05 +00:00
|
|
|
return ptr->s == cpu_to_be32(NULLAGBLOCK);
|
2008-10-30 05:55:45 +00:00
|
|
|
}
|
|
|
|
|
2020-03-11 17:51:50 +00:00
|
|
|
void
|
2008-10-30 05:57:40 +00:00
|
|
|
xfs_btree_set_ptr_null(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr)
|
|
|
|
{
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
|
2014-07-29 23:12:05 +00:00
|
|
|
ptr->l = cpu_to_be64(NULLFSBLOCK);
|
2008-10-30 05:57:40 +00:00
|
|
|
else
|
|
|
|
ptr->s = cpu_to_be32(NULLAGBLOCK);
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:43:34 +00:00
|
|
|
static inline bool
|
|
|
|
xfs_btree_ptrs_equal(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr1,
|
|
|
|
union xfs_btree_ptr *ptr2)
|
|
|
|
{
|
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
|
|
|
|
return ptr1->l == ptr2->l;
|
|
|
|
return ptr1->s == ptr2->s;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:55:45 +00:00
|
|
|
/*
|
|
|
|
* Get/set/init sibling pointers
|
|
|
|
*/
|
2017-10-18 04:37:37 +00:00
|
|
|
void
|
2008-10-30 05:55:45 +00:00
|
|
|
xfs_btree_get_sibling(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
union xfs_btree_ptr *ptr,
|
|
|
|
int lr)
|
|
|
|
{
|
|
|
|
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
|
|
|
|
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
|
2008-10-30 05:55:45 +00:00
|
|
|
if (lr == XFS_BB_RIGHTSIB)
|
|
|
|
ptr->l = block->bb_u.l.bb_rightsib;
|
|
|
|
else
|
|
|
|
ptr->l = block->bb_u.l.bb_leftsib;
|
|
|
|
} else {
|
|
|
|
if (lr == XFS_BB_RIGHTSIB)
|
|
|
|
ptr->s = block->bb_u.s.bb_rightsib;
|
|
|
|
else
|
|
|
|
ptr->s = block->bb_u.s.bb_leftsib;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-11 17:51:50 +00:00
|
|
|
void
|
2008-10-30 05:57:03 +00:00
|
|
|
xfs_btree_set_sibling(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
int lr)
|
2008-10-30 05:57:03 +00:00
|
|
|
{
|
|
|
|
ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
|
|
|
|
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
|
2008-10-30 05:57:03 +00:00
|
|
|
if (lr == XFS_BB_RIGHTSIB)
|
|
|
|
block->bb_u.l.bb_rightsib = ptr->l;
|
|
|
|
else
|
|
|
|
block->bb_u.l.bb_leftsib = ptr->l;
|
|
|
|
} else {
|
|
|
|
if (lr == XFS_BB_RIGHTSIB)
|
|
|
|
block->bb_u.s.bb_rightsib = ptr->s;
|
|
|
|
else
|
|
|
|
block->bb_u.s.bb_leftsib = ptr->s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:19 +00:00
|
|
|
static void
|
|
|
|
__xfs_btree_init_block(
|
2013-04-21 19:53:46 +00:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_btree_block *buf,
|
2024-02-22 20:35:16 +00:00
|
|
|
const struct xfs_btree_ops *ops,
|
2013-04-21 19:53:46 +00:00
|
|
|
xfs_daddr_t blkno,
|
|
|
|
__u16 level,
|
|
|
|
__u16 numrecs,
|
2024-02-22 20:35:16 +00:00
|
|
|
__u64 owner)
|
2013-04-21 19:53:46 +00:00
|
|
|
{
|
2024-02-22 20:34:29 +00:00
|
|
|
bool crc = xfs_has_crc(mp);
|
2024-02-22 20:35:16 +00:00
|
|
|
__u32 magic = xfs_btree_magic(mp, ops);
|
2017-01-28 07:16:37 +00:00
|
|
|
|
2013-04-21 19:53:46 +00:00
|
|
|
buf->bb_magic = cpu_to_be32(magic);
|
|
|
|
buf->bb_level = cpu_to_be16(level);
|
|
|
|
buf->bb_numrecs = cpu_to_be16(numrecs);
|
|
|
|
|
2024-02-22 20:35:36 +00:00
|
|
|
if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
|
2014-07-29 23:12:05 +00:00
|
|
|
buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
|
|
|
|
buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
|
2017-01-28 07:16:37 +00:00
|
|
|
if (crc) {
|
2013-04-21 19:53:46 +00:00
|
|
|
buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
|
|
|
|
buf->bb_u.l.bb_owner = cpu_to_be64(owner);
|
2015-07-29 01:53:31 +00:00
|
|
|
uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
|
2013-04-21 19:53:46 +00:00
|
|
|
buf->bb_u.l.bb_pad = 0;
|
2013-08-28 11:22:46 +00:00
|
|
|
buf->bb_u.l.bb_lsn = 0;
|
2013-04-21 19:53:46 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
|
|
|
|
buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
|
2017-01-28 07:16:37 +00:00
|
|
|
if (crc) {
|
2013-04-21 19:53:46 +00:00
|
|
|
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
|
2024-02-22 20:36:17 +00:00
|
|
|
/* owner is a 32 bit value on short blocks */
|
|
|
|
buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner);
|
2015-07-29 01:53:31 +00:00
|
|
|
uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
|
2013-08-28 11:22:46 +00:00
|
|
|
buf->bb_u.s.bb_lsn = 0;
|
2013-04-21 19:53:46 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:19 +00:00
|
|
|
void
|
|
|
|
xfs_btree_init_block(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
const struct xfs_btree_ops *ops,
|
|
|
|
__u16 level,
|
|
|
|
__u16 numrecs,
|
|
|
|
__u64 owner)
|
|
|
|
{
|
|
|
|
__xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level,
|
|
|
|
numrecs, owner);
|
|
|
|
}
|
|
|
|
|
2012-11-13 22:40:27 +00:00
|
|
|
void
|
2024-02-22 20:35:17 +00:00
|
|
|
xfs_btree_init_buf(
|
2024-02-22 20:35:16 +00:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_buf *bp,
|
|
|
|
const struct xfs_btree_ops *ops,
|
|
|
|
__u16 level,
|
|
|
|
__u16 numrecs,
|
|
|
|
__u64 owner)
|
2008-10-30 05:57:03 +00:00
|
|
|
{
|
2024-02-22 20:35:19 +00:00
|
|
|
__xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops,
|
2024-02-22 20:35:16 +00:00
|
|
|
xfs_buf_daddr(bp), level, numrecs, owner);
|
2024-02-22 20:35:19 +00:00
|
|
|
bp->b_ops = ops->buf_ops;
|
2008-10-30 05:57:03 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:22 +00:00
|
|
|
static inline __u64
|
|
|
|
xfs_btree_owner(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
2024-02-22 20:43:35 +00:00
|
|
|
switch (cur->bc_ops->type) {
|
|
|
|
case XFS_BTREE_TYPE_MEM:
|
|
|
|
return cur->bc_mem.xfbtree->owner;
|
|
|
|
case XFS_BTREE_TYPE_INODE:
|
2024-02-22 20:35:22 +00:00
|
|
|
return cur->bc_ino.ip->i_ino;
|
2024-02-22 20:43:35 +00:00
|
|
|
case XFS_BTREE_TYPE_AG:
|
2024-11-04 04:18:44 +00:00
|
|
|
return cur->bc_group->xg_gno;
|
2024-02-22 20:43:35 +00:00
|
|
|
default:
|
|
|
|
ASSERT(0);
|
|
|
|
return 0;
|
|
|
|
}
|
2024-02-22 20:35:22 +00:00
|
|
|
}
|
|
|
|
|
2020-03-11 17:51:50 +00:00
|
|
|
void
|
2012-11-13 22:40:27 +00:00
|
|
|
xfs_btree_init_block_cur(
|
|
|
|
struct xfs_btree_cur *cur,
|
2013-04-21 19:53:46 +00:00
|
|
|
struct xfs_buf *bp,
|
2012-11-13 22:40:27 +00:00
|
|
|
int level,
|
2013-04-21 19:53:46 +00:00
|
|
|
int numrecs)
|
2012-11-13 22:40:27 +00:00
|
|
|
{
|
2024-02-22 20:35:22 +00:00
|
|
|
xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs,
|
|
|
|
xfs_btree_owner(cur));
|
2012-11-13 22:40:27 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 05:57:03 +00:00
|
|
|
STATIC void
|
|
|
|
xfs_btree_buf_to_ptr(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp,
|
|
|
|
union xfs_btree_ptr *ptr)
|
|
|
|
{
|
2024-02-22 20:43:35 +00:00
|
|
|
switch (cur->bc_ops->type) {
|
|
|
|
case XFS_BTREE_TYPE_AG:
|
2009-01-15 05:22:07 +00:00
|
|
|
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
|
2021-08-19 01:46:57 +00:00
|
|
|
xfs_buf_daddr(bp)));
|
2024-02-22 20:43:35 +00:00
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_INODE:
|
|
|
|
ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
|
|
|
|
xfs_buf_daddr(bp)));
|
|
|
|
break;
|
|
|
|
case XFS_BTREE_TYPE_MEM:
|
|
|
|
ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
|
|
|
|
break;
|
2008-10-30 05:57:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:20 +00:00
|
|
|
static inline void
|
2008-10-30 05:55:45 +00:00
|
|
|
xfs_btree_set_refs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2024-02-22 20:35:20 +00:00
|
|
|
xfs_buf_set_ref(bp, cur->bc_ops->lru_refs);
|
2008-10-30 05:55:45 +00:00
|
|
|
}
|
|
|
|
|
2020-03-11 17:51:50 +00:00
|
|
|
int
|
2008-10-30 05:57:03 +00:00
|
|
|
xfs_btree_get_buf_block(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
struct xfs_btree_block **block,
|
|
|
|
struct xfs_buf **bpp)
|
2008-10-30 05:57:03 +00:00
|
|
|
{
|
2024-02-22 20:43:35 +00:00
|
|
|
xfs_daddr_t d;
|
|
|
|
int error;
|
2008-10-30 05:57:03 +00:00
|
|
|
|
2018-06-04 20:58:34 +00:00
|
|
|
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
|
|
|
|
if (error)
|
|
|
|
return error;
|
2024-02-22 20:43:35 +00:00
|
|
|
error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
|
|
|
|
xfs_btree_bbsize(cur), 0, bpp);
|
2020-01-24 01:01:18 +00:00
|
|
|
if (error)
|
|
|
|
return error;
|
2008-10-30 05:57:03 +00:00
|
|
|
|
2012-11-14 06:54:40 +00:00
|
|
|
(*bpp)->b_ops = cur->bc_ops->buf_ops;
|
2008-10-30 05:57:03 +00:00
|
|
|
*block = XFS_BUF_TO_BLOCK(*bpp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:55:45 +00:00
|
|
|
/*
|
|
|
|
* Read in the buffer at the given ptr and return the buffer and
|
|
|
|
* the block pointer within the buffer.
|
|
|
|
*/
|
2023-12-15 18:03:28 +00:00
|
|
|
int
|
2008-10-30 05:55:45 +00:00
|
|
|
xfs_btree_read_buf_block(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
int flags,
|
|
|
|
struct xfs_btree_block **block,
|
|
|
|
struct xfs_buf **bpp)
|
2008-10-30 05:55:45 +00:00
|
|
|
{
|
|
|
|
struct xfs_mount *mp = cur->bc_mp;
|
|
|
|
xfs_daddr_t d;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* need to sort out how callers deal with failures first */
|
2010-01-19 09:56:44 +00:00
|
|
|
ASSERT(!(flags & XBF_TRYLOCK));
|
2008-10-30 05:55:45 +00:00
|
|
|
|
2018-06-04 20:58:34 +00:00
|
|
|
error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
|
|
|
|
if (error)
|
|
|
|
return error;
|
2024-02-22 20:43:35 +00:00
|
|
|
error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
|
|
|
|
xfs_btree_bbsize(cur), flags, bpp,
|
|
|
|
cur->bc_ops->buf_ops);
|
2024-02-22 20:32:09 +00:00
|
|
|
if (xfs_metadata_is_sick(error))
|
|
|
|
xfs_btree_mark_sick(cur);
|
2008-10-30 05:55:45 +00:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
xfs_btree_set_refs(cur, *bpp);
|
|
|
|
*block = XFS_BUF_TO_BLOCK(*bpp);
|
2012-11-12 11:54:08 +00:00
|
|
|
return 0;
|
2008-10-30 05:55:45 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:22 +00:00
|
|
|
/*
|
|
|
|
* Copy keys from one btree block to another.
|
|
|
|
*/
|
2020-03-11 17:51:50 +00:00
|
|
|
void
|
2008-10-30 05:56:22 +00:00
|
|
|
xfs_btree_copy_keys(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_key *dst_key,
|
|
|
|
const union xfs_btree_key *src_key,
|
|
|
|
int numkeys)
|
2008-10-30 05:56:22 +00:00
|
|
|
{
|
|
|
|
ASSERT(numkeys >= 0);
|
|
|
|
memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:32 +00:00
|
|
|
/*
|
|
|
|
* Copy records from one btree block to another.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_copy_recs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_rec *dst_rec,
|
|
|
|
union xfs_btree_rec *src_rec,
|
|
|
|
int numrecs)
|
|
|
|
{
|
|
|
|
ASSERT(numrecs >= 0);
|
|
|
|
memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:43 +00:00
|
|
|
/*
|
|
|
|
* Copy block pointers from one btree block to another.
|
|
|
|
*/
|
2020-03-11 17:51:50 +00:00
|
|
|
void
|
2008-10-30 05:56:43 +00:00
|
|
|
xfs_btree_copy_ptrs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *dst_ptr,
|
2020-03-11 17:51:50 +00:00
|
|
|
const union xfs_btree_ptr *src_ptr,
|
2008-10-30 05:56:43 +00:00
|
|
|
int numptrs)
|
|
|
|
{
|
|
|
|
ASSERT(numptrs >= 0);
|
2024-02-22 20:35:36 +00:00
|
|
|
memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len);
|
2008-10-30 05:56:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Shift keys one index left/right inside a single btree block.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_shift_keys(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_key *key,
|
|
|
|
int dir,
|
|
|
|
int numkeys)
|
|
|
|
{
|
|
|
|
char *dst_key;
|
|
|
|
|
|
|
|
ASSERT(numkeys >= 0);
|
|
|
|
ASSERT(dir == 1 || dir == -1);
|
|
|
|
|
|
|
|
dst_key = (char *)key + (dir * cur->bc_ops->key_len);
|
|
|
|
memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Shift records one index left/right inside a single btree block.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_shift_recs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_rec *rec,
|
|
|
|
int dir,
|
|
|
|
int numrecs)
|
|
|
|
{
|
|
|
|
char *dst_rec;
|
|
|
|
|
|
|
|
ASSERT(numrecs >= 0);
|
|
|
|
ASSERT(dir == 1 || dir == -1);
|
|
|
|
|
|
|
|
dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
|
|
|
|
memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Shift block pointers one index left/right inside a single btree block.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_shift_ptrs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr,
|
|
|
|
int dir,
|
|
|
|
int numptrs)
|
|
|
|
{
|
|
|
|
char *dst_ptr;
|
|
|
|
|
|
|
|
ASSERT(numptrs >= 0);
|
|
|
|
ASSERT(dir == 1 || dir == -1);
|
|
|
|
|
2024-02-22 20:35:36 +00:00
|
|
|
dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len);
|
|
|
|
memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len);
|
2008-10-30 05:56:43 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:22 +00:00
|
|
|
/*
|
|
|
|
* Log key values from the btree block.
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_log_keys(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp,
|
|
|
|
int first,
|
|
|
|
int last)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (bp) {
|
2013-04-03 05:11:30 +00:00
|
|
|
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
|
2008-10-30 05:56:22 +00:00
|
|
|
xfs_trans_log_buf(cur->bc_tp, bp,
|
|
|
|
xfs_btree_key_offset(cur, first),
|
|
|
|
xfs_btree_key_offset(cur, last + 1) - 1);
|
|
|
|
} else {
|
2020-03-11 00:52:53 +00:00
|
|
|
xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
|
|
|
|
xfs_ilog_fbroot(cur->bc_ino.whichfork));
|
2008-10-30 05:56:22 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:32 +00:00
|
|
|
/*
|
|
|
|
* Log record values from the btree block.
|
|
|
|
*/
|
2008-10-30 05:58:21 +00:00
|
|
|
void
|
2008-10-30 05:56:32 +00:00
|
|
|
xfs_btree_log_recs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp,
|
|
|
|
int first,
|
|
|
|
int last)
|
|
|
|
{
|
|
|
|
|
2013-04-03 05:11:30 +00:00
|
|
|
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
|
2008-10-30 05:56:32 +00:00
|
|
|
xfs_trans_log_buf(cur->bc_tp, bp,
|
|
|
|
xfs_btree_rec_offset(cur, first),
|
|
|
|
xfs_btree_rec_offset(cur, last + 1) - 1);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:43 +00:00
|
|
|
/*
|
|
|
|
* Log block pointer fields from a btree block (nonleaf).
|
|
|
|
*/
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_log_ptrs(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
struct xfs_buf *bp, /* buffer containing btree block */
|
|
|
|
int first, /* index of first pointer to log */
|
|
|
|
int last) /* index of last pointer to log */
|
|
|
|
{
|
|
|
|
|
|
|
|
if (bp) {
|
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
|
|
|
int level = xfs_btree_get_level(block);
|
|
|
|
|
2013-04-03 05:11:30 +00:00
|
|
|
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
|
2008-10-30 05:56:43 +00:00
|
|
|
xfs_trans_log_buf(cur->bc_tp, bp,
|
|
|
|
xfs_btree_ptr_offset(cur, first, level),
|
|
|
|
xfs_btree_ptr_offset(cur, last + 1, level) - 1);
|
|
|
|
} else {
|
2020-03-11 00:52:53 +00:00
|
|
|
xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
|
|
|
|
xfs_ilog_fbroot(cur->bc_ino.whichfork));
|
2008-10-30 05:56:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Log fields from a btree block header.
|
|
|
|
*/
|
2008-10-30 05:58:21 +00:00
|
|
|
void
|
2008-10-30 05:56:43 +00:00
|
|
|
xfs_btree_log_block(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
struct xfs_buf *bp, /* buffer containing btree block */
|
2022-04-21 00:46:33 +00:00
|
|
|
uint32_t fields) /* mask of fields: XFS_BB_... */
|
2008-10-30 05:56:43 +00:00
|
|
|
{
|
|
|
|
int first; /* first byte offset logged */
|
|
|
|
int last; /* last byte offset logged */
|
|
|
|
static const short soffsets[] = { /* table of offsets (short) */
|
2008-10-30 06:14:34 +00:00
|
|
|
offsetof(struct xfs_btree_block, bb_magic),
|
|
|
|
offsetof(struct xfs_btree_block, bb_level),
|
|
|
|
offsetof(struct xfs_btree_block, bb_numrecs),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
|
2013-04-21 19:53:46 +00:00
|
|
|
offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
|
|
|
|
XFS_BTREE_SBLOCK_CRC_LEN
|
2008-10-30 05:56:43 +00:00
|
|
|
};
|
|
|
|
static const short loffsets[] = { /* table of offsets (long) */
|
2008-10-30 06:14:34 +00:00
|
|
|
offsetof(struct xfs_btree_block, bb_magic),
|
|
|
|
offsetof(struct xfs_btree_block, bb_level),
|
|
|
|
offsetof(struct xfs_btree_block, bb_numrecs),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
|
2013-04-21 19:53:46 +00:00
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
|
|
|
|
offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
|
|
|
|
XFS_BTREE_LBLOCK_CRC_LEN
|
2008-10-30 05:56:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
if (bp) {
|
2013-04-21 19:53:46 +00:00
|
|
|
int nbits;
|
|
|
|
|
2024-02-22 20:34:12 +00:00
|
|
|
if (xfs_has_crc(cur->bc_mp)) {
|
2013-04-21 19:53:46 +00:00
|
|
|
/*
|
|
|
|
* We don't log the CRC when updating a btree
|
|
|
|
* block but instead recreate it during log
|
|
|
|
* recovery. As the log buffers have checksums
|
|
|
|
* of their own this is safe and avoids logging a crc
|
|
|
|
* update in a lot of places.
|
|
|
|
*/
|
|
|
|
if (fields == XFS_BB_ALL_BITS)
|
|
|
|
fields = XFS_BB_ALL_BITS_CRC;
|
|
|
|
nbits = XFS_BB_NUM_BITS_CRC;
|
|
|
|
} else {
|
|
|
|
nbits = XFS_BB_NUM_BITS;
|
|
|
|
}
|
2008-10-30 05:56:43 +00:00
|
|
|
xfs_btree_offsets(fields,
|
2024-02-22 20:35:36 +00:00
|
|
|
(cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ?
|
2008-10-30 05:56:43 +00:00
|
|
|
loffsets : soffsets,
|
2013-04-21 19:53:46 +00:00
|
|
|
nbits, &first, &last);
|
2013-04-03 05:11:30 +00:00
|
|
|
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
|
2008-10-30 05:56:43 +00:00
|
|
|
xfs_trans_log_buf(cur->bc_tp, bp, first, last);
|
|
|
|
} else {
|
2020-03-11 00:52:53 +00:00
|
|
|
xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
|
|
|
|
xfs_ilog_fbroot(cur->bc_ino.whichfork));
|
2008-10-30 05:56:43 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:55:45 +00:00
|
|
|
/*
|
|
|
|
* Increment cursor by one record at the level.
|
|
|
|
* For nonzero levels the leaf-ward information is untouched.
|
|
|
|
*/
|
|
|
|
int /* error */
|
|
|
|
xfs_btree_increment(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
union xfs_btree_ptr ptr;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
int error; /* error return value */
|
|
|
|
int lev;
|
|
|
|
|
|
|
|
ASSERT(level < cur->bc_nlevels);
|
|
|
|
|
|
|
|
/* Read-ahead to the right at this level. */
|
|
|
|
xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
|
|
|
|
|
|
|
|
/* Get a pointer to the btree block. */
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* We're done if we remain in the block after the increment. */
|
2021-09-16 19:24:04 +00:00
|
|
|
if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
|
2008-10-30 05:55:45 +00:00
|
|
|
goto out1;
|
|
|
|
|
|
|
|
/* Fail if we just went off the right edge of the tree. */
|
|
|
|
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
|
|
|
|
if (xfs_btree_ptr_is_null(cur, &ptr))
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, increment);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* March up the tree incrementing pointers.
|
|
|
|
* Stop when we don't go off the right edge of a block.
|
|
|
|
*/
|
|
|
|
for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
|
|
|
|
block = xfs_btree_get_block(cur, lev, &bp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, lev, bp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
|
2008-10-30 05:55:45 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* Read-ahead the right block for the next loop. */
|
|
|
|
xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we went off the root then we are either seriously
|
|
|
|
* confused or have the tree root in an inode.
|
|
|
|
*/
|
|
|
|
if (lev == cur->bc_nlevels) {
|
2024-02-22 20:36:17 +00:00
|
|
|
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
|
2008-10-30 05:55:45 +00:00
|
|
|
goto out0;
|
|
|
|
ASSERT(0);
|
2024-02-22 20:32:09 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
2014-06-25 04:58:08 +00:00
|
|
|
error = -EFSCORRUPTED;
|
2008-10-30 05:55:45 +00:00
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
ASSERT(lev < cur->bc_nlevels);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now walk back down the tree, fixing up the cursor's buffer
|
|
|
|
* pointers and key numbers.
|
|
|
|
*/
|
|
|
|
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
|
|
|
|
union xfs_btree_ptr *ptrp;
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
|
2014-04-14 08:59:56 +00:00
|
|
|
--lev;
|
|
|
|
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
|
2008-10-30 05:55:45 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
xfs_btree_setbuf(cur, lev, bp);
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[lev].ptr = 1;
|
2008-10-30 05:55:45 +00:00
|
|
|
}
|
|
|
|
out1:
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out0:
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
2008-10-30 05:55:58 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Decrement cursor by one record at the level.
|
|
|
|
* For nonzero levels the leaf-ward information is untouched.
|
|
|
|
*/
|
|
|
|
int /* error */
|
|
|
|
xfs_btree_decrement(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block;
|
2020-12-17 00:07:34 +00:00
|
|
|
struct xfs_buf *bp;
|
2008-10-30 05:55:58 +00:00
|
|
|
int error; /* error return value */
|
|
|
|
int lev;
|
|
|
|
union xfs_btree_ptr ptr;
|
|
|
|
|
|
|
|
ASSERT(level < cur->bc_nlevels);
|
|
|
|
|
|
|
|
/* Read-ahead to the left at this level. */
|
|
|
|
xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
|
|
|
|
|
|
|
|
/* We're done if we remain in the block after the decrement. */
|
2021-09-16 19:24:04 +00:00
|
|
|
if (--cur->bc_levels[level].ptr > 0)
|
2008-10-30 05:55:58 +00:00
|
|
|
goto out1;
|
|
|
|
|
|
|
|
/* Get a pointer to the btree block. */
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Fail if we just went off the left edge of the tree. */
|
|
|
|
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
|
|
|
|
if (xfs_btree_ptr_is_null(cur, &ptr))
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, decrement);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* March up the tree decrementing pointers.
|
|
|
|
* Stop when we don't go off the left edge of a block.
|
|
|
|
*/
|
|
|
|
for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
|
2021-09-16 19:24:04 +00:00
|
|
|
if (--cur->bc_levels[lev].ptr > 0)
|
2008-10-30 05:55:58 +00:00
|
|
|
break;
|
|
|
|
/* Read-ahead the left block for the next loop. */
|
|
|
|
xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we went off the root then we are seriously confused.
|
|
|
|
* or the root of the tree is in an inode.
|
|
|
|
*/
|
|
|
|
if (lev == cur->bc_nlevels) {
|
2024-02-22 20:36:17 +00:00
|
|
|
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
|
2008-10-30 05:55:58 +00:00
|
|
|
goto out0;
|
|
|
|
ASSERT(0);
|
2024-02-22 20:32:09 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
2014-06-25 04:58:08 +00:00
|
|
|
error = -EFSCORRUPTED;
|
2008-10-30 05:55:58 +00:00
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
ASSERT(lev < cur->bc_nlevels);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now walk back down the tree, fixing up the cursor's buffer
|
|
|
|
* pointers and key numbers.
|
|
|
|
*/
|
|
|
|
for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
|
|
|
|
union xfs_btree_ptr *ptrp;
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
|
2014-04-14 08:59:56 +00:00
|
|
|
--lev;
|
|
|
|
error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
|
2008-10-30 05:55:58 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
xfs_btree_setbuf(cur, lev, bp);
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
|
2008-10-30 05:55:58 +00:00
|
|
|
}
|
|
|
|
out1:
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out0:
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:35:23 +00:00
|
|
|
/*
|
|
|
|
* Check the btree block owner now that we have the context to know who the
|
|
|
|
* real owner is.
|
|
|
|
*/
|
|
|
|
static inline xfs_failaddr_t
|
|
|
|
xfs_btree_check_block_owner(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block)
|
|
|
|
{
|
|
|
|
__u64 owner;
|
|
|
|
|
|
|
|
if (!xfs_has_crc(cur->bc_mp) ||
|
|
|
|
(cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
owner = xfs_btree_owner(cur);
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
|
2024-02-22 20:35:23 +00:00
|
|
|
if (be64_to_cpu(block->bb_u.l.bb_owner) != owner)
|
|
|
|
return __this_address;
|
|
|
|
} else {
|
|
|
|
if (be32_to_cpu(block->bb_u.s.bb_owner) != owner)
|
|
|
|
return __this_address;
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2017-06-16 18:00:07 +00:00
|
|
|
int
|
2008-10-30 05:56:09 +00:00
|
|
|
xfs_btree_lookup_get_block(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
int level, /* level in the btree */
|
|
|
|
const union xfs_btree_ptr *pp, /* ptr to btree block */
|
|
|
|
struct xfs_btree_block **blkp) /* return btree block */
|
2008-10-30 05:56:09 +00:00
|
|
|
{
|
|
|
|
struct xfs_buf *bp; /* buffer pointer for btree block */
|
2018-06-04 20:58:34 +00:00
|
|
|
xfs_daddr_t daddr;
|
2008-10-30 05:56:09 +00:00
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
/* special case the root block if in an inode */
|
2024-02-22 20:37:24 +00:00
|
|
|
if (xfs_btree_at_iroot(cur, level)) {
|
2008-10-30 05:56:09 +00:00
|
|
|
*blkp = xfs_btree_get_iroot(cur);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the old buffer at this level for the disk address we are
|
|
|
|
* looking for re-use it.
|
|
|
|
*
|
|
|
|
* Otherwise throw it away and get a new one.
|
|
|
|
*/
|
2021-09-16 19:24:04 +00:00
|
|
|
bp = cur->bc_levels[level].bp;
|
2018-06-04 20:58:34 +00:00
|
|
|
error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
|
|
|
|
if (error)
|
|
|
|
return error;
|
2021-08-19 01:46:57 +00:00
|
|
|
if (bp && xfs_buf_daddr(bp) == daddr) {
|
2008-10-30 05:56:09 +00:00
|
|
|
*blkp = XFS_BUF_TO_BLOCK(bp);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
|
2008-10-30 05:56:09 +00:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
2016-12-05 01:33:54 +00:00
|
|
|
/* Check the inode owner since the verifiers don't. */
|
2024-02-22 20:35:23 +00:00
|
|
|
if (xfs_btree_check_block_owner(cur, *blkp) != NULL)
|
2016-12-05 01:33:54 +00:00
|
|
|
goto out_bad;
|
|
|
|
|
|
|
|
/* Did we get the level we were looking for? */
|
|
|
|
if (be16_to_cpu((*blkp)->bb_level) != level)
|
|
|
|
goto out_bad;
|
|
|
|
|
|
|
|
/* Check that internal nodes have at least one record. */
|
|
|
|
if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
|
|
|
|
goto out_bad;
|
|
|
|
|
2008-10-30 05:56:09 +00:00
|
|
|
xfs_btree_setbuf(cur, level, bp);
|
|
|
|
return 0;
|
2016-12-05 01:33:54 +00:00
|
|
|
|
|
|
|
out_bad:
|
|
|
|
*blkp = NULL;
|
2020-03-11 17:37:54 +00:00
|
|
|
xfs_buf_mark_corrupt(bp);
|
2016-12-05 01:33:54 +00:00
|
|
|
xfs_trans_brelse(cur->bc_tp, bp);
|
2024-02-22 20:32:09 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
2016-12-05 01:33:54 +00:00
|
|
|
return -EFSCORRUPTED;
|
2008-10-30 05:56:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get current search key. For level 0 we don't actually have a key
|
|
|
|
* structure so we make one up from the record. For all other levels
|
|
|
|
* we just return the right key.
|
|
|
|
*/
|
|
|
|
STATIC union xfs_btree_key *
|
|
|
|
xfs_lookup_get_search_key(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
int keyno,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
union xfs_btree_key *kp)
|
|
|
|
{
|
|
|
|
if (level == 0) {
|
|
|
|
cur->bc_ops->init_key_from_rec(kp,
|
|
|
|
xfs_btree_rec_addr(cur, keyno, block));
|
|
|
|
return kp;
|
|
|
|
}
|
|
|
|
|
|
|
|
return xfs_btree_key_addr(cur, keyno, block);
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:37:26 +00:00
|
|
|
/*
|
|
|
|
* Initialize a pointer to the root block.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_btree_init_ptr_from_cur(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_ptr *ptr)
|
|
|
|
{
|
|
|
|
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
|
|
|
|
/*
|
|
|
|
* Inode-rooted btrees call xfs_btree_get_iroot to find the root
|
|
|
|
* in xfs_btree_lookup_get_block and don't need a pointer here.
|
|
|
|
*/
|
|
|
|
ptr->l = 0;
|
2024-02-22 20:37:35 +00:00
|
|
|
} else if (cur->bc_flags & XFS_BTREE_STAGING) {
|
|
|
|
ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root);
|
2024-02-22 20:37:26 +00:00
|
|
|
} else {
|
|
|
|
cur->bc_ops->init_ptr_from_cur(cur, ptr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:09 +00:00
|
|
|
/*
|
|
|
|
* Lookup the record. The cursor is made to point to it, based on dir.
|
2013-08-07 10:11:00 +00:00
|
|
|
* stat is set to 0 if can't find any such record, 1 for success.
|
2008-10-30 05:56:09 +00:00
|
|
|
*/
|
|
|
|
int /* error */
|
|
|
|
xfs_btree_lookup(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
xfs_lookup_t dir, /* <=, ==, or >= */
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block; /* current btree block */
|
2017-06-16 18:00:05 +00:00
|
|
|
int64_t diff; /* difference for the current key */
|
2008-10-30 05:56:09 +00:00
|
|
|
int error; /* error return value */
|
|
|
|
int keyno; /* current key number */
|
|
|
|
int level; /* level in the btree */
|
|
|
|
union xfs_btree_ptr *pp; /* ptr to btree block */
|
|
|
|
union xfs_btree_ptr ptr; /* ptr to btree block */
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, lookup);
|
|
|
|
|
2016-08-26 05:58:40 +00:00
|
|
|
/* No such thing as a zero-level tree. */
|
2024-02-22 20:32:09 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) {
|
|
|
|
xfs_btree_mark_sick(cur);
|
2016-08-26 05:58:40 +00:00
|
|
|
return -EFSCORRUPTED;
|
2024-02-22 20:32:09 +00:00
|
|
|
}
|
2016-08-26 05:58:40 +00:00
|
|
|
|
2008-10-30 05:56:09 +00:00
|
|
|
block = NULL;
|
|
|
|
keyno = 0;
|
|
|
|
|
|
|
|
/* initialise start pointer from cursor */
|
2024-02-22 20:37:26 +00:00
|
|
|
xfs_btree_init_ptr_from_cur(cur, &ptr);
|
2008-10-30 05:56:09 +00:00
|
|
|
pp = &ptr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Iterate over each level in the btree, starting at the root.
|
|
|
|
* For each level above the leaves, find the key we need, based
|
|
|
|
* on the lookup record, then follow the corresponding block
|
|
|
|
* pointer down to the next level.
|
|
|
|
*/
|
|
|
|
for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
|
|
|
|
/* Get the block we need to do the lookup on. */
|
|
|
|
error = xfs_btree_lookup_get_block(cur, level, pp, &block);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
if (diff == 0) {
|
|
|
|
/*
|
|
|
|
* If we already had a key match at a higher level, we
|
|
|
|
* know we need to use the first entry in this block.
|
|
|
|
*/
|
|
|
|
keyno = 1;
|
|
|
|
} else {
|
|
|
|
/* Otherwise search this block. Do a binary search. */
|
|
|
|
|
|
|
|
int high; /* high entry number */
|
|
|
|
int low; /* low entry number */
|
|
|
|
|
|
|
|
/* Set low and high entry numbers, 1-based. */
|
|
|
|
low = 1;
|
|
|
|
high = xfs_btree_get_numrecs(block);
|
|
|
|
if (!high) {
|
|
|
|
/* Block is empty, must be an empty leaf. */
|
2018-06-03 23:10:14 +00:00
|
|
|
if (level != 0 || cur->bc_nlevels != 1) {
|
|
|
|
XFS_CORRUPTION_ERROR(__func__,
|
|
|
|
XFS_ERRLEVEL_LOW,
|
2018-06-04 17:23:54 +00:00
|
|
|
cur->bc_mp, block,
|
|
|
|
sizeof(*block));
|
2024-02-22 20:32:09 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
2018-06-03 23:10:14 +00:00
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
2008-10-30 05:56:09 +00:00
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
|
2008-10-30 05:56:09 +00:00
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Binary search the block. */
|
|
|
|
while (low <= high) {
|
|
|
|
union xfs_btree_key key;
|
|
|
|
union xfs_btree_key *kp;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, compare);
|
|
|
|
|
|
|
|
/* keyno is average of low and high. */
|
|
|
|
keyno = (low + high) >> 1;
|
|
|
|
|
|
|
|
/* Get current search key */
|
|
|
|
kp = xfs_lookup_get_search_key(cur, level,
|
|
|
|
keyno, block, &key);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Compute difference to get next direction:
|
|
|
|
* - less than, move right
|
|
|
|
* - greater than, move left
|
|
|
|
* - equal, we're done
|
|
|
|
*/
|
|
|
|
diff = cur->bc_ops->key_diff(cur, kp);
|
|
|
|
if (diff < 0)
|
|
|
|
low = keyno + 1;
|
|
|
|
else if (diff > 0)
|
|
|
|
high = keyno - 1;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there are more levels, set up for the next level
|
|
|
|
* by getting the block number and filling in the cursor.
|
|
|
|
*/
|
|
|
|
if (level > 0) {
|
|
|
|
/*
|
|
|
|
* If we moved left, need the previous key number,
|
|
|
|
* unless there isn't one.
|
|
|
|
*/
|
|
|
|
if (diff > 0 && --keyno < 1)
|
|
|
|
keyno = 1;
|
|
|
|
pp = xfs_btree_ptr_addr(cur, keyno, block);
|
|
|
|
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
|
2008-10-30 05:56:09 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr = keyno;
|
2008-10-30 05:56:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Done with the search. See if we need to adjust the results. */
|
|
|
|
if (dir != XFS_LOOKUP_LE && diff < 0) {
|
|
|
|
keyno++;
|
|
|
|
/*
|
|
|
|
* If ge search and we went off the end of the block, but it's
|
|
|
|
* not the last block, we're in the wrong block.
|
|
|
|
*/
|
|
|
|
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
|
|
|
|
if (dir == XFS_LOOKUP_GE &&
|
|
|
|
keyno > xfs_btree_get_numrecs(block) &&
|
|
|
|
!xfs_btree_ptr_is_null(cur, &ptr)) {
|
|
|
|
int i;
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[0].ptr = keyno;
|
2008-10-30 05:56:09 +00:00
|
|
|
error = xfs_btree_increment(cur, 0, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
2024-02-22 20:32:55 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
return -EFSCORRUPTED;
|
2024-02-22 20:32:55 +00:00
|
|
|
}
|
2008-10-30 05:56:09 +00:00
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
} else if (dir == XFS_LOOKUP_LE && diff > 0)
|
|
|
|
keyno--;
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[0].ptr = keyno;
|
2008-10-30 05:56:09 +00:00
|
|
|
|
|
|
|
/* Return if we succeeded or not. */
|
|
|
|
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
|
|
|
|
*stat = 0;
|
|
|
|
else if (dir != XFS_LOOKUP_EQ || diff == 0)
|
|
|
|
*stat = 1;
|
|
|
|
else
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
2008-10-30 05:56:22 +00:00
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/* Find the high key storage area from a regular key. */
|
2017-10-25 22:03:46 +00:00
|
|
|
union xfs_btree_key *
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
xfs_btree_high_key_from_key(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_key *key)
|
|
|
|
{
|
2024-02-22 20:34:29 +00:00
|
|
|
ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
return (union xfs_btree_key *)((char *)key +
|
|
|
|
(cur->bc_ops->key_len / 2));
|
|
|
|
}
|
|
|
|
|
2016-08-03 02:22:12 +00:00
|
|
|
/* Determine the low (and high if overlapped) keys of a leaf block */
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_get_leaf_keys(
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
union xfs_btree_key *key)
|
|
|
|
{
|
|
|
|
union xfs_btree_key max_hkey;
|
|
|
|
union xfs_btree_key hkey;
|
2016-08-03 02:22:12 +00:00
|
|
|
union xfs_btree_rec *rec;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
union xfs_btree_key *high;
|
2016-08-03 02:22:12 +00:00
|
|
|
int n;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
|
|
|
rec = xfs_btree_rec_addr(cur, 1, block);
|
|
|
|
cur->bc_ops->init_key_from_rec(key, rec);
|
|
|
|
|
2024-02-22 20:34:29 +00:00
|
|
|
if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
|
2016-08-03 02:22:12 +00:00
|
|
|
|
|
|
|
cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
|
|
|
|
for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
|
|
|
|
rec = xfs_btree_rec_addr(cur, n, block);
|
|
|
|
cur->bc_ops->init_high_key_from_rec(&hkey, rec);
|
2023-04-12 02:00:10 +00:00
|
|
|
if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey))
|
2016-08-03 02:22:12 +00:00
|
|
|
max_hkey = hkey;
|
|
|
|
}
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
2016-08-03 02:22:12 +00:00
|
|
|
high = xfs_btree_high_key_from_key(cur, key);
|
|
|
|
memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
|
|
|
|
}
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
}
|
|
|
|
|
2016-08-03 02:22:12 +00:00
|
|
|
/* Determine the low (and high if overlapped) keys of a node block */
|
|
|
|
STATIC void
|
|
|
|
xfs_btree_get_node_keys(
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
union xfs_btree_key *key)
|
|
|
|
{
|
|
|
|
union xfs_btree_key *hkey;
|
|
|
|
union xfs_btree_key *max_hkey;
|
|
|
|
union xfs_btree_key *high;
|
2016-08-03 02:22:12 +00:00
|
|
|
int n;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
2024-02-22 20:34:29 +00:00
|
|
|
if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
|
2016-08-03 02:22:12 +00:00
|
|
|
memcpy(key, xfs_btree_key_addr(cur, 1, block),
|
|
|
|
cur->bc_ops->key_len / 2);
|
|
|
|
|
|
|
|
max_hkey = xfs_btree_high_key_addr(cur, 1, block);
|
|
|
|
for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
|
|
|
|
hkey = xfs_btree_high_key_addr(cur, n, block);
|
2023-04-12 02:00:10 +00:00
|
|
|
if (xfs_btree_keycmp_gt(cur, hkey, max_hkey))
|
2016-08-03 02:22:12 +00:00
|
|
|
max_hkey = hkey;
|
|
|
|
}
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
2016-08-03 02:22:12 +00:00
|
|
|
high = xfs_btree_high_key_from_key(cur, key);
|
|
|
|
memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
|
|
|
|
} else {
|
|
|
|
memcpy(key, xfs_btree_key_addr(cur, 1, block),
|
|
|
|
cur->bc_ops->key_len);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:03:38 +00:00
|
|
|
/* Derive the keys for any btree block. */
|
2017-10-25 22:03:46 +00:00
|
|
|
void
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_get_keys(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
union xfs_btree_key *key)
|
|
|
|
{
|
|
|
|
if (be16_to_cpu(block->bb_level) == 0)
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_leaf_keys(cur, block, key);
|
2016-08-03 01:03:38 +00:00
|
|
|
else
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_node_keys(cur, block, key);
|
2016-08-03 01:03:38 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:22 +00:00
|
|
|
/*
|
2016-08-03 01:03:38 +00:00
|
|
|
* Decide if we need to update the parent keys of a btree block. For
|
|
|
|
* a standard btree this is only necessary if we're updating the first
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
* record/key. For an overlapping btree, we must always update the
|
|
|
|
* keys because the highest key can be in any of the records or keys
|
|
|
|
* in the block.
|
2008-10-30 05:56:22 +00:00
|
|
|
*/
|
2016-08-03 01:03:38 +00:00
|
|
|
static inline bool
|
|
|
|
xfs_btree_needs_key_update(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int ptr)
|
|
|
|
{
|
2024-02-22 20:34:29 +00:00
|
|
|
return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the low and high parent keys of the given level, progressing
|
|
|
|
* towards the root. If force_all is false, stop if the keys for a given
|
|
|
|
* level do not need updating.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
__xfs_btree_updkeys(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
struct xfs_btree_block *block,
|
|
|
|
struct xfs_buf *bp0,
|
|
|
|
bool force_all)
|
|
|
|
{
|
2016-09-19 00:24:36 +00:00
|
|
|
union xfs_btree_key key; /* keys from current level */
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
union xfs_btree_key *lkey; /* keys from the next level up */
|
|
|
|
union xfs_btree_key *hkey;
|
|
|
|
union xfs_btree_key *nlkey; /* keys from the next level up */
|
|
|
|
union xfs_btree_key *nhkey;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
int ptr;
|
|
|
|
|
2024-02-22 20:34:29 +00:00
|
|
|
ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
|
|
|
/* Exit if there aren't any parent levels to update. */
|
|
|
|
if (level + 1 >= cur->bc_nlevels)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
trace_xfs_btree_updkeys(cur, level, bp0);
|
|
|
|
|
2016-09-19 00:24:36 +00:00
|
|
|
lkey = &key;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
hkey = xfs_btree_high_key_from_key(cur, lkey);
|
|
|
|
xfs_btree_get_keys(cur, block, lkey);
|
|
|
|
for (level++; level < cur->bc_nlevels; level++) {
|
|
|
|
#ifdef DEBUG
|
|
|
|
int error;
|
|
|
|
#endif
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
trace_xfs_btree_updkeys(cur, level, bp);
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
2018-03-07 01:03:30 +00:00
|
|
|
if (error)
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
return error;
|
|
|
|
#endif
|
2021-09-16 19:24:04 +00:00
|
|
|
ptr = cur->bc_levels[level].ptr;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
nlkey = xfs_btree_key_addr(cur, ptr, block);
|
|
|
|
nhkey = xfs_btree_high_key_addr(cur, ptr, block);
|
|
|
|
if (!force_all &&
|
2023-04-12 02:00:10 +00:00
|
|
|
xfs_btree_keycmp_eq(cur, nlkey, lkey) &&
|
|
|
|
xfs_btree_keycmp_eq(cur, nhkey, hkey))
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
break;
|
|
|
|
xfs_btree_copy_keys(cur, nlkey, lkey, 1);
|
|
|
|
xfs_btree_log_keys(cur, bp, ptr, ptr);
|
|
|
|
if (level + 1 >= cur->bc_nlevels)
|
|
|
|
break;
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_node_keys(cur, block, lkey);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Update all the keys from some level in cursor back to the root. */
|
|
|
|
STATIC int
|
|
|
|
xfs_btree_updkeys_force(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level)
|
|
|
|
{
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
return __xfs_btree_updkeys(cur, level, block, bp, true);
|
2016-08-03 01:03:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the parent keys of the given level, progressing towards the root.
|
|
|
|
*/
|
2016-08-03 02:22:12 +00:00
|
|
|
STATIC int
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_update_keys(
|
2008-10-30 05:56:22 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
union xfs_btree_key *kp;
|
2016-08-03 01:03:38 +00:00
|
|
|
union xfs_btree_key key;
|
2008-10-30 05:56:22 +00:00
|
|
|
int ptr;
|
|
|
|
|
2016-08-03 02:22:12 +00:00
|
|
|
ASSERT(level >= 0);
|
|
|
|
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
2024-02-22 20:34:29 +00:00
|
|
|
if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)
|
2016-08-03 02:22:12 +00:00
|
|
|
return __xfs_btree_updkeys(cur, level, block, bp, false);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
2008-10-30 05:56:22 +00:00
|
|
|
/*
|
|
|
|
* Go up the tree from this level toward the root.
|
|
|
|
* At each level, update the key value to the value input.
|
|
|
|
* Stop when we reach a level where the cursor isn't pointing
|
|
|
|
* at the first entry in the block.
|
|
|
|
*/
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_get_keys(cur, block, &key);
|
|
|
|
for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
|
2008-10-30 05:56:22 +00:00
|
|
|
#ifdef DEBUG
|
|
|
|
int error;
|
|
|
|
#endif
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
2018-03-07 01:03:30 +00:00
|
|
|
if (error)
|
2008-10-30 05:56:22 +00:00
|
|
|
return error;
|
|
|
|
#endif
|
2021-09-16 19:24:04 +00:00
|
|
|
ptr = cur->bc_levels[level].ptr;
|
2008-10-30 05:56:22 +00:00
|
|
|
kp = xfs_btree_key_addr(cur, ptr, block);
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_copy_keys(cur, kp, &key, 1);
|
2008-10-30 05:56:22 +00:00
|
|
|
xfs_btree_log_keys(cur, bp, ptr, ptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2008-10-30 05:56:32 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the record referred to by cur to the value in the
|
|
|
|
* given record. This either works (return 0) or gets an
|
|
|
|
* EFSCORRUPTED error.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_btree_update(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_rec *rec)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
int error;
|
|
|
|
int ptr;
|
|
|
|
union xfs_btree_rec *rp;
|
|
|
|
|
|
|
|
/* Pick up the current block. */
|
|
|
|
block = xfs_btree_get_block(cur, 0, &bp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, 0, bp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
/* Get the address of the rec to be updated. */
|
2021-09-16 19:24:04 +00:00
|
|
|
ptr = cur->bc_levels[0].ptr;
|
2008-10-30 05:56:32 +00:00
|
|
|
rp = xfs_btree_rec_addr(cur, ptr, block);
|
|
|
|
|
|
|
|
/* Fill in the new contents and log them. */
|
|
|
|
xfs_btree_copy_recs(cur, rp, rec, 1);
|
|
|
|
xfs_btree_log_recs(cur, bp, ptr, ptr);
|
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/* Pass new key value up to our parent. */
|
2016-08-03 01:03:38 +00:00
|
|
|
if (xfs_btree_needs_key_update(cur, ptr)) {
|
2016-08-03 02:22:12 +00:00
|
|
|
error = xfs_btree_update_keys(cur, 0);
|
2008-10-30 05:56:32 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:53 +00:00
|
|
|
/*
|
|
|
|
* Move 1 record left from cur/level if possible.
|
|
|
|
* Update cur to reflect the new path.
|
|
|
|
*/
|
2008-10-30 05:58:41 +00:00
|
|
|
STATIC int /* error */
|
2008-10-30 05:56:53 +00:00
|
|
|
xfs_btree_lshift(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_buf *lbp; /* left buffer pointer */
|
|
|
|
struct xfs_btree_block *left; /* left btree block */
|
|
|
|
int lrecs; /* left record count */
|
|
|
|
struct xfs_buf *rbp; /* right buffer pointer */
|
|
|
|
struct xfs_btree_block *right; /* right btree block */
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
struct xfs_btree_cur *tcur; /* temporary btree cursor */
|
2008-10-30 05:56:53 +00:00
|
|
|
int rrecs; /* right record count */
|
|
|
|
union xfs_btree_ptr lptr; /* left btree pointer */
|
|
|
|
union xfs_btree_key *rkp = NULL; /* right btree key */
|
|
|
|
union xfs_btree_ptr *rpp = NULL; /* right address pointer */
|
|
|
|
union xfs_btree_rec *rrp = NULL; /* right record pointer */
|
|
|
|
int error; /* error return value */
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
int i;
|
2008-10-30 05:56:53 +00:00
|
|
|
|
2024-02-22 20:37:24 +00:00
|
|
|
if (xfs_btree_at_iroot(cur, level))
|
2008-10-30 05:56:53 +00:00
|
|
|
goto out0;
|
|
|
|
|
|
|
|
/* Set up variables for this block as "right". */
|
|
|
|
right = xfs_btree_get_block(cur, level, &rbp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, right, level, rbp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* If we've got no left sibling then we can't shift an entry left. */
|
|
|
|
xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
|
|
|
|
if (xfs_btree_ptr_is_null(cur, &lptr))
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the cursor entry is the one that would be moved, don't
|
|
|
|
* do it... it's too complicated.
|
|
|
|
*/
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[level].ptr <= 1)
|
2008-10-30 05:56:53 +00:00
|
|
|
goto out0;
|
|
|
|
|
|
|
|
/* Set up the left neighbor as "left". */
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
|
2008-10-30 05:56:53 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/* If it's full, it can't take another entry. */
|
|
|
|
lrecs = xfs_btree_get_numrecs(left);
|
|
|
|
if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
rrecs = xfs_btree_get_numrecs(right);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We add one entry to the left side and remove one for the right side.
|
2009-03-29 07:55:42 +00:00
|
|
|
* Account for it here, the changes will be updated on disk and logged
|
2008-10-30 05:56:53 +00:00
|
|
|
* later.
|
|
|
|
*/
|
|
|
|
lrecs++;
|
|
|
|
rrecs--;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, lshift);
|
|
|
|
XFS_BTREE_STATS_ADD(cur, moves, 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If non-leaf, copy a key and a ptr to the left block.
|
|
|
|
* Log the changes to the left block.
|
|
|
|
*/
|
|
|
|
if (level > 0) {
|
|
|
|
/* It's a non-leaf. Move keys and pointers. */
|
|
|
|
union xfs_btree_key *lkp; /* left btree key */
|
|
|
|
union xfs_btree_ptr *lpp; /* left address pointer */
|
|
|
|
|
|
|
|
lkp = xfs_btree_key_addr(cur, lrecs, left);
|
|
|
|
rkp = xfs_btree_key_addr(cur, 1, right);
|
|
|
|
|
|
|
|
lpp = xfs_btree_ptr_addr(cur, lrecs, left);
|
|
|
|
rpp = xfs_btree_ptr_addr(cur, 1, right);
|
2018-06-04 04:10:48 +00:00
|
|
|
|
|
|
|
error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
|
2008-10-30 05:56:53 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:56:53 +00:00
|
|
|
xfs_btree_copy_keys(cur, lkp, rkp, 1);
|
|
|
|
xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
|
|
|
|
|
|
|
|
xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
|
|
|
|
xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
|
|
|
|
|
2008-10-30 05:58:32 +00:00
|
|
|
ASSERT(cur->bc_ops->keys_inorder(cur,
|
|
|
|
xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
|
2008-10-30 05:56:53 +00:00
|
|
|
} else {
|
|
|
|
/* It's a leaf. Move records. */
|
|
|
|
union xfs_btree_rec *lrp; /* left record pointer */
|
|
|
|
|
|
|
|
lrp = xfs_btree_rec_addr(cur, lrecs, left);
|
|
|
|
rrp = xfs_btree_rec_addr(cur, 1, right);
|
|
|
|
|
|
|
|
xfs_btree_copy_recs(cur, lrp, rrp, 1);
|
|
|
|
xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
|
|
|
|
|
2008-10-30 05:58:32 +00:00
|
|
|
ASSERT(cur->bc_ops->recs_inorder(cur,
|
|
|
|
xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
|
2008-10-30 05:56:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
xfs_btree_set_numrecs(left, lrecs);
|
|
|
|
xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
|
|
|
|
|
|
|
|
xfs_btree_set_numrecs(right, rrecs);
|
|
|
|
xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Slide the contents of right down one entry.
|
|
|
|
*/
|
|
|
|
XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
|
|
|
|
if (level > 0) {
|
|
|
|
/* It's a nonleaf. operate on keys and ptrs */
|
|
|
|
for (i = 0; i < rrecs; i++) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
|
2008-10-30 05:56:53 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:56:53 +00:00
|
|
|
xfs_btree_shift_keys(cur,
|
|
|
|
xfs_btree_key_addr(cur, 2, right),
|
|
|
|
-1, rrecs);
|
|
|
|
xfs_btree_shift_ptrs(cur,
|
|
|
|
xfs_btree_ptr_addr(cur, 2, right),
|
|
|
|
-1, rrecs);
|
|
|
|
|
|
|
|
xfs_btree_log_keys(cur, rbp, 1, rrecs);
|
|
|
|
xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
|
|
|
|
} else {
|
|
|
|
/* It's a leaf. operate on records */
|
|
|
|
xfs_btree_shift_recs(cur,
|
|
|
|
xfs_btree_rec_addr(cur, 2, right),
|
|
|
|
-1, rrecs);
|
|
|
|
xfs_btree_log_recs(cur, rbp, 1, rrecs);
|
|
|
|
}
|
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/*
|
|
|
|
* Using a temporary cursor, update the parent key values of the
|
|
|
|
* block on the left.
|
|
|
|
*/
|
2024-02-22 20:34:29 +00:00
|
|
|
if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
|
2016-08-03 02:26:22 +00:00
|
|
|
error = xfs_btree_dup_cursor(cur, &tcur);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
i = xfs_btree_firstrec(tcur, level);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
2016-08-03 02:26:22 +00:00
|
|
|
error = xfs_btree_decrement(tcur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error1;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
2016-08-03 02:26:22 +00:00
|
|
|
/* Update the parent high keys of the left block, if needed. */
|
2016-08-03 02:22:12 +00:00
|
|
|
error = xfs_btree_update_keys(tcur, level);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
if (error)
|
|
|
|
goto error1;
|
2016-08-03 02:26:22 +00:00
|
|
|
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
}
|
|
|
|
|
2016-08-03 02:26:22 +00:00
|
|
|
/* Update the parent keys of the right block. */
|
|
|
|
error = xfs_btree_update_keys(cur, level);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
2008-10-30 05:56:53 +00:00
|
|
|
|
|
|
|
/* Slide the cursor value left one. */
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr--;
|
2008-10-30 05:56:53 +00:00
|
|
|
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out0:
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
return error;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
|
|
|
error1:
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
|
|
|
|
return error;
|
2008-10-30 05:56:53 +00:00
|
|
|
}
|
|
|
|
|
2008-10-30 05:56:43 +00:00
|
|
|
/*
|
|
|
|
* Move 1 record right from cur/level if possible.
|
|
|
|
* Update cur to reflect the new path.
|
|
|
|
*/
|
2008-10-30 05:58:41 +00:00
|
|
|
STATIC int /* error */
|
2008-10-30 05:56:43 +00:00
|
|
|
xfs_btree_rshift(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_buf *lbp; /* left buffer pointer */
|
|
|
|
struct xfs_btree_block *left; /* left btree block */
|
|
|
|
struct xfs_buf *rbp; /* right buffer pointer */
|
|
|
|
struct xfs_btree_block *right; /* right btree block */
|
|
|
|
struct xfs_btree_cur *tcur; /* temporary btree cursor */
|
|
|
|
union xfs_btree_ptr rptr; /* right block pointer */
|
|
|
|
union xfs_btree_key *rkp; /* right btree key */
|
|
|
|
int rrecs; /* right record count */
|
|
|
|
int lrecs; /* left record count */
|
|
|
|
int error; /* error return value */
|
|
|
|
int i; /* loop counter */
|
|
|
|
|
2024-02-22 20:37:24 +00:00
|
|
|
if (xfs_btree_at_iroot(cur, level))
|
2008-10-30 05:56:43 +00:00
|
|
|
goto out0;
|
|
|
|
|
|
|
|
/* Set up variables for this block as "left". */
|
|
|
|
left = xfs_btree_get_block(cur, level, &lbp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, left, level, lbp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* If we've got no right sibling then we can't shift an entry right. */
|
|
|
|
xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
|
|
|
|
if (xfs_btree_ptr_is_null(cur, &rptr))
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the cursor entry is the one that would be moved, don't
|
|
|
|
* do it... it's too complicated.
|
|
|
|
*/
|
|
|
|
lrecs = xfs_btree_get_numrecs(left);
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[level].ptr >= lrecs)
|
2008-10-30 05:56:43 +00:00
|
|
|
goto out0;
|
|
|
|
|
|
|
|
/* Set up the right neighbor as "right". */
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
|
2008-10-30 05:56:43 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/* If it's full, it can't take another entry. */
|
|
|
|
rrecs = xfs_btree_get_numrecs(right);
|
|
|
|
if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, rshift);
|
|
|
|
XFS_BTREE_STATS_ADD(cur, moves, rrecs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make a hole at the start of the right neighbor block, then
|
|
|
|
* copy the last left block entry to the hole.
|
|
|
|
*/
|
|
|
|
if (level > 0) {
|
|
|
|
/* It's a nonleaf. make a hole in the keys and ptrs */
|
|
|
|
union xfs_btree_key *lkp;
|
|
|
|
union xfs_btree_ptr *lpp;
|
|
|
|
union xfs_btree_ptr *rpp;
|
|
|
|
|
|
|
|
lkp = xfs_btree_key_addr(cur, lrecs, left);
|
|
|
|
lpp = xfs_btree_ptr_addr(cur, lrecs, left);
|
|
|
|
rkp = xfs_btree_key_addr(cur, 1, right);
|
|
|
|
rpp = xfs_btree_ptr_addr(cur, 1, right);
|
|
|
|
|
|
|
|
for (i = rrecs - 1; i >= 0; i--) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
|
2008-10-30 05:56:43 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
|
|
|
xfs_btree_shift_keys(cur, rkp, 1, rrecs);
|
|
|
|
xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
|
|
|
|
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
|
2008-10-30 05:56:43 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/* Now put the new data in, and log it. */
|
|
|
|
xfs_btree_copy_keys(cur, rkp, lkp, 1);
|
|
|
|
xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
|
|
|
|
|
|
|
|
xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
|
|
|
|
xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
|
|
|
|
|
2008-10-30 05:58:32 +00:00
|
|
|
ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
|
|
|
|
xfs_btree_key_addr(cur, 2, right)));
|
2008-10-30 05:56:43 +00:00
|
|
|
} else {
|
|
|
|
/* It's a leaf. make a hole in the records */
|
|
|
|
union xfs_btree_rec *lrp;
|
|
|
|
union xfs_btree_rec *rrp;
|
|
|
|
|
|
|
|
lrp = xfs_btree_rec_addr(cur, lrecs, left);
|
|
|
|
rrp = xfs_btree_rec_addr(cur, 1, right);
|
|
|
|
|
|
|
|
xfs_btree_shift_recs(cur, rrp, 1, rrecs);
|
|
|
|
|
|
|
|
/* Now put the new data in, and log it. */
|
|
|
|
xfs_btree_copy_recs(cur, rrp, lrp, 1);
|
|
|
|
xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decrement and log left's numrecs, bump and log right's numrecs.
|
|
|
|
*/
|
|
|
|
xfs_btree_set_numrecs(left, --lrecs);
|
|
|
|
xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
|
|
|
|
|
|
|
|
xfs_btree_set_numrecs(right, ++rrecs);
|
|
|
|
xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Using a temporary cursor, update the parent key values of the
|
|
|
|
* block on the right.
|
|
|
|
*/
|
|
|
|
error = xfs_btree_dup_cursor(cur, &tcur);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
i = xfs_btree_lastrec(tcur, level);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:56:43 +00:00
|
|
|
|
|
|
|
error = xfs_btree_increment(tcur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error1;
|
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/* Update the parent high keys of the left block, if needed. */
|
2024-02-22 20:34:29 +00:00
|
|
|
if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
|
2016-08-03 02:22:12 +00:00
|
|
|
error = xfs_btree_update_keys(cur, level);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
if (error)
|
|
|
|
goto error1;
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:03:38 +00:00
|
|
|
/* Update the parent keys of the right block. */
|
2016-08-03 02:22:12 +00:00
|
|
|
error = xfs_btree_update_keys(tcur, level);
|
2008-10-30 05:56:43 +00:00
|
|
|
if (error)
|
|
|
|
goto error1;
|
|
|
|
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
|
|
|
|
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out0:
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
|
|
|
|
error1:
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
|
|
|
|
return error;
|
|
|
|
}
|
2008-10-30 05:57:03 +00:00
|
|
|
|
2024-02-22 20:33:07 +00:00
|
|
|
static inline int
|
|
|
|
xfs_btree_alloc_block(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *hint_block,
|
|
|
|
union xfs_btree_ptr *new_block,
|
|
|
|
int *stat)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2024-02-22 20:37:35 +00:00
|
|
|
/*
|
|
|
|
* Don't allow block allocation for a staging cursor, because staging
|
|
|
|
* cursors do not support regular btree modifications.
|
|
|
|
*
|
|
|
|
* Bulk loading uses a separate callback to obtain new blocks from a
|
|
|
|
* preallocated list, which prevents ENOSPC failures during loading.
|
|
|
|
*/
|
|
|
|
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
|
|
|
|
ASSERT(0);
|
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:33:07 +00:00
|
|
|
error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat);
|
|
|
|
trace_xfs_btree_alloc_block(cur, new_block, *stat, error);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:57:03 +00:00
|
|
|
/*
|
|
|
|
* Split cur/level block in half.
|
|
|
|
* Return new block number and the key to its first
|
|
|
|
* record (to be inserted into parent).
|
|
|
|
*/
|
2008-10-30 05:58:41 +00:00
|
|
|
STATIC int /* error */
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
__xfs_btree_split(
|
2008-10-30 05:57:03 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
union xfs_btree_ptr *ptrp,
|
|
|
|
union xfs_btree_key *key,
|
|
|
|
struct xfs_btree_cur **curp,
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
union xfs_btree_ptr lptr; /* left sibling block ptr */
|
|
|
|
struct xfs_buf *lbp; /* left buffer pointer */
|
|
|
|
struct xfs_btree_block *left; /* left btree block */
|
|
|
|
union xfs_btree_ptr rptr; /* right sibling block ptr */
|
|
|
|
struct xfs_buf *rbp; /* right buffer pointer */
|
|
|
|
struct xfs_btree_block *right; /* right btree block */
|
|
|
|
union xfs_btree_ptr rrptr; /* right-right sibling ptr */
|
|
|
|
struct xfs_buf *rrbp; /* right-right buffer pointer */
|
|
|
|
struct xfs_btree_block *rrblock; /* right-right btree block */
|
|
|
|
int lrecs;
|
|
|
|
int rrecs;
|
|
|
|
int src_index;
|
|
|
|
int error; /* error return value */
|
|
|
|
int i;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, split);
|
|
|
|
|
|
|
|
/* Set up left block (current one). */
|
|
|
|
left = xfs_btree_get_block(cur, level, &lbp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, left, level, lbp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
|
|
|
|
|
|
|
|
/* Allocate the new block. If we can't do it, we're toast. Give up. */
|
2024-02-22 20:33:07 +00:00
|
|
|
error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat);
|
2008-10-30 05:57:03 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
if (*stat == 0)
|
|
|
|
goto out0;
|
|
|
|
XFS_BTREE_STATS_INC(cur, alloc);
|
|
|
|
|
|
|
|
/* Set up the new block as "right". */
|
2019-06-12 16:00:00 +00:00
|
|
|
error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
|
2008-10-30 05:57:03 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/* Fill in the btree header for the new right block. */
|
2013-04-21 19:53:46 +00:00
|
|
|
xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
|
2008-10-30 05:57:03 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Split the entries between the old and the new block evenly.
|
|
|
|
* Make sure that if there's an odd number of entries now, that
|
|
|
|
* each new block will have the same number of entries.
|
|
|
|
*/
|
|
|
|
lrecs = xfs_btree_get_numrecs(left);
|
|
|
|
rrecs = lrecs / 2;
|
2021-09-16 19:24:04 +00:00
|
|
|
if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
|
2008-10-30 05:57:03 +00:00
|
|
|
rrecs++;
|
|
|
|
src_index = (lrecs - rrecs + 1);
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_ADD(cur, moves, rrecs);
|
|
|
|
|
2016-08-03 01:03:38 +00:00
|
|
|
/* Adjust numrecs for the later get_*_keys() calls. */
|
|
|
|
lrecs -= rrecs;
|
|
|
|
xfs_btree_set_numrecs(left, lrecs);
|
|
|
|
xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
|
|
|
|
|
2008-10-30 05:57:03 +00:00
|
|
|
/*
|
|
|
|
* Copy btree block entries from the left block over to the
|
|
|
|
* new block, the right. Update the right block and log the
|
|
|
|
* changes.
|
|
|
|
*/
|
|
|
|
if (level > 0) {
|
|
|
|
/* It's a non-leaf. Move keys and pointers. */
|
|
|
|
union xfs_btree_key *lkp; /* left btree key */
|
|
|
|
union xfs_btree_ptr *lpp; /* left address pointer */
|
|
|
|
union xfs_btree_key *rkp; /* right btree key */
|
|
|
|
union xfs_btree_ptr *rpp; /* right address pointer */
|
|
|
|
|
|
|
|
lkp = xfs_btree_key_addr(cur, src_index, left);
|
|
|
|
lpp = xfs_btree_ptr_addr(cur, src_index, left);
|
|
|
|
rkp = xfs_btree_key_addr(cur, 1, right);
|
|
|
|
rpp = xfs_btree_ptr_addr(cur, 1, right);
|
|
|
|
|
|
|
|
for (i = src_index; i < rrecs; i++) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
|
2008-10-30 05:57:03 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:03:38 +00:00
|
|
|
/* Copy the keys & pointers to the new block. */
|
2008-10-30 05:57:03 +00:00
|
|
|
xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
|
|
|
|
xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
|
|
|
|
|
|
|
|
xfs_btree_log_keys(cur, rbp, 1, rrecs);
|
|
|
|
xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
|
|
|
|
|
2016-08-03 01:03:38 +00:00
|
|
|
/* Stash the keys of the new block for later insertion. */
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_node_keys(cur, right, key);
|
2008-10-30 05:57:03 +00:00
|
|
|
} else {
|
|
|
|
/* It's a leaf. Move records. */
|
|
|
|
union xfs_btree_rec *lrp; /* left record pointer */
|
|
|
|
union xfs_btree_rec *rrp; /* right record pointer */
|
|
|
|
|
|
|
|
lrp = xfs_btree_rec_addr(cur, src_index, left);
|
|
|
|
rrp = xfs_btree_rec_addr(cur, 1, right);
|
|
|
|
|
2016-08-03 01:03:38 +00:00
|
|
|
/* Copy records to the new block. */
|
2008-10-30 05:57:03 +00:00
|
|
|
xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
|
|
|
|
xfs_btree_log_recs(cur, rbp, 1, rrecs);
|
|
|
|
|
2016-08-03 01:03:38 +00:00
|
|
|
/* Stash the keys of the new block for later insertion. */
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_leaf_keys(cur, right, key);
|
2008-10-30 05:57:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the left block number by looking in the buffer.
|
2016-08-03 01:03:38 +00:00
|
|
|
* Adjust sibling pointers.
|
2008-10-30 05:57:03 +00:00
|
|
|
*/
|
|
|
|
xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
|
|
|
|
xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
|
|
|
|
xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
|
|
|
|
xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
|
|
|
|
|
|
|
|
xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
|
|
|
|
xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there's a block to the new block's right, make that block
|
|
|
|
* point back to right instead of to left.
|
|
|
|
*/
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &rrptr,
|
2008-10-30 05:57:03 +00:00
|
|
|
0, &rrblock, &rrbp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
|
|
|
|
xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
|
|
|
|
}
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
|
|
|
|
/* Update the parent high keys of the left block, if needed. */
|
2024-02-22 20:34:29 +00:00
|
|
|
if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
|
2016-08-03 02:22:12 +00:00
|
|
|
error = xfs_btree_update_keys(cur, level);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:57:03 +00:00
|
|
|
/*
|
|
|
|
* If the cursor is really in the right block, move it there.
|
|
|
|
* If it's just pointing past the last entry in left, then we'll
|
|
|
|
* insert there, so don't change anything in that case.
|
|
|
|
*/
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[level].ptr > lrecs + 1) {
|
2008-10-30 05:57:03 +00:00
|
|
|
xfs_btree_setbuf(cur, level, rbp);
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr -= lrecs;
|
2008-10-30 05:57:03 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If there are more levels, we'll need another cursor which refers
|
|
|
|
* the right block, no matter where this cursor was.
|
|
|
|
*/
|
|
|
|
if (level + 1 < cur->bc_nlevels) {
|
|
|
|
error = xfs_btree_dup_cursor(cur, curp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
2021-09-16 19:24:04 +00:00
|
|
|
(*curp)->bc_levels[level + 1].ptr++;
|
2008-10-30 05:57:03 +00:00
|
|
|
}
|
|
|
|
*ptrp = rptr;
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
out0:
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
2008-10-30 05:57:16 +00:00
|
|
|
|
2021-11-10 02:32:17 +00:00
|
|
|
#ifdef __KERNEL__
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
struct xfs_btree_split_args {
|
|
|
|
struct xfs_btree_cur *cur;
|
|
|
|
int level;
|
|
|
|
union xfs_btree_ptr *ptrp;
|
|
|
|
union xfs_btree_key *key;
|
|
|
|
struct xfs_btree_cur **curp;
|
|
|
|
int *stat; /* success/failure */
|
|
|
|
int result;
|
|
|
|
bool kswapd; /* allocation in kswapd context */
|
|
|
|
struct completion *done;
|
|
|
|
struct work_struct work;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Stack switching interfaces for allocation
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
xfs_btree_split_worker(
|
|
|
|
struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct xfs_btree_split_args *args = container_of(work,
|
|
|
|
struct xfs_btree_split_args, work);
|
|
|
|
unsigned long pflags;
|
2021-02-23 18:26:06 +00:00
|
|
|
unsigned long new_pflags = 0;
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* we are in a transaction context here, but may also be doing work
|
|
|
|
* in kswapd context, and hence we may need to inherit that state
|
|
|
|
* temporarily to ensure that we don't block waiting for memory reclaim
|
|
|
|
* in any way.
|
|
|
|
*/
|
|
|
|
if (args->kswapd)
|
2022-03-22 21:45:38 +00:00
|
|
|
new_pflags |= PF_MEMALLOC | PF_KSWAPD;
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
|
|
|
|
current_set_flags_nested(&pflags, new_pflags);
|
2021-02-23 18:26:06 +00:00
|
|
|
xfs_trans_set_context(args->cur->bc_tp);
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
|
|
|
|
args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
|
|
|
|
args->key, args->curp, args->stat);
|
|
|
|
|
2021-02-23 18:26:06 +00:00
|
|
|
xfs_trans_clear_context(args->cur->bc_tp);
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
current_restore_flags_nested(&pflags, new_pflags);
|
2021-02-23 18:26:06 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Do not access args after complete() has run here. We don't own args
|
|
|
|
* and the owner may run and free args before we return here.
|
|
|
|
*/
|
|
|
|
complete(args->done);
|
|
|
|
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
xfs: don't use BMBT btree split workers for IO completion
When we split a BMBT due to record insertion, we offload it to a
worker thread because we can be deep in the stack when we try to
allocate a new block for the BMBT. Allocation can use several
kilobytes of stack (full memory reclaim, swap and/or IO path can
end up on the stack during allocation) and we can already be several
kilobytes deep in the stack when we need to split the BMBT.
A recent workload demonstrated a deadlock in this BMBT split
offload. It requires several things to happen at once:
1. two inodes need a BMBT split at the same time, one must be
unwritten extent conversion from IO completion, the other must be
from extent allocation.
2. there must be a no available xfs_alloc_wq worker threads
available in the worker pool.
3. There must be sustained severe memory shortages such that new
kworker threads cannot be allocated to the xfs_alloc_wq pool for
both threads that need split work to be run
4. The split work from the unwritten extent conversion must run
first.
5. when the BMBT block allocation runs from the split work, it must
loop over all AGs and not be able to either trylock an AGF
successfully, or each AGF is is able to lock has no space available
for a single block allocation.
6. The BMBT allocation must then attempt to lock the AGF that the
second task queued to the rescuer thread already has locked before
it finds an AGF it can allocate from.
At this point, we have an ABBA deadlock between tasks queued on the
xfs_alloc_wq rescuer thread and a locked AGF. i.e. The queued task
holding the AGF lock can't be run by the rescuer thread until the
task the rescuer thread is runing gets the AGF lock....
This is a highly improbably series of events, but there it is.
There's a couple of ways to fix this, but the easiest way to ensure
that we only punt tasks with a locked AGF that holds enough space
for the BMBT block allocations to the worker thread.
This works for unwritten extent conversion in IO completion (which
doesn't have a locked AGF and space reservations) because we have
tight control over the IO completion stack. It is typically only 6
functions deep when xfs_btree_split() is called because we've
already offloaded the IO completion work to a worker thread and
hence we don't need to worry about stack overruns here.
The other place we can be called for a BMBT split without a
preceeding allocation is __xfs_bunmapi() when punching out the
center of an existing extent. We don't remove extents in the IO
path, so these operations don't tend to be called with a lot of
stack consumed. Hence we don't really need to ship the split off to
a worker thread in these cases, either.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2023-02-05 16:48:24 +00:00
|
|
|
* BMBT split requests often come in with little stack to work on so we push
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
* them off to a worker thread so there is lots of stack to use. For the other
|
|
|
|
* btree types, just call directly to avoid the context switch overhead here.
|
xfs: don't use BMBT btree split workers for IO completion
When we split a BMBT due to record insertion, we offload it to a
worker thread because we can be deep in the stack when we try to
allocate a new block for the BMBT. Allocation can use several
kilobytes of stack (full memory reclaim, swap and/or IO path can
end up on the stack during allocation) and we can already be several
kilobytes deep in the stack when we need to split the BMBT.
A recent workload demonstrated a deadlock in this BMBT split
offload. It requires several things to happen at once:
1. two inodes need a BMBT split at the same time, one must be
unwritten extent conversion from IO completion, the other must be
from extent allocation.
2. there must be a no available xfs_alloc_wq worker threads
available in the worker pool.
3. There must be sustained severe memory shortages such that new
kworker threads cannot be allocated to the xfs_alloc_wq pool for
both threads that need split work to be run
4. The split work from the unwritten extent conversion must run
first.
5. when the BMBT block allocation runs from the split work, it must
loop over all AGs and not be able to either trylock an AGF
successfully, or each AGF is is able to lock has no space available
for a single block allocation.
6. The BMBT allocation must then attempt to lock the AGF that the
second task queued to the rescuer thread already has locked before
it finds an AGF it can allocate from.
At this point, we have an ABBA deadlock between tasks queued on the
xfs_alloc_wq rescuer thread and a locked AGF. i.e. The queued task
holding the AGF lock can't be run by the rescuer thread until the
task the rescuer thread is runing gets the AGF lock....
This is a highly improbably series of events, but there it is.
There's a couple of ways to fix this, but the easiest way to ensure
that we only punt tasks with a locked AGF that holds enough space
for the BMBT block allocations to the worker thread.
This works for unwritten extent conversion in IO completion (which
doesn't have a locked AGF and space reservations) because we have
tight control over the IO completion stack. It is typically only 6
functions deep when xfs_btree_split() is called because we've
already offloaded the IO completion work to a worker thread and
hence we don't need to worry about stack overruns here.
The other place we can be called for a BMBT split without a
preceeding allocation is __xfs_bunmapi() when punching out the
center of an existing extent. We don't remove extents in the IO
path, so these operations don't tend to be called with a lot of
stack consumed. Hence we don't really need to ship the split off to
a worker thread in these cases, either.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2023-02-05 16:48:24 +00:00
|
|
|
*
|
|
|
|
* Care must be taken here - the work queue rescuer thread introduces potential
|
|
|
|
* AGF <> worker queue deadlocks if the BMBT block allocation has to lock new
|
|
|
|
* AGFs to allocate blocks. A task being run by the rescuer could attempt to
|
|
|
|
* lock an AGF that is already locked by a task queued to run by the rescuer,
|
|
|
|
* resulting in an ABBA deadlock as the rescuer cannot run the lock holder to
|
|
|
|
* release it until the current thread it is running gains the lock.
|
|
|
|
*
|
|
|
|
* To avoid this issue, we only ever queue BMBT splits that don't have an AGF
|
|
|
|
* already locked to allocate from. The only place that doesn't hold an AGF
|
|
|
|
* locked is unwritten extent conversion at IO completion, but that has already
|
|
|
|
* been offloaded to a worker thread and hence has no stack consumption issues
|
|
|
|
* we have to worry about.
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
*/
|
|
|
|
STATIC int /* error */
|
|
|
|
xfs_btree_split(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
union xfs_btree_ptr *ptrp,
|
|
|
|
union xfs_btree_key *key,
|
|
|
|
struct xfs_btree_cur **curp,
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_btree_split_args args;
|
|
|
|
DECLARE_COMPLETION_ONSTACK(done);
|
|
|
|
|
2024-02-22 20:40:51 +00:00
|
|
|
if (!xfs_btree_is_bmap(cur->bc_ops) ||
|
2023-02-10 17:11:06 +00:00
|
|
|
cur->bc_tp->t_highest_agno == NULLAGNUMBER)
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
|
|
|
|
|
|
|
|
args.cur = cur;
|
|
|
|
args.level = level;
|
|
|
|
args.ptrp = ptrp;
|
|
|
|
args.key = key;
|
|
|
|
args.curp = curp;
|
|
|
|
args.stat = stat;
|
|
|
|
args.done = &done;
|
|
|
|
args.kswapd = current_is_kswapd();
|
|
|
|
INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
|
|
|
|
queue_work(xfs_alloc_wq, &args.work);
|
|
|
|
wait_for_completion(&done);
|
|
|
|
destroy_work_on_stack(&args.work);
|
|
|
|
return args.result;
|
|
|
|
}
|
2021-11-10 02:32:17 +00:00
|
|
|
#else
|
|
|
|
#define xfs_btree_split __xfs_btree_split
|
|
|
|
#endif /* __KERNEL__ */
|
xfs: refine the allocation stack switch
The allocation stack switch at xfs_bmapi_allocate() has served it's
purpose, but is no longer a sufficient solution to the stack usage
problem we have in the XFS allocation path.
Whilst the kernel stack size is now 16k, that is not a valid reason
for undoing all our "keep stack usage down" modifications. What it
does allow us to do is have the freedom to refine and perfect the
modifications knowing that if we get it wrong it won't blow up in
our faces - we have a safety net now.
This is important because we still have the issue of older kernels
having smaller stacks and that they are still supported and are
demonstrating a wide range of different stack overflows. Red Hat
has several open bugs for allocation based stack overflows from
directory modifications and direct IO block allocation and these
problems still need to be solved. If we can solve them upstream,
then distro's won't need to bake their own unique solutions.
To that end, I've observed that every allocation based stack
overflow report has had a specific characteristic - it has happened
during or directly after a bmap btree block split. That event
requires a new block to be allocated to the tree, and so we
effectively stack one allocation stack on top of another, and that's
when we get into trouble.
A further observation is that bmap btree block splits are much rarer
than writeback allocation - over a range of different workloads I've
observed the ratio of bmap btree inserts to splits ranges from 100:1
(xfstests run) to 10000:1 (local VM image server with sparse files
that range in the hundreds of thousands to millions of extents).
Either way, bmap btree split events are much, much rarer than
allocation events.
Finally, we have to move the kswapd state to the allocation workqueue
work when allocation is done on behalf of kswapd. This is proving to
cause significant perturbation in performance under memory pressure
and appears to be generating allocation deadlock warnings under some
workloads, so avoiding the use of a workqueue for the majority of
kswapd writeback allocation will minimise the impact of such
behaviour.
Hence it makes sense to move the stack switch to xfs_btree_split()
and only do it for bmap btree splits. Stack switches during
allocation will be much rarer, so there won't be significant
performacne overhead caused by switching stacks. The worse case
stack from all allocation paths will be split, not just writeback.
And the majority of memory allocations will be done in the correct
context (e.g. kswapd) without causing additional latency, and so we
simplify the memory reclaim interactions between processes,
workqueues and kswapd.
The worst stack I've been able to generate with this patch in place
is 5600 bytes deep. It's very revealing because we exit XFS at:
37) 1768 64 kmem_cache_alloc+0x13b/0x170
about 1800 bytes of stack consumed, and the remaining 3800 bytes
(and 36 functions) is memory reclaim, swap and the IO stack. And
this occurs in the inode allocation from an open(O_CREAT) syscall,
not writeback.
The amount of stack being used is much less than I've previously be
able to generate - fs_mark testing has been able to generate stack
usage of around 7k without too much trouble; with this patch it's
only just getting to 5.5k. This is primarily because the metadata
allocation paths (e.g. directory blocks) are no longer causing
double splits on the same stack, and hence now stack tracing is
showing swapping being the worst stack consumer rather than XFS.
Performance of fs_mark inode create workloads is unchanged.
Performance of fs_mark async fsync workloads is consistently good
with context switches reduced by around 150,000/s (30%).
Performance of dbench, streaming IO and postmark is unchanged.
Allocation deadlock warnings have not been seen on the workloads
that generated them since adding this patch.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-07-14 21:08:24 +00:00
|
|
|
|
2008-10-30 05:57:28 +00:00
|
|
|
/*
|
|
|
|
* Copy the old inode root contents into a real block and make the
|
|
|
|
* broot point to it.
|
|
|
|
*/
|
|
|
|
int /* error */
|
|
|
|
xfs_btree_new_iroot(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
int *logflags, /* logging flags for inode */
|
|
|
|
int *stat) /* return status - 0 fail */
|
|
|
|
{
|
|
|
|
struct xfs_buf *cbp; /* buffer for cblock */
|
|
|
|
struct xfs_btree_block *block; /* btree block */
|
|
|
|
struct xfs_btree_block *cblock; /* child btree block */
|
|
|
|
union xfs_btree_key *ckp; /* child key pointer */
|
|
|
|
union xfs_btree_ptr *cpp; /* child ptr pointer */
|
|
|
|
union xfs_btree_key *kp; /* pointer to btree key */
|
|
|
|
union xfs_btree_ptr *pp; /* pointer to block addr */
|
|
|
|
union xfs_btree_ptr nptr; /* new block addr */
|
|
|
|
int level; /* btree level */
|
|
|
|
int error; /* error return code */
|
|
|
|
int i; /* loop counter */
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, newroot);
|
|
|
|
|
2024-02-22 20:36:17 +00:00
|
|
|
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
|
2008-10-30 05:57:28 +00:00
|
|
|
|
|
|
|
level = cur->bc_nlevels - 1;
|
|
|
|
|
|
|
|
block = xfs_btree_get_iroot(cur);
|
|
|
|
pp = xfs_btree_ptr_addr(cur, 1, block);
|
|
|
|
|
|
|
|
/* Allocate the new block. If we can't do it, we're toast. Give up. */
|
2024-02-22 20:33:07 +00:00
|
|
|
error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
|
2008-10-30 05:57:28 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
2018-03-07 01:03:30 +00:00
|
|
|
if (*stat == 0)
|
2008-10-30 05:57:28 +00:00
|
|
|
return 0;
|
2018-03-07 01:03:30 +00:00
|
|
|
|
2008-10-30 05:57:28 +00:00
|
|
|
XFS_BTREE_STATS_INC(cur, alloc);
|
|
|
|
|
|
|
|
/* Copy the root into a real block. */
|
2019-06-12 16:00:00 +00:00
|
|
|
error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
|
2008-10-30 05:57:28 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
xfs: ensure btree root split sets blkno correctly
For CRC enabled filesystems, the BMBT is rooted in an inode, so it
passes through a different code path on root splits than the
freespace and inode btrees. This is much less traversed by xfstests
than the other trees. When testing on a 1k block size filesystem,
I've been seeing ASSERT failures in generic/234 like:
XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_private.b.allocated == 0, file: fs/xfs/xfs_btree.c, line: 317
which are generally preceded by a lblock check failure. I noticed
this in the bmbt stats:
$ pminfo -f xfs.btree.block_map
xfs.btree.block_map.lookup
value 39135
xfs.btree.block_map.compare
value 268432
xfs.btree.block_map.insrec
value 15786
xfs.btree.block_map.delrec
value 13884
xfs.btree.block_map.newroot
value 2
xfs.btree.block_map.killroot
value 0
.....
Very little coverage of root splits and merges. Indeed, on a 4k
filesystem, block_map.newroot and block_map.killroot are both zero.
i.e. the code is not exercised at all, and it's the only generic
btree infrastructure operation that is not exercised by a default run
of xfstests.
Turns out that on a 1k filesystem, generic/234 accounts for one of
those two root splits, and that is somewhat of a smoking gun. In
fact, it's the same problem we saw in the directory/attr code where
headers are memcpy()d from one block to another without updating the
self describing metadata.
Simple fix - when copying the header out of the root block, make
sure the block number is updated correctly.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
(cherry picked from commit ade1335afef556df6538eb02e8c0dc91fbd9cc37)
2013-06-12 02:19:08 +00:00
|
|
|
/*
|
|
|
|
* we can't just memcpy() the root in for CRC enabled btree blocks.
|
|
|
|
* In that case have to also ensure the blkno remains correct
|
|
|
|
*/
|
2008-10-30 05:57:28 +00:00
|
|
|
memcpy(cblock, block, xfs_btree_block_len(cur));
|
2024-02-22 20:34:12 +00:00
|
|
|
if (xfs_has_crc(cur->bc_mp)) {
|
2021-08-19 01:47:05 +00:00
|
|
|
__be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
|
2021-08-19 01:47:05 +00:00
|
|
|
cblock->bb_u.l.bb_blkno = bno;
|
xfs: ensure btree root split sets blkno correctly
For CRC enabled filesystems, the BMBT is rooted in an inode, so it
passes through a different code path on root splits than the
freespace and inode btrees. This is much less traversed by xfstests
than the other trees. When testing on a 1k block size filesystem,
I've been seeing ASSERT failures in generic/234 like:
XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_private.b.allocated == 0, file: fs/xfs/xfs_btree.c, line: 317
which are generally preceded by a lblock check failure. I noticed
this in the bmbt stats:
$ pminfo -f xfs.btree.block_map
xfs.btree.block_map.lookup
value 39135
xfs.btree.block_map.compare
value 268432
xfs.btree.block_map.insrec
value 15786
xfs.btree.block_map.delrec
value 13884
xfs.btree.block_map.newroot
value 2
xfs.btree.block_map.killroot
value 0
.....
Very little coverage of root splits and merges. Indeed, on a 4k
filesystem, block_map.newroot and block_map.killroot are both zero.
i.e. the code is not exercised at all, and it's the only generic
btree infrastructure operation that is not exercised by a default run
of xfstests.
Turns out that on a 1k filesystem, generic/234 accounts for one of
those two root splits, and that is somewhat of a smoking gun. In
fact, it's the same problem we saw in the directory/attr code where
headers are memcpy()d from one block to another without updating the
self describing metadata.
Simple fix - when copying the header out of the root block, make
sure the block number is updated correctly.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
(cherry picked from commit ade1335afef556df6538eb02e8c0dc91fbd9cc37)
2013-06-12 02:19:08 +00:00
|
|
|
else
|
2021-08-19 01:47:05 +00:00
|
|
|
cblock->bb_u.s.bb_blkno = bno;
|
xfs: ensure btree root split sets blkno correctly
For CRC enabled filesystems, the BMBT is rooted in an inode, so it
passes through a different code path on root splits than the
freespace and inode btrees. This is much less traversed by xfstests
than the other trees. When testing on a 1k block size filesystem,
I've been seeing ASSERT failures in generic/234 like:
XFS: Assertion failed: cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_private.b.allocated == 0, file: fs/xfs/xfs_btree.c, line: 317
which are generally preceded by a lblock check failure. I noticed
this in the bmbt stats:
$ pminfo -f xfs.btree.block_map
xfs.btree.block_map.lookup
value 39135
xfs.btree.block_map.compare
value 268432
xfs.btree.block_map.insrec
value 15786
xfs.btree.block_map.delrec
value 13884
xfs.btree.block_map.newroot
value 2
xfs.btree.block_map.killroot
value 0
.....
Very little coverage of root splits and merges. Indeed, on a 4k
filesystem, block_map.newroot and block_map.killroot are both zero.
i.e. the code is not exercised at all, and it's the only generic
btree infrastructure operation that is not exercised by a default run
of xfstests.
Turns out that on a 1k filesystem, generic/234 accounts for one of
those two root splits, and that is somewhat of a smoking gun. In
fact, it's the same problem we saw in the directory/attr code where
headers are memcpy()d from one block to another without updating the
self describing metadata.
Simple fix - when copying the header out of the root block, make
sure the block number is updated correctly.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
(cherry picked from commit ade1335afef556df6538eb02e8c0dc91fbd9cc37)
2013-06-12 02:19:08 +00:00
|
|
|
}
|
2008-10-30 05:57:28 +00:00
|
|
|
|
|
|
|
be16_add_cpu(&block->bb_level, 1);
|
|
|
|
xfs_btree_set_numrecs(block, 1);
|
|
|
|
cur->bc_nlevels++;
|
2021-09-16 19:26:56 +00:00
|
|
|
ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level + 1].ptr = 1;
|
2008-10-30 05:57:28 +00:00
|
|
|
|
|
|
|
kp = xfs_btree_key_addr(cur, 1, block);
|
|
|
|
ckp = xfs_btree_key_addr(cur, 1, cblock);
|
|
|
|
xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
|
|
|
|
|
|
|
|
cpp = xfs_btree_ptr_addr(cur, 1, cblock);
|
|
|
|
for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, pp, i, level);
|
2008-10-30 05:57:28 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:57:28 +00:00
|
|
|
xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
|
|
|
|
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
|
2008-10-30 05:57:28 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:57:28 +00:00
|
|
|
xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
|
|
|
|
|
2020-03-11 00:52:53 +00:00
|
|
|
xfs_iroot_realloc(cur->bc_ino.ip,
|
2008-10-30 05:57:28 +00:00
|
|
|
1 - xfs_btree_get_numrecs(cblock),
|
2020-03-11 00:52:53 +00:00
|
|
|
cur->bc_ino.whichfork);
|
2008-10-30 05:57:28 +00:00
|
|
|
|
|
|
|
xfs_btree_setbuf(cur, level, cbp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do all this logging at the end so that
|
|
|
|
* the root is at the right level.
|
|
|
|
*/
|
|
|
|
xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
|
|
|
|
xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
|
|
|
|
xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
|
|
|
|
|
|
|
|
*logflags |=
|
2020-03-11 00:52:53 +00:00
|
|
|
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
|
2008-10-30 05:57:28 +00:00
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2024-02-22 20:37:35 +00:00
|
|
|
static void
|
|
|
|
xfs_btree_set_root(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *ptr,
|
|
|
|
int inc)
|
|
|
|
{
|
|
|
|
if (cur->bc_flags & XFS_BTREE_STAGING) {
|
|
|
|
/* Update the btree root information for a per-AG fake root. */
|
|
|
|
cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s);
|
|
|
|
cur->bc_ag.afake->af_levels += inc;
|
|
|
|
} else {
|
|
|
|
cur->bc_ops->set_root(cur, ptr, inc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:57:16 +00:00
|
|
|
/*
|
|
|
|
* Allocate a new root block, fill it in.
|
|
|
|
*/
|
2008-10-30 05:58:41 +00:00
|
|
|
STATIC int /* error */
|
2008-10-30 05:57:16 +00:00
|
|
|
xfs_btree_new_root(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block; /* one half of the old root block */
|
|
|
|
struct xfs_buf *bp; /* buffer containing block */
|
|
|
|
int error; /* error return value */
|
|
|
|
struct xfs_buf *lbp; /* left buffer pointer */
|
|
|
|
struct xfs_btree_block *left; /* left btree block */
|
|
|
|
struct xfs_buf *nbp; /* new (root) buffer */
|
|
|
|
struct xfs_btree_block *new; /* new (root) btree block */
|
|
|
|
int nptr; /* new value for key index, 1 or 2 */
|
|
|
|
struct xfs_buf *rbp; /* right buffer pointer */
|
|
|
|
struct xfs_btree_block *right; /* right btree block */
|
|
|
|
union xfs_btree_ptr rptr;
|
|
|
|
union xfs_btree_ptr lptr;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, newroot);
|
|
|
|
|
|
|
|
/* initialise our start point from the cursor */
|
2024-02-22 20:37:26 +00:00
|
|
|
xfs_btree_init_ptr_from_cur(cur, &rptr);
|
2008-10-30 05:57:16 +00:00
|
|
|
|
|
|
|
/* Allocate the new block. If we can't do it, we're toast. Give up. */
|
2024-02-22 20:33:07 +00:00
|
|
|
error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat);
|
2008-10-30 05:57:16 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
if (*stat == 0)
|
|
|
|
goto out0;
|
|
|
|
XFS_BTREE_STATS_INC(cur, alloc);
|
|
|
|
|
|
|
|
/* Set up the new block. */
|
2019-06-12 16:00:00 +00:00
|
|
|
error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
|
2008-10-30 05:57:16 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/* Set the root in the holding structure increasing the level by 1. */
|
2024-02-22 20:37:35 +00:00
|
|
|
xfs_btree_set_root(cur, &lptr, 1);
|
2008-10-30 05:57:16 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* At the previous root level there are now two blocks: the old root,
|
|
|
|
* and the new block generated when it was split. We don't know which
|
|
|
|
* one the cursor is pointing at, so we set up variables "left" and
|
|
|
|
* "right" for each case.
|
|
|
|
*/
|
|
|
|
block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &rptr)) {
|
|
|
|
/* Our block is left, pick up the right block. */
|
|
|
|
lbp = bp;
|
|
|
|
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
|
|
|
|
left = block;
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
|
2008-10-30 05:57:16 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
bp = rbp;
|
|
|
|
nptr = 1;
|
|
|
|
} else {
|
|
|
|
/* Our block is right, pick up the left block. */
|
|
|
|
rbp = bp;
|
|
|
|
xfs_btree_buf_to_ptr(cur, rbp, &rptr);
|
|
|
|
right = block;
|
|
|
|
xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
|
2008-10-30 05:57:16 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
bp = lbp;
|
|
|
|
nptr = 2;
|
|
|
|
}
|
2016-08-03 01:03:38 +00:00
|
|
|
|
2008-10-30 05:57:16 +00:00
|
|
|
/* Fill in the new block's btree header and log it. */
|
2013-04-21 19:53:46 +00:00
|
|
|
xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
|
2008-10-30 05:57:16 +00:00
|
|
|
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
|
|
|
|
ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
|
|
|
|
!xfs_btree_ptr_is_null(cur, &rptr));
|
|
|
|
|
|
|
|
/* Fill in the key data in the new root. */
|
|
|
|
if (xfs_btree_get_level(left) > 0) {
|
2016-08-03 01:03:38 +00:00
|
|
|
/*
|
|
|
|
* Get the keys for the left block's keys and put them directly
|
|
|
|
* in the parent block. Do the same for the right block.
|
|
|
|
*/
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_node_keys(cur, left,
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_key_addr(cur, 1, new));
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_node_keys(cur, right,
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_key_addr(cur, 2, new));
|
2008-10-30 05:57:16 +00:00
|
|
|
} else {
|
2016-08-03 01:03:38 +00:00
|
|
|
/*
|
|
|
|
* Get the keys for the left block's records and put them
|
|
|
|
* directly in the parent block. Do the same for the right
|
|
|
|
* block.
|
|
|
|
*/
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_leaf_keys(cur, left,
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_key_addr(cur, 1, new));
|
2016-08-03 02:22:12 +00:00
|
|
|
xfs_btree_get_leaf_keys(cur, right,
|
2016-08-03 01:03:38 +00:00
|
|
|
xfs_btree_key_addr(cur, 2, new));
|
2008-10-30 05:57:16 +00:00
|
|
|
}
|
|
|
|
xfs_btree_log_keys(cur, nbp, 1, 2);
|
|
|
|
|
|
|
|
/* Fill in the pointer data in the new root. */
|
|
|
|
xfs_btree_copy_ptrs(cur,
|
|
|
|
xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
|
|
|
|
xfs_btree_copy_ptrs(cur,
|
|
|
|
xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
|
|
|
|
xfs_btree_log_ptrs(cur, nbp, 1, 2);
|
|
|
|
|
|
|
|
/* Fix up the cursor. */
|
|
|
|
xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[cur->bc_nlevels].ptr = nptr;
|
2008-10-30 05:57:16 +00:00
|
|
|
cur->bc_nlevels++;
|
2021-09-16 19:26:56 +00:00
|
|
|
ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
|
2008-10-30 05:57:16 +00:00
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
out0:
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
2008-10-30 05:57:40 +00:00
|
|
|
|
|
|
|
STATIC int
|
|
|
|
xfs_btree_make_block_unfull(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
int level, /* btree level */
|
|
|
|
int numrecs,/* # of recs in block */
|
|
|
|
int *oindex,/* old tree index */
|
|
|
|
int *index, /* new tree index */
|
|
|
|
union xfs_btree_ptr *nptr, /* new btree ptr */
|
|
|
|
struct xfs_btree_cur **ncur, /* new btree cursor */
|
2016-08-03 01:03:38 +00:00
|
|
|
union xfs_btree_key *key, /* key of new block */
|
2008-10-30 05:57:40 +00:00
|
|
|
int *stat)
|
|
|
|
{
|
|
|
|
int error = 0;
|
|
|
|
|
2024-02-22 20:37:24 +00:00
|
|
|
if (xfs_btree_at_iroot(cur, level)) {
|
2020-03-11 00:52:53 +00:00
|
|
|
struct xfs_inode *ip = cur->bc_ino.ip;
|
2008-10-30 05:57:40 +00:00
|
|
|
|
|
|
|
if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
|
|
|
|
/* A root block that can be made bigger. */
|
2020-03-11 00:52:53 +00:00
|
|
|
xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
|
2016-08-03 01:01:25 +00:00
|
|
|
*stat = 1;
|
2008-10-30 05:57:40 +00:00
|
|
|
} else {
|
|
|
|
/* A root block that needs replacing */
|
|
|
|
int logflags = 0;
|
|
|
|
|
|
|
|
error = xfs_btree_new_iroot(cur, &logflags, stat);
|
|
|
|
if (error || *stat == 0)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
xfs_trans_log_inode(cur->bc_tp, ip, logflags);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* First, try shifting an entry to the right neighbor. */
|
|
|
|
error = xfs_btree_rshift(cur, level, stat);
|
|
|
|
if (error || *stat)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
/* Next, try shifting an entry to the left neighbor. */
|
|
|
|
error = xfs_btree_lshift(cur, level, stat);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (*stat) {
|
2021-09-16 19:24:04 +00:00
|
|
|
*oindex = *index = cur->bc_levels[level].ptr;
|
2008-10-30 05:57:40 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Next, try splitting the current block in half.
|
|
|
|
*
|
|
|
|
* If this works we have to re-set our variables because we
|
|
|
|
* could be in a different block now.
|
|
|
|
*/
|
2016-08-03 01:02:39 +00:00
|
|
|
error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
|
2008-10-30 05:57:40 +00:00
|
|
|
if (error || *stat == 0)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
*index = cur->bc_levels[level].ptr;
|
2008-10-30 05:57:40 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert one record/level. Return information to the caller
|
|
|
|
* allowing the next level up to proceed if necessary.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_btree_insrec(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
int level, /* level to insert record at */
|
|
|
|
union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
|
2016-08-03 01:02:39 +00:00
|
|
|
union xfs_btree_rec *rec, /* record to insert */
|
|
|
|
union xfs_btree_key *key, /* i/o: block key for ptrp */
|
2008-10-30 05:57:40 +00:00
|
|
|
struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block; /* btree block */
|
|
|
|
struct xfs_buf *bp; /* buffer for block */
|
|
|
|
union xfs_btree_ptr nptr; /* new block ptr */
|
2022-05-27 00:22:56 +00:00
|
|
|
struct xfs_btree_cur *ncur = NULL; /* new btree cursor */
|
2016-09-19 00:24:36 +00:00
|
|
|
union xfs_btree_key nkey; /* new block key */
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
union xfs_btree_key *lkey;
|
2008-10-30 05:57:40 +00:00
|
|
|
int optr; /* old key/record index */
|
|
|
|
int ptr; /* key/record index */
|
|
|
|
int numrecs;/* number of records */
|
|
|
|
int error; /* error return value */
|
|
|
|
int i;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
xfs_daddr_t old_bn;
|
2008-10-30 05:57:40 +00:00
|
|
|
|
|
|
|
ncur = NULL;
|
2016-09-19 00:24:36 +00:00
|
|
|
lkey = &nkey;
|
2008-10-30 05:57:40 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we have an external root pointer, and we've made it to the
|
|
|
|
* root level, allocate a new root block and we're done.
|
|
|
|
*/
|
2024-02-22 20:36:17 +00:00
|
|
|
if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE &&
|
|
|
|
level >= cur->bc_nlevels) {
|
2008-10-30 05:57:40 +00:00
|
|
|
error = xfs_btree_new_root(cur, stat);
|
|
|
|
xfs_btree_set_ptr_null(cur, ptrp);
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If we're off the left edge, return failure. */
|
2021-09-16 19:24:04 +00:00
|
|
|
ptr = cur->bc_levels[level].ptr;
|
2008-10-30 05:57:40 +00:00
|
|
|
if (ptr == 0) {
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
optr = ptr;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, insrec);
|
|
|
|
|
|
|
|
/* Get pointers to the btree buffer and block. */
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
2021-08-19 01:47:05 +00:00
|
|
|
old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
|
2008-10-30 05:57:40 +00:00
|
|
|
numrecs = xfs_btree_get_numrecs(block);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/* Check that the new entry is being inserted in the right place. */
|
|
|
|
if (ptr <= numrecs) {
|
|
|
|
if (level == 0) {
|
2016-08-03 01:02:39 +00:00
|
|
|
ASSERT(cur->bc_ops->recs_inorder(cur, rec,
|
2008-10-30 05:58:32 +00:00
|
|
|
xfs_btree_rec_addr(cur, ptr, block)));
|
2008-10-30 05:57:40 +00:00
|
|
|
} else {
|
2016-08-03 01:02:39 +00:00
|
|
|
ASSERT(cur->bc_ops->keys_inorder(cur, key,
|
2008-10-30 05:58:32 +00:00
|
|
|
xfs_btree_key_addr(cur, ptr, block)));
|
2008-10-30 05:57:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the block is full, we can't insert the new entry until we
|
|
|
|
* make the block un-full.
|
|
|
|
*/
|
|
|
|
xfs_btree_set_ptr_null(cur, &nptr);
|
|
|
|
if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
|
|
|
|
error = xfs_btree_make_block_unfull(cur, level, numrecs,
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
&optr, &ptr, &nptr, &ncur, lkey, stat);
|
2008-10-30 05:57:40 +00:00
|
|
|
if (error || *stat == 0)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The current block may have changed if the block was
|
|
|
|
* previously full and we have just made space in it.
|
|
|
|
*/
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
numrecs = xfs_btree_get_numrecs(block);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (error)
|
2022-05-27 00:22:56 +00:00
|
|
|
goto error0;
|
2008-10-30 05:57:40 +00:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* At this point we know there's room for our new entry in the block
|
|
|
|
* we're pointing at.
|
|
|
|
*/
|
|
|
|
XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
|
|
|
|
|
|
|
|
if (level > 0) {
|
|
|
|
/* It's a nonleaf. make a hole in the keys and ptrs */
|
|
|
|
union xfs_btree_key *kp;
|
|
|
|
union xfs_btree_ptr *pp;
|
|
|
|
|
|
|
|
kp = xfs_btree_key_addr(cur, ptr, block);
|
|
|
|
pp = xfs_btree_ptr_addr(cur, ptr, block);
|
|
|
|
|
|
|
|
for (i = numrecs - ptr; i >= 0; i--) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, pp, i, level);
|
2008-10-30 05:57:40 +00:00
|
|
|
if (error)
|
2022-05-27 00:22:56 +00:00
|
|
|
goto error0;
|
2008-10-30 05:57:40 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
|
|
|
|
xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
|
|
|
|
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
|
2008-10-30 05:57:40 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/* Now put the new data in, bump numrecs and log it. */
|
2016-08-03 01:02:39 +00:00
|
|
|
xfs_btree_copy_keys(cur, kp, key, 1);
|
2008-10-30 05:57:40 +00:00
|
|
|
xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
|
|
|
|
numrecs++;
|
|
|
|
xfs_btree_set_numrecs(block, numrecs);
|
|
|
|
xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
|
|
|
|
xfs_btree_log_keys(cur, bp, ptr, numrecs);
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (ptr < numrecs) {
|
2008-10-30 05:58:32 +00:00
|
|
|
ASSERT(cur->bc_ops->keys_inorder(cur, kp,
|
|
|
|
xfs_btree_key_addr(cur, ptr + 1, block)));
|
2008-10-30 05:57:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
/* It's a leaf. make a hole in the records */
|
|
|
|
union xfs_btree_rec *rp;
|
|
|
|
|
|
|
|
rp = xfs_btree_rec_addr(cur, ptr, block);
|
|
|
|
|
|
|
|
xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
|
|
|
|
|
|
|
|
/* Now put the new data in, bump numrecs and log it. */
|
2016-08-03 01:02:39 +00:00
|
|
|
xfs_btree_copy_recs(cur, rp, rec, 1);
|
2008-10-30 05:57:40 +00:00
|
|
|
xfs_btree_set_numrecs(block, ++numrecs);
|
|
|
|
xfs_btree_log_recs(cur, bp, ptr, numrecs);
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (ptr < numrecs) {
|
2008-10-30 05:58:32 +00:00
|
|
|
ASSERT(cur->bc_ops->recs_inorder(cur, rp,
|
|
|
|
xfs_btree_rec_addr(cur, ptr + 1, block)));
|
2008-10-30 05:57:40 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Log the new number of records in the btree header. */
|
|
|
|
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
|
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/*
|
|
|
|
* If we just inserted into a new tree block, we have to
|
|
|
|
* recalculate nkey here because nkey is out of date.
|
|
|
|
*
|
|
|
|
* Otherwise we're just updating an existing block (having shoved
|
|
|
|
* some records into the new tree block), so use the regular key
|
|
|
|
* update mechanism.
|
|
|
|
*/
|
2021-08-19 01:47:05 +00:00
|
|
|
if (bp && xfs_buf_daddr(bp) != old_bn) {
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
xfs_btree_get_keys(cur, block, lkey);
|
|
|
|
} else if (xfs_btree_needs_key_update(cur, optr)) {
|
2016-08-03 02:22:12 +00:00
|
|
|
error = xfs_btree_update_keys(cur, level);
|
2008-10-30 05:57:40 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the new block number, if any.
|
|
|
|
* If there is one, give back a record value and a cursor too.
|
|
|
|
*/
|
|
|
|
*ptrp = nptr;
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &nptr)) {
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
xfs_btree_copy_keys(cur, key, lkey, 1);
|
2008-10-30 05:57:40 +00:00
|
|
|
*curp = ncur;
|
|
|
|
}
|
|
|
|
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
2022-05-27 00:22:56 +00:00
|
|
|
if (ncur)
|
|
|
|
xfs_btree_del_cursor(ncur, error);
|
2008-10-30 05:57:40 +00:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Insert the record at the point referenced by cur.
|
|
|
|
*
|
|
|
|
* A multi-level split of the tree on insert will invalidate the original
|
|
|
|
* cursor. All callers of this function should assume that the cursor is
|
|
|
|
* no longer valid and revalidate it.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_btree_insert(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int *stat)
|
|
|
|
{
|
|
|
|
int error; /* error return value */
|
|
|
|
int i; /* result value, 0 for failure */
|
|
|
|
int level; /* current level number in btree */
|
|
|
|
union xfs_btree_ptr nptr; /* new block number (split result) */
|
|
|
|
struct xfs_btree_cur *ncur; /* new cursor (split result) */
|
|
|
|
struct xfs_btree_cur *pcur; /* previous level's cursor */
|
2016-09-19 00:24:36 +00:00
|
|
|
union xfs_btree_key bkey; /* key of block to insert */
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
union xfs_btree_key *key;
|
2008-10-30 05:57:40 +00:00
|
|
|
union xfs_btree_rec rec; /* record to insert */
|
|
|
|
|
|
|
|
level = 0;
|
|
|
|
ncur = NULL;
|
|
|
|
pcur = cur;
|
2016-09-19 00:24:36 +00:00
|
|
|
key = &bkey;
|
2008-10-30 05:57:40 +00:00
|
|
|
|
|
|
|
xfs_btree_set_ptr_null(cur, &nptr);
|
2016-08-03 01:02:39 +00:00
|
|
|
|
|
|
|
/* Make a key out of the record data to be inserted, and save it. */
|
2008-10-30 05:57:40 +00:00
|
|
|
cur->bc_ops->init_rec_from_cur(cur, &rec);
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
cur->bc_ops->init_key_from_rec(key, &rec);
|
2008-10-30 05:57:40 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop going up the tree, starting at the leaf level.
|
|
|
|
* Stop when we don't get a split block, that must mean that
|
|
|
|
* the insert is finished with this level.
|
|
|
|
*/
|
|
|
|
do {
|
|
|
|
/*
|
|
|
|
* Insert nrec/nptr into this level of the tree.
|
|
|
|
* Note if we fail, nptr will be null.
|
|
|
|
*/
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
|
2016-08-03 01:02:39 +00:00
|
|
|
&ncur, &i);
|
2008-10-30 05:57:40 +00:00
|
|
|
if (error) {
|
|
|
|
if (pcur != cur)
|
|
|
|
xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:57:40 +00:00
|
|
|
level++;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* See if the cursor we just used is trash.
|
|
|
|
* Can't trash the caller's cursor, but otherwise we should
|
|
|
|
* if ncur is a new cursor or we're about to be done.
|
|
|
|
*/
|
|
|
|
if (pcur != cur &&
|
|
|
|
(ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
|
|
|
|
/* Save the state from the cursor before we trash it */
|
2024-02-22 20:37:35 +00:00
|
|
|
if (cur->bc_ops->update_cursor &&
|
|
|
|
!(cur->bc_flags & XFS_BTREE_STAGING))
|
2008-10-30 05:57:40 +00:00
|
|
|
cur->bc_ops->update_cursor(pcur, cur);
|
|
|
|
cur->bc_nlevels = pcur->bc_nlevels;
|
|
|
|
xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
|
|
|
|
}
|
|
|
|
/* If we got a new cursor, switch to it. */
|
|
|
|
if (ncur) {
|
|
|
|
pcur = ncur;
|
|
|
|
ncur = NULL;
|
|
|
|
}
|
|
|
|
} while (!xfs_btree_ptr_is_null(cur, &nptr));
|
|
|
|
|
|
|
|
*stat = i;
|
|
|
|
return 0;
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
2008-10-30 05:57:51 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Try to merge a non-leaf block back into the inode root.
|
|
|
|
*
|
|
|
|
* Note: the killroot names comes from the fact that we're effectively
|
|
|
|
* killing the old root block. But because we can't just delete the
|
|
|
|
* inode we have to copy the single block it was pointing to into the
|
|
|
|
* inode.
|
|
|
|
*/
|
2009-07-02 05:09:33 +00:00
|
|
|
STATIC int
|
2008-10-30 05:57:51 +00:00
|
|
|
xfs_btree_kill_iroot(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
2020-03-11 00:52:53 +00:00
|
|
|
int whichfork = cur->bc_ino.whichfork;
|
|
|
|
struct xfs_inode *ip = cur->bc_ino.ip;
|
2022-07-09 17:56:05 +00:00
|
|
|
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
|
2008-10-30 05:57:51 +00:00
|
|
|
struct xfs_btree_block *block;
|
|
|
|
struct xfs_btree_block *cblock;
|
|
|
|
union xfs_btree_key *kp;
|
|
|
|
union xfs_btree_key *ckp;
|
|
|
|
union xfs_btree_ptr *pp;
|
|
|
|
union xfs_btree_ptr *cpp;
|
|
|
|
struct xfs_buf *cbp;
|
|
|
|
int level;
|
|
|
|
int index;
|
|
|
|
int numrecs;
|
2016-02-08 03:58:07 +00:00
|
|
|
int error;
|
2008-10-30 05:57:51 +00:00
|
|
|
#ifdef DEBUG
|
|
|
|
union xfs_btree_ptr ptr;
|
|
|
|
#endif
|
2018-06-04 04:10:48 +00:00
|
|
|
int i;
|
2008-10-30 05:57:51 +00:00
|
|
|
|
2024-02-22 20:36:17 +00:00
|
|
|
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
|
2008-10-30 05:57:51 +00:00
|
|
|
ASSERT(cur->bc_nlevels > 1);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't deal with the root block needs to be a leaf case.
|
|
|
|
* We're just going to turn the thing back into extents anyway.
|
|
|
|
*/
|
|
|
|
level = cur->bc_nlevels - 1;
|
|
|
|
if (level == 1)
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Give up if the root has multiple children.
|
|
|
|
*/
|
|
|
|
block = xfs_btree_get_iroot(cur);
|
|
|
|
if (xfs_btree_get_numrecs(block) != 1)
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
cblock = xfs_btree_get_block(cur, level - 1, &cbp);
|
|
|
|
numrecs = xfs_btree_get_numrecs(cblock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only do this if the next level will fit.
|
|
|
|
* Then the data must be copied up to the inode,
|
|
|
|
* instead of freeing the root you free the next level.
|
|
|
|
*/
|
|
|
|
if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
|
|
|
|
goto out0;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, killroot);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
|
|
|
|
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
|
|
|
|
xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
|
|
|
|
ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
|
|
|
|
#endif
|
|
|
|
|
|
|
|
index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
|
|
|
|
if (index) {
|
2020-03-11 00:52:53 +00:00
|
|
|
xfs_iroot_realloc(cur->bc_ino.ip, index,
|
|
|
|
cur->bc_ino.whichfork);
|
2008-10-30 06:14:34 +00:00
|
|
|
block = ifp->if_broot;
|
2008-10-30 05:57:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
be16_add_cpu(&block->bb_numrecs, index);
|
|
|
|
ASSERT(block->bb_numrecs == cblock->bb_numrecs);
|
|
|
|
|
|
|
|
kp = xfs_btree_key_addr(cur, 1, block);
|
|
|
|
ckp = xfs_btree_key_addr(cur, 1, cblock);
|
|
|
|
xfs_btree_copy_keys(cur, kp, ckp, numrecs);
|
|
|
|
|
|
|
|
pp = xfs_btree_ptr_addr(cur, 1, block);
|
|
|
|
cpp = xfs_btree_ptr_addr(cur, 1, cblock);
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:57:51 +00:00
|
|
|
for (i = 0; i < numrecs; i++) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
|
2018-03-07 01:03:30 +00:00
|
|
|
if (error)
|
2008-10-30 05:57:51 +00:00
|
|
|
return error;
|
|
|
|
}
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:57:51 +00:00
|
|
|
xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
|
|
|
|
|
2016-02-08 03:58:07 +00:00
|
|
|
error = xfs_btree_free_block(cur, cbp);
|
2018-03-07 01:03:30 +00:00
|
|
|
if (error)
|
2016-02-08 03:58:07 +00:00
|
|
|
return error;
|
2008-10-30 05:57:51 +00:00
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level - 1].bp = NULL;
|
2008-10-30 05:57:51 +00:00
|
|
|
be16_add_cpu(&block->bb_level, -1);
|
|
|
|
xfs_trans_log_inode(cur->bc_tp, ip,
|
2020-03-11 00:52:53 +00:00
|
|
|
XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
|
2008-10-30 05:57:51 +00:00
|
|
|
cur->bc_nlevels--;
|
|
|
|
out0:
|
|
|
|
return 0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
2010-09-07 23:34:07 +00:00
|
|
|
/*
|
|
|
|
* Kill the current root node, and replace it with it's only child node.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_btree_kill_root(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
struct xfs_buf *bp,
|
|
|
|
int level,
|
|
|
|
union xfs_btree_ptr *newroot)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, killroot);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the root pointer, decreasing the level by 1 and then
|
|
|
|
* free the old root.
|
|
|
|
*/
|
2024-02-22 20:37:35 +00:00
|
|
|
xfs_btree_set_root(cur, newroot, -1);
|
2010-09-07 23:34:07 +00:00
|
|
|
|
2016-02-08 03:58:07 +00:00
|
|
|
error = xfs_btree_free_block(cur, bp);
|
2018-03-07 01:03:30 +00:00
|
|
|
if (error)
|
2010-09-07 23:34:07 +00:00
|
|
|
return error;
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].bp = NULL;
|
|
|
|
cur->bc_levels[level].ra = 0;
|
2010-09-07 23:34:07 +00:00
|
|
|
cur->bc_nlevels--;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-10-30 05:58:01 +00:00
|
|
|
STATIC int
|
|
|
|
xfs_btree_dec_cursor(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
int *stat)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (level > 0) {
|
|
|
|
error = xfs_btree_decrement(cur, level, &i);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Single level of the btree record deletion routine.
|
|
|
|
* Delete record pointed to by cur/level.
|
|
|
|
* Remove the record from its block then rebalance the tree.
|
|
|
|
* Return 0 for error, 1 for done, 2 to go on to the next level.
|
|
|
|
*/
|
|
|
|
STATIC int /* error */
|
|
|
|
xfs_btree_delrec(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
int level, /* level removing record from */
|
|
|
|
int *stat) /* fail/done/go-on */
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block; /* btree block */
|
|
|
|
union xfs_btree_ptr cptr; /* current block ptr */
|
|
|
|
struct xfs_buf *bp; /* buffer for block */
|
|
|
|
int error; /* error return value */
|
|
|
|
int i; /* loop counter */
|
|
|
|
union xfs_btree_ptr lptr; /* left sibling block ptr */
|
|
|
|
struct xfs_buf *lbp; /* left buffer pointer */
|
|
|
|
struct xfs_btree_block *left; /* left btree block */
|
|
|
|
int lrecs = 0; /* left record count */
|
|
|
|
int ptr; /* key/record index */
|
|
|
|
union xfs_btree_ptr rptr; /* right sibling block ptr */
|
|
|
|
struct xfs_buf *rbp; /* right buffer pointer */
|
|
|
|
struct xfs_btree_block *right; /* right btree block */
|
|
|
|
struct xfs_btree_block *rrblock; /* right-right btree block */
|
|
|
|
struct xfs_buf *rrbp; /* right-right buffer pointer */
|
|
|
|
int rrecs = 0; /* right record count */
|
|
|
|
struct xfs_btree_cur *tcur; /* temporary btree cursor */
|
|
|
|
int numrecs; /* temporary numrec count */
|
|
|
|
|
|
|
|
tcur = NULL;
|
|
|
|
|
|
|
|
/* Get the index of the entry being deleted, check for nothing there. */
|
2021-09-16 19:24:04 +00:00
|
|
|
ptr = cur->bc_levels[level].ptr;
|
2008-10-30 05:58:01 +00:00
|
|
|
if (ptr == 0) {
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Get the buffer & block containing the record or key/ptr. */
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
numrecs = xfs_btree_get_numrecs(block);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Fail if we're off the end of the block. */
|
|
|
|
if (ptr > numrecs) {
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, delrec);
|
|
|
|
XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
|
|
|
|
|
|
|
|
/* Excise the entries being deleted. */
|
|
|
|
if (level > 0) {
|
|
|
|
/* It's a nonleaf. operate on keys and ptrs */
|
|
|
|
union xfs_btree_key *lkp;
|
|
|
|
union xfs_btree_ptr *lpp;
|
|
|
|
|
|
|
|
lkp = xfs_btree_key_addr(cur, ptr + 1, block);
|
|
|
|
lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
|
|
|
|
|
|
|
|
for (i = 0; i < numrecs - ptr; i++) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ptr < numrecs) {
|
|
|
|
xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
|
|
|
|
xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
|
|
|
|
xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
|
|
|
|
xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/* It's a leaf. operate on records */
|
|
|
|
if (ptr < numrecs) {
|
|
|
|
xfs_btree_shift_recs(cur,
|
|
|
|
xfs_btree_rec_addr(cur, ptr + 1, block),
|
|
|
|
-1, numrecs - ptr);
|
|
|
|
xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decrement and log the number of entries in the block.
|
|
|
|
*/
|
|
|
|
xfs_btree_set_numrecs(block, --numrecs);
|
|
|
|
xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're at the root level. First, shrink the root block in-memory.
|
|
|
|
* Try to get rid of the next level down. If we can't then there's
|
|
|
|
* nothing left to do.
|
|
|
|
*/
|
2024-02-22 20:37:24 +00:00
|
|
|
if (xfs_btree_at_iroot(cur, level)) {
|
|
|
|
xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
|
2008-10-30 05:58:01 +00:00
|
|
|
|
2024-02-22 20:37:24 +00:00
|
|
|
error = xfs_btree_kill_iroot(cur);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
2008-10-30 05:58:01 +00:00
|
|
|
|
2024-02-22 20:37:24 +00:00
|
|
|
error = xfs_btree_dec_cursor(cur, level, stat);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
2024-02-22 20:37:24 +00:00
|
|
|
/*
|
|
|
|
* If this is the root level, and there's only one entry left, and it's
|
|
|
|
* NOT the leaf level, then we can get rid of this level.
|
|
|
|
*/
|
|
|
|
if (level == cur->bc_nlevels - 1) {
|
2008-10-30 05:58:01 +00:00
|
|
|
if (numrecs == 1 && level > 0) {
|
|
|
|
union xfs_btree_ptr *pp;
|
|
|
|
/*
|
|
|
|
* pp is still set to the first pointer in the block.
|
|
|
|
* Make it the new root of the btree.
|
|
|
|
*/
|
|
|
|
pp = xfs_btree_ptr_addr(cur, 1, block);
|
2010-09-07 23:34:07 +00:00
|
|
|
error = xfs_btree_kill_root(cur, bp, level, pp);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
} else if (level > 0) {
|
|
|
|
error = xfs_btree_dec_cursor(cur, level, stat);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we deleted the leftmost entry in the block, update the
|
|
|
|
* key values above us in the tree.
|
|
|
|
*/
|
2016-08-03 01:03:38 +00:00
|
|
|
if (xfs_btree_needs_key_update(cur, ptr)) {
|
2016-08-03 02:22:12 +00:00
|
|
|
error = xfs_btree_update_keys(cur, level);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the number of records remaining in the block is at least
|
|
|
|
* the minimum, we're done.
|
|
|
|
*/
|
|
|
|
if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
|
|
|
|
error = xfs_btree_dec_cursor(cur, level, stat);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise, we have to move some records around to keep the
|
|
|
|
* tree balanced. Look at the left and right sibling blocks to
|
|
|
|
* see if we can re-balance by moving only one record.
|
|
|
|
*/
|
|
|
|
xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
|
|
|
|
xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
|
|
|
|
|
2024-02-22 20:36:17 +00:00
|
|
|
if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
|
2008-10-30 05:58:01 +00:00
|
|
|
/*
|
|
|
|
* One child of root, need to get a chance to copy its contents
|
|
|
|
* into the root and delete it. Can't go up to next level,
|
|
|
|
* there's nothing to delete there.
|
|
|
|
*/
|
|
|
|
if (xfs_btree_ptr_is_null(cur, &rptr) &&
|
|
|
|
xfs_btree_ptr_is_null(cur, &lptr) &&
|
|
|
|
level == cur->bc_nlevels - 2) {
|
|
|
|
error = xfs_btree_kill_iroot(cur);
|
|
|
|
if (!error)
|
|
|
|
error = xfs_btree_dec_cursor(cur, level, stat);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
|
|
|
|
!xfs_btree_ptr_is_null(cur, &lptr));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Duplicate the cursor so our btree manipulations here won't
|
|
|
|
* disrupt the next level up.
|
|
|
|
*/
|
|
|
|
error = xfs_btree_dup_cursor(cur, &tcur);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there's a right sibling, see if it's ok to shift an entry
|
|
|
|
* out of it.
|
|
|
|
*/
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &rptr)) {
|
|
|
|
/*
|
|
|
|
* Move the temp cursor to the last entry in the next block.
|
|
|
|
* Actually any entry but the first would suffice.
|
|
|
|
*/
|
|
|
|
i = xfs_btree_lastrec(tcur, level);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
|
|
|
error = xfs_btree_increment(tcur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
|
|
|
i = xfs_btree_lastrec(tcur, level);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
|
|
|
/* Grab a pointer to the block. */
|
|
|
|
right = xfs_btree_get_block(tcur, level, &rbp);
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(tcur, right, level, rbp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
/* Grab the current block number, for future use. */
|
|
|
|
xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If right block is full enough so that removing one entry
|
|
|
|
* won't make it too empty, and left-shifting an entry out
|
|
|
|
* of right to us works, we're done.
|
|
|
|
*/
|
|
|
|
if (xfs_btree_get_numrecs(right) - 1 >=
|
|
|
|
cur->bc_ops->get_minrecs(tcur, level)) {
|
|
|
|
error = xfs_btree_lshift(tcur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
if (i) {
|
|
|
|
ASSERT(xfs_btree_get_numrecs(block) >=
|
|
|
|
cur->bc_ops->get_minrecs(tcur, level));
|
|
|
|
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
|
|
|
|
tcur = NULL;
|
|
|
|
|
|
|
|
error = xfs_btree_dec_cursor(cur, level, stat);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise, grab the number of records in right for
|
|
|
|
* future reference, and fix up the temp cursor to point
|
|
|
|
* to our block again (last record).
|
|
|
|
*/
|
|
|
|
rrecs = xfs_btree_get_numrecs(right);
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
|
|
|
|
i = xfs_btree_firstrec(tcur, level);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
|
|
|
error = xfs_btree_decrement(tcur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there's a left sibling, see if it's ok to shift an entry
|
|
|
|
* out of it.
|
|
|
|
*/
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &lptr)) {
|
|
|
|
/*
|
|
|
|
* Move the temp cursor to the first entry in the
|
|
|
|
* previous block.
|
|
|
|
*/
|
|
|
|
i = xfs_btree_firstrec(tcur, level);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
|
|
|
error = xfs_btree_decrement(tcur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
i = xfs_btree_firstrec(tcur, level);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
|
2024-02-22 20:32:55 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
xfs: kill the XFS_WANT_CORRUPT_* macros
The XFS_WANT_CORRUPT_* macros conceal subtle side effects such as the
creation of local variables and redirections of the code flow. This is
pretty ugly, so replace them with explicit XFS_IS_CORRUPT tests that
remove both of those ugly points. The change was performed with the
following coccinelle script:
@@
expression mp, test;
identifier label;
@@
- XFS_WANT_CORRUPTED_GOTO(mp, test, label);
+ if (XFS_IS_CORRUPT(mp, !test)) { error = -EFSCORRUPTED; goto label; }
@@
expression mp, test;
@@
- XFS_WANT_CORRUPTED_RETURN(mp, test);
+ if (XFS_IS_CORRUPT(mp, !test)) return -EFSCORRUPTED;
@@
expression mp, lval, rval;
@@
- XFS_IS_CORRUPT(mp, !(lval == rval))
+ XFS_IS_CORRUPT(mp, lval != rval)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 && e2))
+ XFS_IS_CORRUPT(mp, !e1 || !e2)
@@
expression e1, e2;
@@
- !(e1 == e2)
+ e1 != e2
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 && e3 == e4) || e5 != e6
+ e1 != e2 || e3 != e4 || e5 != e6
@@
expression e1, e2, e3, e4, e5, e6;
@@
- !(e1 == e2 || (e3 <= e4 && e5 <= e6))
+ e1 != e2 && (e3 > e4 || e5 > e6)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2))
+ XFS_IS_CORRUPT(mp, e1 > e2)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 < e2))
+ XFS_IS_CORRUPT(mp, e1 >= e2)
@@
expression mp, e1;
@@
- XFS_IS_CORRUPT(mp, !!e1)
+ XFS_IS_CORRUPT(mp, e1)
@@
expression mp, e1, e2;
@@
- XFS_IS_CORRUPT(mp, !(e1 || e2))
+ XFS_IS_CORRUPT(mp, !e1 && !e2)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 == e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 != e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 <= e2) || !(e3 >= e4))
+ XFS_IS_CORRUPT(mp, e1 > e2 || e3 < e4)
@@
expression mp, e1, e2, e3, e4;
@@
- XFS_IS_CORRUPT(mp, !(e1 == e2) && !(e3 <= e4))
+ XFS_IS_CORRUPT(mp, e1 != e2 && e3 > e4)
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2019-11-11 20:52:18 +00:00
|
|
|
error = -EFSCORRUPTED;
|
|
|
|
goto error0;
|
|
|
|
}
|
2008-10-30 05:58:01 +00:00
|
|
|
|
|
|
|
/* Grab a pointer to the block. */
|
|
|
|
left = xfs_btree_get_block(tcur, level, &lbp);
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, left, level, lbp);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
#endif
|
|
|
|
/* Grab the current block number, for future use. */
|
|
|
|
xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If left block is full enough so that removing one entry
|
|
|
|
* won't make it too empty, and right-shifting an entry out
|
|
|
|
* of left to us works, we're done.
|
|
|
|
*/
|
|
|
|
if (xfs_btree_get_numrecs(left) - 1 >=
|
|
|
|
cur->bc_ops->get_minrecs(tcur, level)) {
|
|
|
|
error = xfs_btree_rshift(tcur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
if (i) {
|
|
|
|
ASSERT(xfs_btree_get_numrecs(block) >=
|
|
|
|
cur->bc_ops->get_minrecs(tcur, level));
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
|
|
|
|
tcur = NULL;
|
|
|
|
if (level == 0)
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[0].ptr++;
|
2018-03-07 01:03:30 +00:00
|
|
|
|
2008-10-30 05:58:01 +00:00
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise, grab the number of records in right for
|
|
|
|
* future reference.
|
|
|
|
*/
|
|
|
|
lrecs = xfs_btree_get_numrecs(left);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Delete the temp cursor, we're done with it. */
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
|
|
|
|
tcur = NULL;
|
|
|
|
|
|
|
|
/* If here, we need to do a join to keep the tree balanced. */
|
|
|
|
ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
|
|
|
|
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &lptr) &&
|
|
|
|
lrecs + xfs_btree_get_numrecs(block) <=
|
|
|
|
cur->bc_ops->get_maxrecs(cur, level)) {
|
|
|
|
/*
|
|
|
|
* Set "right" to be the starting block,
|
|
|
|
* "left" to be the left neighbor.
|
|
|
|
*/
|
|
|
|
rptr = cptr;
|
|
|
|
right = block;
|
|
|
|
rbp = bp;
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If that won't work, see if we can join with the right neighbor block.
|
|
|
|
*/
|
|
|
|
} else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
|
|
|
|
rrecs + xfs_btree_get_numrecs(block) <=
|
|
|
|
cur->bc_ops->get_maxrecs(cur, level)) {
|
|
|
|
/*
|
|
|
|
* Set "left" to be the starting block,
|
|
|
|
* "right" to be the right neighbor.
|
|
|
|
*/
|
|
|
|
lptr = cptr;
|
|
|
|
left = block;
|
|
|
|
lbp = bp;
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Otherwise, we can't fix the imbalance.
|
|
|
|
* Just return. This is probably a logic error, but it's not fatal.
|
|
|
|
*/
|
|
|
|
} else {
|
|
|
|
error = xfs_btree_dec_cursor(cur, level, stat);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
rrecs = xfs_btree_get_numrecs(right);
|
|
|
|
lrecs = xfs_btree_get_numrecs(left);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We're now going to join "left" and "right" by moving all the stuff
|
|
|
|
* in "right" to "left" and deleting "right".
|
|
|
|
*/
|
|
|
|
XFS_BTREE_STATS_ADD(cur, moves, rrecs);
|
|
|
|
if (level > 0) {
|
|
|
|
/* It's a non-leaf. Move keys and pointers. */
|
|
|
|
union xfs_btree_key *lkp; /* left btree key */
|
|
|
|
union xfs_btree_ptr *lpp; /* left address pointer */
|
|
|
|
union xfs_btree_key *rkp; /* right btree key */
|
|
|
|
union xfs_btree_ptr *rpp; /* right address pointer */
|
|
|
|
|
|
|
|
lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
|
|
|
|
lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
|
|
|
|
rkp = xfs_btree_key_addr(cur, 1, right);
|
|
|
|
rpp = xfs_btree_ptr_addr(cur, 1, right);
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:58:01 +00:00
|
|
|
for (i = 1; i < rrecs; i++) {
|
2018-06-04 04:10:48 +00:00
|
|
|
error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
2018-06-04 04:10:48 +00:00
|
|
|
|
2008-10-30 05:58:01 +00:00
|
|
|
xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
|
|
|
|
xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
|
|
|
|
|
|
|
|
xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
|
|
|
|
xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
|
|
|
|
} else {
|
|
|
|
/* It's a leaf. Move records. */
|
|
|
|
union xfs_btree_rec *lrp; /* left record pointer */
|
|
|
|
union xfs_btree_rec *rrp; /* right record pointer */
|
|
|
|
|
|
|
|
lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
|
|
|
|
rrp = xfs_btree_rec_addr(cur, 1, right);
|
|
|
|
|
|
|
|
xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
|
|
|
|
xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
|
|
|
|
}
|
|
|
|
|
|
|
|
XFS_BTREE_STATS_INC(cur, join);
|
|
|
|
|
|
|
|
/*
|
2009-03-29 07:55:42 +00:00
|
|
|
* Fix up the number of records and right block pointer in the
|
2008-10-30 05:58:01 +00:00
|
|
|
* surviving block, and log it.
|
|
|
|
*/
|
|
|
|
xfs_btree_set_numrecs(left, lrecs + rrecs);
|
2020-12-11 19:36:23 +00:00
|
|
|
xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
|
2008-10-30 05:58:01 +00:00
|
|
|
xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
|
|
|
|
xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
|
|
|
|
|
|
|
|
/* If there is a right sibling, point it to the remaining block. */
|
|
|
|
xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
|
|
|
|
if (!xfs_btree_ptr_is_null(cur, &cptr)) {
|
2014-04-14 08:59:56 +00:00
|
|
|
error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
|
|
|
|
xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Free the deleted block. */
|
2016-02-08 03:58:07 +00:00
|
|
|
error = xfs_btree_free_block(cur, rbp);
|
2008-10-30 05:58:01 +00:00
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we joined with the left neighbor, set the buffer in the
|
|
|
|
* cursor to the left block, and fix up the index.
|
|
|
|
*/
|
|
|
|
if (bp != lbp) {
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].bp = lbp;
|
|
|
|
cur->bc_levels[level].ptr += lrecs;
|
|
|
|
cur->bc_levels[level].ra = 0;
|
2008-10-30 05:58:01 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we joined with the right neighbor and there's a level above
|
|
|
|
* us, increment the cursor at that level.
|
|
|
|
*/
|
2024-02-22 20:36:17 +00:00
|
|
|
else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE ||
|
|
|
|
level + 1 < cur->bc_nlevels) {
|
2008-10-30 05:58:01 +00:00
|
|
|
error = xfs_btree_increment(cur, level + 1, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Readjust the ptr at this level if it's not a leaf, since it's
|
|
|
|
* still pointing at the deletion point, which makes the cursor
|
|
|
|
* inconsistent. If this makes the ptr 0, the caller fixes it up.
|
|
|
|
* We can't use decrement because it would change the next level up.
|
|
|
|
*/
|
|
|
|
if (level > 0)
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr--;
|
2008-10-30 05:58:01 +00:00
|
|
|
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
/*
|
|
|
|
* We combined blocks, so we have to update the parent keys if the
|
2021-09-16 19:24:04 +00:00
|
|
|
* btree supports overlapped intervals. However,
|
|
|
|
* bc_levels[level + 1].ptr points to the old block so that the caller
|
|
|
|
* knows which record to delete. Therefore, the caller must be savvy
|
|
|
|
* enough to call updkeys for us if we return stat == 2. The other
|
|
|
|
* exit points from this function don't require deletions further up
|
|
|
|
* the tree, so they can call updkeys directly.
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
*/
|
|
|
|
|
2008-10-30 05:58:01 +00:00
|
|
|
/* Return value means the next level up has something to do. */
|
|
|
|
*stat = 2;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error0:
|
|
|
|
if (tcur)
|
|
|
|
xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Delete the record pointed to by cur.
|
|
|
|
* The cursor refers to the place where the record was (could be inserted)
|
|
|
|
* when the operation returns.
|
|
|
|
*/
|
|
|
|
int /* error */
|
|
|
|
xfs_btree_delete(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int *stat) /* success/failure */
|
|
|
|
{
|
|
|
|
int error; /* error return value */
|
|
|
|
int level;
|
|
|
|
int i;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
bool joined = false;
|
2008-10-30 05:58:01 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Go up the tree, starting at leaf level.
|
|
|
|
*
|
|
|
|
* If 2 is returned then a join was done; go to the next level.
|
|
|
|
* Otherwise we are done.
|
|
|
|
*/
|
|
|
|
for (level = 0, i = 2; i == 2; level++) {
|
|
|
|
error = xfs_btree_delrec(cur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
if (i == 2)
|
|
|
|
joined = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we combined blocks as part of deleting the record, delrec won't
|
|
|
|
* have updated the parent high keys so we have to do that here.
|
|
|
|
*/
|
2024-02-22 20:34:29 +00:00
|
|
|
if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) {
|
xfs: support btrees with overlapping intervals for keys
On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk. When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval. This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.
(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)
Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute. This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.
This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.
When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished. At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.
When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating. This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.
The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively. This is consistent
with the rest of the kernel and the C library.
In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update. This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-08-03 01:08:36 +00:00
|
|
|
error = xfs_btree_updkeys_force(cur, 0);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
2008-10-30 05:58:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (i == 0) {
|
|
|
|
for (level = 1; level < cur->bc_nlevels; level++) {
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[level].ptr == 0) {
|
2008-10-30 05:58:01 +00:00
|
|
|
error = xfs_btree_decrement(cur, level, &i);
|
|
|
|
if (error)
|
|
|
|
goto error0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*stat = i;
|
|
|
|
return 0;
|
|
|
|
error0:
|
|
|
|
return error;
|
|
|
|
}
|
2008-10-30 05:58:11 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the data from the pointed-to record.
|
|
|
|
*/
|
|
|
|
int /* error */
|
|
|
|
xfs_btree_get_rec(
|
|
|
|
struct xfs_btree_cur *cur, /* btree cursor */
|
|
|
|
union xfs_btree_rec **recp, /* output: btree record */
|
|
|
|
int *stat) /* output: success/failure */
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block; /* btree block */
|
|
|
|
struct xfs_buf *bp; /* buffer pointer */
|
|
|
|
int ptr; /* record number */
|
|
|
|
#ifdef DEBUG
|
|
|
|
int error; /* error return value */
|
|
|
|
#endif
|
|
|
|
|
2021-09-16 19:24:04 +00:00
|
|
|
ptr = cur->bc_levels[0].ptr;
|
2008-10-30 05:58:11 +00:00
|
|
|
block = xfs_btree_get_block(cur, 0, &bp);
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, 0, bp);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Off the right end or left end, return failure.
|
|
|
|
*/
|
|
|
|
if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
|
|
|
|
*stat = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Point to the record and extract its data.
|
|
|
|
*/
|
|
|
|
*recp = xfs_btree_rec_addr(cur, ptr, block);
|
|
|
|
*stat = 1;
|
|
|
|
return 0;
|
|
|
|
}
|
2013-08-30 00:23:44 +00:00
|
|
|
|
2016-08-03 01:10:55 +00:00
|
|
|
/* Visit a block in a btree. */
|
|
|
|
STATIC int
|
|
|
|
xfs_btree_visit_block(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
xfs_btree_visit_blocks_fn fn,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
struct xfs_buf *bp;
|
2024-02-22 20:43:34 +00:00
|
|
|
union xfs_btree_ptr rptr, bufptr;
|
2016-08-03 01:10:55 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
/* do right sibling readahead */
|
|
|
|
xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
|
|
|
|
/* process the block */
|
|
|
|
error = fn(cur, level, data);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
/* now read rh sibling block for next iteration */
|
|
|
|
xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
|
|
|
|
if (xfs_btree_ptr_is_null(cur, &rptr))
|
|
|
|
return -ENOENT;
|
|
|
|
|
2022-05-04 02:13:35 +00:00
|
|
|
/*
|
|
|
|
* We only visit blocks once in this walk, so we have to avoid the
|
|
|
|
* internal xfs_btree_lookup_get_block() optimisation where it will
|
|
|
|
* return the same block without checking if the right sibling points
|
|
|
|
* back to us and creates a cyclic reference in the btree.
|
|
|
|
*/
|
2024-02-22 20:43:34 +00:00
|
|
|
xfs_btree_buf_to_ptr(cur, bp, &bufptr);
|
|
|
|
if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
|
|
|
|
xfs_btree_mark_sick(cur);
|
|
|
|
return -EFSCORRUPTED;
|
2022-05-04 02:13:35 +00:00
|
|
|
}
|
2024-02-22 20:43:34 +00:00
|
|
|
|
2016-08-03 01:10:55 +00:00
|
|
|
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Visit every block in a btree. */
|
|
|
|
int
|
|
|
|
xfs_btree_visit_blocks(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
xfs_btree_visit_blocks_fn fn,
|
2019-10-28 23:12:35 +00:00
|
|
|
unsigned int flags,
|
2016-08-03 01:10:55 +00:00
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
union xfs_btree_ptr lptr;
|
|
|
|
int level;
|
|
|
|
struct xfs_btree_block *block = NULL;
|
|
|
|
int error = 0;
|
|
|
|
|
2024-02-22 20:37:26 +00:00
|
|
|
xfs_btree_init_ptr_from_cur(cur, &lptr);
|
2016-08-03 01:10:55 +00:00
|
|
|
|
|
|
|
/* for each level */
|
|
|
|
for (level = cur->bc_nlevels - 1; level >= 0; level--) {
|
|
|
|
/* grab the left hand block */
|
|
|
|
error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
/* readahead the left most block for the next level down */
|
|
|
|
if (level > 0) {
|
|
|
|
union xfs_btree_ptr *ptr;
|
|
|
|
|
|
|
|
ptr = xfs_btree_ptr_addr(cur, 1, block);
|
|
|
|
xfs_btree_readahead_ptr(cur, ptr, 1);
|
|
|
|
|
|
|
|
/* save for the next iteration of the loop */
|
2017-05-23 02:54:10 +00:00
|
|
|
xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
|
2019-10-28 23:12:35 +00:00
|
|
|
|
|
|
|
if (!(flags & XFS_BTREE_VISIT_LEAVES))
|
|
|
|
continue;
|
|
|
|
} else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
|
|
|
|
continue;
|
2016-08-03 01:10:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* for each buffer in the level */
|
|
|
|
do {
|
|
|
|
error = xfs_btree_visit_block(cur, level, fn, data);
|
|
|
|
} while (!error);
|
|
|
|
|
|
|
|
if (error != -ENOENT)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-08-30 00:23:44 +00:00
|
|
|
/*
|
|
|
|
* Change the owner of a btree.
|
|
|
|
*
|
|
|
|
* The mechanism we use here is ordered buffer logging. Because we don't know
|
|
|
|
* how many buffers were are going to need to modify, we don't really want to
|
|
|
|
* have to make transaction reservations for the worst case of every buffer in a
|
|
|
|
* full size btree as that may be more space that we can fit in the log....
|
|
|
|
*
|
|
|
|
* We do the btree walk in the most optimal manner possible - we have sibling
|
|
|
|
* pointers so we can just walk all the blocks on each level from left to right
|
|
|
|
* in a single pass, and then move to the next level and do the same. We can
|
|
|
|
* also do readahead on the sibling pointers to get IO moving more quickly,
|
|
|
|
* though for slow disks this is unlikely to make much difference to performance
|
|
|
|
* as the amount of CPU work we have to do before moving to the next block is
|
|
|
|
* relatively small.
|
|
|
|
*
|
|
|
|
* For each btree block that we load, modify the owner appropriately, set the
|
|
|
|
* buffer as an ordered buffer and log it appropriately. We need to ensure that
|
|
|
|
* we mark the region we change dirty so that if the buffer is relogged in
|
|
|
|
* a subsequent transaction the changes we make here as an ordered buffer are
|
xfs: recovery of swap extents operations for CRC filesystems
This is the recovery side of the btree block owner change operation
performed by swapext on CRC enabled filesystems. We detect that an
owner change is needed by the flag that has been placed on the inode
log format flag field. Because the inode recovery is being replayed
after the buffers that make up the BMBT in the given checkpoint, we
can walk all the buffers and directly modify them when we see the
flag set on an inode.
Because the inode can be relogged and hence present in multiple
chekpoints with the "change owner" flag set, we could do multiple
passes across the inode to do this change. While this isn't optimal,
we can't directly ignore the flag as there may be multiple
independent swap extent operations being replayed on the same inode
in different checkpoints so we can't ignore them.
Further, because the owner change operation uses ordered buffers, we
might have buffers that are newer on disk than the current
checkpoint and so already have the owner changed in them. Hence we
cannot just peek at a buffer in the tree and check that it has the
correct owner and assume that the change was completed.
So, for the moment just brute force the owner change every time we
see an inode with the flag set. Note that we have to be careful here
because the owner of the buffers may point to either the old owner
or the new owner. Currently the verifier can't verify the owner
directly, so there is no failure case here right now. If we verify
the owner exactly in future, then we'll have to take this into
account.
This was tested in terms of normal operation via xfstests - all of
the fsr tests now pass without failure. however, we really need to
modify xfs/227 to stress v3 inodes correctly to ensure we fully
cover this case for v5 filesystems.
In terms of recovery testing, I used a hacked version of xfs_fsr
that held the temp inode open for a few seconds before exiting so
that the filesystem could be shut down with an open owner change
recovery flags set on at least the temp inode. fsr leaves the temp
inode unlinked and in btree format, so this was necessary for the
owner change to be reliably replayed.
logprint confirmed the tmp inode in the log had the correct flag set:
INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88
INODE: #regs:3 ino:0x44 flags:0x209 dsize:88
^^^^^
0x200 is set, indicating a data fork owner change needed to be
replayed on inode 0x44. A printk in the revoery code confirmed that
the inode change was recovered:
XFS (vdc): Mounting Filesystem
XFS (vdc): Starting recovery (logdev: internal)
recovering owner change ino 0x44
XFS (vdc): Version 5 superblock detected. This kernel L support enabled!
Use of these features in this kernel is at your own risk!
XFS (vdc): Ending recovery (logdev: internal)
The script used to test this was:
$ cat ./recovery-fsr.sh
#!/bin/bash
dev=/dev/vdc
mntpt=/mnt/scratch
testfile=$mntpt/testfile
umount $mntpt
mkfs.xfs -f -m crc=1 $dev
mount $dev $mntpt
chmod 777 $mntpt
for i in `seq 10000 -1 0`; do
xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1
done
xfs_bmap -vp $testfile |head -20
xfs_fsr -d -v $testfile &
sleep 10
/home/dave/src/xfstests-dev/src/godown -f $mntpt
wait
umount $mntpt
xfs_logprint -t $dev |tail -20
time mount $dev $mntpt
xfs_bmap -vp $testfile
umount $mntpt
$
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 00:23:45 +00:00
|
|
|
* correctly relogged in that transaction. If we are in recovery context, then
|
|
|
|
* just queue the modified buffer as delayed write buffer so the transaction
|
|
|
|
* recovery completion writes the changes to disk.
|
2013-08-30 00:23:44 +00:00
|
|
|
*/
|
2016-08-03 01:10:55 +00:00
|
|
|
struct xfs_btree_block_change_owner_info {
|
2017-06-16 18:00:05 +00:00
|
|
|
uint64_t new_owner;
|
2016-08-03 01:10:55 +00:00
|
|
|
struct list_head *buffer_list;
|
|
|
|
};
|
|
|
|
|
2013-08-30 00:23:44 +00:00
|
|
|
static int
|
|
|
|
xfs_btree_block_change_owner(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
2016-08-03 01:10:55 +00:00
|
|
|
void *data)
|
2013-08-30 00:23:44 +00:00
|
|
|
{
|
2016-08-03 01:10:55 +00:00
|
|
|
struct xfs_btree_block_change_owner_info *bbcoi = data;
|
2013-08-30 00:23:44 +00:00
|
|
|
struct xfs_btree_block *block;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
|
|
|
|
/* modify the owner */
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
|
2017-08-29 17:08:40 +00:00
|
|
|
if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
|
|
|
|
return 0;
|
2016-08-03 01:10:55 +00:00
|
|
|
block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
|
2017-08-29 17:08:40 +00:00
|
|
|
} else {
|
|
|
|
if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
|
|
|
|
return 0;
|
2016-08-03 01:10:55 +00:00
|
|
|
block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
|
2017-08-29 17:08:40 +00:00
|
|
|
}
|
2013-08-30 00:23:44 +00:00
|
|
|
|
|
|
|
/*
|
xfs: recovery of swap extents operations for CRC filesystems
This is the recovery side of the btree block owner change operation
performed by swapext on CRC enabled filesystems. We detect that an
owner change is needed by the flag that has been placed on the inode
log format flag field. Because the inode recovery is being replayed
after the buffers that make up the BMBT in the given checkpoint, we
can walk all the buffers and directly modify them when we see the
flag set on an inode.
Because the inode can be relogged and hence present in multiple
chekpoints with the "change owner" flag set, we could do multiple
passes across the inode to do this change. While this isn't optimal,
we can't directly ignore the flag as there may be multiple
independent swap extent operations being replayed on the same inode
in different checkpoints so we can't ignore them.
Further, because the owner change operation uses ordered buffers, we
might have buffers that are newer on disk than the current
checkpoint and so already have the owner changed in them. Hence we
cannot just peek at a buffer in the tree and check that it has the
correct owner and assume that the change was completed.
So, for the moment just brute force the owner change every time we
see an inode with the flag set. Note that we have to be careful here
because the owner of the buffers may point to either the old owner
or the new owner. Currently the verifier can't verify the owner
directly, so there is no failure case here right now. If we verify
the owner exactly in future, then we'll have to take this into
account.
This was tested in terms of normal operation via xfstests - all of
the fsr tests now pass without failure. however, we really need to
modify xfs/227 to stress v3 inodes correctly to ensure we fully
cover this case for v5 filesystems.
In terms of recovery testing, I used a hacked version of xfs_fsr
that held the temp inode open for a few seconds before exiting so
that the filesystem could be shut down with an open owner change
recovery flags set on at least the temp inode. fsr leaves the temp
inode unlinked and in btree format, so this was necessary for the
owner change to be reliably replayed.
logprint confirmed the tmp inode in the log had the correct flag set:
INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88
INODE: #regs:3 ino:0x44 flags:0x209 dsize:88
^^^^^
0x200 is set, indicating a data fork owner change needed to be
replayed on inode 0x44. A printk in the revoery code confirmed that
the inode change was recovered:
XFS (vdc): Mounting Filesystem
XFS (vdc): Starting recovery (logdev: internal)
recovering owner change ino 0x44
XFS (vdc): Version 5 superblock detected. This kernel L support enabled!
Use of these features in this kernel is at your own risk!
XFS (vdc): Ending recovery (logdev: internal)
The script used to test this was:
$ cat ./recovery-fsr.sh
#!/bin/bash
dev=/dev/vdc
mntpt=/mnt/scratch
testfile=$mntpt/testfile
umount $mntpt
mkfs.xfs -f -m crc=1 $dev
mount $dev $mntpt
chmod 777 $mntpt
for i in `seq 10000 -1 0`; do
xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1
done
xfs_bmap -vp $testfile |head -20
xfs_fsr -d -v $testfile &
sleep 10
/home/dave/src/xfstests-dev/src/godown -f $mntpt
wait
umount $mntpt
xfs_logprint -t $dev |tail -20
time mount $dev $mntpt
xfs_bmap -vp $testfile
umount $mntpt
$
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 00:23:45 +00:00
|
|
|
* If the block is a root block hosted in an inode, we might not have a
|
|
|
|
* buffer pointer here and we shouldn't attempt to log the change as the
|
|
|
|
* information is already held in the inode and discarded when the root
|
|
|
|
* block is formatted into the on-disk inode fork. We still change it,
|
|
|
|
* though, so everything is consistent in memory.
|
2013-08-30 00:23:44 +00:00
|
|
|
*/
|
2017-08-29 17:08:40 +00:00
|
|
|
if (!bp) {
|
2024-02-22 20:36:17 +00:00
|
|
|
ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
|
2013-08-30 00:23:44 +00:00
|
|
|
ASSERT(level == cur->bc_nlevels - 1);
|
2017-08-29 17:08:40 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cur->bc_tp) {
|
|
|
|
if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
|
|
|
|
xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
|
2013-08-30 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
2016-08-03 01:10:55 +00:00
|
|
|
return 0;
|
2013-08-30 00:23:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
xfs_btree_change_owner(
|
|
|
|
struct xfs_btree_cur *cur,
|
2017-06-16 18:00:05 +00:00
|
|
|
uint64_t new_owner,
|
xfs: recovery of swap extents operations for CRC filesystems
This is the recovery side of the btree block owner change operation
performed by swapext on CRC enabled filesystems. We detect that an
owner change is needed by the flag that has been placed on the inode
log format flag field. Because the inode recovery is being replayed
after the buffers that make up the BMBT in the given checkpoint, we
can walk all the buffers and directly modify them when we see the
flag set on an inode.
Because the inode can be relogged and hence present in multiple
chekpoints with the "change owner" flag set, we could do multiple
passes across the inode to do this change. While this isn't optimal,
we can't directly ignore the flag as there may be multiple
independent swap extent operations being replayed on the same inode
in different checkpoints so we can't ignore them.
Further, because the owner change operation uses ordered buffers, we
might have buffers that are newer on disk than the current
checkpoint and so already have the owner changed in them. Hence we
cannot just peek at a buffer in the tree and check that it has the
correct owner and assume that the change was completed.
So, for the moment just brute force the owner change every time we
see an inode with the flag set. Note that we have to be careful here
because the owner of the buffers may point to either the old owner
or the new owner. Currently the verifier can't verify the owner
directly, so there is no failure case here right now. If we verify
the owner exactly in future, then we'll have to take this into
account.
This was tested in terms of normal operation via xfstests - all of
the fsr tests now pass without failure. however, we really need to
modify xfs/227 to stress v3 inodes correctly to ensure we fully
cover this case for v5 filesystems.
In terms of recovery testing, I used a hacked version of xfs_fsr
that held the temp inode open for a few seconds before exiting so
that the filesystem could be shut down with an open owner change
recovery flags set on at least the temp inode. fsr leaves the temp
inode unlinked and in btree format, so this was necessary for the
owner change to be reliably replayed.
logprint confirmed the tmp inode in the log had the correct flag set:
INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88
INODE: #regs:3 ino:0x44 flags:0x209 dsize:88
^^^^^
0x200 is set, indicating a data fork owner change needed to be
replayed on inode 0x44. A printk in the revoery code confirmed that
the inode change was recovered:
XFS (vdc): Mounting Filesystem
XFS (vdc): Starting recovery (logdev: internal)
recovering owner change ino 0x44
XFS (vdc): Version 5 superblock detected. This kernel L support enabled!
Use of these features in this kernel is at your own risk!
XFS (vdc): Ending recovery (logdev: internal)
The script used to test this was:
$ cat ./recovery-fsr.sh
#!/bin/bash
dev=/dev/vdc
mntpt=/mnt/scratch
testfile=$mntpt/testfile
umount $mntpt
mkfs.xfs -f -m crc=1 $dev
mount $dev $mntpt
chmod 777 $mntpt
for i in `seq 10000 -1 0`; do
xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1
done
xfs_bmap -vp $testfile |head -20
xfs_fsr -d -v $testfile &
sleep 10
/home/dave/src/xfstests-dev/src/godown -f $mntpt
wait
umount $mntpt
xfs_logprint -t $dev |tail -20
time mount $dev $mntpt
xfs_bmap -vp $testfile
umount $mntpt
$
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 00:23:45 +00:00
|
|
|
struct list_head *buffer_list)
|
2013-08-30 00:23:44 +00:00
|
|
|
{
|
2016-08-03 01:10:55 +00:00
|
|
|
struct xfs_btree_block_change_owner_info bbcoi;
|
2013-08-30 00:23:44 +00:00
|
|
|
|
2016-08-03 01:10:55 +00:00
|
|
|
bbcoi.new_owner = new_owner;
|
|
|
|
bbcoi.buffer_list = buffer_list;
|
2013-08-30 00:23:44 +00:00
|
|
|
|
2016-08-03 01:10:55 +00:00
|
|
|
return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
|
2019-10-28 23:12:35 +00:00
|
|
|
XFS_BTREE_VISIT_ALL, &bbcoi);
|
2013-08-30 00:23:44 +00:00
|
|
|
}
|
2016-01-04 05:13:21 +00:00
|
|
|
|
2018-01-08 18:51:00 +00:00
|
|
|
/* Verify the v5 fields of a long-format btree block. */
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_failaddr_t
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_fsblock_v5hdr_verify(
|
2018-01-08 18:51:00 +00:00
|
|
|
struct xfs_buf *bp,
|
|
|
|
uint64_t owner)
|
|
|
|
{
|
2019-06-29 02:27:29 +00:00
|
|
|
struct xfs_mount *mp = bp->b_mount;
|
2018-01-08 18:51:00 +00:00
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
|
|
|
|
2021-08-19 01:46:37 +00:00
|
|
|
if (!xfs_has_crc(mp))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2018-01-08 18:51:00 +00:00
|
|
|
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2021-08-19 01:47:05 +00:00
|
|
|
if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2018-01-08 18:51:00 +00:00
|
|
|
if (owner != XFS_RMAP_OWN_UNKNOWN &&
|
|
|
|
be64_to_cpu(block->bb_u.l.bb_owner) != owner)
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
|
|
|
return NULL;
|
2018-01-08 18:51:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Verify a long-format btree block. */
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_failaddr_t
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_fsblock_verify(
|
2018-01-08 18:51:00 +00:00
|
|
|
struct xfs_buf *bp,
|
|
|
|
unsigned int max_recs)
|
|
|
|
{
|
2019-06-29 02:27:29 +00:00
|
|
|
struct xfs_mount *mp = bp->b_mount;
|
2018-01-08 18:51:00 +00:00
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
2022-05-04 02:13:35 +00:00
|
|
|
xfs_fsblock_t fsb;
|
|
|
|
xfs_failaddr_t fa;
|
2018-01-08 18:51:00 +00:00
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
ASSERT(!xfs_buftarg_is_mem(bp->b_target));
|
|
|
|
|
2018-01-08 18:51:00 +00:00
|
|
|
/* numrecs verification */
|
|
|
|
if (be16_to_cpu(block->bb_numrecs) > max_recs)
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2018-01-08 18:51:00 +00:00
|
|
|
|
|
|
|
/* sibling pointer verification */
|
2022-05-04 02:13:35 +00:00
|
|
|
fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_fsblock_siblings(mp, fsb,
|
|
|
|
block->bb_u.l.bb_leftsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
if (!fa)
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_fsblock_siblings(mp, fsb,
|
2022-05-27 00:20:45 +00:00
|
|
|
block->bb_u.l.bb_rightsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
return fa;
|
2018-01-08 18:51:00 +00:00
|
|
|
}
|
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
/* Verify an in-memory btree block. */
|
|
|
|
xfs_failaddr_t
|
|
|
|
xfs_btree_memblock_verify(
|
|
|
|
struct xfs_buf *bp,
|
|
|
|
unsigned int max_recs)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
|
|
|
struct xfs_buftarg *btp = bp->b_target;
|
|
|
|
xfs_failaddr_t fa;
|
|
|
|
xfbno_t bno;
|
|
|
|
|
|
|
|
ASSERT(xfs_buftarg_is_mem(bp->b_target));
|
|
|
|
|
|
|
|
/* numrecs verification */
|
|
|
|
if (be16_to_cpu(block->bb_numrecs) > max_recs)
|
|
|
|
return __this_address;
|
|
|
|
|
|
|
|
/* sibling pointer verification */
|
|
|
|
bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
|
|
|
|
fa = xfs_btree_check_memblock_siblings(btp, bno,
|
|
|
|
block->bb_u.l.bb_leftsib);
|
|
|
|
if (fa)
|
|
|
|
return fa;
|
|
|
|
fa = xfs_btree_check_memblock_siblings(btp, bno,
|
|
|
|
block->bb_u.l.bb_rightsib);
|
|
|
|
if (fa)
|
|
|
|
return fa;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2016-01-04 05:13:21 +00:00
|
|
|
/**
|
2024-02-22 20:40:58 +00:00
|
|
|
* xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
|
2016-01-04 05:13:21 +00:00
|
|
|
* btree block
|
|
|
|
*
|
|
|
|
* @bp: buffer containing the btree block
|
|
|
|
*/
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_failaddr_t
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_agblock_v5hdr_verify(
|
2016-01-04 05:13:21 +00:00
|
|
|
struct xfs_buf *bp)
|
|
|
|
{
|
2019-06-29 02:27:29 +00:00
|
|
|
struct xfs_mount *mp = bp->b_mount;
|
2016-01-04 05:13:21 +00:00
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
|
|
|
struct xfs_perag *pag = bp->b_pag;
|
|
|
|
|
2021-08-19 01:46:37 +00:00
|
|
|
if (!xfs_has_crc(mp))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2016-01-04 05:13:21 +00:00
|
|
|
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2021-08-19 01:47:05 +00:00
|
|
|
if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2024-11-04 04:18:38 +00:00
|
|
|
if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag_agno(pag))
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
|
|
|
return NULL;
|
2016-01-04 05:13:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2024-02-22 20:40:58 +00:00
|
|
|
* xfs_btree_agblock_verify() -- verify a short-format btree block
|
2016-01-04 05:13:21 +00:00
|
|
|
*
|
|
|
|
* @bp: buffer containing the btree block
|
|
|
|
* @max_recs: maximum records allowed in this btree node
|
|
|
|
*/
|
2018-01-08 18:51:03 +00:00
|
|
|
xfs_failaddr_t
|
2024-02-22 20:40:58 +00:00
|
|
|
xfs_btree_agblock_verify(
|
2016-01-04 05:13:21 +00:00
|
|
|
struct xfs_buf *bp,
|
|
|
|
unsigned int max_recs)
|
|
|
|
{
|
2019-06-29 02:27:29 +00:00
|
|
|
struct xfs_mount *mp = bp->b_mount;
|
2016-01-04 05:13:21 +00:00
|
|
|
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
|
2022-05-04 02:13:35 +00:00
|
|
|
xfs_agblock_t agbno;
|
|
|
|
xfs_failaddr_t fa;
|
2016-01-04 05:13:21 +00:00
|
|
|
|
2024-02-22 20:43:35 +00:00
|
|
|
ASSERT(!xfs_buftarg_is_mem(bp->b_target));
|
|
|
|
|
2016-01-04 05:13:21 +00:00
|
|
|
/* numrecs verification */
|
|
|
|
if (be16_to_cpu(block->bb_numrecs) > max_recs)
|
2018-01-08 18:51:03 +00:00
|
|
|
return __this_address;
|
2016-01-04 05:13:21 +00:00
|
|
|
|
|
|
|
/* sibling pointer verification */
|
2022-05-04 02:13:35 +00:00
|
|
|
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
|
2022-05-27 00:20:45 +00:00
|
|
|
block->bb_u.s.bb_leftsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
if (!fa)
|
2024-02-22 20:40:58 +00:00
|
|
|
fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
|
2022-05-27 00:20:45 +00:00
|
|
|
block->bb_u.s.bb_rightsib);
|
2022-05-04 02:13:35 +00:00
|
|
|
return fa;
|
2016-01-04 05:13:21 +00:00
|
|
|
}
|
2016-06-21 01:53:28 +00:00
|
|
|
|
|
|
|
/*
|
2021-10-13 18:10:45 +00:00
|
|
|
* For the given limits on leaf and keyptr records per block, calculate the
|
|
|
|
* height of the tree needed to index the number of leaf records.
|
2016-06-21 01:53:28 +00:00
|
|
|
*/
|
2021-10-13 18:10:45 +00:00
|
|
|
unsigned int
|
2016-06-21 01:53:28 +00:00
|
|
|
xfs_btree_compute_maxlevels(
|
2021-10-13 18:10:45 +00:00
|
|
|
const unsigned int *limits,
|
|
|
|
unsigned long long records)
|
2016-06-21 01:53:28 +00:00
|
|
|
{
|
2021-10-13 18:10:45 +00:00
|
|
|
unsigned long long level_blocks = howmany_64(records, limits[0]);
|
|
|
|
unsigned int height = 1;
|
2016-06-21 01:53:28 +00:00
|
|
|
|
2021-10-13 18:10:45 +00:00
|
|
|
while (level_blocks > 1) {
|
|
|
|
level_blocks = howmany_64(level_blocks, limits[1]);
|
|
|
|
height++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return height;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For the given limits on leaf and keyptr records per block, calculate the
|
|
|
|
* number of blocks needed to index the given number of leaf records.
|
|
|
|
*/
|
|
|
|
unsigned long long
|
|
|
|
xfs_btree_calc_size(
|
|
|
|
const unsigned int *limits,
|
|
|
|
unsigned long long records)
|
|
|
|
{
|
|
|
|
unsigned long long level_blocks = howmany_64(records, limits[0]);
|
|
|
|
unsigned long long blocks = level_blocks;
|
|
|
|
|
|
|
|
while (level_blocks > 1) {
|
|
|
|
level_blocks = howmany_64(level_blocks, limits[1]);
|
|
|
|
blocks += level_blocks;
|
|
|
|
}
|
|
|
|
|
|
|
|
return blocks;
|
2016-06-21 01:53:28 +00:00
|
|
|
}
|
2016-08-03 01:10:21 +00:00
|
|
|
|
2021-09-16 19:27:43 +00:00
|
|
|
/*
|
|
|
|
* Given a number of available blocks for the btree to consume with records and
|
|
|
|
* pointers, calculate the height of the tree needed to index all the records
|
|
|
|
* that space can hold based on the number of pointers each interior node
|
|
|
|
* holds.
|
|
|
|
*
|
|
|
|
* We start by assuming a single level tree consumes a single block, then track
|
|
|
|
* the number of blocks each node level consumes until we no longer have space
|
|
|
|
* to store the next node level. At this point, we are indexing all the leaf
|
|
|
|
* blocks in the space, and there's no more free space to split the tree any
|
|
|
|
* further. That's our maximum btree height.
|
|
|
|
*/
|
|
|
|
unsigned int
|
|
|
|
xfs_btree_space_to_height(
|
|
|
|
const unsigned int *limits,
|
|
|
|
unsigned long long leaf_blocks)
|
|
|
|
{
|
xfs: fix off-by-one error in xfs_btree_space_to_height
Lately I've been stress-testing extreme-sized rmap btrees by using the
(new) xfs_db bmap_inflate command to clone bmbt mappings billions of
times and then using xfs_repair to build new rmap and refcount btrees.
This of course is /much/ faster than actually FICLONEing a file billions
of times.
Unfortunately, xfs_repair fails in xfs_btree_bload_compute_geometry with
EOVERFLOW, which indicates that xfs_mount.m_rmap_maxlevels is not
sufficiently large for the test scenario. For a 1TB filesystem (~67
million AG blocks, 4 AGs) the btheight command reports:
$ xfs_db -c 'btheight -n 4400801200 -w min rmapbt' /dev/sda
rmapbt: worst case per 4096-byte block: 84 records (leaf) / 45 keyptrs (node)
level 0: 4400801200 records, 52390491 blocks
level 1: 52390491 records, 1164234 blocks
level 2: 1164234 records, 25872 blocks
level 3: 25872 records, 575 blocks
level 4: 575 records, 13 blocks
level 5: 13 records, 1 block
6 levels, 53581186 blocks total
The AG is sufficiently large to build this rmap btree. Unfortunately,
m_rmap_maxlevels is 5. Augmenting the loop in the space->height
function to report height, node blocks, and blocks remaining produces
this:
ht 1 node_blocks 45 blockleft 67108863
ht 2 node_blocks 2025 blockleft 67108818
ht 3 node_blocks 91125 blockleft 67106793
ht 4 node_blocks 4100625 blockleft 67015668
final height: 5
The goal of this function is to compute the maximum height btree that
can be stored in the given number of ondisk fsblocks. Starting with the
top level of the tree, each iteration through the loop adds the fanout
factor of the next level down until we run out of blocks. IOWs, maximum
height is achieved by using the smallest fanout factor that can apply
to that level.
However, the loop setup is not correct. Top level btree blocks are
allowed to contain fewer than minrecs items, so the computation is
incorrect because the first time through the loop it should be using a
fanout factor of 2. With this corrected, the above becomes:
ht 1 node_blocks 2 blockleft 67108863
ht 2 node_blocks 90 blockleft 67108861
ht 3 node_blocks 4050 blockleft 67108771
ht 4 node_blocks 182250 blockleft 67104721
ht 5 node_blocks 8201250 blockleft 66922471
final height: 6
Fixes: 9ec691205e7d ("xfs: compute the maximum height of the rmap btree when reflink enabled")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-12-26 18:11:18 +00:00
|
|
|
/*
|
|
|
|
* The root btree block can have fewer than minrecs pointers in it
|
|
|
|
* because the tree might not be big enough to require that amount of
|
|
|
|
* fanout. Hence it has a minimum size of 2 pointers, not limits[1].
|
|
|
|
*/
|
|
|
|
unsigned long long node_blocks = 2;
|
2021-09-16 19:27:43 +00:00
|
|
|
unsigned long long blocks_left = leaf_blocks - 1;
|
|
|
|
unsigned int height = 1;
|
|
|
|
|
|
|
|
if (leaf_blocks < 1)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
while (node_blocks < blocks_left) {
|
|
|
|
blocks_left -= node_blocks;
|
|
|
|
node_blocks *= limits[1];
|
|
|
|
height++;
|
|
|
|
}
|
|
|
|
|
|
|
|
return height;
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:10:21 +00:00
|
|
|
/*
|
|
|
|
* Query a regular btree for all records overlapping a given interval.
|
|
|
|
* Start with a LE lookup of the key of low_rec and return all records
|
|
|
|
* until we find a record with a key greater than the key of high_rec.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_btree_simple_query_range(
|
|
|
|
struct xfs_btree_cur *cur,
|
2021-08-11 00:02:15 +00:00
|
|
|
const union xfs_btree_key *low_key,
|
|
|
|
const union xfs_btree_key *high_key,
|
2016-08-03 01:10:21 +00:00
|
|
|
xfs_btree_query_range_fn fn,
|
|
|
|
void *priv)
|
|
|
|
{
|
|
|
|
union xfs_btree_rec *recp;
|
|
|
|
union xfs_btree_key rec_key;
|
|
|
|
int stat;
|
|
|
|
bool firstrec = true;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
ASSERT(cur->bc_ops->init_high_key_from_rec);
|
|
|
|
ASSERT(cur->bc_ops->diff_two_keys);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Find the leftmost record. The btree cursor must be set
|
|
|
|
* to the low record used to generate low_key.
|
|
|
|
*/
|
|
|
|
stat = 0;
|
|
|
|
error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
|
2016-08-26 06:00:10 +00:00
|
|
|
/* Nothing? See if there's anything to the right. */
|
|
|
|
if (!stat) {
|
|
|
|
error = xfs_btree_increment(cur, 0, &stat);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:10:21 +00:00
|
|
|
while (stat) {
|
|
|
|
/* Find the record. */
|
|
|
|
error = xfs_btree_get_rec(cur, &recp, &stat);
|
|
|
|
if (error || !stat)
|
|
|
|
break;
|
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
/* Skip if low_key > high_key(rec). */
|
2016-08-03 01:10:21 +00:00
|
|
|
if (firstrec) {
|
2016-08-26 05:59:50 +00:00
|
|
|
cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
|
2016-08-03 01:10:21 +00:00
|
|
|
firstrec = false;
|
2023-04-12 02:00:10 +00:00
|
|
|
if (xfs_btree_keycmp_gt(cur, low_key, &rec_key))
|
2016-08-03 01:10:21 +00:00
|
|
|
goto advloop;
|
|
|
|
}
|
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
/* Stop if low_key(rec) > high_key. */
|
2016-08-26 05:59:50 +00:00
|
|
|
cur->bc_ops->init_key_from_rec(&rec_key, recp);
|
2023-04-12 02:00:10 +00:00
|
|
|
if (xfs_btree_keycmp_gt(cur, &rec_key, high_key))
|
2016-08-03 01:10:21 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
/* Callback */
|
|
|
|
error = fn(cur, recp, priv);
|
2019-08-28 21:37:57 +00:00
|
|
|
if (error)
|
2016-08-03 01:10:21 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
advloop:
|
|
|
|
/* Move on to the next record. */
|
|
|
|
error = xfs_btree_increment(cur, 0, &stat);
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Query an overlapped interval btree for all records overlapping a given
|
|
|
|
* interval. This function roughly follows the algorithm given in
|
|
|
|
* "Interval Trees" of _Introduction to Algorithms_, which is section
|
|
|
|
* 14.3 in the 2nd and 3rd editions.
|
|
|
|
*
|
|
|
|
* First, generate keys for the low and high records passed in.
|
|
|
|
*
|
|
|
|
* For any leaf node, generate the high and low keys for the record.
|
|
|
|
* If the record keys overlap with the query low/high keys, pass the
|
|
|
|
* record to the function iterator.
|
|
|
|
*
|
|
|
|
* For any internal node, compare the low and high keys of each
|
|
|
|
* pointer against the query low/high keys. If there's an overlap,
|
|
|
|
* follow the pointer.
|
|
|
|
*
|
|
|
|
* As an optimization, we stop scanning a block when we find a low key
|
|
|
|
* that is greater than the query's high key.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_btree_overlapped_query_range(
|
|
|
|
struct xfs_btree_cur *cur,
|
2021-08-11 00:02:15 +00:00
|
|
|
const union xfs_btree_key *low_key,
|
|
|
|
const union xfs_btree_key *high_key,
|
2016-08-03 01:10:21 +00:00
|
|
|
xfs_btree_query_range_fn fn,
|
|
|
|
void *priv)
|
|
|
|
{
|
|
|
|
union xfs_btree_ptr ptr;
|
|
|
|
union xfs_btree_ptr *pp;
|
|
|
|
union xfs_btree_key rec_key;
|
|
|
|
union xfs_btree_key rec_hkey;
|
|
|
|
union xfs_btree_key *lkp;
|
|
|
|
union xfs_btree_key *hkp;
|
|
|
|
union xfs_btree_rec *recp;
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
int level;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
int i;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* Load the root of the btree. */
|
|
|
|
level = cur->bc_nlevels - 1;
|
2024-02-22 20:37:26 +00:00
|
|
|
xfs_btree_init_ptr_from_cur(cur, &ptr);
|
2016-08-03 01:10:21 +00:00
|
|
|
error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
xfs_btree_get_block(cur, level, &bp);
|
|
|
|
trace_xfs_btree_overlapped_query_range(cur, level, bp);
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
#endif
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr = 1;
|
2016-08-03 01:10:21 +00:00
|
|
|
|
|
|
|
while (level < cur->bc_nlevels) {
|
|
|
|
block = xfs_btree_get_block(cur, level, &bp);
|
|
|
|
|
|
|
|
/* End of node, pop back towards the root. */
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[level].ptr >
|
|
|
|
be16_to_cpu(block->bb_numrecs)) {
|
2016-08-03 01:10:21 +00:00
|
|
|
pop_up:
|
|
|
|
if (level < cur->bc_nlevels - 1)
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level + 1].ptr++;
|
2016-08-03 01:10:21 +00:00
|
|
|
level++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (level == 0) {
|
|
|
|
/* Handle a leaf node. */
|
2021-09-16 19:24:04 +00:00
|
|
|
recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
|
|
|
|
block);
|
2016-08-03 01:10:21 +00:00
|
|
|
|
|
|
|
cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
|
|
|
|
cur->bc_ops->init_key_from_rec(&rec_key, recp);
|
|
|
|
|
|
|
|
/*
|
2023-04-12 02:00:10 +00:00
|
|
|
* If (query's high key < record's low key), then there
|
|
|
|
* are no more interesting records in this block. Pop
|
|
|
|
* up to the leaf level to find more record blocks.
|
|
|
|
*
|
2016-08-03 01:10:21 +00:00
|
|
|
* If (record's high key >= query's low key) and
|
|
|
|
* (query's high key >= record's low key), then
|
|
|
|
* this record overlaps the query range; callback.
|
|
|
|
*/
|
2023-04-12 02:00:10 +00:00
|
|
|
if (xfs_btree_keycmp_lt(cur, high_key, &rec_key))
|
|
|
|
goto pop_up;
|
|
|
|
if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) {
|
2016-08-03 01:10:21 +00:00
|
|
|
error = fn(cur, recp, priv);
|
2019-08-28 21:37:57 +00:00
|
|
|
if (error)
|
2016-08-03 01:10:21 +00:00
|
|
|
break;
|
|
|
|
}
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr++;
|
2016-08-03 01:10:21 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Handle an internal node. */
|
2021-09-16 19:24:04 +00:00
|
|
|
lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
|
|
|
|
hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
|
|
|
|
block);
|
|
|
|
pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
|
2016-08-03 01:10:21 +00:00
|
|
|
|
|
|
|
/*
|
2023-04-12 02:00:10 +00:00
|
|
|
* If (query's high key < pointer's low key), then there are no
|
|
|
|
* more interesting keys in this block. Pop up one leaf level
|
|
|
|
* to continue looking for records.
|
|
|
|
*
|
2016-08-03 01:10:21 +00:00
|
|
|
* If (pointer's high key >= query's low key) and
|
|
|
|
* (query's high key >= pointer's low key), then
|
|
|
|
* this record overlaps the query range; follow pointer.
|
|
|
|
*/
|
2023-04-12 02:00:10 +00:00
|
|
|
if (xfs_btree_keycmp_lt(cur, high_key, lkp))
|
|
|
|
goto pop_up;
|
|
|
|
if (xfs_btree_keycmp_ge(cur, hkp, low_key)) {
|
2016-08-03 01:10:21 +00:00
|
|
|
level--;
|
|
|
|
error = xfs_btree_lookup_get_block(cur, level, pp,
|
|
|
|
&block);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
xfs_btree_get_block(cur, level, &bp);
|
|
|
|
trace_xfs_btree_overlapped_query_range(cur, level, bp);
|
|
|
|
#ifdef DEBUG
|
|
|
|
error = xfs_btree_check_block(cur, block, level, bp);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
#endif
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr = 1;
|
2016-08-03 01:10:21 +00:00
|
|
|
continue;
|
|
|
|
}
|
2021-09-16 19:24:04 +00:00
|
|
|
cur->bc_levels[level].ptr++;
|
2016-08-03 01:10:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
/*
|
|
|
|
* If we don't end this function with the cursor pointing at a record
|
|
|
|
* block, a subsequent non-error cursor deletion will not release
|
|
|
|
* node-level buffers, causing a buffer leak. This is quite possible
|
|
|
|
* with a zero-results range query, so release the buffers if we
|
|
|
|
* failed to return any results.
|
|
|
|
*/
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[0].bp == NULL) {
|
2016-08-03 01:10:21 +00:00
|
|
|
for (i = 0; i < cur->bc_nlevels; i++) {
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[i].bp) {
|
|
|
|
xfs_trans_brelse(cur->bc_tp,
|
|
|
|
cur->bc_levels[i].bp);
|
|
|
|
cur->bc_levels[i].bp = NULL;
|
|
|
|
cur->bc_levels[i].ptr = 0;
|
|
|
|
cur->bc_levels[i].ra = 0;
|
2016-08-03 01:10:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2023-04-12 02:00:09 +00:00
|
|
|
static inline void
|
|
|
|
xfs_btree_key_from_irec(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
union xfs_btree_key *key,
|
|
|
|
const union xfs_btree_irec *irec)
|
|
|
|
{
|
|
|
|
union xfs_btree_rec rec;
|
|
|
|
|
|
|
|
cur->bc_rec = *irec;
|
|
|
|
cur->bc_ops->init_rec_from_cur(cur, &rec);
|
|
|
|
cur->bc_ops->init_key_from_rec(key, &rec);
|
|
|
|
}
|
|
|
|
|
2016-08-03 01:10:21 +00:00
|
|
|
/*
|
|
|
|
* Query a btree for all records overlapping a given interval of keys. The
|
|
|
|
* supplied function will be called with each record found; return one of the
|
|
|
|
* XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
|
2019-08-28 21:37:57 +00:00
|
|
|
* code. This function returns -ECANCELED, zero, or a negative error code.
|
2016-08-03 01:10:21 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_btree_query_range(
|
|
|
|
struct xfs_btree_cur *cur,
|
2021-08-11 00:02:15 +00:00
|
|
|
const union xfs_btree_irec *low_rec,
|
|
|
|
const union xfs_btree_irec *high_rec,
|
2016-08-03 01:10:21 +00:00
|
|
|
xfs_btree_query_range_fn fn,
|
|
|
|
void *priv)
|
|
|
|
{
|
|
|
|
union xfs_btree_key low_key;
|
|
|
|
union xfs_btree_key high_key;
|
|
|
|
|
|
|
|
/* Find the keys of both ends of the interval. */
|
2023-04-12 02:00:09 +00:00
|
|
|
xfs_btree_key_from_irec(cur, &high_key, high_rec);
|
|
|
|
xfs_btree_key_from_irec(cur, &low_key, low_rec);
|
2016-08-03 01:10:21 +00:00
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
/* Enforce low key <= high key. */
|
|
|
|
if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
|
2016-08-03 01:10:21 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2024-02-22 20:34:29 +00:00
|
|
|
if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
|
2016-08-03 01:10:21 +00:00
|
|
|
return xfs_btree_simple_query_range(cur, &low_key,
|
|
|
|
&high_key, fn, priv);
|
|
|
|
return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
|
|
|
|
fn, priv);
|
|
|
|
}
|
2016-09-19 00:25:03 +00:00
|
|
|
|
2017-03-28 21:56:35 +00:00
|
|
|
/* Query a btree for all records. */
|
|
|
|
int
|
|
|
|
xfs_btree_query_all(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
xfs_btree_query_range_fn fn,
|
|
|
|
void *priv)
|
|
|
|
{
|
2017-06-16 18:00:04 +00:00
|
|
|
union xfs_btree_key low_key;
|
|
|
|
union xfs_btree_key high_key;
|
|
|
|
|
|
|
|
memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
|
|
|
|
memset(&low_key, 0, sizeof(low_key));
|
|
|
|
memset(&high_key, 0xFF, sizeof(high_key));
|
2017-03-28 21:56:35 +00:00
|
|
|
|
2017-06-16 18:00:04 +00:00
|
|
|
return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
|
2017-03-28 21:56:35 +00:00
|
|
|
}
|
|
|
|
|
2016-10-20 04:42:30 +00:00
|
|
|
static int
|
2016-09-19 00:25:20 +00:00
|
|
|
xfs_btree_count_blocks_helper(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
int level,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
xfs_extlen_t *blocks = data;
|
|
|
|
(*blocks)++;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Count the blocks in a btree and return the result in *blocks. */
|
|
|
|
int
|
|
|
|
xfs_btree_count_blocks(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
xfs_extlen_t *blocks)
|
|
|
|
{
|
|
|
|
*blocks = 0;
|
|
|
|
return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
|
2019-10-28 23:12:35 +00:00
|
|
|
XFS_BTREE_VISIT_ALL, blocks);
|
2016-09-19 00:25:20 +00:00
|
|
|
}
|
2017-10-18 04:37:37 +00:00
|
|
|
|
|
|
|
/* Compare two btree pointers. */
|
|
|
|
int64_t
|
|
|
|
xfs_btree_diff_two_ptrs(
|
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_ptr *a,
|
|
|
|
const union xfs_btree_ptr *b)
|
|
|
|
{
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
|
2017-10-18 04:37:37 +00:00
|
|
|
return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
|
|
|
|
return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
|
|
|
|
}
|
2018-01-17 02:52:12 +00:00
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
struct xfs_btree_has_records {
|
|
|
|
/* Keys for the start and end of the range we want to know about. */
|
|
|
|
union xfs_btree_key start_key;
|
|
|
|
union xfs_btree_key end_key;
|
|
|
|
|
2023-04-12 02:00:11 +00:00
|
|
|
/* Mask for key comparisons, if desired. */
|
|
|
|
const union xfs_btree_key *key_mask;
|
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
/* Highest record key we've seen so far. */
|
|
|
|
union xfs_btree_key high_key;
|
|
|
|
|
|
|
|
enum xbtree_recpacking outcome;
|
|
|
|
};
|
|
|
|
|
2018-01-17 02:52:12 +00:00
|
|
|
STATIC int
|
2023-04-12 02:00:10 +00:00
|
|
|
xfs_btree_has_records_helper(
|
2018-01-17 02:52:12 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
2021-08-11 00:02:16 +00:00
|
|
|
const union xfs_btree_rec *rec,
|
2018-01-17 02:52:12 +00:00
|
|
|
void *priv)
|
|
|
|
{
|
2023-04-12 02:00:10 +00:00
|
|
|
union xfs_btree_key rec_key;
|
|
|
|
union xfs_btree_key rec_high_key;
|
|
|
|
struct xfs_btree_has_records *info = priv;
|
|
|
|
enum xbtree_key_contig key_contig;
|
|
|
|
|
|
|
|
cur->bc_ops->init_key_from_rec(&rec_key, rec);
|
|
|
|
|
|
|
|
if (info->outcome == XBTREE_RECPACKING_EMPTY) {
|
|
|
|
info->outcome = XBTREE_RECPACKING_SPARSE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the first record we find does not overlap the start key,
|
|
|
|
* then there is a hole at the start of the search range.
|
|
|
|
* Classify this as sparse and stop immediately.
|
|
|
|
*/
|
2023-04-12 02:00:11 +00:00
|
|
|
if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key,
|
|
|
|
info->key_mask))
|
2023-04-12 02:00:10 +00:00
|
|
|
return -ECANCELED;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If a subsequent record does not overlap with the any record
|
|
|
|
* we've seen so far, there is a hole in the middle of the
|
|
|
|
* search range. Classify this as sparse and stop.
|
|
|
|
* If the keys overlap and this btree does not allow overlap,
|
|
|
|
* signal corruption.
|
|
|
|
*/
|
|
|
|
key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
|
2023-04-12 02:00:11 +00:00
|
|
|
&rec_key, info->key_mask);
|
2023-04-12 02:00:10 +00:00
|
|
|
if (key_contig == XBTREE_KEY_OVERLAP &&
|
2024-02-22 20:34:29 +00:00
|
|
|
!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
|
2023-04-12 02:00:10 +00:00
|
|
|
return -EFSCORRUPTED;
|
|
|
|
if (key_contig == XBTREE_KEY_GAP)
|
|
|
|
return -ECANCELED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If high_key(rec) is larger than any other high key we've seen,
|
|
|
|
* remember it for later.
|
|
|
|
*/
|
|
|
|
cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec);
|
2023-04-12 02:00:11 +00:00
|
|
|
if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key,
|
|
|
|
info->key_mask))
|
2023-04-12 02:00:10 +00:00
|
|
|
info->high_key = rec_high_key; /* struct copy */
|
|
|
|
|
|
|
|
return 0;
|
2018-01-17 02:52:12 +00:00
|
|
|
}
|
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
/*
|
|
|
|
* Scan part of the keyspace of a btree and tell us if that keyspace does not
|
|
|
|
* map to any records; is fully mapped to records; or is partially mapped to
|
|
|
|
* records. This is the btree record equivalent to determining if a file is
|
|
|
|
* sparse.
|
2023-04-12 02:00:11 +00:00
|
|
|
*
|
|
|
|
* For most btree types, the record scan should use all available btree key
|
|
|
|
* fields to compare the keys encountered. These callers should pass NULL for
|
|
|
|
* @mask. However, some callers (e.g. scanning physical space in the rmapbt)
|
|
|
|
* want to ignore some part of the btree record keyspace when performing the
|
|
|
|
* comparison. These callers should pass in a union xfs_btree_key object with
|
|
|
|
* the fields that *should* be a part of the comparison set to any nonzero
|
|
|
|
* value, and the rest zeroed.
|
2023-04-12 02:00:10 +00:00
|
|
|
*/
|
2018-01-17 02:52:12 +00:00
|
|
|
int
|
2023-04-12 02:00:10 +00:00
|
|
|
xfs_btree_has_records(
|
2021-08-12 17:10:44 +00:00
|
|
|
struct xfs_btree_cur *cur,
|
|
|
|
const union xfs_btree_irec *low,
|
|
|
|
const union xfs_btree_irec *high,
|
2023-04-12 02:00:11 +00:00
|
|
|
const union xfs_btree_key *mask,
|
2023-04-12 02:00:10 +00:00
|
|
|
enum xbtree_recpacking *outcome)
|
2018-01-17 02:52:12 +00:00
|
|
|
{
|
2023-04-12 02:00:10 +00:00
|
|
|
struct xfs_btree_has_records info = {
|
|
|
|
.outcome = XBTREE_RECPACKING_EMPTY,
|
2023-04-12 02:00:11 +00:00
|
|
|
.key_mask = mask,
|
2023-04-12 02:00:10 +00:00
|
|
|
};
|
2021-08-12 17:10:44 +00:00
|
|
|
int error;
|
2018-01-17 02:52:12 +00:00
|
|
|
|
2023-04-12 02:00:10 +00:00
|
|
|
/* Not all btrees support this operation. */
|
|
|
|
if (!cur->bc_ops->keys_contiguous) {
|
|
|
|
ASSERT(0);
|
|
|
|
return -EOPNOTSUPP;
|
2018-01-17 02:52:12 +00:00
|
|
|
}
|
2023-04-12 02:00:10 +00:00
|
|
|
|
|
|
|
xfs_btree_key_from_irec(cur, &info.start_key, low);
|
|
|
|
xfs_btree_key_from_irec(cur, &info.end_key, high);
|
|
|
|
|
|
|
|
error = xfs_btree_query_range(cur, low, high,
|
|
|
|
xfs_btree_has_records_helper, &info);
|
|
|
|
if (error == -ECANCELED)
|
|
|
|
goto out;
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (info.outcome == XBTREE_RECPACKING_EMPTY)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the largest high_key(rec) we saw during the walk is greater than
|
|
|
|
* the end of the search range, classify this as full. Otherwise,
|
|
|
|
* there is a hole at the end of the search range.
|
|
|
|
*/
|
2023-04-12 02:00:11 +00:00
|
|
|
if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key,
|
|
|
|
mask))
|
2023-04-12 02:00:10 +00:00
|
|
|
info.outcome = XBTREE_RECPACKING_FULL;
|
|
|
|
|
|
|
|
out:
|
|
|
|
*outcome = info.outcome;
|
|
|
|
return 0;
|
2018-01-17 02:52:12 +00:00
|
|
|
}
|
2018-05-09 17:02:03 +00:00
|
|
|
|
|
|
|
/* Are there more records in this btree? */
|
|
|
|
bool
|
|
|
|
xfs_btree_has_more_records(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
|
|
|
struct xfs_btree_block *block;
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
|
|
|
|
block = xfs_btree_get_block(cur, 0, &bp);
|
|
|
|
|
|
|
|
/* There are still records in this block. */
|
2021-09-16 19:24:04 +00:00
|
|
|
if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
|
2018-05-09 17:02:03 +00:00
|
|
|
return true;
|
|
|
|
|
|
|
|
/* There are more record blocks. */
|
2024-02-22 20:35:36 +00:00
|
|
|
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
|
2018-05-09 17:02:03 +00:00
|
|
|
return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
|
|
|
|
else
|
|
|
|
return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
|
|
|
|
}
|
2021-09-23 19:21:37 +00:00
|
|
|
|
|
|
|
/* Set up all the btree cursor caches. */
|
|
|
|
int __init
|
|
|
|
xfs_btree_init_cur_caches(void)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = xfs_allocbt_init_cur_cache();
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
error = xfs_inobt_init_cur_cache();
|
|
|
|
if (error)
|
|
|
|
goto err;
|
|
|
|
error = xfs_bmbt_init_cur_cache();
|
|
|
|
if (error)
|
|
|
|
goto err;
|
|
|
|
error = xfs_rmapbt_init_cur_cache();
|
|
|
|
if (error)
|
|
|
|
goto err;
|
|
|
|
error = xfs_refcountbt_init_cur_cache();
|
|
|
|
if (error)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
err:
|
|
|
|
xfs_btree_destroy_cur_caches();
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Destroy all the btree cursor caches, if they've been allocated. */
|
|
|
|
void
|
|
|
|
xfs_btree_destroy_cur_caches(void)
|
|
|
|
{
|
|
|
|
xfs_allocbt_destroy_cur_cache();
|
|
|
|
xfs_inobt_destroy_cur_cache();
|
|
|
|
xfs_bmbt_destroy_cur_cache();
|
|
|
|
xfs_rmapbt_destroy_cur_cache();
|
|
|
|
xfs_refcountbt_destroy_cur_cache();
|
|
|
|
}
|
2023-12-15 18:03:33 +00:00
|
|
|
|
|
|
|
/* Move the btree cursor before the first record. */
|
|
|
|
int
|
|
|
|
xfs_btree_goto_left_edge(
|
|
|
|
struct xfs_btree_cur *cur)
|
|
|
|
{
|
|
|
|
int stat = 0;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
|
|
|
|
error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
if (!stat)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error = xfs_btree_decrement(cur, 0, &stat);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
if (stat != 0) {
|
|
|
|
ASSERT(0);
|
2024-02-22 20:32:09 +00:00
|
|
|
xfs_btree_mark_sick(cur);
|
2023-12-15 18:03:33 +00:00
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|