bcachefs: Improved topology repair checks

Consolidate bch2_gc_check_topology() and btree_node_interior_verify(),
and replace them with an improved version,
bch2_btree_node_check_topology().

This checks that children of an interior node correctly span the full
range of the parent node with no overlaps.

Also, ensure that topology repairs at runtime are always a fatal error;
in particular, this adds a check in btree_iter_down() - if we don't find
a key while walking down the btree that's indicative of a topology error
and should be flagged as such, not a null ptr deref.

Some checks in btree_update_interior.c remaining BUG_ONS(), because we
already checked the node for topology errors when starting the update,
and the assertions indicate that we _just_ corrupted the btree node -
i.e. the problem can't be that existing on disk corruption, they
indicate an actual algorithmic bug.

In the future, we'll be annotating the fsck errors list with which
recovery pass corrects them; the open coded "run explicit recovery pass
or fatal error" in bch2_btree_node_check_topology() will in the future
be done for every fsck_err() call.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-03-23 19:29:19 -04:00
parent 40cb26233a
commit 79032b0781
8 changed files with 139 additions and 167 deletions

View File

@ -808,7 +808,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
prt_printf(&buf, "\nmax ");
bch2_bpos_to_text(&buf, b->data->max_key);
bch2_fs_inconsistent(c, "%s", buf.buf);
bch2_fs_topology_error(c, "%s", buf.buf);
printbuf_exit(&buf);
}

View File

@ -70,90 +70,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
__gc_pos_set(c, new_pos);
}
/*
* Missing: if an interior btree node is empty, we need to do something -
* perhaps just kill it
*/
static int bch2_gc_check_topology(struct bch_fs *c,
struct btree *b,
struct bkey_buf *prev,
struct bkey_buf cur,
bool is_last)
{
struct bpos node_start = b->data->min_key;
struct bpos node_end = b->data->max_key;
struct bpos expected_start = bkey_deleted(&prev->k->k)
? node_start
: bpos_successor(prev->k->k.p);
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
int ret = 0;
if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
if (!bpos_eq(expected_start, bp->v.min_key)) {
bch2_topology_error(c);
if (bkey_deleted(&prev->k->k)) {
prt_printf(&buf1, "start of node: ");
bch2_bpos_to_text(&buf1, node_start);
} else {
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
}
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
FSCK_NO_RATELIMIT,
btree_node_topology_bad_min_key,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" cur %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
}
if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
bch2_topology_error(c);
printbuf_reset(&buf1);
printbuf_reset(&buf2);
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
bch2_bpos_to_text(&buf2, node_end);
if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf) &&
should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
bch2_bkey_buf_copy(prev, c, cur.k);
err:
fsck_err:
printbuf_exit(&buf2);
printbuf_exit(&buf1);
return ret;
}
static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
{
switch (b->key.k.type) {
@ -445,6 +361,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
prev = NULL;
if (ret == DROP_PREV_NODE) {
bch_info(c, "dropped prev node");
bch2_btree_node_evict(trans, prev_k.k);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, prev_k.k->k.p);
@ -841,42 +758,30 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
{
struct bch_fs *c = trans->c;
struct btree_node_iter iter;
struct bkey unpacked;
struct bkey_s_c k;
struct bkey_buf prev, cur;
int ret = 0;
ret = bch2_btree_node_check_topology(trans, b);
if (ret)
return ret;
if (!btree_node_type_needs_gc(btree_node_type(b)))
return 0;
bch2_btree_node_iter_init_from_start(&iter, b);
bch2_bkey_buf_init(&prev);
bch2_bkey_buf_init(&cur);
bkey_init(&prev.k->k);
while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
&k, initial);
if (ret)
break;
return ret;
bch2_btree_node_iter_advance(&iter, b);
if (b->c.level) {
bch2_bkey_buf_reassemble(&cur, c, k);
ret = bch2_gc_check_topology(c, b, &prev, cur,
bch2_btree_node_iter_end(&iter));
if (ret)
break;
}
}
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
return ret;
return 0;
}
static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
@ -925,14 +830,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
struct bch_fs *c = trans->c;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
struct bkey_buf cur;
struct printbuf buf = PRINTBUF;
int ret = 0;
ret = bch2_btree_node_check_topology(trans, b);
if (ret)
return ret;
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
bch2_bkey_buf_init(&prev);
bch2_bkey_buf_init(&cur);
bkey_init(&prev.k->k);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@ -943,20 +850,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
if (ret)
goto fsck_err;
if (b->c.level) {
bch2_bkey_buf_reassemble(&cur, c, k);
k = bkey_i_to_s_c(cur.k);
bch2_btree_and_journal_iter_advance(&iter);
ret = bch2_gc_check_topology(c, b,
&prev, cur,
!bch2_btree_and_journal_iter_peek(&iter).k);
if (ret)
goto fsck_err;
} else {
bch2_btree_and_journal_iter_advance(&iter);
}
bch2_btree_and_journal_iter_advance(&iter);
}
if (b->c.level > target_depth) {
@ -1015,7 +909,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
}
fsck_err:
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
bch2_btree_and_journal_iter_exit(&iter);
printbuf_exit(&buf);
return ret;

View File

@ -1657,7 +1657,7 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
bch_err(c, "%s", buf.buf);
bch_err_ratelimited(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)

View File

@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
if (ret)
goto err;
} else {
bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
if (!k) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "node not found at pos ");
bch2_bpos_to_text(&buf, path->pos);
prt_str(&buf, " within parent node ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
bch2_fs_fatal_error(c, "%s", buf.buf);
printbuf_exit(&buf);
ret = -BCH_ERR_btree_need_topology_repair;
goto err;
}
bch2_bkey_buf_unpack(&tmp, c, l->b, k);
if ((flags & BTREE_ITER_PREFETCH) &&
c->opts.btree_node_prefetch) {
@ -962,7 +976,6 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
return ret;
}
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
#include "bkey_buf.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
@ -18,6 +19,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"
@ -44,56 +46,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
return path_idx;
}
/* Debug code: */
/*
* Verify that child nodes correctly span parent node's range:
*/
static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
struct bpos next_node = b->data->min_key;
struct btree_node_iter iter;
struct bch_fs *c = trans->c;
struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
: b->data->min_key;
struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bkey_s_c_btree_ptr_v2 bp;
struct bkey unpacked;
struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
struct printbuf buf = PRINTBUF;
struct bkey_buf prev;
int ret = 0;
BUG_ON(!b->c.level);
BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
!bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
b->data->min_key));
if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
return;
if (!b->c.level)
return 0;
bch2_btree_node_iter_init_from_start(&iter, b);
bch2_bkey_buf_init(&prev);
bkey_init(&prev.k->k);
bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
while (1) {
k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
if (k.k->type != KEY_TYPE_btree_ptr_v2)
break;
bp = bkey_s_c_to_btree_ptr_v2(k);
goto out;
if (!bpos_eq(next_node, bp.v->min_key)) {
bch2_dump_btree_node(c, b);
bch2_bpos_to_text(&buf1, next_node);
bch2_bpos_to_text(&buf2, bp.v->min_key);
panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
struct bpos expected_min = bkey_deleted(&prev.k->k)
? node_min
: bpos_successor(prev.k->k.p);
if (!bpos_eq(expected_min, bp.v->min_key)) {
bch2_topology_error(c);
printbuf_reset(&buf);
prt_str(&buf, "end of prev node doesn't match start of next node\n"),
prt_printf(&buf, " in btree %s level %u node ",
bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, "\n prev ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
prt_str(&buf, "\n next ");
bch2_bkey_val_to_text(&buf, c, k);
need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
goto topology_repair;
}
bch2_btree_node_iter_advance(&iter, b);
if (bch2_btree_node_iter_end(&iter)) {
if (!bpos_eq(k.k->p, b->key.k.p)) {
bch2_dump_btree_node(c, b);
bch2_bpos_to_text(&buf1, b->key.k.p);
bch2_bpos_to_text(&buf2, k.k->p);
panic("expected end %s got %s\n", buf1.buf, buf2.buf);
}
break;
}
next_node = bpos_successor(k.k->p);
bch2_bkey_buf_reassemble(&prev, c, k);
bch2_btree_and_journal_iter_advance(&iter);
}
#endif
if (bkey_deleted(&prev.k->k)) {
bch2_topology_error(c);
printbuf_reset(&buf);
prt_str(&buf, "empty interior node\n");
prt_printf(&buf, " in btree %s level %u node ",
bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
goto topology_repair;
} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
bch2_topology_error(c);
printbuf_reset(&buf);
prt_str(&buf, "last child node doesn't end at end of parent node\n");
prt_printf(&buf, " in btree %s level %u node ",
bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_str(&buf, "\n last key ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
goto topology_repair;
}
out:
fsck_err:
bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev, c);
printbuf_exit(&buf);
return ret;
topology_repair:
if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
bch2_inconsistent_error(c);
ret = -BCH_ERR_btree_need_topology_repair;
} else {
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
}
goto out;
}
/* Calculate ideal packed bkey format for new btree nodes: */
@ -1448,8 +1497,7 @@ static void __btree_split_node(struct btree_update *as,
bch2_verify_btree_nr_keys(n[i]);
if (b->c.level)
btree_node_interior_verify(as->c, n[i]);
BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
}
}
@ -1480,7 +1528,7 @@ static void btree_split_insert_keys(struct btree_update *as,
__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
btree_node_interior_verify(as->c, b);
BUG_ON(bch2_btree_node_check_topology(trans, b));
}
}
@ -1498,6 +1546,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
BUG_ON(!parent && (b != btree_node_root(c, b)));
BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
ret = bch2_btree_node_check_topology(trans, b);
if (ret)
return ret;
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
@ -1717,7 +1769,11 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
goto split;
}
btree_node_interior_verify(c, b);
ret = bch2_btree_node_check_topology(trans, b);
if (ret) {
bch2_btree_node_unlock_write(trans, path, b);
return ret;
}
bch2_btree_insert_keys_interior(as, trans, path, b, keys);
@ -1735,7 +1791,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_unlock_write(trans, path, b);
btree_node_interior_verify(c, b);
BUG_ON(bch2_btree_node_check_topology(trans, b));
return 0;
split:
/*

View File

@ -10,6 +10,8 @@
#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:

View File

@ -32,6 +32,12 @@ bool bch2_inconsistent_error(struct bch_fs *);
int bch2_topology_error(struct bch_fs *);
#define bch2_fs_topology_error(c, ...) \
({ \
bch_err(c, "btree topology error: " __VA_ARGS__); \
bch2_topology_error(c); \
})
#define bch2_fs_inconsistent(c, ...) \
({ \
bch_err(c, __VA_ARGS__); \

View File

@ -265,7 +265,8 @@
x(subvol_children_bad, 257) \
x(subvol_loop, 258) \
x(subvol_unreachable, 259) \
x(btree_node_bkey_bad_u64s, 260)
x(btree_node_bkey_bad_u64s, 260) \
x(btree_node_topology_empty_interior_node, 261)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,