bcachefs: Implement bch2_btree_iter_prev_min()

A user contributed a filessytem dump, where the dump was actually
corrupted (due to being taken while the filesystem was online), but
which exposed an interesting bug in fsck - reconstruct_inode().

When itearting in BTREE_ITER_filter_snapshots mode, it's required to
give an end position for the iteration and it can't span inode numbers;
continuing into the next inode might mean we start seeing keys from a
different snapshot tree, that the is_ancestor() checks always filter,
thus we're never able to return a key and stop iterating.

Backwards iteration never implemented the end position because nothing
else needed it - except for reconstuct_inode().

Additionally, backwards iteration is now able to overlay keys from the
journal, which will be useful if we ever decide to start doing journal
replay in the background.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2024-10-24 22:12:37 -04:00
parent a7df326af0
commit 632bcf3865
7 changed files with 244 additions and 95 deletions

View File

@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
iter->pos.snapshot != iter->snapshot);
BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
bkey_gt(iter->pos, iter->k.p));
BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
!(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
bkey_gt(iter->pos, iter->k.p)));
}
static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
@ -2152,6 +2154,37 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
return k;
}
static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
path->level,
path->pos,
end_pos,
&iter->journal_idx);
}
static noinline
struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek_prev(trans, iter,
k.k ? k.k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
k = bkey_i_to_s_c(next_journal);
}
return k;
}
/*
* Checks btree key cache for key at iter->pos and returns it if present, or
* bkey_s_c_null:
@ -2457,111 +2490,66 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
return bch2_btree_iter_peek(iter);
}
/**
* bch2_btree_iter_peek_prev() - returns first key less than or equal to
* iterator's current position
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
struct bkey_s_c k;
struct bkey saved_k;
const struct bch_val *saved_v;
btree_path_idx_t saved_path = 0;
int ret;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
EBUG_ON(btree_iter_path(trans, iter)->cached ||
btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_with_journal)
return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
struct bkey_s_c k, k2;
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
if (iter->flags & BTREE_ITER_filter_snapshots)
search_key.snapshot = U32_MAX;
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
bch2_btree_iter_set_pos(iter, iter->pos);
k = bkey_s_c_err(ret);
goto out_no_locked;
break;
}
struct btree_path *path = btree_iter_path(trans, iter);
struct btree_path_level *l = path_l(path);
k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_is_extents)
? bpos_ge(bkey_start_pos(k.k), search_key)
: bpos_gt(k.k->p, search_key)))
k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
bch2_btree_iter_set_pos(iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
btree_path_set_should_be_locked(trans, path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
if (!k.k || bpos_gt(k.k->p, search_key)) {
k = btree_path_level_prev(trans, path, l, &iter->k);
BUG_ON(k.k && bpos_gt(k.k->p, search_key));
}
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
k = k2;
if (bkey_err(k2)) {
bch2_btree_iter_set_pos(iter, iter->pos);
break;
}
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
k = btree_trans_peek_prev_journal(trans, iter, k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
bch2_btree_trans_peek_prev_updates(trans, iter, &k);
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_filter_snapshots) {
if (k.k->p.snapshot == iter->snapshot)
goto got_key;
/*
* If we have a saved candidate, and we're no
* longer at the same _key_ (not pos), return
* that candidate
*/
if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
bch2_path_put_nokeep(trans, iter->path,
iter->flags & BTREE_ITER_intent);
iter->path = saved_path;
saved_path = 0;
iter->k = saved_k;
k.v = saved_v;
goto got_key;
}
if (bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot)) {
if (saved_path)
bch2_path_put_nokeep(trans, saved_path,
iter->flags & BTREE_ITER_intent);
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
path = btree_iter_path(trans, iter);
trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
saved_k = *k.k;
saved_v = k.v;
}
search_key = bpos_predecessor(k.k->p);
continue;
}
got_key:
if (bkey_whiteout(k.k) &&
!(iter->flags & BTREE_ITER_all_snapshots)) {
search_key = bkey_predecessor(iter, k.k->p);
if (iter->flags & BTREE_ITER_filter_snapshots)
search_key.snapshot = U32_MAX;
continue;
}
btree_path_set_should_be_locked(trans, path);
if (likely(k.k && !bkey_deleted(k.k))) {
break;
} else if (k.k) {
search_key = bpos_predecessor(k.k->p);
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
search_key = bpos_predecessor(path->l[0].b->data->min_key);
@ -2569,15 +2557,120 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
/* Start of btree: */
bch2_btree_iter_set_pos(iter, POS_MIN);
k = bkey_s_c_null;
goto out_no_locked;
break;
}
}
EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
bch2_btree_iter_verify(iter);
return k;
}
/**
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
* iterator's current position
* @iter: iterator to peek from
* @end: search limit: returns keys greater than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
struct bkey_s_c k;
btree_path_idx_t saved_path = 0;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
if (iter->flags & BTREE_ITER_filter_snapshots)
search_key.snapshot = U32_MAX;
while (1) {
k = __bch2_btree_iter_peek_prev(iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
goto out_no_locked;
if (iter->flags & BTREE_ITER_filter_snapshots) {
struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
/*
* If we have a saved candidate, and we're past
* the last possible snapshot overwrite, return
* it:
*/
bch2_path_put_nokeep(trans, iter->path,
iter->flags & BTREE_ITER_intent);
iter->path = saved_path;
saved_path = 0;
k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
break;
}
/*
* We need to check against @end before FILTER_SNAPSHOTS because
* if we get to a different inode that requested we might be
* seeing keys for a different snapshot tree that will all be
* filtered out.
*/
if (unlikely(bkey_lt(k.k->p, end)))
goto end;
if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
search_key = bpos_predecessor(k.k->p);
continue;
}
if (k.k->p.snapshot != iter->snapshot) {
/*
* Have a key visible in iter->snapshot, but
* might have overwrites: - save it and keep
* searching. Unless it's a whiteout - then drop
* our previous saved candidate:
*/
if (saved_path) {
bch2_path_put_nokeep(trans, saved_path,
iter->flags & BTREE_ITER_intent);
saved_path = 0;
}
if (!bkey_whiteout(k.k)) {
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_intent,
_THIS_IP_);
trace_btree_path_save_pos(trans,
trans->paths + iter->path,
trans->paths + saved_path);
}
search_key = bpos_predecessor(k.k->p);
continue;
}
if (bkey_whiteout(k.k)) {
search_key = bkey_predecessor(iter, k.k->p);
search_key.snapshot = U32_MAX;
continue;
}
}
EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
bkey_gt(k.k->p, iter->pos));
if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
bkey_lt(k.k->p, end)))
goto end;
break;
}
/* Extents can straddle iter->pos: */
if (bkey_lt(k.k->p, iter->pos))
iter->pos = k.k->p;
iter->pos = bpos_min(iter->pos, k.k->p);;
if (iter->flags & BTREE_ITER_filter_snapshots)
iter->pos.snapshot = iter->snapshot;
@ -2587,8 +2680,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
return k;
end:
bch2_btree_iter_set_pos(iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
/**

View File

@ -389,7 +389,13 @@ static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
return bch2_btree_iter_peek_max(iter, SPOS_MAX);
}
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
}
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);

View File

@ -107,6 +107,52 @@ struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_
return NULL;
}
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
{
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
struct journal_key *k;
BUG_ON(*idx > keys->nr);
search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
while (*idx &&
__journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
(*idx)++;
iters++;
if (iters == 10) {
*idx = 0;
goto search;
}
}
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
return NULL;
if (k->overwritten) {
--(*idx);
continue;
}
if (__journal_key_cmp(btree_id, level, pos, k) >= 0)
return k->k;
--(*idx);
iters++;
if (iters == 10) {
*idx = 0;
goto search;
}
}
return NULL;
}
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos)
{

View File

@ -45,6 +45,8 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour
struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
unsigned, struct bpos, struct bpos, size_t *);
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
unsigned, struct bpos);

View File

@ -193,7 +193,6 @@
x(EINVAL, opt_parse_error) \
x(EINVAL, remove_with_metadata_missing_unimplemented)\
x(EINVAL, remove_would_lose_data) \
x(EINVAL, btree_iter_with_journal_not_supported) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \

View File

@ -620,7 +620,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
struct btree_iter iter = {};
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
bch2_trans_iter_exit(trans, &iter);
int ret = bkey_err(k);
if (ret)
@ -1649,7 +1649,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
if (i->count != count2) {
bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
w->last_pos.inode, i->snapshot, i->count, count2);
return -BCH_ERR_internal_fsck_err;
i->count = count2;
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),

View File

@ -426,7 +426,7 @@ case LOGGED_OP_FINSERT_shift_extents:
bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
k = insert
? bch2_btree_iter_peek_prev(&iter)
? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
: bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
if ((ret = bkey_err(k)))
goto btree_err;