mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-01 10:42:11 +00:00
bcachefs: Implement bch2_btree_iter_prev_min()
A user contributed a filessytem dump, where the dump was actually corrupted (due to being taken while the filesystem was online), but which exposed an interesting bug in fsck - reconstruct_inode(). When itearting in BTREE_ITER_filter_snapshots mode, it's required to give an end position for the iteration and it can't span inode numbers; continuing into the next inode might mean we start seeing keys from a different snapshot tree, that the is_ancestor() checks always filter, thus we're never able to return a key and stop iterating. Backwards iteration never implemented the end position because nothing else needed it - except for reconstuct_inode(). Additionally, backwards iteration is now able to overlay keys from the journal, which will be useful if we ever decide to start doing journal replay in the background. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
a7df326af0
commit
632bcf3865
@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
|
||||
BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) &&
|
||||
iter->pos.snapshot != iter->snapshot);
|
||||
|
||||
BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
|
||||
bkey_gt(iter->pos, iter->k.p));
|
||||
BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) :
|
||||
!(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) :
|
||||
(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) ||
|
||||
bkey_gt(iter->pos, iter->k.p)));
|
||||
}
|
||||
|
||||
static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
|
||||
@ -2152,6 +2154,37 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
|
||||
return k;
|
||||
}
|
||||
|
||||
static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bpos end_pos)
|
||||
{
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
|
||||
return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
|
||||
path->level,
|
||||
path->pos,
|
||||
end_pos,
|
||||
&iter->journal_idx);
|
||||
}
|
||||
|
||||
static noinline
|
||||
struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans,
|
||||
struct btree_iter *iter,
|
||||
struct bkey_s_c k)
|
||||
{
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
struct bkey_i *next_journal =
|
||||
bch2_btree_journal_peek_prev(trans, iter,
|
||||
k.k ? k.k->p : path_l(path)->b->key.k.p);
|
||||
|
||||
if (next_journal) {
|
||||
iter->k = next_journal->k;
|
||||
k = bkey_i_to_s_c(next_journal);
|
||||
}
|
||||
|
||||
return k;
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks btree key cache for key at iter->pos and returns it if present, or
|
||||
* bkey_s_c_null:
|
||||
@ -2457,111 +2490,66 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
|
||||
return bch2_btree_iter_peek(iter);
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_btree_iter_peek_prev() - returns first key less than or equal to
|
||||
* iterator's current position
|
||||
* @iter: iterator to peek from
|
||||
*
|
||||
* Returns: key if found, or an error extractable with bkey_err().
|
||||
*/
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
||||
static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct bpos search_key = iter->pos;
|
||||
struct bkey_s_c k;
|
||||
struct bkey saved_k;
|
||||
const struct bch_val *saved_v;
|
||||
btree_path_idx_t saved_path = 0;
|
||||
int ret;
|
||||
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
EBUG_ON(btree_iter_path(trans, iter)->cached ||
|
||||
btree_iter_path(trans, iter)->level);
|
||||
|
||||
if (iter->flags & BTREE_ITER_with_journal)
|
||||
return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
|
||||
struct bkey_s_c k, k2;
|
||||
|
||||
bch2_btree_iter_verify(iter);
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots)
|
||||
search_key.snapshot = U32_MAX;
|
||||
|
||||
while (1) {
|
||||
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
btree_iter_ip_allocated(iter));
|
||||
|
||||
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
||||
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
|
||||
if (unlikely(ret)) {
|
||||
/* ensure that iter->k is consistent with iter->pos: */
|
||||
bch2_btree_iter_set_pos(iter, iter->pos);
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
break;
|
||||
}
|
||||
|
||||
struct btree_path *path = btree_iter_path(trans, iter);
|
||||
struct btree_path_level *l = path_l(path);
|
||||
|
||||
k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
|
||||
if (!k.k ||
|
||||
((iter->flags & BTREE_ITER_is_extents)
|
||||
? bpos_ge(bkey_start_pos(k.k), search_key)
|
||||
: bpos_gt(k.k->p, search_key)))
|
||||
k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
|
||||
if (unlikely(!l->b)) {
|
||||
/* No btree nodes at requested level: */
|
||||
bch2_btree_iter_set_pos(iter, SPOS_MAX);
|
||||
k = bkey_s_c_null;
|
||||
break;
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
|
||||
k = btree_path_level_peek_all(trans->c, l, &iter->k);
|
||||
if (!k.k || bpos_gt(k.k->p, search_key)) {
|
||||
k = btree_path_level_prev(trans, path, l, &iter->k);
|
||||
|
||||
BUG_ON(k.k && bpos_gt(k.k->p, search_key));
|
||||
}
|
||||
|
||||
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
|
||||
k.k &&
|
||||
(k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
|
||||
k = k2;
|
||||
if (bkey_err(k2)) {
|
||||
bch2_btree_iter_set_pos(iter, iter->pos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(iter->flags & BTREE_ITER_with_journal))
|
||||
k = btree_trans_peek_prev_journal(trans, iter, k);
|
||||
|
||||
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
|
||||
trans->nr_updates))
|
||||
bch2_btree_trans_peek_prev_updates(trans, iter, &k);
|
||||
|
||||
if (likely(k.k)) {
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots) {
|
||||
if (k.k->p.snapshot == iter->snapshot)
|
||||
goto got_key;
|
||||
|
||||
/*
|
||||
* If we have a saved candidate, and we're no
|
||||
* longer at the same _key_ (not pos), return
|
||||
* that candidate
|
||||
*/
|
||||
if (saved_path && !bkey_eq(k.k->p, saved_k.p)) {
|
||||
bch2_path_put_nokeep(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
iter->path = saved_path;
|
||||
saved_path = 0;
|
||||
iter->k = saved_k;
|
||||
k.v = saved_v;
|
||||
goto got_key;
|
||||
}
|
||||
|
||||
if (bch2_snapshot_is_ancestor(trans->c,
|
||||
iter->snapshot,
|
||||
k.k->p.snapshot)) {
|
||||
if (saved_path)
|
||||
bch2_path_put_nokeep(trans, saved_path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
saved_path = btree_path_clone(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
_THIS_IP_);
|
||||
path = btree_iter_path(trans, iter);
|
||||
trace_btree_path_save_pos(trans, path, trans->paths + saved_path);
|
||||
saved_k = *k.k;
|
||||
saved_v = k.v;
|
||||
}
|
||||
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
got_key:
|
||||
if (bkey_whiteout(k.k) &&
|
||||
!(iter->flags & BTREE_ITER_all_snapshots)) {
|
||||
search_key = bkey_predecessor(iter, k.k->p);
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots)
|
||||
search_key.snapshot = U32_MAX;
|
||||
continue;
|
||||
}
|
||||
|
||||
btree_path_set_should_be_locked(trans, path);
|
||||
if (likely(k.k && !bkey_deleted(k.k))) {
|
||||
break;
|
||||
} else if (k.k) {
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
} else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
|
||||
/* Advance to previous leaf node: */
|
||||
search_key = bpos_predecessor(path->l[0].b->data->min_key);
|
||||
@ -2569,15 +2557,120 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
||||
/* Start of btree: */
|
||||
bch2_btree_iter_set_pos(iter, POS_MIN);
|
||||
k = bkey_s_c_null;
|
||||
goto out_no_locked;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos));
|
||||
bch2_btree_iter_verify(iter);
|
||||
return k;
|
||||
}
|
||||
|
||||
/**
|
||||
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
|
||||
* iterator's current position
|
||||
* @iter: iterator to peek from
|
||||
* @end: search limit: returns keys greater than or equal to @end
|
||||
*
|
||||
* Returns: key if found, or an error extractable with bkey_err().
|
||||
*/
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct bpos search_key = iter->pos;
|
||||
struct bkey_s_c k;
|
||||
btree_path_idx_t saved_path = 0;
|
||||
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
|
||||
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots)
|
||||
search_key.snapshot = U32_MAX;
|
||||
|
||||
while (1) {
|
||||
k = __bch2_btree_iter_peek_prev(iter, search_key);
|
||||
if (unlikely(!k.k))
|
||||
goto end;
|
||||
if (unlikely(bkey_err(k)))
|
||||
goto out_no_locked;
|
||||
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots) {
|
||||
struct btree_path *s = saved_path ? trans->paths + saved_path : NULL;
|
||||
if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) {
|
||||
/*
|
||||
* If we have a saved candidate, and we're past
|
||||
* the last possible snapshot overwrite, return
|
||||
* it:
|
||||
*/
|
||||
bch2_path_put_nokeep(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
iter->path = saved_path;
|
||||
saved_path = 0;
|
||||
k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to check against @end before FILTER_SNAPSHOTS because
|
||||
* if we get to a different inode that requested we might be
|
||||
* seeing keys for a different snapshot tree that will all be
|
||||
* filtered out.
|
||||
*/
|
||||
if (unlikely(bkey_lt(k.k->p, end)))
|
||||
goto end;
|
||||
|
||||
if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) {
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (k.k->p.snapshot != iter->snapshot) {
|
||||
/*
|
||||
* Have a key visible in iter->snapshot, but
|
||||
* might have overwrites: - save it and keep
|
||||
* searching. Unless it's a whiteout - then drop
|
||||
* our previous saved candidate:
|
||||
*/
|
||||
if (saved_path) {
|
||||
bch2_path_put_nokeep(trans, saved_path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
saved_path = 0;
|
||||
}
|
||||
|
||||
if (!bkey_whiteout(k.k)) {
|
||||
saved_path = btree_path_clone(trans, iter->path,
|
||||
iter->flags & BTREE_ITER_intent,
|
||||
_THIS_IP_);
|
||||
trace_btree_path_save_pos(trans,
|
||||
trans->paths + iter->path,
|
||||
trans->paths + saved_path);
|
||||
}
|
||||
|
||||
search_key = bpos_predecessor(k.k->p);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (bkey_whiteout(k.k)) {
|
||||
search_key = bkey_predecessor(iter, k.k->p);
|
||||
search_key.snapshot = U32_MAX;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) :
|
||||
iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) :
|
||||
bkey_gt(k.k->p, iter->pos));
|
||||
|
||||
if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) :
|
||||
iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) :
|
||||
bkey_lt(k.k->p, end)))
|
||||
goto end;
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
/* Extents can straddle iter->pos: */
|
||||
if (bkey_lt(k.k->p, iter->pos))
|
||||
iter->pos = k.k->p;
|
||||
iter->pos = bpos_min(iter->pos, k.k->p);;
|
||||
|
||||
if (iter->flags & BTREE_ITER_filter_snapshots)
|
||||
iter->pos.snapshot = iter->snapshot;
|
||||
@ -2587,8 +2680,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
||||
|
||||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
bch2_btree_iter_verify(iter);
|
||||
|
||||
return k;
|
||||
end:
|
||||
bch2_btree_iter_set_pos(iter, end);
|
||||
k = bkey_s_c_null;
|
||||
goto out_no_locked;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -389,7 +389,13 @@ static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
|
||||
return bch2_btree_iter_peek_max(iter, SPOS_MAX);
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
|
||||
struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
|
||||
|
||||
static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
||||
{
|
||||
return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
|
||||
}
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
|
||||
|
||||
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
|
||||
|
@ -107,6 +107,52 @@ struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos,
|
||||
struct bpos end_pos, size_t *idx)
|
||||
{
|
||||
struct journal_keys *keys = &c->journal_keys;
|
||||
unsigned iters = 0;
|
||||
struct journal_key *k;
|
||||
|
||||
BUG_ON(*idx > keys->nr);
|
||||
search:
|
||||
if (!*idx)
|
||||
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
|
||||
|
||||
while (*idx &&
|
||||
__journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
|
||||
(*idx)++;
|
||||
iters++;
|
||||
if (iters == 10) {
|
||||
*idx = 0;
|
||||
goto search;
|
||||
}
|
||||
}
|
||||
|
||||
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
|
||||
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
|
||||
return NULL;
|
||||
|
||||
if (k->overwritten) {
|
||||
--(*idx);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (__journal_key_cmp(btree_id, level, pos, k) >= 0)
|
||||
return k->k;
|
||||
|
||||
--(*idx);
|
||||
iters++;
|
||||
if (iters == 10) {
|
||||
*idx = 0;
|
||||
goto search;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
|
||||
unsigned level, struct bpos pos)
|
||||
{
|
||||
|
@ -45,6 +45,8 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour
|
||||
|
||||
struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos, struct bpos, size_t *);
|
||||
struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos, struct bpos, size_t *);
|
||||
struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
|
||||
unsigned, struct bpos);
|
||||
|
||||
|
@ -193,7 +193,6 @@
|
||||
x(EINVAL, opt_parse_error) \
|
||||
x(EINVAL, remove_with_metadata_missing_unimplemented)\
|
||||
x(EINVAL, remove_would_lose_data) \
|
||||
x(EINVAL, btree_iter_with_journal_not_supported) \
|
||||
x(EROFS, erofs_trans_commit) \
|
||||
x(EROFS, erofs_no_writes) \
|
||||
x(EROFS, erofs_journal_err) \
|
||||
|
@ -620,7 +620,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
|
||||
struct btree_iter iter = {};
|
||||
|
||||
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
|
||||
struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
|
||||
bch2_trans_iter_exit(trans, &iter);
|
||||
int ret = bkey_err(k);
|
||||
if (ret)
|
||||
@ -1649,7 +1649,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
|
||||
if (i->count != count2) {
|
||||
bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
|
||||
w->last_pos.inode, i->snapshot, i->count, count2);
|
||||
return -BCH_ERR_internal_fsck_err;
|
||||
i->count = count2;
|
||||
}
|
||||
|
||||
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
|
||||
|
@ -426,7 +426,7 @@ case LOGGED_OP_FINSERT_shift_extents:
|
||||
bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
|
||||
|
||||
k = insert
|
||||
? bch2_btree_iter_peek_prev(&iter)
|
||||
? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
|
||||
: bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
|
||||
if ((ret = bkey_err(k)))
|
||||
goto btree_err;
|
||||
|
Loading…
Reference in New Issue
Block a user