bcachefs: Extents may now cross btree node boundaries

When snapshots arrive, we won't necessarily be able to arbitrarily split
existis - when we need to split an existing extent, we'll have to check
if the extent was overwritten in child snapshots and if so emit a
whiteout for the split in the child snapshot.

Because extents couldn't span btree nodes previously, journal replay
would sometimes have to split existing extents. That's no good anymore,
but fortunately since extent handling has already been lifted above most
of the btree code there's no real need for that rule anymore.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-02-10 16:13:57 -05:00 committed by Kent Overstreet
parent 7e1a3aa9df
commit 8042b5b715
7 changed files with 87 additions and 237 deletions

View File

@ -1346,13 +1346,19 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
x(reflink_inline_data, 14) \
x(new_varint, 15) \
x(journal_no_flush, 16) \
x(alloc_v2, 17)
x(alloc_v2, 17) \
x(extents_across_btree_nodes, 18)
#define BCH_SB_FEATURES_ALWAYS \
((1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_btree_updates_journalled)|\
(1ULL << BCH_FEATURE_extents_across_btree_nodes))
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(BCH_SB_FEATURES_ALWAYS| \
(1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint)| \
(1ULL << BCH_FEATURE_journal_no_flush)| \
(1ULL << BCH_FEATURE_alloc_v2))

View File

@ -1814,11 +1814,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
static inline struct bkey_s_c
__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter;
struct bkey_s_c k;
struct bkey n;
int ret;
struct bpos pos, next_start;
/* keys & holes can't span inode numbers: */
if (iter->pos.offset == KEY_OFFSET_MAX) {
@ -1826,50 +1823,31 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
return bkey_s_c_null;
bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
}
/*
* iterator is now at the correct position for inserting at iter->pos,
* but we need to keep iterating until we find the first non whiteout so
* we know how big a hole we have, if any:
*/
pos = iter->pos;
k = bch2_btree_iter_peek(iter);
iter->pos = pos;
node_iter = l->iter;
k = __btree_iter_unpack(iter, l, &iter->k,
bch2_btree_node_iter_peek(&node_iter, l->b));
if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
/*
* We're not setting iter->uptodate because the node iterator
* doesn't necessarily point at the key we're returning:
*/
EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
bch2_btree_iter_verify(iter);
if (bkey_err(k))
return k;
}
/* hole */
if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
return k;
if (!k.k)
k.k = &l->b->key.k;
next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
bkey_init(&n);
n.p = iter->pos;
bch2_key_resize(&n,
bkey_init(&iter->k);
iter->k.p = iter->pos;
bch2_key_resize(&iter->k,
min_t(u64, KEY_SIZE_MAX,
(k.k->p.inode == n.p.inode
? bkey_start_offset(k.k)
(next_start.inode == iter->pos.inode
? next_start.offset
: KEY_OFFSET_MAX) -
n.p.offset));
iter->pos.offset));
EBUG_ON(!n.size);
EBUG_ON(!iter->k.size);
iter->k = n;
iter->uptodate = BTREE_ITER_UPTODATE;
bch2_btree_iter_verify_entry_exit(iter);
@ -1893,13 +1871,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->uptodate == BTREE_ITER_UPTODATE)
return btree_iter_peek_uptodate(iter);
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return __bch2_btree_iter_peek_slot_extents(iter);
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return __bch2_btree_iter_peek_slot_extents(iter);
k = __btree_iter_peek_all(iter, l, &iter->k);
EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);

View File

@ -62,9 +62,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
bkey_cmp(bkey_start_pos(&insert->k),
bkey_predecessor(b->data->min_key)) < 0);
EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
@ -705,26 +702,31 @@ static inline int btree_iter_pos_cmp(const struct btree_iter *l,
bkey_cmp(l->pos, r->pos);
}
static void bch2_trans_update2(struct btree_trans *trans,
static int bch2_trans_update2(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_insert_entry *i, n = (struct btree_insert_entry) {
.iter = iter, .k = insert
};
int ret;
btree_insert_entry_checks(trans, n.iter, n.k);
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return ret;
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans_for_each_update2(trans, i) {
if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
*i = n;
return;
return 0;
}
if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
@ -733,6 +735,7 @@ static void bch2_trans_update2(struct btree_trans *trans,
array_insert_item(trans->updates2, trans->nr_updates2,
i - trans->updates2, n);
return 0;
}
static int extent_update_to_keys(struct btree_trans *trans,
@ -753,9 +756,9 @@ static int extent_update_to_keys(struct btree_trans *trans,
iter->flags |= BTREE_ITER_INTENT;
__bch2_btree_iter_set_pos(iter, insert->k.p, false);
bch2_trans_update2(trans, iter, insert);
ret = bch2_trans_update2(trans, iter, insert);
bch2_trans_iter_put(trans, iter);
return 0;
return ret;
}
static int extent_handle_overwrites(struct btree_trans *trans,
@ -785,8 +788,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
bch2_cut_back(start, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
if (ret)
goto err;
}
if (bkey_cmp(k.k->p, end) > 0) {
@ -800,8 +805,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
bch2_cut_front(end, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
if (ret)
goto err;
} else {
update_iter = bch2_trans_copy_iter(trans, iter);
@ -815,8 +822,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
update->k.size = 0;
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
if (ret)
goto err;
}
k = bch2_btree_iter_next_with_updates(iter);
@ -921,11 +930,11 @@ int __bch2_trans_commit(struct btree_trans *trans)
trans_for_each_update(trans, i) {
if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
ret = extent_update_to_keys(trans, i->iter, i->k);
} else {
ret = bch2_trans_update2(trans, i->iter, i->k);
}
if (ret)
goto out;
} else {
bch2_trans_update2(trans, i->iter, i->k);
}
}
trans_for_each_update2(trans, i) {

View File

@ -1321,9 +1321,6 @@ int bch2_mark_update(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree *b = iter_l(iter)->b;
struct btree_node_iter node_iter = iter_l(iter)->iter;
struct bkey_packed *_old;
struct bkey_s_c old;
struct bkey unpacked;
int ret = 0;
@ -1363,23 +1360,24 @@ int bch2_mark_update(struct btree_trans *trans,
BTREE_TRIGGER_OVERWRITE|flags);
}
} else {
struct btree_iter *copy;
BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
0, new->k.size,
fs_usage, trans->journal_res.seq,
BTREE_TRIGGER_INSERT|flags);
while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
unsigned offset = 0;
s64 sectors;
copy = bch2_trans_copy_iter(trans, iter);
old = bkey_disassemble(b, _old, &unpacked);
sectors = -((s64) old.k->size);
for_each_btree_key_continue(copy, 0, old, ret) {
unsigned offset = 0;
s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
return 0;
break;
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
@ -1412,9 +1410,8 @@ int bch2_mark_update(struct btree_trans *trans,
trans->journal_res.seq, flags) ?: 1;
if (ret <= 0)
break;
bch2_btree_node_iter_advance(&node_iter, b);
}
bch2_trans_iter_put(trans, copy);
}
return ret;
@ -1445,27 +1442,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
pr_err("overlapping with");
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
struct btree *b = iter_l(i->iter)->b;
struct btree_node_iter node_iter = iter_l(i->iter)->iter;
struct bkey_packed *_k;
while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
struct bkey_s_c k;
int ret;
pr_info("_k %px format %u", _k, _k->format);
k = bkey_disassemble(b, _k, &unpacked);
if (btree_node_is_extents(b)
for_each_btree_key_continue(copy, 0, k, ret) {
if (btree_node_type_is_extents(i->iter->btree_id)
? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
: bkey_cmp(i->k->k.p, k.k->p))
break;
bch2_bkey_val_to_text(&PBUF(buf), c, k);
pr_err("%s", buf);
bch2_btree_node_iter_advance(&node_iter, b);
}
bch2_trans_iter_put(trans, copy);
} else {
struct bkey_cached *ck = (void *) i->iter->l[0].b;
@ -1860,8 +1850,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
}
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
bch2_trans_update(trans, iter, n, 0);
out:
ret = sectors;
@ -1987,15 +1975,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
BTREE_TRIGGER_OVERWRITE|flags);
}
} else {
struct btree *b = iter_l(iter)->b;
struct btree_node_iter node_iter = iter_l(iter)->iter;
struct bkey_packed *_old;
struct bkey unpacked;
struct btree_iter *copy;
struct bkey _old;
EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
bkey_init(&unpacked);
old = (struct bkey_s_c) { &unpacked, NULL };
bkey_init(&_old);
old = (struct bkey_s_c) { &_old, NULL };
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
0, new->k.size,
@ -2003,18 +1989,16 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (ret)
return ret;
while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
unsigned flags = BTREE_TRIGGER_OVERWRITE;
unsigned offset = 0;
s64 sectors;
copy = bch2_trans_copy_iter(trans, iter);
old = bkey_disassemble(b, _old, &unpacked);
sectors = -((s64) old.k->size);
for_each_btree_key_continue(copy, 0, old, ret) {
unsigned offset = 0;
s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
return 0;
break;
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
@ -2045,10 +2029,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
offset, sectors, flags);
if (ret)
return ret;
bch2_btree_node_iter_advance(&node_iter, b);
break;
}
bch2_trans_iter_put(trans, copy);
}
return ret;

View File

@ -99,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
struct bpos *end)
{
struct btree_trans *trans = iter->trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey_packed *_k;
struct btree_iter *copy;
struct bkey_s_c k;
unsigned nr_iters = 0;
int ret;
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
b = iter->l[0].b;
node_iter = iter->l[0].iter;
BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
bkey_cmp(bkey_start_pos(&insert->k),
bkey_predecessor(b->data->min_key)) < 0);
*end = bpos_min(insert->k.p, b->key.k.p);
*end = insert->k.p;
/* extent_update_to_keys(): */
nr_iters += 1;
@ -126,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
if (ret < 0)
return ret;
while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
copy = bch2_trans_copy_iter(trans, iter);
for_each_btree_key_continue(copy, 0, k, ret) {
unsigned offset = 0;
if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
@ -155,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
&nr_iters, EXTENT_ITERS_MAX);
if (ret)
break;
bch2_btree_node_iter_advance(&node_iter, b);
}
bch2_trans_iter_put(trans, copy);
return ret < 0 ? ret : 0;
}

View File

@ -506,115 +506,6 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
struct bkey_i *k)
{
struct btree_trans trans;
struct btree_iter *iter, *split_iter;
/*
* We might cause compressed extents to be split, so we need to pass in
* a disk_reservation:
*/
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i *split;
struct bpos atomic_end;
/*
* Some extents aren't equivalent - w.r.t. what the triggers do
* - if they're split:
*/
bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
k->k.type == KEY_TYPE_reflink_p;
bool remark = false;
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
retry:
bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, btree_id,
bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
do {
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;
atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
goto err;
if (!remark &&
remark_if_split &&
bkey_cmp(atomic_end, k->k.p) < 0) {
ret = bch2_disk_reservation_add(c, &disk_res,
k->k.size *
bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
remark = true;
}
bkey_copy(split, k);
bch2_cut_front(iter->pos, split);
bch2_cut_back(atomic_end, split);
split_iter = bch2_trans_copy_iter(&trans, iter);
/*
* It's important that we don't go through the
* extent_handle_overwrites() and extent_update_to_keys() path
* here: journal replay is supposed to treat extents like
* regular keys
*/
__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split,
BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(&trans, split_iter);
bch2_btree_iter_set_pos(iter, split->k.p);
if (remark) {
ret = bch2_trans_mark_key(&trans,
bkey_s_c_null,
bkey_i_to_s_c(split),
0, split->k.size,
BTREE_TRIGGER_INSERT);
if (ret)
goto err;
}
} while (bkey_cmp(iter->pos, k->k.p) < 0);
if (remark) {
ret = bch2_trans_mark_key(&trans,
bkey_i_to_s_c(k),
bkey_s_c_null,
0, -((s64) k->k.size),
BTREE_TRIGGER_OVERWRITE);
if (ret)
goto err;
}
ret = bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY);
err:
bch2_trans_iter_put(&trans, iter);
if (ret == -EINTR)
goto retry;
bch2_disk_reservation_put(c, &disk_res);
return bch2_trans_exit(&trans) ?: ret;
}
static int __bch2_journal_replay_key(struct btree_trans *trans,
enum btree_id id, unsigned level,
struct bkey_i *k)
@ -753,9 +644,7 @@ static int bch2_journal_replay(struct bch_fs *c,
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
ret = i->k->k.size
? bch2_extent_replay_key(c, i->btree_id, i->k)
: bch2_journal_replay_key(c, i);
ret = bch2_journal_replay_key(c, i);
if (ret)
goto err;
}

View File

@ -956,9 +956,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);