From 3d1ea1c0aeaf7baaf0c0a3d073a49671dfd3771a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 11 Oct 2024 16:21:14 -0400 Subject: [PATCH 001/233] bcachefs: kill retry_estale() in bch2_ioctl_subvolume_create() this was likely originally cribbed, and has been dead code, and Al is working on removing it from the tree. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 405cf08bda34..15725b4ce393 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -406,7 +406,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, sync_inodes_sb(c->vfs_sb); up_read(&c->vfs_sb->s_umount); } -retry: + if (arg.src_ptr) { error = user_path_at(arg.dirfd, (const char __user *)(unsigned long)arg.src_ptr, @@ -486,11 +486,6 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, err2: if (arg.src_ptr) path_put(&src_path); - - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } err1: return error; } From 40cfa4d5b8dcf25ae12c8fca492212e0a2b1d2cc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 9 Oct 2024 16:53:59 -0400 Subject: [PATCH 002/233] bcachefs: Fix racy use of jiffies Calculate the timeout, then check if it's positive before calling schedule_timeout(). Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ace291f175dd..3d8fc2642425 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -758,10 +758,12 @@ static int bch2_journal_reclaim_thread(void *arg) journal_empty = fifo_empty(&j->pin); spin_unlock(&j->lock); + long timeout = j->next_reclaim - jiffies; + if (journal_empty) schedule(); - else if (time_after(j->next_reclaim, jiffies)) - schedule_timeout(j->next_reclaim - jiffies); + else if (timeout > 0) + schedule_timeout(timeout); else break; } From 61b2134ccc24a8843a1c9bc8bfd28bdfe88a0aab Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 7 Oct 2024 09:11:21 +0100 Subject: [PATCH 003/233] bcachefs: remove superfluous ; after statements There are a several statements with two following semicolons, replace these with just one semicolon. Signed-off-by: Colin Ian King Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 2 +- fs/bcachefs/ec.c | 2 +- fs/bcachefs/super.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 5d809e8bd170..79a274dcd17b 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 749dcf368841..d489a9e28702 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -909,7 +909,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, bch2_bkey_val_to_text(&msgbuf, c, orig_k); bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); - printbuf_exit(&msgbuf);; + printbuf_exit(&msgbuf); ret = -BCH_ERR_stripe_reconstruct; goto out; } diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a6ed9a0bf1c7..17442df7326d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1120,12 +1120,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_bdevname(&buf, fs->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); prt_newline(&buf); prt_bdevname(&buf, sb->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); prt_newline(&buf); if (!opts->no_splitbrain_check) From cb9d3414d0b89b5b9803ea0531cbffe74925d54b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Sep 2024 14:27:24 -0400 Subject: [PATCH 004/233] bcachefs: bch2_inode_should_have_bp -> bch2_inode_should_have_single_bp Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 2 +- fs/bcachefs/fsck.c | 2 +- fs/bcachefs/inode.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index a41d0d8a2f7b..646b74494a3f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -628,7 +628,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, goto err; /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && + if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), c, "dirent points to inode that does not point back:\n %s", diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 75c8a97a6954..285de12436dd 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2156,7 +2156,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target); } - if (bch2_inode_should_have_bp(target) && + if (bch2_inode_should_have_single_bp(target) && !fsck_err(trans, inode_wrong_backpointer, "dirent points to inode that does not point back:\n %s", (bch2_bkey_val_to_text(&buf, c, d.s_c), diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index eab82b5eb897..bdeb6be76038 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -249,7 +249,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); -static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode) +static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) { bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; From 61bf384a85f4ab4845a41762ca6aa91a18c67cca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 1 Oct 2024 17:45:58 -0400 Subject: [PATCH 005/233] bcachefs: remove_backpointer() now uses dirent_get_by_pos() Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 285de12436dd..6b2ddbabe3e7 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -482,6 +482,13 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * return ret; } +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { @@ -490,13 +497,11 @@ static int remove_backpointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c_dirent d = - bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, - SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, - dirent); - int ret = bkey_err(d) ?: - dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); + int ret = bkey_err(d) ?: + dirent_points_to_inode(c, d, inode) ?: + __remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1166,13 +1171,6 @@ static int hash_check_key(struct btree_trans *trans, goto out; } -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, From be40edadb0b715809f25bade2827af050ae6fbaa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 30 Sep 2024 00:14:09 -0400 Subject: [PATCH 006/233] bcachefs: __bch2_key_has_snapshot_overwrites uses for_each_btree_key_reverse_norestart() Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index ae57638506c3..feaf2aa0d900 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1735,18 +1735,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - while (1) { - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (!k.k) - break; - + for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots, + k, ret) { if (!bkey_eq(pos, k.k->p)) break; From 1ef7af68e376ab89a6b8e49387f7a4bad4fc6657 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 22 Sep 2024 01:11:36 -0400 Subject: [PATCH 007/233] bcachefs: rcu_pending: don't invoke __call_rcu() under lock In userspace we don't (yet) have an SRCU implementation, so call_srcu() recurses. But we don't want to be invoking it under the lock anyways. Signed-off-by: Kent Overstreet --- fs/bcachefs/rcu_pending.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index 40a20192eee8..67522aa344a7 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -478,7 +478,9 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, */ if (!p->cb_armed) { p->cb_armed = true; + spin_unlock_irqrestore(&p->lock, flags); __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); + goto free_node; } else { __start_poll_synchronize_rcu(pending->srcu); } From bdb3bdcbc2ebcb2fc50be2c094184103b7ff5d30 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Sep 2024 23:10:48 -0400 Subject: [PATCH 008/233] bcachefs: bch_verbose_ratelimited ratelimit "deleting unlinked inode" messages Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 8 ++++++++ fs/bcachefs/inode.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e94a83b8113e..7db81e182c3c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -293,6 +293,8 @@ do { \ #define bch_info(c, fmt, ...) \ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_info_ratelimited(c, fmt, ...) \ + bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_notice(c, fmt, ...) \ bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ @@ -352,6 +354,12 @@ do { \ bch_info(c, fmt, ##__VA_ARGS__); \ } while (0) +#define bch_verbose_ratelimited(c, fmt, ...) \ +do { \ + if ((c)->opts.verbose) \ + bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ +} while (0) + #define pr_verbose_init(opts, fmt, ...) \ do { \ if (opt_get(opts, verbose)) \ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 039cb7a22244..43653cf050e9 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1380,7 +1380,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c) NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); if (ret > 0) { - bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); + bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", + k.k->p.offset, k.k->p.snapshot); ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); /* From 2aa08c451ebf753ed0170e1d8d05ac4b51221392 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 1 Oct 2024 16:59:08 -0400 Subject: [PATCH 009/233] bcachefs: Pull disk accounting hooks out of trans_commit.c Also, fix a minor bug in the revert path, where we weren't checking the journal entry type correctly. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 35 +++++------------------------ fs/bcachefs/disk_accounting.h | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 9bf471fa4361..3d951846a1be 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -609,14 +609,6 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) return 0; } -static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) -{ - return (struct bversion) { - .hi = res->seq >> 32, - .lo = (res->seq << 32) | (res->offset + offset), - }; -} - static inline int bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct btree_insert_entry **stopped_at, @@ -701,25 +693,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct jset_entry *entry = trans->journal_entries; percpu_down_read(&c->mark_lock); - for (entry = trans->journal_entries; entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); entry = vstruct_next(entry)) if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && entry->start->k.type == KEY_TYPE_accounting) { - BUG_ON(!trans->journal_res.ref); - - struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); - - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, - (u64 *) entry - (u64 *) trans->journal_entries); - BUG_ON(bversion_zero(a->k.bversion)); - - if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { - ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); - if (ret) - goto revert_fs_usage; - } + ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); + if (ret) + goto revert_fs_usage; } percpu_up_read(&c->mark_lock); @@ -833,13 +814,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, entry2 != entry; entry2 = vstruct_next(entry2)) if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && - entry2->start->k.type == KEY_TYPE_accounting) { - struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); - - bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); - bch2_accounting_neg(a); - } + entry2->start->k.type == KEY_TYPE_accounting) + bch2_accounting_trans_commit_revert(trans, + bkey_i_to_accounting(entry2->start), flags); percpu_up_read(&c->mark_lock); return ret; } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 4ea6c8a092bc..6639535dc91c 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_DISK_ACCOUNTING_H #define _BCACHEFS_DISK_ACCOUNTING_H +#include "btree_update.h" #include "eytzinger.h" #include "sb-members.h" @@ -204,6 +205,43 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, bch2_accounting_mem_read_counters(acc, idx, v, nr, false); } +static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) +{ + EBUG_ON(!res->ref); + + return (struct bversion) { + .hi = res->seq >> 32, + .lo = (res->seq << 32) | (res->offset + offset), + }; +} + +static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, + struct bkey_i_accounting *a, + unsigned commit_flags) +{ + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, + (u64 *) a - (u64 *) trans->journal_entries); + + EBUG_ON(bversion_zero(a->k.bversion)); + + return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) + ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) + : 0; +} + +static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, + struct bkey_i_accounting *a_i, + unsigned commit_flags) +{ + if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { + struct bkey_s_accounting a = accounting_i_to_s(a_i); + + bch2_accounting_neg(a); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); + bch2_accounting_neg(a); + } +} + int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); From 6952c5b0d70b70638a070f50668a614235a11175 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Fri, 27 Sep 2024 22:26:53 +0800 Subject: [PATCH 010/233] bcachefs: Delete dead code lock_fail_root_changed has not been used since commit 0d7009d7ca99 ("bcachefs: Delete old deadlock avoidance code") Remove it. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 -- fs/bcachefs/errcode.h | 1 - 2 files changed, 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index eef9b89c561d..01152fd5ac57 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -748,8 +748,6 @@ static inline int btree_path_lock_root(struct btree_trans *trans, ret = btree_node_lock(trans, path, &b->c, path->level, lock_type, trace_ip); if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) - continue; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; BUG(); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 9c4fe5cdbfb7..e3b0ec7a0f73 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -164,7 +164,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ x(0, backpointer_to_overwritten_btree_node) \ - x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ From cf3d513801562174506425a79a9e71050f1d5d77 Mon Sep 17 00:00:00 2001 From: Thomas Bertschinger Date: Fri, 13 Sep 2024 18:11:22 -0600 Subject: [PATCH 011/233] bcachefs: move bch2_xattr_handlers to .rodata A series posted previously moved all of the `struct xattr_handler` tables to .rodata for each filesystem [1]. However, this appears to have been done shortly before bcachefs was merged, so bcachefs was missed at that time. Link: https://lkml.kernel.org/r/20230930050033.41174-1-wedsonaf@gmail.com [1] Cc: Wedson Almeida Filho Signed-off-by: Thomas Bertschinger Signed-off-by: Kent Overstreet --- fs/bcachefs/xattr.c | 2 +- fs/bcachefs/xattr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 952aca400faf..bf3c6bb50495 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -609,7 +609,7 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { #endif /* NO_BCACHEFS_FS */ -const struct xattr_handler *bch2_xattr_handlers[] = { +const struct xattr_handler * const bch2_xattr_handlers[] = { &bch_xattr_user_handler, &bch_xattr_trusted_handler, &bch_xattr_security_handler, diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index c188a5ad64ce..2c96de051f3e 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -44,6 +44,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum, ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -extern const struct xattr_handler *bch2_xattr_handlers[]; +extern const struct xattr_handler * const bch2_xattr_handlers[]; #endif /* _BCACHEFS_XATTR_H */ From 8ed4dcbbc3242c5c8004bb1ca5c1d47d0e8250f9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 24 Sep 2024 05:08:39 -0400 Subject: [PATCH 012/233] bcachefs: Remove unnecessary peek_slot() hash_lookup() used to return an errorcode, and a peek_slot() call was required to get the key it looked up. But we're adding fault injection for transaction restarts, so fix this old unconverted code. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6b2ddbabe3e7..c96025b8b65d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -170,7 +170,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, if (ret) return ret; - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); *target = le64_to_cpu(d.v->d_inum); *type = d.v->d_type; bch2_trans_iter_exit(trans, &iter); From 250087e69e9c123ea58fba31cf301355ee6cb49a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:11:41 -0400 Subject: [PATCH 013/233] bcachefs: kill btree_trans_restart_nounlock() Redundant, the normal btree_trans_restart() doesn't unlock. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 7 +++---- fs/bcachefs/btree_trans_commit.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 0bda054f80d7..24406f723283 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -341,21 +341,20 @@ static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) } __always_inline -static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; - trans->last_restarted_ip = _THIS_IP_; + trans->last_restarted_ip = ip; return -err; } __always_inline static int btree_trans_restart(struct btree_trans *trans, int err) { - btree_trans_restart_nounlock(trans, err); - return -err; + return btree_trans_restart_ip(trans, err, _THIS_IP_); } bool bch2_btree_node_upgrade(struct btree_trans *, diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 3d951846a1be..b47f11881fe4 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -624,7 +624,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } /* From 43bf715a17c1be337b686fb4b5297739a704126e Mon Sep 17 00:00:00 2001 From: Dennis Lam Date: Wed, 11 Sep 2024 21:16:28 -0400 Subject: [PATCH 014/233] docs: filesystems: bcachefs: fixed some spelling mistakes in the bcachefs coding style page Specifically, fixed spelling of "commit" and pluralization of last sentence. Signed-off-by: Dennis Lam Signed-off-by: Kent Overstreet --- Documentation/filesystems/bcachefs/CodingStyle.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst index 01de555e21d8..b29562a6bf55 100644 --- a/Documentation/filesystems/bcachefs/CodingStyle.rst +++ b/Documentation/filesystems/bcachefs/CodingStyle.rst @@ -183,4 +183,4 @@ even better as a code comment. A good code comment is wonderful, but even better is the comment that didn't need to exist because the code was so straightforward as to be obvious; organized into small clean and tidy modules, with clear and descriptive names -for functions and variable, where every line of code has a clear purpose. +for functions and variables, where every line of code has a clear purpose. From 0892d51393106dcb8c7d88cc2ee2f976d4a56c92 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 23 Sep 2024 16:20:29 +0200 Subject: [PATCH 015/233] bcachefs: Remove duplicate included headers The header files dirent_format.h and disk_groups_format.h are included twice. Remove the redundant includes and the following warnings reported by make includecheck: disk_groups_format.h is included more than once dirent_format.h is included more than once Reviewed-by: Hongbo Li Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 5004f6ba997c..6a67df2a2fcd 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -499,8 +499,6 @@ struct bch_sb_field { #include "disk_groups_format.h" #include "extents_format.h" #include "ec_format.h" -#include "dirent_format.h" -#include "disk_groups_format.h" #include "inode_format.h" #include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" From e4753128a6cfda251b1dcb95320735c0a2e036c8 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 23 Sep 2024 16:44:53 +0200 Subject: [PATCH 016/233] bcachefs: Use FOREACH_ACL_ENTRY() macro to iterate over acl entries Use the existing FOREACH_ACL_ENTRY() macro to iterate over POSIX acl entries and remove the custom acl_for_each_entry() macro. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 87f1be9d4db4..99487727ae64 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -184,11 +184,6 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, return ERR_PTR(-EINVAL); } -#define acl_for_each_entry(acl, acl_e) \ - for (acl_e = acl->a_entries; \ - acl_e < acl->a_entries + acl->a_count; \ - acl_e++) - /* * Convert from in-memory to filesystem representation. */ @@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans, { struct bkey_i_xattr *xattr; bch_acl_header *acl_header; - const struct posix_acl_entry *acl_e; + const struct posix_acl_entry *acl_e, *pe; void *outptr; unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - acl_for_each_entry(acl, acl_e) { + FOREACH_ACL_ENTRY(acl_e, acl, pe) { switch (acl_e->e_tag) { case ACL_USER: case ACL_GROUP: @@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans, outptr = (void *) acl_header + sizeof(*acl_header); - acl_for_each_entry(acl, acl_e) { + FOREACH_ACL_ENTRY(acl_e, acl, pe) { bch_acl_entry *entry = outptr; entry->e_tag = cpu_to_le16(acl_e->e_tag); From aab94e92a9b24c17443295df539631c0bf2306bb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 18:11:07 -0400 Subject: [PATCH 017/233] bcachefs: add more path idx debug asserts Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 24406f723283..550db3654f2c 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -23,6 +23,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path { unsigned idx = path - trans->paths; + EBUG_ON(idx >= trans->nr_paths); EBUG_ON(!test_bit(idx, trans->paths_allocated)); if (unlikely(path->ref == U8_MAX)) { bch2_dump_trans_paths_updates(trans); @@ -36,6 +37,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) { + EBUG_ON(path - trans->paths >= trans->nr_paths); EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); EBUG_ON(!path->ref); EBUG_ON(!path->intent_ref && intent); From 31308cdd120cb7df3efd2c90f62c4fc735d7cc43 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 21 Sep 2024 20:21:18 -0400 Subject: [PATCH 018/233] bcachefs: bch2_run_explicit_recovery_pass() returns different error when not in recovery if we're not in recovery then there's no way to rewind recovery - give this a different errcode so that any error messages will give us a better idea of what happened. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 4 +++- fs/bcachefs/recovery_passes.c | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index e3b0ec7a0f73..40bf1e5775a9 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -172,7 +172,9 @@ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(0, restart_recovery) \ + x(EINVAL, restart_recovery) \ + x(EINVAL, not_in_recovery) \ + x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index dff589ddc984..1cc010bf1695 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -106,6 +106,9 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, if (c->opts.recovery_passes & BIT_ULL(pass)) return 0; + if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) + return -BCH_ERR_not_in_recovery; + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", bch2_recovery_passes[pass], pass, bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); From f5037ae0441bc26678836db41693086b6eddd2ea Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 21 Sep 2024 23:22:48 -0400 Subject: [PATCH 019/233] bcachefs: lru, accounting are alloc btrees They can be regenerated by fsck and don't require a btree node scan, like other alloc btrees. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 6a67df2a2fcd..79a80a78c2d8 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1359,6 +1359,8 @@ static inline bool btree_id_is_alloc(enum btree_id id) case BTREE_ID_need_discard: case BTREE_ID_freespace: case BTREE_ID_bucket_gens: + case BTREE_ID_lru: + case BTREE_ID_accounting: return true; default: return false; From f5e8d0269ca9ef941bda37f57d0af1dc2ede1546 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 21 Sep 2024 23:27:59 -0400 Subject: [PATCH 020/233] bcachefs: Add locking for bch_fs.curr_recovery_pass Recovery can rewind in certain situations - when we discover we need to run a pass that doesn't normally run. This can happen from another thread for btree node read errors, so we need a bit of locking. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/recovery_passes.c | 78 ++++++++++++++++++++++++++--------- fs/bcachefs/super.c | 1 + 3 files changed, 60 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7db81e182c3c..fbd89f91625d 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1060,6 +1060,7 @@ struct bch_fs { u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ enum bch_recovery_pass recovery_pass_done; + spinlock_t recovery_pass_lock; struct semaphore online_fsck_mutex; /* DEBUG JUNK */ diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 1cc010bf1695..5e7722cc0879 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -100,8 +100,8 @@ u64 bch2_recovery_passes_from_stable(u64 v) /* * For when we need to rewind recovery passes and run a pass we skipped: */ -int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) +static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) { if (c->opts.recovery_passes & BIT_ULL(pass)) return 0; @@ -109,6 +109,13 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) return -BCH_ERR_not_in_recovery; + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && + c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { + bch_info(c, "need recovery pass %s (%u), but already rw", + bch2_recovery_passes[pass], pass); + return -BCH_ERR_cannot_rewind_recovery; + } + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", bch2_recovery_passes[pass], pass, bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); @@ -124,6 +131,16 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + unsigned long flags; + spin_lock_irqsave(&c->recovery_pass_lock, flags); + int ret = __bch2_run_explicit_recovery_pass(c, pass); + spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + return ret; +} + int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, enum bch_recovery_pass pass) { @@ -237,30 +254,51 @@ int bch2_run_recovery_passes(struct bch_fs *c) c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + spin_lock_irq(&c->recovery_pass_lock); + unsigned pass = c->curr_recovery_pass; + if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) + c->curr_recovery_pass > c->opts.recovery_pass_last) { + spin_unlock_irq(&c->recovery_pass_lock); break; - - if (should_run_recovery_pass(c, c->curr_recovery_pass)) { - unsigned pass = c->curr_recovery_pass; - - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?: - bch2_journal_flush(&c->journal); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || - (ret && c->curr_recovery_pass < pass)) - continue; - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); } - c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); + if (!should_run_recovery_pass(c, pass)) { + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, pass); + spin_unlock_irq(&c->recovery_pass_lock); + continue; + } + spin_unlock_irq(&c->recovery_pass_lock); + + ret = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); + + spin_lock_irq(&c->recovery_pass_lock); + if (c->curr_recovery_pass < pass) { + /* + * bch2_run_explicit_recovery_pass() was called: we + * can't always catch -BCH_ERR_restart_recovery because + * it may have been called from another thread (btree + * node read completion) + */ + spin_unlock_irq(&c->recovery_pass_lock); + continue; + } else if (c->curr_recovery_pass == pass) { + c->curr_recovery_pass++; + } else { + BUG(); + } + spin_unlock_irq(&c->recovery_pass_lock); + + if (ret) + break; + + c->recovery_passes_complete |= BIT_ULL(pass); + c->recovery_pass_done = max(c->recovery_pass_done, pass); if (!test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); - - c->curr_recovery_pass++; + bch2_clear_recovery_pass_required(c, pass); } return ret; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 17442df7326d..d6411324cd3f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -766,6 +766,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); + spin_lock_init(&c->recovery_pass_lock); sema_init(&c->online_fsck_mutex, 1); init_rwsem(&c->gc_lock); From 771bf65862db8911c5368bf47410c629928562e4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 21 Sep 2024 23:40:01 -0400 Subject: [PATCH 021/233] bcachefs: bch2_btree_lost_data() now uses run_explicit_rceovery_pass_persistent() Also get a bit more fine grained about which passes to run for which btrees. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 63 +++++++++++++++++++++++------------ fs/bcachefs/recovery.h | 2 +- fs/bcachefs/recovery_passes.c | 11 ++++++ fs/bcachefs/recovery_passes.h | 1 + 4 files changed, 54 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 3c7f941dde39..b1c83e72c0d8 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -34,21 +34,52 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) { - if (btree >= BTREE_ID_NR_MAX) - return; - u64 b = BIT_ULL(btree); + int ret = 0; + + mutex_lock(&c->sb_lock); if (!(c->sb.btrees_lost_data & b)) { bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); - - mutex_lock(&c->sb_lock); bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); } + + switch (btree) { + case BTREE_ID_alloc: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_backpointers: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + goto out; + case BTREE_ID_need_discard: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_freespace: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_bucket_gens: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_lru: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_accounting: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + goto out; + default: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + goto out; + } +out: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; } /* for -o reconstruct_alloc: */ @@ -524,22 +555,10 @@ static int read_btree_roots(struct bch_fs *c) c, btree_root_read_error, "error reading btree root %s l=%u: %s", bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { - if (btree_id_is_alloc(i)) { - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + if (btree_id_is_alloc(i)) r->error = 0; - } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { - bch_info(c, "will run btree node scan"); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); - } - ret = 0; - bch2_btree_lost_data(c, i); + ret = bch2_btree_lost_data(c, i); } } diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 4bf818de1f2f..b0d55754b21b 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -void bch2_btree_lost_data(struct bch_fs *, enum btree_id); +int bch2_btree_lost_data(struct bch_fs *, enum btree_id); int bch2_journal_replay(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 5e7722cc0879..1240c5c19fea 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -141,6 +141,17 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, return ret; } +int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + + return bch2_run_explicit_recovery_pass(c, pass); +} + int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, enum bch_recovery_pass pass) { diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 99b464e127b8..7d7339c8fa29 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); int bch2_run_online_recovery_passes(struct bch_fs *); From 703b8d61ec2cef306f5379847adc089069333897 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 9 Oct 2024 21:26:05 -0400 Subject: [PATCH 022/233] bcachefs: improved bkey_val_copy() Factor out some common code, add typechecking. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 550db3654f2c..dda07a320488 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -594,13 +594,18 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ _btree_id, _pos, _flags, KEY_TYPE_##_type)) +static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) +{ + unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); + memcpy(dst_v, src_k.v, b); + if (unlikely(b < dst_size)) + memset(dst_v + b, 0, dst_size - b); +} + #define bkey_val_copy(_dst_v, _src_k) \ do { \ - unsigned b = min_t(unsigned, sizeof(*_dst_v), \ - bkey_val_bytes(_src_k.k)); \ - memcpy(_dst_v, _src_k.v, b); \ - if (b < sizeof(*_dst_v)) \ - memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \ + BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ + __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ } while (0) static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, @@ -609,17 +614,10 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, unsigned val_size, void *val) { struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); - ret = bkey_err(k); + struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); + int ret = bkey_err(k); if (!ret) { - unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); - - memcpy(val, k.v, b); - if (unlikely(b < sizeof(*val))) - memset((void *) val + b, 0, sizeof(*val) - b); + __bkey_val_copy(val, val_size, k); bch2_trans_iter_exit(trans, &iter); } From 7d6273caeac4a7389272be2c870562308753656a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 9 Oct 2024 21:51:05 -0400 Subject: [PATCH 023/233] bcachefs: Factor out jset_entry_log_msg_bytes() Needed for improved userspace cmd_list_journal Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 9 +++++++++ fs/bcachefs/journal_io.c | 3 +-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 79a80a78c2d8..c5e3824d5771 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1219,6 +1219,15 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); +static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) +{ + unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); + + while (b && !l->d[b - 1]) + --b; + return b; +} + struct jset_entry_datetime { struct jset_entry entry; __le64 seconds; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index fb35dd336331..7c7595e5369b 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -738,9 +738,8 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); - prt_printf(out, "%.*s", bytes, l->d); + prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); } static int journal_entry_overwrite_validate(struct bch_fs *c, From a1fbdad42fc52ff038183644e39785525553e667 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 9 Oct 2024 21:27:11 -0400 Subject: [PATCH 024/233] bcachefs: better error message in check_snapshot_tree() If we find a snapshot node and it didn't match the snapshot tree, we should print it. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index feaf2aa0d900..34e01bd8127f 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -506,7 +506,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, break; } } - bch2_trans_iter_exit(trans, &iter); if (!ret && !found) { @@ -536,6 +535,7 @@ static int check_snapshot_tree(struct btree_trans *trans, struct bch_snapshot s; struct bch_subvolume subvol; struct printbuf buf = PRINTBUF; + struct btree_iter snapshot_iter = {}; u32 root_id; int ret; @@ -545,16 +545,27 @@ static int check_snapshot_tree(struct btree_trans *trans, st = bkey_s_c_to_snapshot_tree(k); root_id = le32_to_cpu(st.v->root_snapshot); - ret = bch2_snapshot_lookup(trans, root_id, &s); + struct bkey_s_c_snapshot snapshot_k = + bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, + POS(0, root_id), 0, snapshot); + ret = bkey_err(snapshot_k); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; + if (!ret) + bkey_val_copy(&s, snapshot_k); + if (fsck_err_on(ret || root_id != bch2_snapshot_root(c, root_id) || st.k->p.offset != le32_to_cpu(s.tree), trans, snapshot_tree_to_missing_snapshot, "snapshot tree points to missing/incorrect snapshot:\n %s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + (bch2_bkey_val_to_text(&buf, c, st.s_c), + prt_newline(&buf), + ret + ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) + : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), + buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); goto err; } @@ -605,6 +616,7 @@ static int check_snapshot_tree(struct btree_trans *trans, } err: fsck_err: + bch2_trans_iter_exit(trans, &snapshot_iter); printbuf_exit(&buf); return ret; } From 78cf5d12ae82115f913292e8e4fa35e73161504a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 9 Oct 2024 23:02:04 -0400 Subject: [PATCH 025/233] bcachefs: Avoid bch2_btree_id_str() Prefer bch2_btree_id_to_text() - it prints out the integer ID when unknown. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 24 ++++++++------- fs/bcachefs/bbpos.h | 2 +- fs/bcachefs/btree_cache.c | 37 +++++++++++++++--------- fs/bcachefs/btree_cache.h | 3 +- fs/bcachefs/btree_gc.c | 45 +++++++++++++++++------------ fs/bcachefs/btree_io.c | 13 +++++---- fs/bcachefs/btree_iter.c | 32 ++++++++++---------- fs/bcachefs/btree_journal_iter.c | 5 +++- fs/bcachefs/btree_node_scan.c | 10 ++++--- fs/bcachefs/btree_update_interior.c | 23 ++++++++------- fs/bcachefs/debug.c | 4 ++- fs/bcachefs/disk_accounting.c | 3 +- fs/bcachefs/journal_io.c | 3 +- fs/bcachefs/recovery.c | 25 +++++++++++----- fs/bcachefs/sysfs.c | 3 +- 15 files changed, 140 insertions(+), 92 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 654a58132a4d..f323ce4b0b33 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -81,12 +81,11 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) { - prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", - bch2_btree_id_str(bp->btree_id), - bp->level, - (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), - (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp->bucket_len); + bch2_btree_id_level_to_text(out, bp->btree_id, bp->level); + prt_printf(out, " offset=%llu:%u len=%u pos=", + (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), + (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp->bucket_len); bch2_bpos_to_text(out, bp->pos); } @@ -501,9 +500,13 @@ static int check_extent_checksum(struct btree_trans *trans, goto err; prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); - prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); + prt_printf(&buf, "\n "); + bch2_btree_id_to_text(&buf, btree); + prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent); - prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); + prt_printf(&buf, "\n "); + bch2_btree_id_to_text(&buf, o_btree); + prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent2); struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); @@ -638,8 +641,9 @@ static int check_bp_exists(struct btree_trans *trans, goto err; missing: printbuf_reset(&buf); - prt_printf(&buf, "missing backpointer for btree=%s l=%u ", - bch2_btree_id_str(bp.btree_id), bp.level); + prt_str(&buf, "missing backpointer for btree="); + bch2_btree_id_to_text(&buf, bp.btree_id); + prt_printf(&buf, " l=%u ", bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); prt_printf(&buf, "\n got: "); bch2_bkey_val_to_text(&buf, c, bp_k); diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h index be2edced5213..63abe17f35ea 100644 --- a/fs/bcachefs/bbpos.h +++ b/fs/bcachefs/bbpos.h @@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos) static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) { - prt_str(out, bch2_btree_id_str(pos.btree)); + bch2_btree_id_to_text(out, pos.btree); prt_char(out, ':'); bch2_bpos_to_text(out, pos.pos); } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 7123019ab3bc..a0a406b0c7bc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1004,16 +1004,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) return; prt_printf(&buf, - "btree node header doesn't match ptr\n" - "btree %s level %u\n" - "ptr: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + "btree node header doesn't match ptr: "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, "\nptr: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_printf(&buf, "\nheader: btree %s level %llu\n" - "min ", - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nheader: "); + bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nmin "); bch2_bpos_to_text(&buf, b->data->min_key); prt_printf(&buf, "\nmax "); @@ -1398,12 +1396,19 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) prt_printf(out, "(unknown btree %u)", btree); } +void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) +{ + prt_str(out, "btree="); + bch2_btree_id_to_text(out, btree); + prt_printf(out, " level=%u", level); +} + void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { - prt_printf(out, "%s level %u/%u\n ", - bch2_btree_id_str(b->c.btree_id), - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_btree_id_to_text(out, b->c.btree_id); + prt_printf(out, " level %u/%u\n ", + b->c.level, + bch2_btree_id_root(c, b->c.btree_id)->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); } @@ -1478,8 +1483,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); prt_newline(out); - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) - prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { + bch2_btree_id_to_text(out, i); + prt_printf(out, "\t"); + prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); + prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); + } prt_newline(out); prt_printf(out, "freed:\t%zu\n", bc->nr_freed); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 66e86d1a178d..6cfacacb6769 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -138,8 +138,9 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) return bch2_btree_id_root(c, b->c.btree_id)->b; } -const char *bch2_btree_id_str(enum btree_id); +const char *bch2_btree_id_str(enum btree_id); /* avoid */ void bch2_btree_id_to_text(struct printbuf *, enum btree_id); +void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 81dcf9e512c0..3c4e66da1ca4 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -56,8 +56,8 @@ void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) { prt_str(out, bch2_gc_phase_strs[p->phase]); prt_char(out, ' '); - bch2_btree_id_to_text(out, p->btree); - prt_printf(out, " l=%u ", p->level); + bch2_btree_id_level_to_text(out, p->btree, p->level); + prt_char(out, ' '); bch2_bpos_to_text(out, p->pos); } @@ -209,8 +209,9 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_eq(expected_start, cur->data->min_key)) return 0; - prt_printf(&buf, " at btree %s level %u:\n parent: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_printf(&buf, " at "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_printf(&buf, ":\n parent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (prev) { @@ -277,8 +278,9 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, if (bpos_eq(child->key.k.p, b->key.k.p)) return 0; - prt_printf(&buf, "at btree %s level %u:\n parent: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_printf(&buf, " at "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_printf(&buf, ":\n parent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, "\n child: "); @@ -341,14 +343,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ret = PTR_ERR_OR_ZERO(cur); printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); + prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), trans, btree_node_unreadable, - "Topology repair: unreadable btree node at btree %s level %u:\n" + "Topology repair: unreadable btree node at\n" " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; @@ -370,7 +372,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct break; if (bch2_btree_node_is_stale(c, cur)) { - bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); + bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); six_unlock_read(&cur->c.lock); bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, @@ -478,14 +480,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct } printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (mustfix_fsck_err_on(!have_child, trans, btree_node_topology_interior_node_empty, - "empty interior btree node at btree %s level %u\n" - " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level, buf.buf)) + "empty interior btree node at %s", buf.buf)) ret = DROP_THIS_NODE; err: fsck_err: @@ -511,6 +512,7 @@ int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bpos pulled_from_scan = POS_MIN; + struct printbuf buf = PRINTBUF; int ret = 0; bch2_trans_srcu_unlock(trans); @@ -519,19 +521,21 @@ int bch2_check_topology(struct bch_fs *c) struct btree_root *r = bch2_btree_id_root(c, i); bool reconstructed_root = false; + bch2_btree_id_to_text(&buf, i); + if (r->error) { ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); if (ret) break; reconstruct_root: - bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); + bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); r->alive = false; r->error = 0; if (!bch2_btree_has_scanned_nodes(c, i)) { mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); + "no nodes found for btree %s, continue?", buf.buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { bch2_btree_root_alloc_fake_trans(trans, i, 1); @@ -560,13 +564,14 @@ int bch2_check_topology(struct bch_fs *c) if (!reconstructed_root) goto reconstruct_root; - bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); + bch_err(c, "empty btree root %s", buf.buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } } fsck_err: + printbuf_exit(&buf); bch2_trans_put(trans); return ret; } @@ -713,6 +718,7 @@ static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; + struct printbuf buf = PRINTBUF; unsigned i; int ret = 0; @@ -731,10 +737,13 @@ static int bch2_gc_btrees(struct bch_fs *c) if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), trans, btree_node_read_error, "btree node read error for %s", - bch2_btree_id_str(btree))) + (printbuf_reset(&buf), + bch2_btree_id_to_text(&buf, btree), + buf.buf))) ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); } fsck_err: + printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 839d68802e42..89a42ee81e5c 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -25,9 +25,8 @@ static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) { - prt_printf(out, "btree=%s l=%u seq %llux\n", - bch2_btree_id_str(BTREE_NODE_ID(bn)), - (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq); + bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); + prt_printf(out, " seq %llux\n", bn->keys.seq); prt_str(out, "min: "); bch2_bpos_to_text(out, bn->min_key); prt_newline(out); @@ -1343,9 +1342,11 @@ static void btree_node_read_work(struct work_struct *work) !btree_node_read_error(b) && c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->key.k.p); - bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", - __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", + __func__, buf.buf); bch2_btree_node_rewrite_async(c, b); } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 01152fd5ac57..07bce85dafaf 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1448,10 +1448,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - prt_printf(buf, "update: btree=%s cached=%u %pS\n", - bch2_btree_id_str(i->btree_id), - i->cached, - (void *) i->ip_allocated); + prt_str(buf, "update: btree="); + bch2_btree_id_to_text(buf, i->btree_id); + prt_printf(buf, " cached=%u %pS\n", + i->cached, + (void *) i->ip_allocated); prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); @@ -1484,13 +1485,13 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', - path->cached ? 'C' : 'B', - bch2_btree_id_str(path->btree_id), - path->level); + path->cached ? 'C' : 'B'); + bch2_btree_id_level_to_text(out, path->btree_id, path->level); + prt_str(out, " pos "); bch2_bpos_to_text(out, path->pos); if (!path->cached && btree_node_locked(path, path->level)) { @@ -3336,8 +3337,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, pid = owner ? owner->pid : 0; rcu_read_unlock(); - prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', - b->level, bch2_btree_id_str(b->btree_id)); + prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, b->btree_id); + prt_printf(out, " l=%u:", b->level); bch2_bpos_to_text(out, btree_node_pos(b)); prt_printf(out, "\t locks %u:%u:%u held by pid %u", @@ -3376,11 +3378,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) if (!path->nodes_locked) continue; - prt_printf(out, " path %u %c l=%u %s:", - idx, - path->cached ? 'c' : 'b', - path->level, - bch2_btree_id_str(path->btree_id)); + prt_printf(out, " path %u %c ", + idx, + path->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, path->btree_id); + prt_printf(out, " l=%u:", path->level); bch2_bpos_to_text(out, path->pos); prt_newline(out); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index c1657182c275..924b5e3a4390 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -628,8 +628,11 @@ void bch2_journal_keys_dump(struct bch_fs *c) darray_for_each(*keys, i) { printbuf_reset(&buf); + prt_printf(&buf, "btree="); + bch2_btree_id_to_text(&buf, i->btree_id); + prt_printf(&buf, " l=%u ", i->level); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); + pr_err("%s", buf.buf); } printbuf_exit(&buf); } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 30131c3bdd97..4b4df31d4b95 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -22,9 +22,9 @@ struct find_btree_nodes_worker { static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) { - prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ", - bch2_btree_id_str(n->btree_id), n->level, n->seq, - n->journal_seq, n->cookie); + bch2_btree_id_level_to_text(out, n->btree_id, n->level); + prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", + n->seq, n->journal_seq, n->cookie); bch2_bpos_to_text(out, n->min_key); prt_str(out, "-"); bch2_bpos_to_text(out, n->max_key); @@ -499,7 +499,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, if (c->opts.verbose) { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); + prt_str(&buf, "recovery "); + bch2_btree_id_level_to_text(&buf, btree, level); + prt_str(&buf, " "); bch2_bpos_to_text(&buf, node_min); prt_str(&buf, " - "); bch2_bpos_to_text(&buf, node_max); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d596ef93239f..d62de3f79b29 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -97,9 +97,9 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) bch2_topology_error(c); printbuf_reset(&buf); - prt_str(&buf, "end of prev node doesn't match start of next node\n"), - prt_printf(&buf, " in btree %s level %u node ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_str(&buf, "end of prev node doesn't match start of next node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, "\n prev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); @@ -118,9 +118,9 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) bch2_topology_error(c); printbuf_reset(&buf); - prt_str(&buf, "empty interior node\n"); - prt_printf(&buf, " in btree %s level %u node ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_str(&buf, "empty interior node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); @@ -129,9 +129,9 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) bch2_topology_error(c); printbuf_reset(&buf); - prt_str(&buf, "last child node doesn't end at end of parent node\n"); - prt_printf(&buf, " in btree %s level %u node ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_str(&buf, "last child node doesn't end at end of parent node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, "\n last key "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); @@ -2575,8 +2575,9 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update prt_printf(out, "%ps: ", (void *) as->ip_started); bch2_trans_commit_flags_to_text(out, as->flags); - prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - bch2_btree_id_str(as->btree_id), + prt_str(out, " "); + bch2_btree_id_to_text(out, as->btree_id); + prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", as->update_level_start, as->update_level_end, bch2_btree_update_modes[as->mode], diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 45aec1afdb0e..b5de52a50d10 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -472,7 +472,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_printf(out, "%px ", b); + bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); + prt_printf(out, "\n"); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 07eb8fa1b026..38b563113cfb 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -217,7 +217,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po prt_printf(out, "id=%u", k->snapshot.id); break; case BCH_DISK_ACCOUNTING_btree: - prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id)); + prt_str(out, "btree="); + bch2_btree_id_to_text(out, k->btree.id); break; } } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7c7595e5369b..9bc0caa9d5e4 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -421,7 +421,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs bch2_prt_jset_entry_type(out, entry->type); prt_str(out, ": "); } - prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); + bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); + prt_char(out, ' '); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); first = false; } diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index b1c83e72c0d8..0e5a53541ce4 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -42,7 +42,10 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) mutex_lock(&c->sb_lock); if (!(c->sb.btrees_lost_data & b)) { - bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); + struct printbuf buf = PRINTBUF; + bch2_btree_id_to_text(&buf, btree); + bch_err(c, "flagging btree %s lost data", buf.buf); + printbuf_exit(&buf); bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); } @@ -385,10 +388,13 @@ int bch2_journal_replay(struct bch_fs *c) ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim : 0), bch2_journal_replay_key(trans, k)); - bch_err_msg(c, ret, "while replaying key at btree %s level %u:", - bch2_btree_id_str(k->btree_id), k->level); - if (ret) + if (ret) { + struct printbuf buf = PRINTBUF; + bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); + bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); + printbuf_exit(&buf); goto err; + } BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); } @@ -536,6 +542,7 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { + struct printbuf buf = PRINTBUF; int ret = 0; for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { @@ -547,14 +554,17 @@ static int read_btree_roots(struct bch_fs *c) if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; + printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, i, r->level); + if (mustfix_fsck_err_on((ret = r->error), c, btree_root_bkey_invalid, "invalid btree root %s", - bch2_btree_id_str(i)) || + buf.buf) || mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), c, btree_root_read_error, - "error reading btree root %s l=%u: %s", - bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { + "error reading btree root %s: %s", + buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) r->error = 0; @@ -572,6 +582,7 @@ static int read_btree_roots(struct bch_fs *c) } } fsck_err: + printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 03e59f86f360..3270bfab9466 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -302,7 +302,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) { - prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); + bch2_btree_id_to_text(out, c->gc_gens_btree); + prt_printf(out, ": "); bch2_bpos_to_text(out, c->gc_gens_pos); prt_printf(out, "\n"); } From 09115483e7432d20c72e382662c0dffd603cc6b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Sep 2024 14:57:26 -0400 Subject: [PATCH 026/233] bcachefs: Refactor new stripe path to reduce dependencies on ec_stripe_head We need to add a path for reshaping existing stripes (for e.g. device removal), and this new path won't necessarily use ec_stripe_head. Refactor the code to avoid unnecessary references to it for clarity. Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 204 +++++++++++++++++++++++++---------------------- 1 file changed, 108 insertions(+), 96 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d489a9e28702..6e855fe888c2 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1716,7 +1716,7 @@ static void ec_stripe_key_init(struct bch_fs *c, set_bkey_val_u64s(&s->k, u64s); } -static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; @@ -1724,7 +1724,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) - return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + return NULL; mutex_init(&s->lock); closure_init(&s->iodone, NULL); @@ -1739,10 +1739,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, s->nr_parity, h->blocksize, h->disk_label); - - h->s = s; - h->nr_created++; - return 0; + return s; } static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) @@ -1887,25 +1884,26 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, return h; } -static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, +static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct ec_stripe_head *h, struct ec_stripe_new *s, enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; struct bch_devs_mask devs = h->devs; struct open_bucket *ob; struct open_buckets buckets; - struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; int ret = 0; - BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); - BUG_ON(v->nr_redundant != h->s->nr_parity); + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); + BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* * Note: we don't yet repair invalid blocks (failed/removed * devices) when reusing stripes - we still need a codepath to @@ -1915,21 +1913,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) __clear_bit(v->ptrs[i].dev, devs.d); - if (i < h->s->nr_data) + if (i < s->nr_data) nr_have_data++; else nr_have_parity++; } - BUG_ON(nr_have_data > h->s->nr_data); - BUG_ON(nr_have_parity > h->s->nr_parity); + BUG_ON(nr_have_data > s->nr_data); + BUG_ON(nr_have_parity > s->nr_parity); buckets.nr = 0; - if (nr_have_parity < h->s->nr_parity) { + if (nr_have_parity < s->nr_parity) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->parity_stripe, &devs, - h->s->nr_parity, + s->nr_parity, &nr_have_parity, &have_cache, 0, BCH_DATA_parity, @@ -1937,14 +1935,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data + h->s->nr_parity, - h->s->nr_data); - BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data + s->nr_parity, + s->nr_data); + BUG_ON(j >= s->nr_data + s->nr_parity); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -1952,11 +1950,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ } buckets.nr = 0; - if (nr_have_data < h->s->nr_data) { + if (nr_have_data < s->nr_data) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->block_stripe, &devs, - h->s->nr_data, + s->nr_data, &nr_have_data, &have_cache, 0, BCH_DATA_user, @@ -1964,13 +1962,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data, 0); - BUG_ON(j >= h->s->nr_data); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data, 0); + BUG_ON(j >= s->nr_data); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -2016,12 +2014,54 @@ static s64 get_existing_stripe(struct bch_fs *c, return ret; } -static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) +{ + struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; + unsigned i; + + BUG_ON(existing_v->nr_redundant != s->nr_parity); + s->nr_data = existing_v->nr_blocks - + existing_v->nr_redundant; + + int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); + if (ret) { + bch2_stripe_close(c, s); + return ret; + } + + BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + + /* + * Free buckets we initially allocated - they might conflict with + * blocks from the stripe we're reusing: + */ + for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); + s->blocks[i] = 0; + } + memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); + memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); + + for (unsigned i = 0; i < existing_v->nr_blocks; i++) { + if (stripe_blockcount_get(existing_v, i)) { + __set_bit(i, s->blocks_gotten); + __set_bit(i, s->blocks_allocated); + } + + ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); + } + + bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); + s->have_existing_stripe = true; + + return 0; +} + +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, + struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; - struct bch_stripe *existing_v; - unsigned i; s64 idx; int ret; @@ -2033,56 +2073,19 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri if (idx < 0) return -BCH_ERR_stripe_alloc_blocked; - ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); + ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, "reading stripe key: %s", bch2_err_str(ret)); if (ret) { - bch2_stripe_close(c, h->s); + bch2_stripe_close(c, s); return ret; } - existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; - - BUG_ON(existing_v->nr_redundant != h->s->nr_parity); - h->s->nr_data = existing_v->nr_blocks - - existing_v->nr_redundant; - - ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); - if (ret) { - bch2_stripe_close(c, h->s); - return ret; - } - - BUG_ON(h->s->existing_stripe.size != h->blocksize); - BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); - - /* - * Free buckets we initially allocated - they might conflict with - * blocks from the stripe we're reusing: - */ - for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { - bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); - h->s->blocks[i] = 0; - } - memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); - memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); - - for (i = 0; i < existing_v->nr_blocks; i++) { - if (stripe_blockcount_get(existing_v, i)) { - __set_bit(i, h->s->blocks_gotten); - __set_bit(i, h->s->blocks_allocated); - } - - ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); - } - - bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); - h->s->have_existing_stripe = true; - - return 0; + return init_new_stripe_from_existing(c, s); } -static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) +static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, + struct ec_stripe_new *s) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -2091,15 +2094,19 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; - if (!h->s->res.sectors) { - ret = bch2_disk_reservation_get(c, &h->s->res, + if (!s->res.sectors) { + ret = bch2_disk_reservation_get(c, &s->res, h->blocksize, - h->s->nr_parity, + s->nr_parity, BCH_DISK_RESERVATION_NOFAIL); if (ret) return ret; } + /* + * Allocate stripe slot + * XXX: we're going to need a bitrange btree of free stripes + */ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (bkey_gt(k.k->p, POS(0, U32_MAX))) { @@ -2114,7 +2121,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st } if (bkey_deleted(k.k) && - bch2_try_open_stripe(c, h->s, k.k->p.offset)) + bch2_try_open_stripe(c, s, k.k->p.offset)) break; } @@ -2125,16 +2132,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st ret = ec_stripe_mem_alloc(trans, &iter); if (ret) { - bch2_stripe_close(c, h->s); + bch2_stripe_close(c, s); goto err; } - h->s->new_stripe.key.k.p = iter.pos; + s->new_stripe.key.k.p = iter.pos; out: bch2_trans_iter_exit(trans, &iter); return ret; err: - bch2_disk_reservation_put(c, &h->s->res); + bch2_disk_reservation_put(c, &s->res); goto out; } @@ -2165,22 +2172,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, return h; if (!h->s) { - ret = ec_new_stripe_alloc(c, h); - if (ret) { + h->s = ec_new_stripe_alloc(c, h); + if (!h->s) { + ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; bch_err(c, "failed to allocate new stripe"); goto err; } + + h->nr_created++; } - if (h->s->allocated) + struct ec_stripe_new *s = h->s; + + if (s->allocated) goto allocated; - if (h->s->have_existing_stripe) + if (s->have_existing_stripe) goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h); + ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2192,15 +2204,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * existing stripe: */ while (1) { - ret = __bch2_ec_stripe_head_reuse(trans, h); + ret = __bch2_ec_stripe_head_reuse(trans, h, s); if (!ret) break; if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h); + ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; goto allocate_buf; @@ -2218,19 +2230,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, watermark, cl); + ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); if (ret) goto err; allocate_buf: - ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); + ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); if (ret) goto err; - h->s->allocated = true; + s->allocated = true; allocated: - BUG_ON(!h->s->idx); - BUG_ON(!h->s->new_stripe.data[0]); + BUG_ON(!s->idx); + BUG_ON(!s->new_stripe.data[0]); BUG_ON(trans->restarted); return h; err: From cca1dff8fea3f4ebe8a7f39a109d14a0f136d319 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 11 Oct 2024 22:53:09 -0400 Subject: [PATCH 027/233] bcachefs: -o norecovery now bails out of recovery earlier -o norecovery (used by the dump tool) should be doing the absolute minimum amount of work to get the filesystem up and readable; we shouldn't be running check and repair code, or going read-write. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 0e5a53541ce4..bc2fd174bb32 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -690,8 +690,13 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.norecovery) - c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; + if (c->opts.norecovery) { + c->opts.recovery_pass_last = c->opts.recovery_pass_last + ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) + : BCH_RECOVERY_PASS_snapshots_read; + c->opts.nochanges = true; + c->opts.read_only = true; + } mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); From a99869f0b74e8ced83ece54c3f1645363fe8214c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 11 Oct 2024 22:50:48 -0400 Subject: [PATCH 028/233] bcachefs: bch2_journal_meta() takes ref on c->writes This part of addressing https://github.com/koverstreet/bcachefs/issues/656 where we're getting stuck in bch2_journal_meta() in the dump tool. We shouldn't be invoking the journal without a ref on c->writes (if we're not RW), and there's no reason for the dump tool to be going read-write. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/journal.c | 27 +++++++++++++++++---------- fs/bcachefs/recovery.c | 4 +--- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index fbd89f91625d..d4d95ef6791f 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -688,6 +688,7 @@ struct btree_trans_buf { ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) #define BCH_WRITE_REFS() \ + x(journal) \ x(trans) \ x(write) \ x(promote) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 2dc0d60c1745..2cf8f24d50cc 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -831,19 +831,14 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) return ret; } -int bch2_journal_meta(struct journal *j) +static int __bch2_journal_meta(struct journal *j) { - struct journal_buf *buf; - struct journal_res res; - int ret; - - memset(&res, 0, sizeof(res)); - - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + struct journal_res res = {}; + int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); if (ret) return ret; - buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); buf->must_flush = true; if (!buf->flush_time) { @@ -856,6 +851,18 @@ int bch2_journal_meta(struct journal *j) return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); } +int bch2_journal_meta(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) + return -EROFS; + + int ret = __bch2_journal_meta(j); + bch2_write_ref_put(c, BCH_WRITE_REF_journal); + return ret; +} + /* block/unlock the journal: */ void bch2_journal_unblock(struct journal *j) @@ -1193,7 +1200,7 @@ void bch2_fs_journal_stop(struct journal *j) * Always write a new journal entry, to make sure the clock hands are up * to date (and match the superblock) */ - bch2_journal_meta(j); + __bch2_journal_meta(j); journal_quiesce(j); cancel_delayed_work_sync(&j->write_work); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index bc2fd174bb32..431698189090 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -910,11 +910,9 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_accounting_replay_done, &c->flags); /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags) && - bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); } /* If we fixed errors, verify that fs is actually clean now: */ From c4bfe7049c62651c7e03210760529e2fab9a7706 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 12 Oct 2024 14:07:44 -0400 Subject: [PATCH 029/233] bcachefs: Fix warning about passing flex array member by value this showed up when building in userspace Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 38b563113cfb..55a00018dc8b 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -244,10 +244,10 @@ void bch2_accounting_swab(struct bkey_s k) } static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, - struct disk_accounting_pos acc) + struct disk_accounting_pos *acc) { - unsafe_memcpy(r, &acc.replicas, - replicas_entry_bytes(&acc.replicas), + unsafe_memcpy(r, &acc->replicas, + replicas_entry_bytes(&acc->replicas), "variable length struct"); } @@ -258,7 +258,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - __accounting_to_replicas(r, acc_k); + __accounting_to_replicas(r, &acc_k); return true; default: return false; @@ -626,7 +626,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, switch (acc.type) { case BCH_DISK_ACCOUNTING_replicas: { struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, acc); + __accounting_to_replicas(&r.e, &acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && From b6a562e6d87918faaacea4999d47ae4e0da2f5f0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 14 Oct 2024 21:35:44 -0400 Subject: [PATCH 030/233] bcachefs: Add block plugging to read paths This will help with some of the btree_trans srcu lock hold time warnings that are still turning up; submit_bio() can block for awhile if the device is sufficiently congested. It's not a perfect solution since blk_plug bios are submitted when scheduling; we might want a way to disable the "submit on context switch" behaviour, or switch to our own plugging in the future. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 19 ++++++++++++++++++- fs/bcachefs/fs-io-direct.c | 5 +++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 95972809e76d..0923f38a2fcd 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -248,6 +248,7 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_io_opts opts; struct folio *folio; struct readpages_iter readpages_iter; + struct blk_plug plug; bch2_inode_opts_get(&opts, c, &inode->ei_inode); @@ -255,6 +256,16 @@ void bch2_readahead(struct readahead_control *ractl) if (ret) return; + /* + * Besides being a general performance optimization, plugging helps with + * avoiding btree transaction srcu warnings - submitting a bio can + * block, and we don't want todo that with the transaction locked. + * + * However, plugged bios are submitted when we schedule; we ideally + * would have our own scheduler hook to call unlock_long() before + * scheduling. + */ + blk_start_plug(&plug); bch2_pagecache_add_get(inode); struct btree_trans *trans = bch2_trans_get(c); @@ -281,7 +292,7 @@ void bch2_readahead(struct readahead_control *ractl) bch2_trans_put(trans); bch2_pagecache_add_put(inode); - + blk_finish_plug(&plug); darray_exit(&readpages_iter.folios); } @@ -296,9 +307,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; struct bch_io_opts opts; + struct blk_plug plug; int ret; DECLARE_COMPLETION_ONSTACK(done); + BUG_ON(folio_test_uptodate(folio)); + BUG_ON(folio_test_dirty(folio)); + if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; @@ -313,7 +328,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + blk_start_plug(&plug); bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); + blk_finish_plug(&plug); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 6d3a05ae5da8..2089c36b5866 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct bch_io_opts opts; struct dio_read *dio; struct bio *bio; + struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); size_t shorten; @@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) */ dio->should_dirty = iter_is_iovec(iter); + blk_start_plug(&plug); + goto start; while (iter->count) { bio = bio_alloc_bioset(NULL, @@ -160,6 +163,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); } + blk_finish_plug(&plug); + iter->count += shorten; if (sync) { From ce612d0d48ce2143fc0394c7fbf5eb1f5944a25f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 19:02:44 -0400 Subject: [PATCH 031/233] bcachefs: Add version check for bch_btree_ptr_v2.sectors_written validate A user popped up with a very old (0.11) filesystem that needed repair and wasn't recently backed up. Reported-by: Manoa Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 37e3d69bec06..85b98c782e1b 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -203,7 +203,8 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, c, btree_ptr_v2_min_key_bad, "min_key > key"); - if (flags & BCH_VALIDATE_write) + if ((flags & BCH_VALIDATE_write) && + c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) bkey_fsck_err_on(!bp.v->sectors_written, c, btree_ptr_v2_written_0, "sectors_written == 0"); From d953b409bc8685f21315799bc3b32448945cc27f Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 19 Oct 2024 14:25:27 +0200 Subject: [PATCH 032/233] bcachefs: Use str_write_read() helper function Remove hard-coded strings by using the helper function str_write_read(). Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9bc0caa9d5e4..768a3b950997 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,8 @@ #include "sb-clean.h" #include "trace.h" +#include + void bch2_journal_pos_from_member_info_set(struct bch_fs *c) { lockdep_assert_held(&c->sb_lock); @@ -666,7 +668,7 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); - prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); + prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); } static int journal_entry_dev_usage_validate(struct bch_fs *c, From dfce49250859cf87c267cf3952dadf8d702ff674 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 20 Oct 2024 13:20:46 +0200 Subject: [PATCH 033/233] bcachefs: Use str_write_read() helper in ec_block_endio() Remove hard-coded strings by using the helper function str_write_read(). Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 6e855fe888c2..015107e241cc 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -26,6 +26,7 @@ #include "util.h" #include +#include #ifdef __KERNEL__ @@ -732,7 +733,7 @@ static void ec_block_endio(struct bio *bio) ? BCH_MEMBER_ERROR_write : BCH_MEMBER_ERROR_read, "erasure coding %s error: %s", - bio_data_dir(bio) ? "write" : "read", + str_write_read(bio_data_dir(bio)), bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); From 342a485f4e29ecfdcf979af0b2e609cbdc52701d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 26 Oct 2024 12:47:23 +0200 Subject: [PATCH 034/233] bcachefs: Use str_write_read() helper in write_super_endio() Remove hard-coded strings by using the str_write_read() helper function. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 7c71594f6a8b..c83bd3dedb1b 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -23,6 +23,7 @@ #include #include +#include static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { }; @@ -878,7 +879,7 @@ static void write_super_endio(struct bio *bio) ? BCH_MEMBER_ERROR_write : BCH_MEMBER_ERROR_read, "superblock %s error: %s", - bio_data_dir(bio) ? "write" : "read", + str_write_read(bio_data_dir(bio)), bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; From caaf27392849abbc6499196fcd114da3f11e4e0f Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 26 Oct 2024 17:47:04 +0200 Subject: [PATCH 035/233] bcachefs: Annotate struct bucket_gens with __counted_by() Add the __counted_by compiler attribute to the flexible array member b to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Use struct_size() to calculate the number of bytes to be allocated. Update bucket_gens->nbuckets and bucket_gens->nbuckets_minus_first when resizing. Compile-tested only. Signed-off-by: Thorsten Blum Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 13 ++++++++----- fs/bcachefs/buckets_types.h | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ec7d9a59bea9..8bd17667e243 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1266,8 +1266,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) BUG_ON(resize && ca->buckets_nouse); - if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO))) { + bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets), + GFP_KERNEL|__GFP_ZERO); + if (!bucket_gens) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } @@ -1285,11 +1286,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { - size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); - + bucket_gens->nbuckets = min(bucket_gens->nbuckets, + old_bucket_gens->nbuckets); + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; memcpy(bucket_gens->b, old_bucket_gens->b, - n); + bucket_gens->nbuckets); } rcu_assign_pointer(ca->bucket_gens, bucket_gens); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 28bd09a253c8..7174047b8e92 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -24,7 +24,7 @@ struct bucket_gens { u16 first_bucket; size_t nbuckets; size_t nbuckets_minus_first; - u8 b[]; + u8 b[] __counted_by(nbuckets); }; struct bch_dev_usage { From 104688e7ed283a31735c61f4c3f95c339df42f8f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 01:11:29 -0400 Subject: [PATCH 036/233] bcachefs: avoid 'unsigned flags' flags should have actual types, where possible: fix btree_update.h helpers Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.h | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 70b3c989fac2..7e71c4d1111d 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -244,7 +244,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, unsigned flags, + struct bkey_s_c *k, + enum btree_iter_update_trigger_flags flags, unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); @@ -261,8 +262,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str return mut; } -static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, unsigned flags) +static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c *k, + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); } @@ -274,7 +276,8 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) { struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags|BTREE_ITER_intent, type); @@ -289,7 +292,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); } @@ -297,7 +300,8 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); @@ -318,7 +322,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned min_bytes) { return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); } @@ -326,7 +331,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); } @@ -337,7 +342,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, - unsigned flags, unsigned type, unsigned val_size) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned val_size) { struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); int ret; From e86231fc642b4d428c941fbd8285ca66873375db Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 01:16:16 -0400 Subject: [PATCH 037/233] bcachefs: use bch2_data_update_opts_to_text() in trace_move_extent_fail() Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 8e75a852b358..5ea432bc0052 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -110,11 +110,8 @@ static void trace_move_extent_fail2(struct data_update *m, { struct bch_fs *c = m->op.c; struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - const union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct extent_ptr_decoded p; struct printbuf buf = PRINTBUF; - unsigned i, rewrites_found = 0; + unsigned rewrites_found = 0; if (!trace_move_extent_fail_enabled()) return; @@ -122,27 +119,25 @@ static void trace_move_extent_fail2(struct data_update *m, prt_str(&buf, msg); if (insert) { - i = 0; + const union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; + + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - if (((1U << i) & m->data_opts.rewrite_ptrs) && + if ((ptr_bit & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) - rewrites_found |= 1U << i; - i++; + rewrites_found |= ptr_bit; + ptr_bit <<= 1; } } - prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", - (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); + prt_str(&buf, "rewrites found:\t"); + bch2_prt_u64_base2(&buf, rewrites_found); + prt_newline(&buf); - prt_printf(&buf, "\nrewrites found: %u%u%u%u", - (rewrites_found & (1 << 0)) != 0, - (rewrites_found & (1 << 1)) != 0, - (rewrites_found & (1 << 2)) != 0, - (rewrites_found & (1 << 3)) != 0); + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); From 7813e014b5c83791825eff8850c42bd9dfe25471 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 01:21:43 -0400 Subject: [PATCH 038/233] bcachefs: bch2_io_opts_fixups() Centralize some io path option fixups - they weren't always being applied correctly: - background_compression uses compression if unset - background_target uses foreground_target if unset - nocow disables most fancy io path options Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 4 ++-- fs/bcachefs/extents.c | 2 +- fs/bcachefs/inode.c | 4 ++-- fs/bcachefs/opts.c | 5 ++++- fs/bcachefs/opts.h | 12 ++++++++++-- fs/bcachefs/rebalance.c | 4 ++-- 6 files changed, 21 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 5ea432bc0052..a176e5439cbf 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -535,7 +535,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_newline(out); prt_str(out, "compression:\t"); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); + bch2_compression_opt_to_text(out, io_opts->background_compression); prt_newline(out); prt_str(out, "opts.replicas:\t"); @@ -647,7 +647,7 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = background_compression(io_opts); + m->op.compression_opt = io_opts.background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; unsigned durability_have = 0, durability_removing = 0; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 85b98c782e1b..45a67daf0d64 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1504,7 +1504,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, struct bkey_s k = bkey_i_to_s(_k); struct bch_extent_rebalance *r; unsigned target = opts->background_target; - unsigned compression = background_compression(*opts); + unsigned compression = opts->background_compression; bool needs_rebalance; if (!bkey_extent_is_direct_data(k.k)) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 43653cf050e9..fbdd11802bdf 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -14,6 +14,7 @@ #include "extent_update.h" #include "fs.h" #include "inode.h" +#include "opts.h" #include "str_hash.h" #include "snapshot.h" #include "subvolume.h" @@ -1145,8 +1146,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, BCH_INODE_OPTS() #undef x - if (opts->nocow) - opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; + bch2_io_opts_fixups(opts); } int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 0e2ee262fbd4..1f5843284e9e 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -710,11 +710,14 @@ void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) { - return (struct bch_io_opts) { + struct bch_io_opts opts = { #define x(_name, _bits) ._name = src._name, BCH_INODE_OPTS() #undef x }; + + bch2_io_opts_fixups(&opts); + return opts; } bool bch2_opt_is_inode_opt(enum bch_opt_id id) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 23dda014e331..dd27ef556611 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -626,9 +626,17 @@ struct bch_io_opts { #undef x }; -static inline unsigned background_compression(struct bch_io_opts opts) +static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) { - return opts.background_compression ?: opts.compression; + if (!opts->background_target) + opts->background_target = opts->foreground_target; + if (!opts->background_compression) + opts->background_compression = opts->compression; + if (opts->nocow) { + opts->compression = opts->background_compression = 0; + opts->data_checksum = 0; + opts->erasure_code = 0; + } } struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index cd6647374353..2f93ae8781bb 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -257,12 +257,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, if (k.k->p.inode) { target = io_opts->background_target; - compression = background_compression(*io_opts); + compression = io_opts->background_compression; } else { const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : background_compression(*io_opts); + compression = r ? r->compression : io_opts->background_compression; } data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); From 88c26043ea0b7fb1d68b5e7ca2befc1e3077020b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 01:32:55 -0400 Subject: [PATCH 039/233] bcachefs: small cleanup for extent ptr bitmasks Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 30 +++++++++++++++--------------- fs/bcachefs/extents.c | 12 ++++++------ fs/bcachefs/io_read.c | 6 +++--- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index a176e5439cbf..90da57a26962 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -189,7 +189,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, struct bpos next_pos; bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned rewrites_found = 0, durability, i; + unsigned rewrites_found = 0, durability, ptr_bit; bch2_trans_begin(trans); @@ -226,16 +226,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * * Fist, drop rewrite_ptrs from @new: */ - i = 0; + ptr_bit = 1; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { - if (((1U << i) & m->data_opts.rewrite_ptrs) && + if ((ptr_bit & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { bch2_extent_ptr_set_cached(c, &m->op.opts, bkey_i_to_s(insert), ptr); - rewrites_found |= 1U << i; + rewrites_found |= ptr_bit; } - i++; + ptr_bit <<= 1; } if (m->data_opts.rewrite_ptrs && @@ -609,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; int ret = 0; /* @@ -652,17 +652,17 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned durability_have = 0, durability_removing = 0; - i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached) { rcu_read_lock(); - if (BIT(i) & m->data_opts.rewrite_ptrs) { + if (ptr_bit & m->data_opts.rewrite_ptrs) { if (crc_is_compressed(p.crc)) reserve_sectors += k.k->size; m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!(BIT(i) & m->data_opts.kill_ptrs)) { + } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } @@ -682,7 +682,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - i++; + ptr_bit <<= 1; } unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); @@ -745,14 +745,14 @@ int bch2_data_update_init(struct btree_trans *trans, void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { - if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { - opts->kill_ptrs |= 1U << i; - opts->rewrite_ptrs ^= 1U << i; + if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { + opts->kill_ptrs |= ptr_bit; + opts->rewrite_ptrs ^= ptr_bit; } - i++; + ptr_bit <<= 1; } } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 45a67daf0d64..243cb15b74b3 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1414,7 +1414,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, unsigned compression_type = bch2_compression_opt_to_type(compression); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - unsigned i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || @@ -1424,18 +1424,18 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, } if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= 1U << i; - i++; + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; } } incompressible: if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { - unsigned i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) - rewrite_ptrs |= 1U << i; - i++; + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; } } diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index b3b934a87c6d..cbc3cc1f6d03 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -231,11 +231,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, update_opts.target = opts.foreground_target; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { if (bch2_dev_io_failures(failed, ptr->dev)) - update_opts.rewrite_ptrs |= BIT(i); - i++; + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; } } From ef92eb2a63e2a1ae573455c11784519fee7bd70c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 01:40:19 -0400 Subject: [PATCH 040/233] bcachefs: kill bch2_bkey_needs_rebalance() Dead code Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 17 ----------------- fs/bcachefs/extents.h | 1 - 2 files changed, 18 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 243cb15b74b3..6ad5ff7c8239 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1442,23 +1442,6 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, return rewrite_ptrs; } -bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - /* - * If it's an indirect extent, we don't delete the rebalance entry when - * done so that we know what options were applied - check if it still - * needs work done: - */ - if (r && - k.k->type == KEY_TYPE_reflink_v && - !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) - r = NULL; - - return r != NULL; -} - static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, unsigned target, unsigned compression) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index bcffcf60aaaf..9374599b384d 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -713,7 +713,6 @@ void bch2_ptr_swab(struct bkey_s); const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, unsigned, unsigned); -bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, From 48cc0c6bed2eb17b1bba38273cb257533a83c408 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 02:14:53 -0400 Subject: [PATCH 041/233] bcachefs: kill __bch2_bkey_sectors_need_rebalance() Single caller, fold into bch2_bkey_sectors_need_rebalance() Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 6ad5ff7c8239..5ec0fec597f7 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1442,16 +1442,19 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, return rewrite_ptrs; } -static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, - unsigned target, unsigned compression) +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { + const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + if (!opts) + return 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; - if (compression) { - unsigned compression_type = bch2_compression_opt_to_type(compression); + if (opts->compression) { + unsigned compression_type = bch2_compression_opt_to_type(opts->compression); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || @@ -1465,22 +1468,16 @@ static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c } } incompressible: - if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + if (opts->target && + bch2_target_accepts_data(c, BCH_DATA_user, opts->target)) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target)) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->target)) sectors += p.crc.compressed_size; } return sectors; } -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0; -} - int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, struct bch_io_opts *opts) { From 875a3128db4cabe615187b9ca841b85bde923fd9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 02:21:28 -0400 Subject: [PATCH 042/233] bcachefs: rename bch_extent_rebalance fields to match other opts structs Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 36 ++++++++++++++++++------------------ fs/bcachefs/extents_format.h | 8 ++++---- fs/bcachefs/rebalance.c | 12 ++++++------ 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 5ec0fec597f7..ee0e86f0becd 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1161,11 +1161,11 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, prt_str(out, "rebalance: target "); if (c) - bch2_target_to_text(out, c, r->target); + bch2_target_to_text(out, c, r->background_target); else - prt_printf(out, "%u", r->target); + prt_printf(out, "%u", r->background_target); prt_str(out, " compression "); - bch2_compression_opt_to_text(out, r->compression); + bch2_compression_opt_to_text(out, r->background_compression); break; } default: @@ -1453,8 +1453,8 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; u64 sectors = 0; - if (opts->compression) { - unsigned compression_type = bch2_compression_opt_to_type(opts->compression); + if (opts->background_compression) { + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || @@ -1468,10 +1468,10 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->target && - bch2_target_accepts_data(c, BCH_DATA_user, opts->target)) { + if (opts->background_target && + bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->target)) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; } @@ -1500,14 +1500,14 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, * they're referenced by different inodes with different * options: */ - if (r->target) - target = r->target; - if (r->compression) - compression = r->compression; + if (r->background_target) + target = r->background_target; + if (r->background_compression) + compression = r->background_compression; } - r->target = target; - r->compression = compression; + r->background_target = target; + r->background_compression = compression; } needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); @@ -1515,10 +1515,10 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, if (needs_rebalance && !r) { union bch_extent_entry *new = bkey_val_end(k); - new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; - new->rebalance.compression = compression; - new->rebalance.target = target; - new->rebalance.unused = 0; + new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; + new->rebalance.background_compression = compression; + new->rebalance.background_target = target; + new->rebalance.unused = 0; k.k->u64s += extent_entry_u64s(new); } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { /* diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index 3bd2fdbb0817..2cc3e60f3b12 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -205,11 +205,11 @@ struct bch_extent_rebalance { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:6, unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; + background_compression:8, /* enum bch_compression_opt */ + background_target:16; #elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, + __u64 background_target:16, + background_compression:8, unused:34, type:6; #endif diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 2f93ae8781bb..dc70e6feaf79 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -157,8 +157,8 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, memset(data_opts, 0, sizeof(*data_opts)); data_opts->rewrite_ptrs = - bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); - data_opts->target = r->target; + bch2_bkey_ptrs_need_rebalance(c, k, r->background_target, r->background_compression); + data_opts->target = r->background_target; data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; if (!data_opts->rewrite_ptrs) { @@ -179,9 +179,9 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct printbuf buf = PRINTBUF; prt_str(&buf, "target="); - bch2_target_to_text(&buf, c, r->target); + bch2_target_to_text(&buf, c, r->background_target); prt_str(&buf, " compression="); - bch2_compression_opt_to_text(&buf, r->compression); + bch2_compression_opt_to_text(&buf, r->background_compression); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); @@ -261,8 +261,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, } else { const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : io_opts->background_compression; + target = r ? r->background_target : io_opts->background_target; + compression = r ? r->background_compression : io_opts->background_compression; } data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); From bdad263284332336642784929b9e4bf3887560b8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 02:28:51 -0400 Subject: [PATCH 043/233] bcachefs: io_opts_to_rebalance_opts() New helper to simplify bch2_bkey_set_needs_rebalance() Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 2 +- fs/bcachefs/extents.c | 60 ++++++++++-------------------------- fs/bcachefs/extents.h | 3 +- fs/bcachefs/extents_format.h | 5 +++ fs/bcachefs/io_misc.c | 2 +- fs/bcachefs/io_write.c | 2 +- fs/bcachefs/opts.h | 13 ++++++++ fs/bcachefs/reflink.c | 2 +- 8 files changed, 39 insertions(+), 50 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 90da57a26962..e4af2ccdf4c8 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -357,7 +357,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: + bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, &op->res, diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index ee0e86f0becd..a134aa5a76bb 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1478,55 +1478,27 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) return sectors; } -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - struct bch_io_opts *opts) +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_i *_k) { - struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *r; - unsigned target = opts->background_target; - unsigned compression = opts->background_compression; - bool needs_rebalance; - - if (!bkey_extent_is_direct_data(k.k)) + if (!bkey_extent_is_direct_data(&_k->k)) return 0; - /* get existing rebalance entry: */ - r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (r) { - if (k.k->type == KEY_TYPE_reflink_v) { - /* - * indirect extents: existing options take precedence, - * so that we don't move extents back and forth if - * they're referenced by different inodes with different - * options: - */ - if (r->background_target) - target = r->background_target; - if (r->background_compression) - compression = r->background_compression; + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *old = + (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + + if (k.k->type == KEY_TYPE_reflink_v || + bch2_bkey_ptrs_need_rebalance(c, k.s_c, opts->background_target, opts->background_compression)) { + if (!old) { + old = bkey_val_end(k); + k.k->u64s += sizeof(*old) / sizeof(u64); } - r->background_target = target; - r->background_compression = compression; - } - - needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); - - if (needs_rebalance && !r) { - union bch_extent_entry *new = bkey_val_end(k); - - new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; - new->rebalance.background_compression = compression; - new->rebalance.background_target = target; - new->rebalance.unused = 0; - k.k->u64s += extent_entry_u64s(new); - } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { - /* - * For indirect extents, don't delete the rebalance entry when - * we're finished so that we know we specifically moved it or - * compressed it to its current location/compression type - */ - extent_entry_drop(k, (union bch_extent_entry *) r); + *old = io_opts_to_rebalance_opts(opts); + } else { + if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); } return 0; diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 9374599b384d..97af0d6e4319 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -715,8 +715,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, unsigned, unsigned); u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - struct bch_io_opts *); +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); /* Generic extent code: */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index 2cc3e60f3b12..520697f236c0 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -215,6 +215,11 @@ struct bch_extent_rebalance { #endif }; +/* subset of BCH_INODE_OPTS */ +#define BCH_REBALANCE_OPTS() \ + x(background_compression) \ + x(background_target) + union bch_extent_entry { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 unsigned long type; diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index f283051758d6..e2acf21ac9b0 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -461,7 +461,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 96720adcfee0..f2f69e5e0910 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -369,7 +369,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index dd27ef556611..13555bc35f00 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -642,4 +642,17 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); +/* rebalance opts: */ + +static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts) +{ + return (struct bch_extent_rebalance) { + .type = BIT(BCH_EXTENT_ENTRY_rebalance), +#define x(_name) \ + ._name = opts->_name, + BCH_REBALANCE_OPTS() +#undef x + }; +}; + #endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index f457925fa362..8a36ebd9dd9c 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -547,7 +547,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, From fc23ffb93c5ba25558186bb77216ad0d1baf59b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Oct 2024 23:26:11 -0400 Subject: [PATCH 044/233] bcachefs: Add bch_io_opts fields for indicating whether the opts came from the inode This is going to be used in the bch_extent_rebalance improvements, which propagate io_path options into the extent (important for rebalance, which needs something present in the extent for transactionally tagging them in the rebalance_work btree, and also for indirect extents). By tracking in bch_extent_rebalance whether the option came from the filesystem or the inode we can correctly handle options being changed on indirect extents. Signed-off-by: Kent Overstreet --- fs/bcachefs/inode.c | 8 +++++++- fs/bcachefs/opts.h | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index fbdd11802bdf..5dd9d3edae77 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -1142,7 +1142,13 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, struct bch_inode_unpacked *inode) { -#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); +#define x(_name, _bits) \ + if ((inode)->bi_##_name) { \ + opts->_name = inode->bi_##_name - 1; \ + opts->_name##_from_inode = true; \ + } else { \ + opts->_name = c->opts._name; \ + } BCH_INODE_OPTS() #undef x diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 13555bc35f00..918eb6730117 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -624,6 +624,9 @@ struct bch_io_opts { #define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() #undef x +#define x(_name, _bits) u64 _name##_from_inode:1; + BCH_INODE_OPTS() +#undef x }; static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) From 86066b111be8a1ef502332ab0511d2cf65766bcd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2024 01:06:53 -0400 Subject: [PATCH 045/233] bcachefs: copygc_enabled, rebalance_enabled now opts.h options They can now be set at mount time Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 -- fs/bcachefs/movinggc.c | 4 ++-- fs/bcachefs/opts.h | 12 ++++++++++++ fs/bcachefs/rebalance.c | 4 ++-- fs/bcachefs/rebalance_types.h | 2 -- fs/bcachefs/super.c | 3 --- fs/bcachefs/sysfs.c | 31 +++++++------------------------ 7 files changed, 23 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d4d95ef6791f..e1ab67c533f0 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1096,8 +1096,6 @@ struct bch_fs { u64 counters_on_mount[BCH_COUNTER_NR]; u64 __percpu *counters; - unsigned copy_gc_enabled:1; - struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index d658be90f737..725292d69fd6 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -350,9 +350,9 @@ static int bch2_copygc_thread(void *arg) bch2_trans_unlock_long(ctxt.trans); cond_resched(); - if (!c->copy_gc_enabled) { + if (!c->opts.copygc_enabled) { move_buckets_wait(&ctxt, buckets, true); - kthread_wait_freezable(c->copy_gc_enabled || + kthread_wait_freezable(c->opts.copygc_enabled || kthread_should_stop()); } diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 918eb6730117..e0e23c29c2d6 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -473,6 +473,18 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ + x(copygc_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable copygc: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ + x(rebalance_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable rebalance: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index dc70e6feaf79..d8cb346ac138 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -338,9 +338,9 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!r->enabled) { + if (!c->opts.rebalance_enabled) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(r->enabled || + kthread_wait_freezable(c->opts.rebalance_enabled || kthread_should_stop()); } diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 0fffb536c1d0..fe5098c17dfc 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -30,8 +30,6 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; - - unsigned enabled:1; }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d6411324cd3f..7e2431de3a94 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -810,9 +810,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->vfs_inodes_list); mutex_init(&c->vfs_inodes_lock); - c->copy_gc_enabled = 1; - c->rebalance.enabled = 1; - c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 3270bfab9466..4ab0ccba2ab5 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -213,10 +213,8 @@ BCH_PERSISTENT_COUNTERS() rw_attribute(discard); rw_attribute(label); -rw_attribute(copy_gc_enabled); read_attribute(copy_gc_wait); -rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); @@ -340,9 +338,6 @@ SHOW(bch2_fs) if (attr == &sysfs_gc_gens_pos) bch2_gc_gens_pos_to_text(out, c); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); - - sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ if (attr == &sysfs_copy_gc_wait) @@ -419,23 +414,6 @@ STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - if (attr == &sysfs_copy_gc_enabled) { - ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) - ?: (ssize_t) size; - - if (c->copygc_thread) - wake_up_process(c->copygc_thread); - return ret; - } - - if (attr == &sysfs_rebalance_enabled) { - ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) - ?: (ssize_t) size; - - rebalance_wakeup(c); - return ret; - } - sysfs_pd_controller_store(rebalance, &c->rebalance.pd); /* Debugging: */ @@ -611,10 +589,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_gc_gens_pos, - &sysfs_copy_gc_enabled, &sysfs_copy_gc_wait, - &sysfs_rebalance_enabled, sysfs_pd_controller_files(rebalance), &sysfs_moving_ctxts, @@ -683,6 +659,13 @@ STORE(bch2_fs_opts_dir) (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); + if (v && id == Opt_rebalance_enabled) + rebalance_wakeup(c); + + if (v && id == Opt_copygc_enabled && + c->copygc_thread) + wake_up_process(c->copygc_thread); + ret = size; err: bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); From e1d67b5d67f83789f0cf57d8c822941a5050a454 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Oct 2024 01:14:53 -0400 Subject: [PATCH 046/233] bcachefs: bch2_prt_csum_opt() bounds checking helper Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 2 +- fs/bcachefs/checksum.h | 2 +- fs/bcachefs/opts.c | 3 ++- fs/bcachefs/opts.h | 7 ++++--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c5e3824d5771..dc14bfe37e3b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1030,7 +1030,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) x(crc64, 2) \ x(xxhash, 3) -enum bch_csum_opts { +enum bch_csum_opt { #define x(t, n) BCH_CSUM_OPT_##t = n, BCH_CSUM_OPTS() #undef x diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index e40499fde9a4..43b9d71f2f2b 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -109,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool); void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); -static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, +static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, bool data) { switch (type) { diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 1f5843284e9e..49c59aec6954 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -48,7 +48,7 @@ static const char * const __bch2_csum_types[] = { NULL }; -const char * const bch2_csum_opts[] = { +const char * const __bch2_csum_opts[] = { BCH_CSUM_OPTS() NULL }; @@ -113,6 +113,7 @@ void bch2_prt_##name(struct printbuf *out, type t) \ PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); +PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index e0e23c29c2d6..f6dc0628b025 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,7 +16,7 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; -extern const char * const bch2_csum_opts[]; +extern const char * const __bch2_csum_opts[]; extern const char * const bch2_compression_opts[]; extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; @@ -27,6 +27,7 @@ extern const char * const bch2_d_types[]; void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); void bch2_prt_data_type(struct printbuf *, enum bch_data_type); +void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); @@ -171,12 +172,12 @@ enum fsck_err_opts { "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_csum_opts), \ + OPT_STR(__bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_csum_opts), \ + OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(compression, u8, \ From a5fe1d1656b8471db2cd0854062977245aadd80a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Oct 2024 21:41:20 -0400 Subject: [PATCH 047/233] bcachefs: New bch_extent_rebalance fields - Add more io path options to bch_extent_rebalance - For each option, track whether it came from the filesystem or the inode This will be used for improved rebalance support for reflinked data. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 65 ++++++++++++++++++++++++++++++------ fs/bcachefs/extents_format.h | 34 +++++++++++++++++-- fs/bcachefs/opts.h | 3 +- 3 files changed, 87 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index a134aa5a76bb..467ffed0809e 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1121,6 +1121,57 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr bch2_prt_compression_type(out, crc->compression_type); } +static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, + const struct bch_extent_rebalance *r) +{ + prt_str(out, "rebalance:"); + + prt_printf(out, " replicas=%u", r->data_replicas); + if (r->data_replicas_from_inode) + prt_str(out, " (inode)"); + + prt_str(out, " checksum="); + bch2_prt_csum_opt(out, r->data_checksum); + if (r->data_checksum_from_inode) + prt_str(out, " (inode)"); + + if (r->background_compression || r->background_compression_from_inode) { + prt_str(out, " background_compression="); + bch2_compression_opt_to_text(out, r->background_compression); + + if (r->background_compression_from_inode) + prt_str(out, " (inode)"); + } + + if (r->background_target || r->background_target_from_inode) { + prt_str(out, " background_target="); + if (c) + bch2_target_to_text(out, c, r->background_target); + else + prt_printf(out, "%u", r->background_target); + + if (r->background_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->promote_target || r->promote_target_from_inode) { + prt_str(out, " promote_target="); + if (c) + bch2_target_to_text(out, c, r->promote_target); + else + prt_printf(out, "%u", r->promote_target); + + if (r->promote_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->erasure_code || r->erasure_code_from_inode) { + prt_printf(out, " ec=%u", r->erasure_code); + if (r->erasure_code_from_inode) + prt_str(out, " (inode)"); + } +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -1156,18 +1207,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, (u64) ec->idx, ec->block); break; } - case BCH_EXTENT_ENTRY_rebalance: { - const struct bch_extent_rebalance *r = &entry->rebalance; - - prt_str(out, "rebalance: target "); - if (c) - bch2_target_to_text(out, c, r->background_target); - else - prt_printf(out, "%u", r->background_target); - prt_str(out, " compression "); - bch2_compression_opt_to_text(out, r->background_compression); + case BCH_EXTENT_ENTRY_rebalance: + bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; - } + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index 520697f236c0..222eed6b46d8 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -204,21 +204,49 @@ struct bch_extent_stripe_ptr { struct bch_extent_rebalance { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:6, - unused:34, + unused:3, + + promote_target_from_inode:1, + erasure_code_from_inode:1, + data_checksum_from_inode:1, + background_compression_from_inode:1, + data_replicas_from_inode:1, + background_target_from_inode:1, + + promote_target:16, + erasure_code:1, + data_checksum:4, + data_replicas:4, background_compression:8, /* enum bch_compression_opt */ background_target:16; #elif defined (__BIG_ENDIAN_BITFIELD) __u64 background_target:16, background_compression:8, - unused:34, + data_replicas:4, + data_checksum:4, + erasure_code:1, + promote_target:16, + + background_target_from_inode:1, + data_replicas_from_inode:1, + background_compression_from_inode:1, + data_checksum_from_inode:1, + erasure_code_from_inode:1, + promote_target_from_inode:1, + + unused:3, type:6; #endif }; /* subset of BCH_INODE_OPTS */ #define BCH_REBALANCE_OPTS() \ + x(data_checksum) \ x(background_compression) \ - x(background_target) + x(data_replicas) \ + x(promote_target) \ + x(background_target) \ + x(erasure_code) union bch_extent_entry { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index f6dc0628b025..39cdc185fa73 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -665,7 +665,8 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_i return (struct bch_extent_rebalance) { .type = BIT(BCH_EXTENT_ENTRY_rebalance), #define x(_name) \ - ._name = opts->_name, + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, BCH_REBALANCE_OPTS() #undef x }; From e2aeaaa5c9eee014aba96c54c6730d016c00ca67 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 20:53:53 -0400 Subject: [PATCH 048/233] bcachefs: bch2_write_inode() now checks for changing rebalance options Previously, BCHFS_IOC_REINHERIT_ATTRS didn't trigger rebalance scans when changing rebalance options - it had been missed, only the xattr interface triggered them. Ideally they'd be done by the transactional trigger, but unpacking the inode to get the options is too heavy to be done in the low level trigger - the inode trigger is run on every extent update, since the bch_inode.bi_journal_seq has to be updated for fsync. bch2_write_inode() is a good compromise, it already unpacks and repacks and is not run in any super-fast paths. Additionally, creating the new rebalance entry to trigger the scan is now done in the same transaction as the inode update that changed the options. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 26 +++++++++++++++++++++----- fs/bcachefs/inode.h | 8 ++++++++ fs/bcachefs/rebalance.c | 4 ++-- fs/bcachefs/rebalance.h | 1 + fs/bcachefs/xattr.c | 7 ------- 5 files changed, 32 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 646b74494a3f..e0ffe4648bb8 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -23,6 +23,7 @@ #include "journal.h" #include "keylist.h" #include "quota.h" +#include "rebalance.h" #include "snapshot.h" #include "super.h" #include "xattr.h" @@ -89,10 +90,25 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(trans); - ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent) ?: - (set ? set(trans, inode, &inode_u, p) : 0) ?: - bch2_inode_write(trans, &iter, &inode_u) ?: + ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); + if (ret) + goto err; + + struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); + + ret = (set ? set(trans, inode, &inode_u, p) : 0); + if (ret) + goto err; + + struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + + if (memcmp(&old_r, &new_r, sizeof(new_r))) { + ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); + if (ret) + goto err; + } + + ret = bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* @@ -101,7 +117,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, */ if (!ret) bch2_inode_update_after_write(trans, inode, &inode_u, fields); - +err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index bdeb6be76038..f52336cb298f 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -262,6 +262,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +static inline struct bch_extent_rebalance +bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) +{ + struct bch_io_opts io_opts; + bch2_inode_opts_get(&io_opts, c, inode); + return io_opts_to_rebalance_opts(&io_opts); +} + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d8cb346ac138..926b9d5eba45 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -33,7 +33,7 @@ static const char * const bch2_rebalance_state_strs[] = { #undef x }; -static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) { struct btree_iter iter; struct bkey_s_c k; @@ -73,7 +73,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_lazy_rw, - __bch2_set_rebalance_needs_scan(trans, inum)); + bch2_set_rebalance_needs_scan_trans(trans, inum)); rebalance_wakeup(c); return ret; } diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 28a52638f16c..791649c04ff5 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -4,6 +4,7 @@ #include "rebalance_types.h" +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); int bch2_set_fs_needs_rebalance(struct bch_fs *); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index bf3c6bb50495..ed418a747cdd 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -565,13 +565,6 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); err: mutex_unlock(&inode->ei_update_lock); - - if (value && - (opt_id == Opt_background_target || - opt_id == Opt_background_compression || - (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) - bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); - err_class_exit: return bch2_err_class(ret); } From f14488c8cf907f1d69716465c9c8ab11b9ff2261 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Oct 2024 21:41:20 -0400 Subject: [PATCH 049/233] bcachefs: get_update_rebalance_opts() bch2_move_get_io_opts() now synchronizes options loaded from the filesystem and inode (if present, i.e. not walking the reflink btree directly) with options from the bch_extent_rebalance_entry, updating the extent if necessary. Since bch_extent_rebalance tracks where its option came from we can preserve "inode options override filesystem options", even for indirect extents where we don't have access to the inode the options came from. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 17 ++++++++ fs/bcachefs/extents.h | 1 + fs/bcachefs/move.c | 94 ++++++++++++++++++++++++++++++----------- fs/bcachefs/move.h | 5 +-- fs/bcachefs/rebalance.c | 2 +- 5 files changed, 91 insertions(+), 28 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 467ffed0809e..4988056ab4f1 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1521,6 +1521,23 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) return sectors; } +bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_s_c k) +{ + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); + + if (k.k->type == KEY_TYPE_reflink_v || + bch2_bkey_ptrs_need_rebalance(c, k, opts->background_target, opts->background_compression)) { + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); + return old == NULL || memcmp(old, &new, sizeof(new)); + } else { + return old != NULL; + } +} + int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_i *_k) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 97af0d6e4319..abe7d4b2fc6b 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -715,6 +715,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, unsigned, unsigned); u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +bool bch2_bkey_rebalance_needs_update(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c); int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); /* Generic extent code: */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0ef4a86850bb..1003f7fe4f50 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -379,14 +379,57 @@ int bch2_move_extent(struct moving_context *ctxt, return ret; } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +static int get_update_rebalance_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct btree_iter *iter, + struct bkey_s_c k) +{ + BUG_ON(iter->flags & BTREE_ITER_is_extents); + BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); + + const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v + ? bch2_bkey_rebalance_opts(k) : NULL; + if (r) { +#define x(_name) \ + if (r->_name##_from_inode) { \ + io_opts->_name = r->_name; \ + io_opts->_name##_from_inode = true; \ + } + BCH_REBALANCE_OPTS() +#undef x + } + + if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) + return 0; + + struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(n, k); + + /* On successfull transaction commit, @k was invalidated: */ + + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; +} + +static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, + struct btree_iter *extent_iter, struct bkey_s_c extent_k) { struct bch_fs *c = trans->c; u32 restart_count = trans->restart_count; + struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; + if (extent_k.k->type == KEY_TYPE_reflink_v) + goto out; + if (io_opts->cur_inum != extent_k.k->p.inode) { io_opts->d.nr = 0; @@ -415,43 +458,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (extent_k.k->p.snapshot) darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) - return &i->io_opts; - - return &io_opts->fs_io_opts; + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { + opts_ret = &i->io_opts; + break; + } +out: + ret = get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); + if (ret) + return ERR_PTR(ret); + return opts_ret; } int bch2_move_get_io_opts_one(struct btree_trans *trans, struct bch_io_opts *io_opts, + struct btree_iter *extent_iter, struct bkey_s_c extent_k) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; + struct bch_fs *c = trans->c; + + *io_opts = bch2_opts_to_inode_opts(c->opts); /* reflink btree? */ - if (!extent_k.k->p.inode) { - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); - return 0; - } + if (!extent_k.k->p.inode) + goto out; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + struct btree_iter inode_iter; + struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), BTREE_ITER_cached); - ret = bkey_err(k); + int ret = bkey_err(inode_k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - if (!ret && bkey_is_inode(k.k)) { + if (!ret && bkey_is_inode(inode_k.k)) { struct bch_inode_unpacked inode; - bch2_inode_unpack(k, &inode); - bch2_inode_opts_get(io_opts, trans->c, &inode); - } else { - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + bch2_inode_unpack(inode_k, &inode); + bch2_inode_opts_get(io_opts, c, &inode); } - - bch2_trans_iter_exit(trans, &iter); - return 0; + bch2_trans_iter_exit(trans, &inode_iter); +out: + return get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } int bch2_move_ratelimit(struct moving_context *ctxt) @@ -552,7 +598,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, &iter, k); ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; @@ -728,7 +774,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 9baf3093a678..51e0505a8156 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt darray_exit(&io_opts->d); } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bkey_s_c); -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); +int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, + struct btree_iter *, struct bkey_s_c); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 926b9d5eba45..e79459a5891d 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -216,7 +216,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, if (ret || !k.k) goto out; - ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + ret = bch2_move_get_io_opts_one(trans, &io_opts, extent_iter, k); if (ret) goto out; From 8aa17c262842fedae48ab6b38e63284664971560 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Oct 2024 21:41:20 -0400 Subject: [PATCH 050/233] bcachefs: Simplify option logic in rebalance Since bch2_move_get_io_opts() now synchronizes io_opts with options from bch_extent_rebalance, delete the ad-hoc logic in rebalance.c that previously did this. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 20 +++++++++--------- fs/bcachefs/extents.h | 3 +-- fs/bcachefs/rebalance.c | 47 +++++++++++++---------------------------- 3 files changed, 26 insertions(+), 44 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 4988056ab4f1..bee083d787f2 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1447,14 +1447,15 @@ const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) return NULL; } -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, - unsigned target, unsigned compression) +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned rewrite_ptrs = 0; - if (compression) { - unsigned compression_type = bch2_compression_opt_to_type(compression); + if (opts->background_compression) { + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned ptr_bit = 1; @@ -1472,11 +1473,12 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, } } incompressible: - if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { + if (opts->background_target && + bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } @@ -1529,8 +1531,7 @@ bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - if (k.k->type == KEY_TYPE_reflink_v || - bch2_bkey_ptrs_need_rebalance(c, k, opts->background_target, opts->background_compression)) { + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); return old == NULL || memcmp(old, &new, sizeof(new)); } else { @@ -1548,8 +1549,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, struct bch_extent_rebalance *old = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (k.k->type == KEY_TYPE_reflink_v || - bch2_bkey_ptrs_need_rebalance(c, k.s_c, opts->background_target, opts->background_compression)) { + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { if (!old) { old = bkey_val_end(k); k.k->u64s += sizeof(*old) / sizeof(u64); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index abe7d4b2fc6b..156fbb8e04d5 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -711,8 +711,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, void bch2_ptr_swab(struct bkey_s); const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, - unsigned, unsigned); +unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c); u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); bool bch2_bkey_rebalance_needs_update(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index e79459a5891d..3be9c85dd55d 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -121,6 +121,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + if (!bch2_bkey_rebalance_opts(k)) + return 0; + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); int ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -134,31 +137,27 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct bpos work_pos, struct btree_iter *extent_iter, + struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; - struct bkey_s_c k; bch2_trans_iter_exit(trans, extent_iter); bch2_trans_iter_init(trans, extent_iter, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, BTREE_ITER_all_snapshots); - k = bch2_btree_iter_peek_slot(extent_iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); if (bkey_err(k)) return k; - const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; - if (!r) { - /* raced due to btree write buffer, nothing to do */ - return bkey_s_c_null; - } + int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); memset(data_opts, 0, sizeof(*data_opts)); - - data_opts->rewrite_ptrs = - bch2_bkey_ptrs_need_rebalance(c, k, r->background_target, r->background_compression); - data_opts->target = r->background_target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; if (!data_opts->rewrite_ptrs) { @@ -179,9 +178,9 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct printbuf buf = PRINTBUF; prt_str(&buf, "target="); - bch2_target_to_text(&buf, c, r->background_target); + bch2_target_to_text(&buf, c, io_opts->background_target); prt_str(&buf, " compression="); - bch2_compression_opt_to_text(&buf, r->background_compression); + bch2_compression_opt_to_text(&buf, io_opts->background_compression); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); @@ -212,14 +211,10 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_init(&sk); ret = bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &data_opts)); + extent_iter, &io_opts, &data_opts)); if (ret || !k.k) goto out; - ret = bch2_move_get_io_opts_one(trans, &io_opts, extent_iter, k); - if (ret) - goto out; - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); /* @@ -253,20 +248,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - unsigned target, compression; - - if (k.k->p.inode) { - target = io_opts->background_target; - compression = io_opts->background_compression; - } else { - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - target = r ? r->background_target : io_opts->background_target; - compression = r ? r->background_compression : io_opts->background_compression; - } - - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); - data_opts->target = target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; return data_opts->rewrite_ptrs != 0; } From 9cf36d9f4d281af3dc65f1e819f5ec84613db899 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 26 Oct 2024 01:42:57 -0400 Subject: [PATCH 051/233] bcachefs: Improve trace_rebalance_extent We now say explicitly which pointers are being moved or compressed Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 35 +++-------------------------- fs/bcachefs/rebalance.c | 26 +++++++++++++++++----- fs/bcachefs/rebalance.h | 49 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 37 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index bee083d787f2..6f9514c19b2f 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -21,6 +21,7 @@ #include "extents.h" #include "inode.h" #include "journal.h" +#include "rebalance.h" #include "replicas.h" #include "super.h" #include "super-io.h" @@ -1452,39 +1453,9 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned rewrite_ptrs = 0; - if (opts->background_compression) { - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ptr_bit = 1; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - rewrite_ptrs = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - } -incompressible: - if (opts->background_target && - bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { - unsigned ptr_bit = 1; - - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - } - - return rewrite_ptrs; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | + bch2_bkey_ptrs_need_move(c, opts, ptrs); } u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 3be9c85dd55d..124da250cbe7 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -177,12 +177,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (trace_rebalance_extent_enabled()) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "target="); - bch2_target_to_text(&buf, c, io_opts->background_target); - prt_str(&buf, " compression="); - bch2_compression_opt_to_text(&buf, io_opts->background_compression); - prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); + if (p) { + prt_str(&buf, "compression="); + bch2_compression_opt_to_text(&buf, io_opts->background_compression); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, p); + prt_newline(&buf); + } + + p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); + if (p) { + prt_str(&buf, "move="); + bch2_target_to_text(&buf, c, io_opts->background_target); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, p); + prt_newline(&buf); + } trace_rebalance_extent(c, buf.buf); printbuf_exit(&buf); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 791649c04ff5..606c88f49f7f 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -2,8 +2,57 @@ #ifndef _BCACHEFS_REBALANCE_H #define _BCACHEFS_REBALANCE_H +#include "compress.h" +#include "disk_groups.h" #include "rebalance_types.h" +static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_compression) + return 0; + + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) + return 0; + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + +static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_target || + !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) + return 0; + + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); int bch2_set_fs_needs_rebalance(struct bch_fs *); From f6240723f7d66143b5a74b620666795112162b4f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Oct 2024 23:23:18 -0400 Subject: [PATCH 052/233] bcachefs: Move bch_extent_rebalance code to rebalance.c Signed-off-by: Kent Overstreet --- fs/bcachefs/buckets.c | 1 + fs/bcachefs/extents.c | 99 ------------------ fs/bcachefs/extents.h | 7 -- fs/bcachefs/extents_format.h | 48 +-------- fs/bcachefs/move.c | 43 +------- fs/bcachefs/rebalance.c | 186 +++++++++++++++++++++++++++++++++ fs/bcachefs/rebalance.h | 52 ++------- fs/bcachefs/rebalance_format.h | 53 ++++++++++ 8 files changed, 251 insertions(+), 238 deletions(-) create mode 100644 fs/bcachefs/rebalance_format.h diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 8bd17667e243..c4123fa4f250 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -18,6 +18,7 @@ #include "error.h" #include "inode.h" #include "movinggc.h" +#include "rebalance.h" #include "recovery.h" #include "reflink.h" #include "replicas.h" diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 6f9514c19b2f..bc7cfdb66687 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1436,105 +1436,6 @@ void bch2_ptr_swab(struct bkey_s k) } } -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) - return &entry->rebalance; - - return NULL; -} - -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - - return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | - bch2_bkey_ptrs_need_move(c, opts, ptrs); -} - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); - if (!opts) - return 0; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 sectors = 0; - - if (opts->background_compression) { - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - sectors = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - sectors += p.crc.compressed_size; - } - } -incompressible: - if (opts->background_target && - bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) - sectors += p.crc.compressed_size; - } - - return sectors; -} - -bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_s_c k) -{ - if (!bkey_extent_is_direct_data(k.k)) - return 0; - - const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { - struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); - return old == NULL || memcmp(old, &new, sizeof(new)); - } else { - return old != NULL; - } -} - -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, - struct bkey_i *_k) -{ - if (!bkey_extent_is_direct_data(&_k->k)) - return 0; - - struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *old = - (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - - if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { - if (!old) { - old = bkey_val_end(k); - k.k->u64s += sizeof(*old) / sizeof(u64); - } - - *old = io_opts_to_rebalance_opts(opts); - } else { - if (old) - extent_entry_drop(k, (union bch_extent_entry *) old); - } - - return 0; -} - /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 156fbb8e04d5..ba33788fee36 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -710,13 +710,6 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, void bch2_ptr_swab(struct bkey_s); -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c); -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); - -bool bch2_bkey_rebalance_needs_update(struct bch_fs *, struct bch_io_opts *, struct bkey_s_c); -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); - /* Generic extent code: */ enum bch_extent_overlap { diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index 222eed6b46d8..c198dfc376d6 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -201,52 +201,8 @@ struct bch_extent_stripe_ptr { #endif }; -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:3, - - promote_target_from_inode:1, - erasure_code_from_inode:1, - data_checksum_from_inode:1, - background_compression_from_inode:1, - data_replicas_from_inode:1, - background_target_from_inode:1, - - promote_target:16, - erasure_code:1, - data_checksum:4, - data_replicas:4, - background_compression:8, /* enum bch_compression_opt */ - background_target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 background_target:16, - background_compression:8, - data_replicas:4, - data_checksum:4, - erasure_code:1, - promote_target:16, - - background_target_from_inode:1, - data_replicas_from_inode:1, - background_compression_from_inode:1, - data_checksum_from_inode:1, - erasure_code_from_inode:1, - promote_target_from_inode:1, - - unused:3, - type:6; -#endif -}; - -/* subset of BCH_INODE_OPTS */ -#define BCH_REBALANCE_OPTS() \ - x(data_checksum) \ - x(background_compression) \ - x(data_replicas) \ - x(promote_target) \ - x(background_target) \ - x(erasure_code) +/* bch_extent_rebalance: */ +#include "rebalance_format.h" union bch_extent_entry { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 1003f7fe4f50..d6e68265e039 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -21,6 +21,7 @@ #include "journal_reclaim.h" #include "keylist.h" #include "move.h" +#include "rebalance.h" #include "replicas.h" #include "snapshot.h" #include "super-io.h" @@ -379,44 +380,6 @@ int bch2_move_extent(struct moving_context *ctxt, return ret; } -static int get_update_rebalance_opts(struct btree_trans *trans, - struct bch_io_opts *io_opts, - struct btree_iter *iter, - struct bkey_s_c k) -{ - BUG_ON(iter->flags & BTREE_ITER_is_extents); - BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); - - const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v - ? bch2_bkey_rebalance_opts(k) : NULL; - if (r) { -#define x(_name) \ - if (r->_name##_from_inode) { \ - io_opts->_name = r->_name; \ - io_opts->_name##_from_inode = true; \ - } - BCH_REBALANCE_OPTS() -#undef x - } - - if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) - return 0; - - struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); - int ret = PTR_ERR_OR_ZERO(n); - if (ret) - return ret; - - bkey_reassemble(n, k); - - /* On successfull transaction commit, @k was invalidated: */ - - return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: - bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; -} - static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, struct btree_iter *extent_iter, @@ -463,7 +426,7 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, break; } out: - ret = get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); + ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); if (ret) return ERR_PTR(ret); return opts_ret; @@ -497,7 +460,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, } bch2_trans_iter_exit(trans, &inode_iter); out: - return get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } int bch2_move_ratelimit(struct moving_context *ctxt) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 124da250cbe7..d1b580e76ba4 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -24,6 +24,192 @@ #include #include +/* bch_extent_rebalance: */ + +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) + return &entry->rebalance; + + return NULL; +} + +static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_compression) + return 0; + + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) + return 0; + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + +static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_target || + !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) + return 0; + + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + +static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | + bch2_bkey_ptrs_need_move(c, opts, ptrs); +} + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + if (!opts) + return 0; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 sectors = 0; + + if (opts->background_compression) { + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + sectors = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + sectors += p.crc.compressed_size; + } + } +incompressible: + if (opts->background_target && + bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + sectors += p.crc.compressed_size; + } + + return sectors; +} + +static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_s_c k) +{ + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); + + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); + return old == NULL || memcmp(old, &new, sizeof(new)); + } else { + return old != NULL; + } +} + +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_i *_k) +{ + if (!bkey_extent_is_direct_data(&_k->k)) + return 0; + + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *old = + (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { + if (!old) { + old = bkey_val_end(k); + k.k->u64s += sizeof(*old) / sizeof(u64); + } + + *old = io_opts_to_rebalance_opts(opts); + } else { + if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); + } + + return 0; +} + +int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct btree_iter *iter, + struct bkey_s_c k) +{ + BUG_ON(iter->flags & BTREE_ITER_is_extents); + BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); + + const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v + ? bch2_bkey_rebalance_opts(k) : NULL; + if (r) { +#define x(_name) \ + if (r->_name##_from_inode) { \ + io_opts->_name = r->_name; \ + io_opts->_name##_from_inode = true; \ + } + BCH_REBALANCE_OPTS() +#undef x + } + + if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) + return 0; + + struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(n, k); + + /* On successfull transaction commit, @k was invalidated: */ + + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; +} + #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) static const char * const bch2_rebalance_state_strs[] = { diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 606c88f49f7f..0a0821ab895d 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -6,52 +6,12 @@ #include "disk_groups.h" #include "rebalance_types.h" -static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_s_c k, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_compression) - return 0; - - unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) - return 0; - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} - -static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, - struct bch_io_opts *opts, - struct bkey_ptrs_c ptrs) -{ - if (!opts->background_target || - !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) - return 0; - - unsigned ptr_bit = 1; - unsigned rewrite_ptrs = 0; - - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) - rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - - return rewrite_ptrs; -} +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); +int bch2_get_update_rebalance_opts(struct btree_trans *, + struct bch_io_opts *, + struct btree_iter *, + struct bkey_s_c); int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h new file mode 100644 index 000000000000..ff9a1342a22b --- /dev/null +++ b/fs/bcachefs/rebalance_format.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_FORMAT_H +#define _BCACHEFS_REBALANCE_FORMAT_H + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:3, + + promote_target_from_inode:1, + erasure_code_from_inode:1, + data_checksum_from_inode:1, + background_compression_from_inode:1, + data_replicas_from_inode:1, + background_target_from_inode:1, + + promote_target:16, + erasure_code:1, + data_checksum:4, + data_replicas:4, + background_compression:8, /* enum bch_compression_opt */ + background_target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 background_target:16, + background_compression:8, + data_replicas:4, + data_checksum:4, + erasure_code:1, + promote_target:16, + + background_target_from_inode:1, + data_replicas_from_inode:1, + background_compression_from_inode:1, + data_checksum_from_inode:1, + erasure_code_from_inode:1, + promote_target_from_inode:1, + + unused:3, + type:6; +#endif +}; + +/* subset of BCH_INODE_OPTS */ +#define BCH_REBALANCE_OPTS() \ + x(data_checksum) \ + x(background_compression) \ + x(data_replicas) \ + x(promote_target) \ + x(background_target) \ + x(erasure_code) + +#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ + From 9e6755da18a187eb1389a205680049587d97c62a Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 29 Oct 2024 20:53:50 +0800 Subject: [PATCH 053/233] bcachefs: remove write permission for gc_gens_pos sysfs interface The gc_gens_pos is used to show the status of bucket gen gc. There is no need to assign write permissions for this attribute. Here we can use read_attribute helper to define this attribute. ``` [Before] $ ll internal/gc_gens_pos -rw-r--r-- 1 root root 4096 Oct 28 15:27 internal/gc_gens_pos [After] $ ll internal/gc_gens_pos -r--r--r-- 1 root root 4096 Oct 28 17:27 internal/gc_gens_pos ``` Fixes: ac516d0e7db7 ("bcachefs: Add the status of bucket gen gc to sysfs") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 4ab0ccba2ab5..47ac8d5ab562 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -146,7 +146,7 @@ write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_freelist_wakeup); -rw_attribute(gc_gens_pos); +read_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); From 3289204eca4e4f1f873ceb5630f855b66373576e Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 29 Oct 2024 20:54:08 +0800 Subject: [PATCH 054/233] bcachefs: use attribute define helper for sysfs attribute The sysfs attribute definition has been wrapped into macro: rw_attribute, read_attribute and write_attribute, we can use these helpers to uniform the attribute definition. Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/sysfs.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 47ac8d5ab562..97733c766948 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -211,6 +211,7 @@ BCH_PERSISTENT_COUNTERS() #undef x rw_attribute(discard); +read_attribute(state); rw_attribute(label); read_attribute(copy_gc_wait); @@ -235,11 +236,6 @@ write_attribute(perf_test); BCH_TIME_STATS() #undef x -static struct attribute sysfs_state_rw = { - .name = "state", - .mode = 0444, -}; - static size_t bch2_btree_cache_size(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; @@ -774,7 +770,7 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_state_rw) { + if (attr == &sysfs_state) { prt_string_option(out, bch2_member_states, ca->mi.state); prt_char(out, '\n'); } @@ -854,7 +850,7 @@ struct attribute *bch2_dev_files[] = { /* settings: */ &sysfs_discard, - &sysfs_state_rw, + &sysfs_state, &sysfs_label, &sysfs_has_data, From 0877e537bcecaaa8b2a7926a130fab809a83e6da Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2024 03:35:41 -0400 Subject: [PATCH 055/233] bcachefs: Add assert for use of journal replay keys for updates The journal replay keys mechanism can only be used for updates in early recovery, when still single threaded. Add some asserts to make sure we never accidentally use it elsewhere. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 6 ++++++ fs/bcachefs/btree_trans_commit.c | 2 ++ fs/bcachefs/super.c | 5 +++++ 3 files changed, 13 insertions(+) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e1ab67c533f0..c59a58b93a92 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -743,6 +743,12 @@ struct bch_fs { #else struct percpu_ref writes; #endif + /* + * Certain operations are only allowed in single threaded mode, during + * recovery, and we want to assert that this is the case: + */ + struct task_struct *recovery_task; + /* * Analagous to c->writes, for asynchronous ops that don't necessarily * need fs to be read-write diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index b47f11881fe4..529a5a19ab8a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -999,6 +999,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; + BUG_ON(current != c->recovery_task); + trans_for_each_update(trans, i) { int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7e2431de3a94..7e0ff17a6dbb 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -441,6 +441,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) { int ret; + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; @@ -1031,9 +1033,12 @@ int bch2_fs_start(struct bch_fs *c) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + c->recovery_task = current; ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); + c->recovery_task = NULL; + if (ret) goto err; From bf924eff0e7b0dff40eee7d9237e382918022f6a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2024 03:39:32 -0400 Subject: [PATCH 056/233] bcachefs: Kill BCH_TRANS_COMMIT_lazy_rw We unconditionally go read-write, if we're going to do so, before journal replay: lazy_rw is obsolete. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 2 +- fs/bcachefs/btree_trans_commit.c | 31 +++++-------------------------- fs/bcachefs/btree_update.c | 3 +-- fs/bcachefs/btree_update.h | 1 - fs/bcachefs/lru.c | 2 +- fs/bcachefs/rebalance.c | 3 +-- fs/bcachefs/snapshot.c | 8 ++++++-- fs/bcachefs/subvolume.c | 2 +- fs/bcachefs/super.h | 10 ---------- 9 files changed, 16 insertions(+), 46 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 3c4e66da1ca4..833d743dee0c 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -908,7 +908,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c) POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), BTREE_ITER_slots|BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_alloc_write_key(trans, &iter, ca, k))); if (ret) { bch2_dev_put(ca); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 529a5a19ab8a..3aca746d08f6 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -971,24 +971,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, return ret; } -static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - int ret; - - if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || - test_bit(BCH_FS_started, &c->flags)) - return -BCH_ERR_erofs_trans_commit; - - ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); - if (ret) - return ret; - - bch2_write_ref_get(c, BCH_WRITE_REF_trans); - return 0; -} - /* * This is for updates done in the early part of fsck - btree_gc - before we've * gone RW. we only add the new key to the list of keys for journal replay to @@ -1037,16 +1019,13 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (ret) goto out_reset; - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { - ret = do_bch2_trans_commit_to_journal_replay(trans); - goto out_reset; - } - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { - ret = bch2_trans_commit_get_rw_cold(trans, flags); - if (ret) - goto out_reset; + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) + ret = do_bch2_trans_commit_to_journal_replay(trans); + else + ret = -BCH_ERR_erofs_trans_commit; + goto out_reset; } EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 79a274dcd17b..a9a29fba4902 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -865,8 +865,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, memcpy(l->d, buf.buf, buf.pos); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { - ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_lazy_rw|commit_flags, + ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, __bch2_trans_log_msg(trans, &buf, u64s)); } err: diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 7e71c4d1111d..3bc57d43aa83 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -24,7 +24,6 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, #define BCH_TRANS_COMMIT_FLAGS() \ x(no_enospc, "don't check for enospc") \ x(no_check_rw, "don't attempt to take a ref on c->writes") \ - x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ x(no_journal_res, "don't take a journal reservation, instead " \ "pin journal entry referred to by trans->journal_res.seq") \ x(journal_reclaim, "operation required for journal reclaim; may return error" \ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 10857eccdeaf..c18242748ca3 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -192,7 +192,7 @@ int bch2_check_lrus(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_lru_key(trans, &iter, k, &last_flushed))); bch2_bkey_buf_exit(&last_flushed, c); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d1b580e76ba4..4adc74cd3f70 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -257,8 +257,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_lazy_rw, + BCH_TRANS_COMMIT_no_enospc, bch2_set_rebalance_needs_scan_trans(trans, inum)); rebalance_wakeup(c); return ret; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 34e01bd8127f..6a52090485dc 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1733,8 +1733,12 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && - !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) + return; + + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 80e5efaff524..cb45ef769c54 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -675,7 +675,7 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) /* set bi_subvol on root inode */ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_fs_upgrade_for_subvolumes(trans)); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index dada09331d2e..fa6d52216510 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -34,16 +34,6 @@ void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); -/* - * Only for use in the recovery/fsck path: - */ -static inline void bch2_fs_lazy_rw(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_rw, &c->flags) && - !test_bit(BCH_FS_was_rw, &c->flags)) - bch2_fs_read_write_early(c); -} - void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); From 657e12389cdd2f954012666fd5a2ea336950bd56 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2024 00:25:36 -0400 Subject: [PATCH 057/233] bcachefs: Improved check_topology() assert On interior btree node updates, we always verify that we're not introducing topology errors: child nodes should exactly span the range of the parent node. single_device.ktest small_nodes has been popping this assert: change it to give us more information. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d62de3f79b29..865c4724d550 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1418,15 +1418,26 @@ bch2_btree_insert_keys_interior(struct btree_update *as, (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ; - while (!bch2_keylist_empty(keys)) { - insert = bch2_keylist_front(keys); - - if (bpos_gt(insert->k.p, b->key.k.p)) - break; - + for (; + insert != keys->top && bpos_le(insert->k.p, b->key.k.p); + insert = bkey_next(insert)) bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - bch2_keylist_pop_front(keys); + + if (bch2_btree_node_check_topology(trans, b)) { + struct printbuf buf = PRINTBUF; + + for (struct bkey_i *k = keys->keys; + k != insert; + k = bkey_next(k)) { + bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); + prt_newline(&buf); + } + + panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); } + + memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); + keys->top_p -= insert->_data - keys->keys_p; } static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) @@ -1575,8 +1586,6 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - - BUG_ON(bch2_btree_node_check_topology(trans, b)); } } @@ -1827,8 +1836,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); - - BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* From e17c5f5f191f21665cc9e48cd68d92ef4cc377ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Nov 2024 22:00:05 -0500 Subject: [PATCH 058/233] bcachefs: Fix unhandled transaction restart in evacuate_bucket() Generally, releasing a transaction within a transaction restart means an unhandled transaction restart: but this can happen legitimately within the move code, e.g. when bch2_move_ratelimit() tells us to exit before we've retried. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index d6e68265e039..a6b503278519 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -197,6 +197,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) list_del(&ctxt->list); mutex_unlock(&c->moving_context_lock); + /* + * Generally, releasing a transaction within a transaction restart means + * an unhandled transaction restart: but this can happen legitimately + * within the move code, e.g. when bch2_move_ratelimit() tells us to + * exit before we've retried + */ + bch2_trans_begin(ctxt->trans); bch2_trans_put(ctxt->trans); memset(ctxt, 0, sizeof(*ctxt)); } From 58d33a0a41804c2ab68c85fc61f79e91f4b9f98b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 14 Oct 2024 23:33:57 -0400 Subject: [PATCH 059/233] bcachefs: Assert we're not in a restart in bch2_trans_put() This always indicates a transaction restart handling bug Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 07bce85dafaf..98375c66021a 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3261,6 +3261,9 @@ void bch2_trans_put(struct btree_trans *trans) { struct bch_fs *c = trans->c; + if (trans->restarted) + bch2_trans_in_restart_error(trans); + bch2_trans_unlock(trans); trans_for_each_update(trans, i) From efaa4e4ea6ceb2cdee4b0dca156e0606bbc98f8d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 14 Oct 2024 23:52:51 -0400 Subject: [PATCH 060/233] bcachefs: Better in_restart error We're ramping up on checking transaction restart handling correctness - so, in debug mode we now save a backtrace for where the restart was emitted, which makes it much easier to track down the incorrect handling. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 12 ++++++++++++ fs/bcachefs/btree_iter.h | 4 ++++ fs/bcachefs/btree_types.h | 3 +++ 3 files changed, 19 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 98375c66021a..acf70aaf2fd2 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1427,9 +1427,17 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct printbuf buf = PRINTBUF; + bch2_prt_backtrace(&buf, &trans->last_restarted_trace); + panic("in transaction restart: %s, last restarted by\n%s", + bch2_err_str(trans->restarted), + buf.buf); +#else panic("in transaction restart: %s, last restarted by %pS\n", bch2_err_str(trans->restarted), (void *) trans->last_restarted_ip); +#endif } void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) @@ -3287,6 +3295,10 @@ void bch2_trans_put(struct btree_trans *trans) closure_return_sync(&trans->ref); trans->locking_wait.task = NULL; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); +#endif + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index dda07a320488..36899c6b134e 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -350,6 +350,10 @@ static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned l trans->restarted = err; trans->last_restarted_ip = ip; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); + bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); +#endif return -err; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 4568a41fefaf..baab5288ecc9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -513,6 +513,9 @@ struct btree_trans { u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; +#ifdef CONFIG_BCACHEFS_DEBUG + bch_stacktrace last_restarted_trace; +#endif unsigned long last_unlock_ip; unsigned long srcu_lock_time; From e398d6fb0f5aace9f5d181fab0b9dfb7c1025938 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 27 Oct 2024 19:32:40 -0400 Subject: [PATCH 061/233] bcachefs: bch2_trans_verify_not_unlocked_or_in_restart() Fold two asserts into one. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 36 ++++++++++++++++------------- fs/bcachefs/btree_iter.h | 20 +++++----------- fs/bcachefs/btree_locking.h | 2 +- fs/bcachefs/btree_trans_commit.c | 9 +++----- fs/bcachefs/btree_update_interior.c | 3 +-- fs/bcachefs/btree_update_interior.h | 2 +- 6 files changed, 32 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index acf70aaf2fd2..1efc77fc9abf 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -327,7 +327,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos) { - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); struct btree_path *path; struct trans_for_each_path_inorder_iter iter; @@ -1265,7 +1265,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, { int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); EBUG_ON(!trans->paths[path_idx].ref); trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); @@ -1425,7 +1425,7 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ (void *) trans->last_begin_ip); } -void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) +static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { #ifdef CONFIG_BCACHEFS_DEBUG struct printbuf buf = PRINTBUF; @@ -1440,10 +1440,16 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) #endif } -void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) +void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) { - panic("trans should be locked, unlocked by %pS\n", - (void *) trans->last_unlock_ip); + if (trans->restarted) + bch2_trans_in_restart_error(trans); + + if (!trans->locked) + panic("trans should be locked, unlocked by %pS\n", + (void *) trans->last_unlock_ip); + + BUG(); } noinline __cold @@ -1724,8 +1730,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, struct trans_for_each_path_inorder_iter iter; btree_path_idx_t path_pos = 0, path_idx; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); @@ -1877,7 +1882,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree_trans *trans = iter->trans; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), @@ -1952,7 +1957,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2161,8 +2166,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos struct bkey_s_c k; int ret; - bch2_trans_verify_not_in_restart(trans); - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) @@ -2302,7 +2306,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e struct bpos iter_pos; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); if (iter->update_path) { @@ -2475,7 +2479,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_path_idx_t saved_path = 0; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); EBUG_ON(btree_iter_path(trans, iter)->cached || btree_iter_path(trans, iter)->level); @@ -2614,7 +2618,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); @@ -3136,7 +3140,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->notrace_relock_fail = false; } - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); return trans->restart_count; } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 36899c6b134e..6b1c46e95432 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -236,12 +236,12 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, btree_path_idx_t, unsigned, unsigned long); -static inline void bch2_trans_verify_not_unlocked(struct btree_trans *); +static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) { - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -326,20 +326,12 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, bch2_trans_restart_error(trans, restart_count); } -void __noreturn bch2_trans_in_restart_error(struct btree_trans *); +void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); -static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) { - if (trans->restarted) - bch2_trans_in_restart_error(trans); -} - -void __noreturn bch2_trans_unlocked_error(struct btree_trans *); - -static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) -{ - if (!trans->locked) - bch2_trans_unlocked_error(trans); + if (trans->restarted || !trans->locked) + bch2_trans_unlocked_or_in_restart_error(trans); } __always_inline diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 7c07f9fa9add..ca4aeefd631e 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -282,7 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 3aca746d08f6..cf313477567a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -619,8 +619,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned u64s = 0; int ret = 0; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); @@ -1008,8 +1007,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret = 0; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (!trans->nr_updates && !trans->journal_entries_u64s) @@ -1070,8 +1068,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) } retry: errored_at = NULL; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 865c4724d550..c11babe31f54 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1960,8 +1960,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; - bch2_trans_verify_not_in_restart(trans); - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 10f400957f21..1c6cf3e2e6a9 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -159,7 +159,7 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, unsigned level, unsigned flags) { - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_prev_sib) ?: From f97b3e7fd8f371eba3ae114eb8eb0dd3c6842771 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 25 Oct 2024 22:31:20 -0400 Subject: [PATCH 062/233] bcachefs: Assert that we're not violating key cache coherency rules We're not allowed to have a dirty key in the key cache if the key doesn't exist at all in the btree - creation has to bypass the key cache, so that iteration over the btree can check if the key is present in the key cache. Things break in subtle ways if cache coherency is broken, so this needs an assert. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 244610b1d0b5..3bd40ea0fa3d 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -424,8 +424,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - ret = bch2_btree_iter_traverse(&b_iter) ?: - bch2_trans_update(trans, &b_iter, ck->k, + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); + ret = bkey_err(btree_k); + if (ret) + goto err; + + /* * Check that we're not violating cache coherency rules: */ + BUG_ON(bkey_deleted(btree_k.k)); + + ret = bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_key_cache_reclaim| BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun) ?: @@ -433,7 +440,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| commit_flags); - +err: bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && From 95918915a6d31158bbec7a2abd3eeffed4decab3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2024 18:39:59 -0400 Subject: [PATCH 063/233] bcachefs: Rename btree_iter_peek_upto() -> btree_iter_peek_max() We'll be introducing btree_iter_peek_prev_min(), so rename for consistency. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 6 +++--- fs/bcachefs/btree_gc.c | 2 +- fs/bcachefs/btree_iter.c | 10 ++++----- fs/bcachefs/btree_iter.h | 36 ++++++++++++++++---------------- fs/bcachefs/btree_journal_iter.c | 4 ++-- fs/bcachefs/btree_journal_iter.h | 2 +- fs/bcachefs/btree_update.c | 6 +++--- fs/bcachefs/dirent.c | 4 ++-- fs/bcachefs/ec.c | 2 +- fs/bcachefs/extent_update.c | 2 +- fs/bcachefs/fs-io-pagecache.c | 2 +- fs/bcachefs/fs-io.c | 8 +++---- fs/bcachefs/fs.c | 2 +- fs/bcachefs/fsck.c | 8 +++---- fs/bcachefs/inode.c | 6 +++--- fs/bcachefs/io_misc.c | 6 +++--- fs/bcachefs/io_write.c | 4 ++-- fs/bcachefs/movinggc.c | 2 +- fs/bcachefs/reflink.c | 2 +- fs/bcachefs/str_hash.h | 6 +++--- fs/bcachefs/subvolume.h | 12 +++++------ fs/bcachefs/tests.c | 26 +++++++++++------------ fs/bcachefs/xattr.c | 2 +- 23 files changed, 80 insertions(+), 80 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c84a91572a1d..af791f4dab99 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1045,7 +1045,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos * btree node min/max is a closed interval, upto takes a half * open interval: */ - k = bch2_btree_iter_peek_upto(&iter2, end); + k = bch2_btree_iter_peek_max(&iter2, end); next = iter2.pos; bch2_trans_iter_exit(iter->trans, &iter2); @@ -1886,7 +1886,7 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, + for_each_btree_key_max(trans, iter, BTREE_ID_need_discard, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX), 0, k, @@ -2101,7 +2101,7 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter { struct bkey_s_c k; again: - k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); if (!k.k && !*wrapped) { bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); *wrapped = true; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 833d743dee0c..e45cf32a6403 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -904,7 +904,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c) for_each_member_device(c, ca) { ret = bch2_trans_run(c, - for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, + for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), BTREE_ITER_slots|BTREE_ITER_prefetch, k, diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 1efc77fc9abf..21cadc98bdae 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2113,7 +2113,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, { struct btree_path *path = btree_iter_path(trans, iter); - return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + return bch2_journal_keys_peek_max(trans->c, iter->btree_id, path->level, path->pos, end_pos, @@ -2291,14 +2291,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp } /** - * bch2_btree_iter_peek_upto() - returns first key greater than or equal to + * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) { struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); @@ -2682,7 +2682,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct btree_iter iter2; bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_upto(&iter2, end); + k = bch2_btree_iter_peek_max(&iter2, end); if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); @@ -2693,7 +2693,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_upto(iter, end); + k = bch2_btree_iter_peek_max(iter, end); if (unlikely(bkey_err(k))) bch2_btree_iter_set_pos(iter, pos); else diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 6b1c46e95432..cd9022ce15a5 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -381,12 +381,12 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { - return bch2_btree_iter_peek_upto(iter, SPOS_MAX); + return bch2_btree_iter_peek_max(iter, SPOS_MAX); } struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); @@ -672,12 +672,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, bch2_btree_iter_peek(iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, struct bpos end, unsigned flags) { if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_upto(iter, end); + return bch2_btree_iter_peek_max(iter, end); if (bkey_gt(iter->pos, end)) return bkey_s_c_null; @@ -741,7 +741,7 @@ transaction_restart: \ _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key_upto_continue(_trans, _iter, \ +#define for_each_btree_key_max_continue(_trans, _iter, \ _end, _flags, _k, _do) \ ({ \ struct bkey_s_c _k; \ @@ -749,7 +749,7 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ _end, (_flags)); \ if (!(_k).k) \ break; \ @@ -763,9 +763,9 @@ transaction_restart: \ }) #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ - for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _do) \ ({ \ bch2_trans_begin(trans); \ @@ -774,12 +774,12 @@ transaction_restart: \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\ + for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ }) #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ - for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ + for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ SPOS_MAX, _flags, _k, _do) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ @@ -823,33 +823,33 @@ transaction_restart: \ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ _start, _end, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) -#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ +#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ for (; \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ - for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ + for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) #define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ @@ -861,7 +861,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); bch2_btree_iter_rewind(&(_iter))) #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) /* * This should not be used in a fastpath, without first trying _do in diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 924b5e3a4390..c9dee4b4627a 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -61,7 +61,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, } /* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, +struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) { @@ -112,7 +112,7 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree { size_t idx = 0; - return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); + return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); } static void journal_iter_verify(struct journal_iter *iter) diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 1653de9d609b..754939f604d5 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -43,7 +43,7 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, +struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index a9a29fba4902..6afd77c68411 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_with_updates| BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -323,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto out; next: bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -721,7 +721,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int ret = 0; bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { + while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index faffc98d5605..4c22f78b0484 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -500,7 +500,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 struct bkey_s_c k; int ret; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, SPOS(dir, 0, snapshot), POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { @@ -549,7 +549,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) bch2_bkey_buf_init(&sk); int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, POS(inum.inum, ctx->pos), POS(inum.inum, U64_MAX), inum.subvol, 0, k, ({ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 015107e241cc..d6560bccd87c 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2308,7 +2308,7 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) { return bch2_trans_run(c, - for_each_btree_key_upto_commit(trans, iter, + for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 5f4fecb358da..45c87c019f6b 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, bch2_trans_copy_iter(©, iter); - for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { + for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { unsigned offset = 0; if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 1d4910ea0f1d..51a499c5a7b6 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -199,7 +199,7 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, unsigned folio_idx = 0; return bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inum.inum, offset), POS(inum.inum, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 2456c41b215e..0021db191480 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -222,7 +222,7 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos end) { return bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, subvol, 0, k, ({ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); }))); @@ -806,7 +806,7 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 sectors = end - start; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, start), POS(inode->v.i_ino, end - 1), @@ -922,7 +922,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, 0, k, ({ @@ -958,7 +958,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) return -ENXIO; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index e0ffe4648bb8..91fce04272a1 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1294,7 +1294,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_upto(&iter, end); + k = bch2_btree_iter_peek_max(&iter, end); ret = bkey_err(k); if (ret) continue; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c96025b8b65d..2229f0dcc860 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -73,7 +73,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, { u64 sectors = 0; - int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), 0, k, ({ @@ -90,7 +90,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, { u64 subdirs = 0; - int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), 0, k, ({ @@ -1751,7 +1751,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter1, btree, pos1, BTREE_ITER_all_snapshots| BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); + k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) goto err; @@ -1776,7 +1776,7 @@ static int overlapping_extents_found(struct btree_trans *trans, while (1) { bch2_btree_iter_advance(&iter2); - k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); + k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); ret = bkey_err(k2); if (ret) goto err; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 5dd9d3edae77..5c603ab66be0 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -618,7 +618,7 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter struct bkey_s_c k; int ret = 0; - for_each_btree_key_upto_norestart(trans, *iter, btree, + for_each_btree_key_max_norestart(trans, *iter, btree, bpos_successor(pos), SPOS(pos.inode, pos.offset, U32_MAX), flags|BTREE_ITER_all_snapshots, k, ret) @@ -653,7 +653,7 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) struct bkey_s_c k; int ret = 0; - for_each_btree_key_upto_norestart(trans, iter, + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), BTREE_ITER_all_snapshots| BTREE_ITER_with_updates, k, ret) @@ -967,7 +967,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_upto(&iter, end); + k = bch2_btree_iter_peek_max(&iter, end); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index e2acf21ac9b0..ff661a072000 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -164,9 +164,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(iter, snapshot); /* - * peek_upto() doesn't have ideal semantics for extents: + * peek_max() doesn't have ideal semantics for extents: */ - k = bch2_btree_iter_peek_upto(iter, end_pos); + k = bch2_btree_iter_peek_max(iter, end_pos); if (!k.k) break; @@ -427,7 +427,7 @@ case LOGGED_OP_FINSERT_shift_extents: k = insert ? bch2_btree_iter_peek_prev(&iter) - : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f2f69e5e0910..f11e11279f01 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -164,7 +164,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_upto_continue_norestart(iter, + for_each_btree_key_max_continue_norestart(iter, new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), @@ -1165,7 +1165,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct btree_trans *trans = bch2_trans_get(c); for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 725292d69fd6..85c361e78ba5 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -167,7 +167,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 8a36ebd9dd9c..96cf50f4705d 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -409,7 +409,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) struct bkey_s_c k; int ret; - for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { + for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index ec2b1feea520..00c785055d22 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -160,7 +160,7 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct bkey_s_c k; int ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), BTREE_ITER_slots|flags, k, ret) { @@ -210,7 +210,7 @@ bch2_hash_hole(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), BTREE_ITER_slots|BTREE_ITER_intent, k, ret) @@ -265,7 +265,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, bool found = false; int ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index f897d106e142..07b23dc08614 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -34,7 +34,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end, +bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, u32 subvolid, unsigned flags) { u32 snapshot; @@ -43,10 +43,10 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos return bkey_s_c_err(ret); bch2_btree_iter_set_snapshot(iter, snapshot); - return bch2_btree_iter_peek_upto_type(iter, end, flags); + return bch2_btree_iter_peek_max_type(iter, end, flags); } -#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ +#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ _end, _subvolid, _flags, _k, _do) \ ({ \ struct bkey_s_c _k; \ @@ -54,7 +54,7 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ _end, _subvolid, (_flags)); \ if (!(_k).k) \ break; \ @@ -67,14 +67,14 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos _ret3; \ }) -#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \ +#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ _start, _end, _subvolid, _flags, _k, _do) \ ({ \ struct btree_iter _iter; \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ + for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ _end, _subvolid, _flags, _k, _do); \ }) diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index fb5c1543e52f..6c6469814637 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i++); @@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); @@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); @@ -259,7 +259,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_slots, k, ({ if (i >= nr * 2) @@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); @@ -320,7 +320,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_slots, k, ({ if (i == nr) @@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); + k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; @@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { return bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index ed418a747cdd..820c1791545a 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -309,7 +309,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) u64 offset = 0, inum = inode->ei_inode.bi_inum; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, POS(inum, offset), POS(inum, U64_MAX), inode->ei_inum.subvol, 0, k, ({ From 0ad36d94fec984615bee1a89d402d8ad942b3eab Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 25 Oct 2024 01:48:26 -0400 Subject: [PATCH 064/233] bcachefs: Simplify btree_iter_peek() filter_snapshots Collapse all the BTREE_ITER_filter_snapshots handling down into a single block; btree iteration is much simpler in the !filter_snapshots case. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 131 +++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 68 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 21cadc98bdae..580fee86a965 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1855,7 +1855,6 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * return (struct bkey_s_c) { u, NULL }; } - void bch2_set_btree_iter_dontneed(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; @@ -2212,8 +2211,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp bch2_btree_iter_verify(iter); while (1) { - struct btree_path_level *l; - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); @@ -2227,7 +2224,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp } struct btree_path *path = btree_iter_path(trans, iter); - l = path_l(path); + struct btree_path_level *l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ @@ -2303,10 +2300,11 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; - struct bpos iter_pos; + struct bpos iter_pos = iter->pos; int ret; bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); if (iter->update_path) { @@ -2315,8 +2313,6 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en iter->update_path = 0; } - bch2_btree_iter_verify_entry_exit(iter); - while (1) { k = __bch2_btree_iter_peek(iter, search_key); if (unlikely(!k.k)) @@ -2324,77 +2320,76 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en if (unlikely(bkey_err(k))) goto out_no_locked; - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - * - * But we can't do the full check here, because bkey_start_pos() - * isn't monotonically increasing before FILTER_SNAPSHOTS, and - * that's what we check against in extents mode: - */ - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(k.k->p, end) - : k.k->p.inode > end.inode)) - goto end; + if (iter->flags & BTREE_ITER_filter_snapshots) { + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + * + * But we can't do the full check here, because bkey_start_pos() + * isn't monotonically increasing before FILTER_SNAPSHOTS, and + * that's what we check against in extents mode: + */ + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) + ? bkey_gt(k.k->p, end) + : k.k->p.inode > end.inode)) + goto end; - if (iter->update_path && - !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } + if (iter->update_path && + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); + iter->update_path = 0; + } - if ((iter->flags & BTREE_ITER_filter_snapshots) && - (iter->flags & BTREE_ITER_intent) && - !(iter->flags & BTREE_ITER_is_extents) && - !iter->update_path) { - struct bpos pos = k.k->p; + if ((iter->flags & BTREE_ITER_intent) && + !(iter->flags & BTREE_ITER_is_extents) && + !iter->update_path) { + struct bpos pos = k.k->p; - if (pos.snapshot < iter->snapshot) { + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } + + pos.snapshot = iter->snapshot; + + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_intent, + _THIS_IP_); + ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + } + + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if (!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { search_key = bpos_successor(k.k->p); continue; } - pos.snapshot = iter->snapshot; - - /* - * advance, same as on exit for iter->path, but only up - * to snapshot - */ - __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); - iter->update_path = iter->path; - - iter->update_path = bch2_btree_path_set_pos(trans, - iter->update_path, pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; + if (bkey_whiteout(k.k)) { + search_key = bkey_successor(iter, k.k->p); + continue; } } - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if ((iter->flags & BTREE_ITER_filter_snapshots) && - !bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_all_snapshots)) { - search_key = bkey_successor(iter, k.k->p); - continue; - } - /* * iter->pos should be mononotically increasing, and always be * equal to the key we just returned - except extents can From 2cb00966dd7d318400633b66864ceb34dfdcfdc8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 25 Oct 2024 22:16:19 -0400 Subject: [PATCH 065/233] bcachefs: Kill unnecessary iter_rewind() in bkey_get_empty_slot() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 6afd77c68411..f3d7ca3d92b9 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -588,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { - struct bkey_s_c k; - int ret = 0; - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); - k = bch2_btree_iter_prev(iter); - ret = bkey_err(k); + struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); + int ret = bkey_err(k); if (ret) goto err; From 45f667488e390bd9771163eddbcdc304798f32a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 6 Nov 2024 13:13:25 -0500 Subject: [PATCH 066/233] bcachefs: Move fsck ioctl code to fsck.c chardev.c and fs-ioctl.c are not organized by subject; let's try to fix this. Signed-off-by: Kent Overstreet --- fs/bcachefs/chardev.c | 219 +----------------------------------------- fs/bcachefs/fsck.c | 218 +++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/fsck.h | 3 + 3 files changed, 222 insertions(+), 218 deletions(-) diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 2182b555c112..46e9e32105a9 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -6,11 +6,11 @@ #include "buckets.h" #include "chardev.h" #include "disk_accounting.h" +#include "fsck.h" #include "journal.h" #include "move.h" #include "recovery_passes.h" #include "replicas.h" -#include "super.h" #include "super-io.h" #include "thread_with_file.h" @@ -127,130 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg } #endif -struct fsck_thread { - struct thread_with_stdio thr; - struct bch_fs *c; - struct bch_opts opts; -}; - -static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) -{ - struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - kfree(thr); -} - -static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - int ret = PTR_ERR_OR_ZERO(c); - if (ret) - return ret; - - ret = bch2_fs_start(thr->c); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - ret |= 1; - } - if (test_bit(BCH_FS_error, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - ret |= 4; - } -err: - bch2_fs_stop(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_offline_thread_fn, -}; - -static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) -{ - struct bch_ioctl_fsck_offline arg; - struct fsck_thread *thr = NULL; - darray_str(devs) = {}; - long ret = 0; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - for (size_t i = 0; i < arg.nr_devs; i++) { - u64 dev_u64; - ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); - if (ret) - goto err; - - char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); - ret = PTR_ERR_OR_ZERO(dev_str); - if (ret) - goto err; - - ret = darray_push(&devs, dev_str); - if (ret) { - kfree(dev_str); - goto err; - } - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - opt_set(thr->opts, read_only, 1); - opt_set(thr->opts, ratelimit_errors, 0); - - /* We need request_key() to be called before we punt to kthread: */ - opt_set(thr->opts, nostart, true); - - bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - - thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); - - if (!IS_ERR(thr->c) && - thr->c->opts.errors == BCH_ON_ERROR_panic) - thr->c->opts.errors = BCH_ON_ERROR_ro; - - ret = __bch2_run_thread_with_stdio(&thr->thr); -out: - darray_for_each(devs, i) - kfree(*i); - darray_exit(&devs); - return ret; -err: - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - goto out; -} - static long bch2_global_ioctl(unsigned cmd, void __user *arg) { long ret; @@ -775,99 +651,6 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } -static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - c->stdio_filter = current; - c->stdio = &thr->thr.stdio; - - /* - * XXX: can we figure out a way to do this without mucking with c->opts? - */ - unsigned old_fix_errors = c->opts.fix_errors; - if (opt_defined(thr->opts, fix_errors)) - c->opts.fix_errors = thr->opts.fix_errors; - else - c->opts.fix_errors = FSCK_FIX_ask; - - c->opts.fsck = true; - set_bit(BCH_FS_fsck_running, &c->flags); - - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); - - clear_bit(BCH_FS_fsck_running, &c->flags); - bch_err_fn(c, ret); - - c->stdio = NULL; - c->stdio_filter = NULL; - c->opts.fix_errors = old_fix_errors; - - up(&c->online_fsck_mutex); - bch2_ro_ref_put(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_online_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_online_thread_fn, -}; - -static long bch2_ioctl_fsck_online(struct bch_fs *c, - struct bch_ioctl_fsck_online arg) -{ - struct fsck_thread *thr = NULL; - long ret = 0; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!bch2_ro_ref_tryget(c)) - return -EROFS; - - if (down_trylock(&c->online_fsck_mutex)) { - bch2_ro_ref_put(c); - return -EAGAIN; - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->c = c; - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); -err: - if (ret < 0) { - bch_err_fn(c, ret); - if (thr) - bch2_fsck_thread_exit(&thr->thr); - up(&c->online_fsck_mutex); - bch2_ro_ref_put(c); - } - return ret; -} - #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 2229f0dcc860..e0335265de3d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bcachefs_ioctl.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" @@ -16,6 +17,7 @@ #include "recovery_passes.h" #include "snapshot.h" #include "super.h" +#include "thread_with_file.h" #include "xattr.h" #include @@ -3192,3 +3194,219 @@ int bch2_fix_reflink_p(struct bch_fs *c) bch_err_fn(c, ret); return ret; } + +struct fsck_thread { + struct thread_with_stdio thr; + struct bch_fs *c; + struct bch_opts opts; +}; + +static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) +{ + struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); + kfree(thr); +} + +static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) +{ + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + int ret = PTR_ERR_OR_ZERO(c); + if (ret) + return ret; + + ret = bch2_fs_start(thr->c); + if (ret) + goto err; + + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); + ret |= 1; + } + if (test_bit(BCH_FS_error, &c->flags)) { + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); + ret |= 4; + } +err: + bch2_fs_stop(c); + return ret; +} + +static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_offline_thread_fn, +}; + +long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +{ + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; + darray_str(devs) = {}; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + for (size_t i = 0; i < arg.nr_devs; i++) { + u64 dev_u64; + ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); + if (ret) + goto err; + + char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); + ret = PTR_ERR_OR_ZERO(dev_str); + if (ret) + goto err; + + ret = darray_push(&devs, dev_str); + if (ret) { + kfree(dev_str); + goto err; + } + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); + if (!IS_ERR(optstr)) + kfree(optstr); + + if (ret) + goto err; + } + + opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + opt_set(thr->opts, read_only, 1); + opt_set(thr->opts, ratelimit_errors, 0); + + /* We need request_key() to be called before we punt to kthread: */ + opt_set(thr->opts, nostart, true); + + bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); + + thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) + thr->c->opts.errors = BCH_ON_ERROR_ro; + + ret = __bch2_run_thread_with_stdio(&thr->thr); +out: + darray_for_each(devs, i) + kfree(*i); + darray_exit(&devs); + return ret; +err: + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + goto out; +} + +static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) +{ + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + c->stdio_filter = current; + c->stdio = &thr->thr.stdio; + + /* + * XXX: can we figure out a way to do this without mucking with c->opts? + */ + unsigned old_fix_errors = c->opts.fix_errors; + if (opt_defined(thr->opts, fix_errors)) + c->opts.fix_errors = thr->opts.fix_errors; + else + c->opts.fix_errors = FSCK_FIX_ask; + + c->opts.fsck = true; + set_bit(BCH_FS_fsck_running, &c->flags); + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + int ret = bch2_run_online_recovery_passes(c); + + clear_bit(BCH_FS_fsck_running, &c->flags); + bch_err_fn(c, ret); + + c->stdio = NULL; + c->stdio_filter = NULL; + c->opts.fix_errors = old_fix_errors; + + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + return ret; +} + +static const struct thread_with_stdio_ops bch2_online_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_online_thread_fn, +}; + +long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) +{ + struct fsck_thread *thr = NULL; + long ret = 0; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + + if (down_trylock(&c->online_fsck_mutex)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->c = c; + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); + if (!IS_ERR(optstr)) + kfree(optstr); + + if (ret) + goto err; + } + + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); +err: + if (ret < 0) { + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_exit(&thr->thr); + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + } + return ret; +} diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 1cca31011530..4481b40a881d 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -14,4 +14,7 @@ int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); +long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); +long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); + #endif /* _BCACHEFS_FSCK_H */ From 88c2aa59c3f62a04f89a62985f9f0ece694fe066 Mon Sep 17 00:00:00 2001 From: Integral Date: Wed, 23 Oct 2024 18:00:33 +0800 Subject: [PATCH 067/233] bcachefs: add support for true/false & yes/no in bool-type options Here is the patch which uses existing constant table: Currently, when using bcachefs-tools to set options, bool-type options can only accept 1 or 0. Add support for accepting true/false and yes/no for these options. Signed-off-by: Integral Signed-off-by: Kent Overstreet Acked-by: David Howells --- fs/bcachefs/opts.c | 16 +++++++++------- fs/fs_parser.c | 3 ++- include/linux/fs_parser.h | 2 ++ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 49c59aec6954..0ba58d74c21f 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include "bcachefs.h" #include "compress.h" @@ -334,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: if (val) { - ret = kstrtou64(val, 10, res); + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; + } else { + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; + } } else { - ret = 0; *res = 1; } - if (ret < 0 || (*res != 0 && *res != 1)) { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret < 0 ? ret : -BCH_ERR_option_not_bool; - } break; case BCH_OPT_UINT: if (!val) { diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 24727ec34e5a..6521e9a9d6ef 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -13,7 +13,7 @@ #include #include "internal.h" -static const struct constant_table bool_names[] = { +const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, @@ -22,6 +22,7 @@ static const struct constant_table bool_names[] = { { "yes", true }, { }, }; +EXPORT_SYMBOL(bool_names); static const struct constant_table * __lookup_constant(const struct constant_table *tbl, const char *name) diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 6cf713a7e6c6..0974cd33bcba 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -83,6 +83,8 @@ extern int fs_lookup_param(struct fs_context *fc, extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); +extern const struct constant_table bool_names[]; + #ifdef CONFIG_VALIDATE_FS_PARSER extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, int low, int high, int special); From 19128c53d438d7435d855625e5abe6a0a929ae63 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Wed, 16 Oct 2024 09:50:26 +0800 Subject: [PATCH 068/233] bcachefs: Correct the description of the '--bucket=size' options Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/opts.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 39cdc185fa73..6b29339ea725 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -501,7 +501,7 @@ enum fsck_err_opts { OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ BCH2_NO_SB_OPT, 0, \ - "size", "Size of filesystem on device") \ + "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ From a57150fe53c1e85babe43129d047b502b0316765 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Jul 2024 09:11:33 +0800 Subject: [PATCH 069/233] bcachefs: Add support for FS_IOC_GETFSUUID Use super_set_uuid() to set `sb->s_uuid_len` to avoid returning `-ENOTTY` with sb->s_uuid_len being 0. Original patch link: [1]: https://lore.kernel.org/all/20240207025624.1019754-2-kent.overstreet@linux.dev/ Signed-off-by: Kent Overstreet Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 91fce04272a1..396a8f677621 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2216,7 +2216,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); - sb->s_uuid = c->sb.user_uuid; + super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); From 797a14eb7da2e1db8b8c768b62035f6567bf4b80 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 9 Jul 2024 09:11:34 +0800 Subject: [PATCH 070/233] bcachefs: Add support for FS_IOC_GETFSSYSFSPATH [TEST]: ``` $ cat ioctl_getsysfspath.c #include #include #include #include #include #include int main(int argc, char *argv[]) { int fd; struct fs_sysfs_path sysfs_path = {}; if (argc != 2) { fprintf(stderr, "Usage: %s \n", argv[0]); exit(EXIT_FAILURE); } fd = open(argv[1], O_RDONLY); if (fd == -1) { perror("open"); exit(EXIT_FAILURE); } if (ioctl(fd, FS_IOC_GETFSSYSFSPATH, &sysfs_path) == -1) { perror("ioctl FS_IOC_GETFSSYSFSPATH"); close(fd); exit(EXIT_FAILURE); } printf("FS_IOC_GETFSSYSFSPATH: %s\n", sysfs_path.name); close(fd); return 0; } $ gcc ioctl_getsysfspath.c $ sudo bcachefs format /dev/sda $ sudo mount.bcachefs /dev/sda /mnt $ sudo ./a.out /mnt FS_IOC_GETFSSYSFSPATH: bcachefs/c380b4ab-fbb6-41d2-b805-7a89cae9cadb ``` Original patch link: [1]: https://lore.kernel.org/all/20240207025624.1019754-8-kent.overstreet@linux.dev/ Signed-off-by: Kent Overstreet Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 396a8f677621..7a269dbcf44b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -2217,6 +2217,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); + super_set_sysfs_name_uuid(sb); sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); From 0fe251fd827271cb8d40490db395ca9c58a21509 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Tue, 24 Sep 2024 10:53:50 +0800 Subject: [PATCH 071/233] bcachefs: Removes NULL pointer checks for __filemap_get_folio return values __filemap_get_folio the return value cannot be NULL, so unnecessary checks are removed. Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 2 +- fs/bcachefs/fs-io-pagecache.c | 2 +- fs/bcachefs/fs-io.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 0923f38a2fcd..b853cecd3c1b 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -686,7 +686,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN | fgf_set_order(len), mapping_gfp_mask(mapping)); - if (IS_ERR_OR_NULL(folio)) + if (IS_ERR(folio)) goto err_unlock; offset = pos - folio_pos(folio); diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 51a499c5a7b6..e072900e6a5b 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, break; f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR_OR_NULL(f)) + if (IS_ERR(f)) break; BUG_ON(fs->nr && folio_pos(f) != pos); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 0021db191480..c6fdfec51082 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -256,7 +256,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (IS_ERR_OR_NULL(folio)) { + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } From db6b114bd5c8426f9efa74209e987f3ff2a7bf5f Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 27 Sep 2024 16:40:42 +0800 Subject: [PATCH 072/233] bcachefs: Remove redundant initialization in bch2_vfs_inode_init() `inode->v.i_ino` has been initialized to `inum.inum`. If `inum.inum` and `bi->bi_inum` are not equal, BUG_ON() is triggered in bch2_inode_update_after_write(). Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 7a269dbcf44b..f852dbf30aa2 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1752,7 +1752,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; - inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; From c03828056d84e06f5687d296a2808b3ee5e4dfce Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Wed, 16 Oct 2024 09:49:11 +0800 Subject: [PATCH 073/233] bcachefs: Simplify code in bch2_dev_alloc() - Remove unnecessary variable 'ret'. - Remove unnecessary bch2_dev_free() operations. Signed-off-by: Youling Tang Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 7e0ff17a6dbb..ab678231afd4 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1369,7 +1369,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); struct bch_dev *ca = NULL; - int ret = 0; if (bch2_fs_init_fault("dev_alloc")) goto err; @@ -1381,10 +1380,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; bch2_dev_attach(c, ca, dev_idx); - return ret; + return 0; err: - if (ca) - bch2_dev_free(ca); return -BCH_ERR_ENOMEM_dev_alloc; } From 53f02a6929c291960b8d44540a7085940b82366d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Nov 2024 19:15:38 -0500 Subject: [PATCH 074/233] bcachefs: Don't use page allocator for sb_read_scratch Kill another unnecessary dependency on PAGE_SIZE Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 7 ++++--- fs/bcachefs/super-io.h | 2 ++ fs/bcachefs/super.c | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index c83bd3dedb1b..4c29f8215d54 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -892,14 +892,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) struct bch_sb *sb = ca->disk_sb.sb; struct bio *bio = ca->disk_sb.bio; + memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); bio->bi_end_io = write_super_endio; bio->bi_private = ca; - bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); + bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], - bio_sectors(bio)); + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); closure_bio_submit(bio, &c->sb_write); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index fadd364e2802..90e7b176cdd0 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -10,6 +10,8 @@ #include +#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 + static inline bool bch2_version_compatible(u16 version) { return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ab678231afd4..6ab93db52eca 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1201,7 +1201,7 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->io_done); bch2_dev_buckets_free(ca); - free_page((unsigned long) ca->sb_read_scratch); + kfree(ca->sb_read_scratch); bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); @@ -1340,7 +1340,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || + !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; From a0678f9c859bf25e60a535e65aff5864ccf66eb2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 7 Nov 2024 21:50:00 -0500 Subject: [PATCH 075/233] bcachefs: Fix shutdown message Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6ab93db52eca..37eee352fa21 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -290,7 +290,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_fs_journal_stop(&c->journal); - bch_info(c, "%sshutdown complete, journal seq %llu", + bch_info(c, "%sclean shutdown complete, journal seq %llu", test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", c->journal.seq_ondisk); From 2228901e43af2add536866df466a21117faef0b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 12 Nov 2024 03:53:30 -0500 Subject: [PATCH 076/233] bcachefs: delete dead code Signed-off-by: Kent Overstreet --- fs/bcachefs/error.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 6551ada926b6..81af0b8ddb52 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -54,26 +54,6 @@ int bch2_topology_error(struct bch_fs *); _ret; \ }) -/* - * Later we might want to mark only the particular device inconsistent, not the - * entire filesystem: - */ - -#define bch2_dev_inconsistent(ca, ...) \ -do { \ - bch_err(ca, __VA_ARGS__); \ - bch2_inconsistent_error((ca)->fs); \ -} while (0) - -#define bch2_dev_inconsistent_on(cond, ca, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - \ - if (_ret) \ - bch2_dev_inconsistent(ca, __VA_ARGS__); \ - _ret; \ -}) - /* * When a transaction update discovers or is causing a fs inconsistency, it's * helpful to also dump the pending updates: From 7fda5f15087a016cc2ef2d449bcba4e4b9b795ce Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 26 Oct 2024 23:35:03 -0400 Subject: [PATCH 077/233] bcachefs: bch2_btree_bit_mod_iter() factor out a new helper, make it handle extents bitset btrees (freespace). Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 57 ++++++---------------------------- fs/bcachefs/btree_update.c | 41 ++++++++++++------------ fs/bcachefs/btree_update.h | 3 +- 3 files changed, 31 insertions(+), 70 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index af791f4dab99..a1bd75a44d79 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -671,44 +671,31 @@ static int bch2_bucket_do_index(struct btree_trans *trans, bool set) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c old; - struct bkey_i *k; enum btree_id btree; + struct bpos pos; enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; - enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; struct printbuf buf = PRINTBUF; - int ret; if (a->data_type != BCH_DATA_free && a->data_type != BCH_DATA_need_discard) return 0; - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.type = new_type; - switch (a->data_type) { case BCH_DATA_free: btree = BTREE_ID_freespace; - k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); - bch2_key_resize(&k->k, 1); + pos = alloc_freespace_pos(alloc_k.k->p, *a); break; case BCH_DATA_need_discard: btree = BTREE_ID_need_discard; - k->k.p = alloc_k.k->p; + pos = alloc_k.k->p; break; default: return 0; } - old = bch2_bkey_get_iter(trans, &iter, btree, - bkey_start_pos(&k->k), - BTREE_ITER_intent); - ret = bkey_err(old); + struct btree_iter iter; + struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); + int ret = bkey_err(old); if (ret) return ret; @@ -728,7 +715,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans, goto err; } - ret = bch2_trans_update(trans, &iter, k, 0); + ret = bch2_btree_bit_mod_iter(trans, &iter, set); err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -1163,18 +1150,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, bch2_bkey_types[k.k->type], bch2_bkey_types[discard_key_type], (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = discard_key_type; - update->k.p = discard_iter->pos; - - ret = bch2_trans_update(trans, discard_iter, update, 0); + ret = bch2_btree_bit_mod_iter(trans, discard_iter, !!discard_key_type); if (ret) goto err; } @@ -1194,19 +1170,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, bch2_bkey_types[freespace_key_type], (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = freespace_key_type; - update->k.p = freespace_iter->pos; - bch2_key_resize(&update->k, 1); - - ret = bch2_trans_update(trans, freespace_iter, update, 0); + ret = bch2_btree_bit_mod_iter(trans, freespace_iter, !!freespace_key_type); if (ret) goto err; } @@ -1420,8 +1384,7 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_delete_extent_at(trans, iter, - iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: + ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); goto out; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index f3d7ca3d92b9..06fd5aa62296 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -669,25 +669,17 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, bch2_btree_insert_trans(trans, id, k, iter_flags)); } -int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, - unsigned len, unsigned update_flags) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = iter->pos; - bch2_key_resize(&k->k, len); - return bch2_trans_update(trans, iter, k, update_flags); -} - int bch2_btree_delete_at(struct btree_trans *trans, struct btree_iter *iter, unsigned update_flags) { - return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; + + bkey_init(&k->k); + k->k.p = iter->pos; + return bch2_trans_update(trans, iter, k, update_flags); } int bch2_btree_delete(struct btree_trans *trans, @@ -791,8 +783,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, return ret; } -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) +int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) { struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); int ret = PTR_ERR_OR_ZERO(k); @@ -801,13 +792,21 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, bkey_init(&k->k); k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; + k->k.p = iter->pos; + if (iter->flags & BTREE_ITER_is_extents) + bch2_key_resize(&k->k, 1); + return bch2_trans_update(trans, iter, k, 0); +} + +int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ struct btree_iter iter; bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, 0); + int ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_bit_mod_iter(trans, &iter, set); bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 3bc57d43aa83..58df20194306 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -46,8 +46,6 @@ enum bch_trans_commit_flags { void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); -int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, - unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); @@ -65,6 +63,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); +int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); From 7ad0ba0e1849e010a7be3e90f6588b4460a469e4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 26 Oct 2024 22:52:06 -0400 Subject: [PATCH 078/233] bcachefs: Delete dead code from bch2_discard_one_bucket() alloc key validation ensures that if a bucket is in need_discard state the sector counts are all zero - we don't have to check for that. The NEED_INC_GEN check appears to be dead code, as well: we only see buckets in the need_discard btree, and it's an error if they aren't in the need_discard state. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a1bd75a44d79..38df36f8e70a 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1756,22 +1756,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (bch2_bucket_sectors_total(a->v)) { - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, - trans, "attempting to discard bucket with dirty data\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = -EIO; - goto out; - } - if (a->v.data_type != BCH_DATA_need_discard) { - if (data_type_is_empty(a->v.data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { - a->v.gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - goto write; - } - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, trans, "bucket incorrectly set in need_discard btree\n" "%s", @@ -1814,7 +1799,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); -write: alloc_data_type_set(&a->v, a->v.data_type); ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: From d6ad842cf77fe209875d699d3b2d64b764d5edd2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 29 Oct 2024 01:17:08 -0400 Subject: [PATCH 079/233] bcachefs: lru errors are expected when reconstructing alloc Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 431698189090..7086a7226989 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -113,6 +113,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); From 4a370320dcb4ea925e38a852baf85724baf9711a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Oct 2024 21:27:23 -0400 Subject: [PATCH 080/233] bcachefs: Kill FSCK_NEED_FSCK If we find an error that indicates that we need to run fsck, we can specify that directly with run_explicit_recovery_pass(). These are now log_fsck_err() calls: we're just logging in the superblock that an error occurred - and possibly doing an emergency shutdown, depending on policy. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 18 ++++++------------ fs/bcachefs/buckets.c | 29 +++++++++++++++++------------ fs/bcachefs/error.c | 21 +++++++++++++-------- fs/bcachefs/error.h | 12 ++++++------ fs/bcachefs/sb-errors_format.h | 5 ++--- 5 files changed, 44 insertions(+), 41 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index c11babe31f54..faa2816e02a0 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -62,7 +62,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (!bpos_eq(b->data->min_key, POS_MIN)) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); - need_fsck_err(trans, btree_root_bad_min_key, + log_fsck_err(trans, btree_root_bad_min_key, "btree root with incorrect min_key: %s", buf.buf); goto topology_repair; } @@ -70,7 +70,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (!bpos_eq(b->data->max_key, SPOS_MAX)) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->max_key); - need_fsck_err(trans, btree_root_bad_max_key, + log_fsck_err(trans, btree_root_bad_max_key, "btree root with incorrect max_key: %s", buf.buf); goto topology_repair; } @@ -106,7 +106,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) prt_str(&buf, "\n next "); bch2_bkey_val_to_text(&buf, c, k); - need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); + log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); goto topology_repair; } @@ -123,7 +123,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); + log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); goto topology_repair; } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { bch2_topology_error(c); @@ -136,7 +136,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) prt_str(&buf, "\n last key "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); goto topology_repair; } out: @@ -146,13 +146,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) printbuf_exit(&buf); return ret; topology_repair: - if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { - bch2_inconsistent_error(c); - ret = -BCH_ERR_btree_need_topology_repair; - } else { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - } + ret = bch2_topology_error(c); goto out; } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index c4123fa4f250..5b42f0a7b0cb 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -20,6 +20,7 @@ #include "movinggc.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "subvolume.h" @@ -402,8 +403,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, BUG_ON(!sectors); if (gen_after(ptr->gen, b_gen)) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - ptr_gen_newer_than_bucket_gen, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -416,8 +417,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - ptr_too_stale, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_too_stale, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -436,8 +437,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (b_gen != ptr->gen) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - stale_dirty_ptr, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -452,8 +453,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - ptr_bucket_data_type_mismatch, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -467,8 +468,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if ((u64) *bucket_sectors + sectors > U32_MAX) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - bucket_sector_count_overflow, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -486,7 +487,9 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, printbuf_exit(&buf); return ret; err: +fsck_err: bch2_dump_trans_updates(trans); + bch2_inconsistent_error(c); ret = -BCH_ERR_bucket_ref_update; goto out; } @@ -952,6 +955,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, enum bch_data_type type, unsigned sectors) { + struct bch_fs *c = trans->c; struct btree_iter iter; int ret = 0; @@ -961,8 +965,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - bucket_metadata_type_mismatch, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, @@ -980,6 +984,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: +fsck_err: bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index b679def8fb98..22b0fa405a39 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -385,9 +385,7 @@ int __bch2_fsck_err(struct bch_fs *c, prt_str(out, ", not "); prt_actioning(out, action); } - } else if (flags & FSCK_NEED_FSCK) { - prt_str(out, " (run fsck to correct)"); - } else { + } else if (!(flags & FSCK_CAN_IGNORE)) { prt_str(out, " (repair unimplemented)"); } @@ -424,11 +422,18 @@ int __bch2_fsck_err(struct bch_fs *c, if (inconsistent) bch2_inconsistent_error(c); - if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); + /* + * We don't yet track whether the filesystem currently has errors, for + * log_fsck_err()s: that would require us to track for every error type + * which recovery pass corrects it, to get the fsck exit status correct: + */ + if (flags & FSCK_CAN_FIX) { + if (ret == -BCH_ERR_fsck_fix) { + set_bit(BCH_FS_errors_fixed, &c->flags); + } else { + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); + } } err: if (action != action_orig) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 81af0b8ddb52..24c41a9994df 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -129,12 +129,6 @@ void bch2_flush_fsck_errs(struct bch_fs *); (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ }) -#define need_fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) - -#define need_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) - #define mustfix_fsck_err(c, _err_type, ...) \ __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) @@ -147,6 +141,12 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) +#define log_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + +#define log_fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + enum bch_validate_flags; __printf(5, 6) int __bch2_bkey_fsck_err(struct bch_fs *, diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 9feb6739f77a..f2b38493356d 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -5,9 +5,8 @@ enum bch_fsck_flags { FSCK_CAN_FIX = 1 << 0, FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, - FSCK_AUTOFIX = 1 << 4, + FSCK_NO_RATELIMIT = 1 << 2, + FSCK_AUTOFIX = 1 << 3, }; #define BCH_SB_ERRS() \ From b36ff5dc0842ed3c4aa202107da727cc74b59cd0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Oct 2024 23:43:16 -0400 Subject: [PATCH 081/233] bcachefs: Reserve 8 bits in bch_reflink_p Better repair for reflink pointers, as well as propagating new inode options to indirect extents, are going to require a few extra bits bch_reflink_p: so claim a few from the high end of the destination index. Also add some missing bounds checking. Signed-off-by: Kent Overstreet --- fs/bcachefs/extent_update.c | 2 +- fs/bcachefs/extents.c | 2 +- fs/bcachefs/io_read.c | 14 ++++------- fs/bcachefs/reflink.c | 45 +++++++++++++++++++++++++----------- fs/bcachefs/reflink_format.h | 4 +++- 5 files changed, 41 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 45c87c019f6b..6aac579a692a 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans, break; case KEY_TYPE_reflink_p: { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = le64_to_cpu(p.v->idx); + u64 idx = REFLINK_P_IDX(p.v); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); struct btree_iter iter; diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index bc7cfdb66687..98bb680b3860 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1495,7 +1495,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) case KEY_TYPE_reflink_p: { struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - le64_add_cpu(&p.v->idx, sub); + SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); break; } case KEY_TYPE_inline_data: diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index cbc3cc1f6d03..c700a95df89e 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -754,17 +754,13 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, struct bkey_buf *orig_k) { + struct bkey_i_reflink_p *p = bkey_i_to_reflink_p(orig_k->k); + u64 reflink_offset = REFLINK_P_IDX(&p->v) + *offset_into_extent; + struct btree_iter iter; - struct bkey_s_c k; - u64 reflink_offset; - int ret; - - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + - *offset_into_extent; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, POS(0, reflink_offset), 0); - ret = bkey_err(k); + int ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 96cf50f4705d..addaf5f74624 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -35,10 +35,10 @@ int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; - bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), + bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), c, reflink_p_front_pad_bad, "idx < front_pad (%llu < %u)", - le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); + REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); fsck_err: return ret; } @@ -49,7 +49,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); prt_printf(out, "idx %llu front_pad %u back_pad %u", - le64_to_cpu(p.v->idx), + REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); } @@ -65,7 +65,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r */ return false; - if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) return false; bch2_key_resize(l.k, l.k->size + r.k->size); @@ -115,12 +115,12 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, u64 pad; pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + REFLINK_P_IDX(v) - bkey_start_offset(&k->k)); BUG_ON(pad > U32_MAX); v->front_pad = cpu_to_le32(pad); pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + k->k.p.offset - p.k->size - REFLINK_P_IDX(v)); BUG_ON(pad > U32_MAX); v->back_pad = cpu_to_le32(pad); } @@ -147,8 +147,8 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, struct bch_fs *c = trans->c; struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; - u64 start = le64_to_cpu(p.v->idx); - u64 end = le64_to_cpu(p.v->idx) + p.k->size; + u64 start = REFLINK_P_IDX(p.v); + u64 end = start + p.k->size; u64 next_idx = end + le32_to_cpu(p.v->back_pad); s64 ret = 0; struct printbuf buf = PRINTBUF; @@ -210,8 +210,8 @@ static int __trigger_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; - u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); - u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); + u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); + u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); if (flags & BTREE_TRIGGER_transactional) { while (idx < end && !ret) @@ -258,7 +258,16 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { - return bch2_bkey_ptrs_validate(c, k, flags); + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), + c, reflink_v_pos_bad, + "indirect extent above maximum position 0:%llu", + REFLINK_P_IDX_MAX); + + ret = bch2_bkey_ptrs_validate(c, k, flags); +fsck_err: + return ret; } void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, @@ -358,6 +367,14 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (ret) goto err; + /* + * XXX: we're assuming that 56 bits will be enough for the life of the + * filesystem: we need to implement wraparound, with a cursor in the + * logged ops btree: + */ + if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) + return -ENOSPC; + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) @@ -394,7 +411,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, memset(&r_p->v, 0, sizeof(r_p->v)); #endif - r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, BTREE_UPDATE_internal_snapshot_node); @@ -533,11 +550,11 @@ s64 bch2_remap_range(struct bch_fs *c, struct bkey_i_reflink_p *dst_p = bkey_reflink_p_init(new_dst.k); - u64 offset = le64_to_cpu(src_p.v->idx) + + u64 offset = REFLINK_P_IDX(src_p.v) + (src_want.offset - bkey_start_offset(src_k.k)); - dst_p->v.idx = cpu_to_le64(offset); + SET_REFLINK_P_IDX(&dst_p->v, offset); } else { BUG(); } diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h index 6772eebb1fc6..0d8de13b9ddf 100644 --- a/fs/bcachefs/reflink_format.h +++ b/fs/bcachefs/reflink_format.h @@ -4,7 +4,7 @@ struct bch_reflink_p { struct bch_val v; - __le64 idx; + __le64 idx_flags; /* * A reflink pointer might point to an indirect extent which is then * later split (by copygc or rebalance). If we only pointed to part of @@ -17,6 +17,8 @@ struct bch_reflink_p { __le32 back_pad; } __packed __aligned(8); +LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); + struct bch_reflink_v { struct bch_val v; __le64 refcount; From 6cf666ffb5694d38860eb3f46d773825487e7f7e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 31 Oct 2024 01:25:09 -0400 Subject: [PATCH 082/233] bcachefs: Reorganize reflink.c a bit Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 92 ------------------ fs/bcachefs/reflink.c | 214 ++++++++++++++++++++++++++++++----------- fs/bcachefs/reflink.h | 3 + 3 files changed, 160 insertions(+), 149 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e45cf32a6403..2e8cfc4d3265 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -937,98 +937,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c) return ret; } -static int bch2_gc_write_reflink_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - size_t *idx) -{ - struct bch_fs *c = trans->c; - const __le64 *refcount = bkey_refcount_c(k); - struct printbuf buf = PRINTBUF; - struct reflink_gc *r; - int ret = 0; - - if (!refcount) - return 0; - - while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && - r->offset < k.k->p.offset) - ++*idx; - - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } - - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), - trans, reflink_v_refcount_wrong, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - if (!r->refcount) - new->k.type = KEY_TYPE_deleted; - else - *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); - ret = bch2_trans_update(trans, iter, new, 0); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_reflink_done(struct bch_fs *c) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_reflink_key(trans, &iter, k, &idx))); - c->reflink_gc_nr = 0; - return ret; -} - -static int bch2_gc_reflink_start(struct bch_fs *c) -{ - c->reflink_gc_nr = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, ({ - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, - c->reflink_gc_nr++, GFP_KERNEL); - if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} - static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index addaf5f74624..36fb1e9473ff 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -72,6 +72,66 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r return true; } +/* indirect extents */ + +int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), + c, reflink_v_pos_bad, + "indirect extent above maximum position 0:%llu", + REFLINK_P_IDX_MAX); + + ret = bch2_bkey_ptrs_validate(c, k, flags); +fsck_err: + return ret; +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +#if 0 +Currently disabled, needs to be debugged: + +bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); + + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} +#endif + +/* indirect inline data */ + +int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + enum bch_validate_flags flags) +{ + return 0; +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, + struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + prt_printf(out, "refcount %llu datalen %u: %*phN", + le64_to_cpu(d.v->refcount), datalen, + min(datalen, 32U), d.v->data); +} + +/* reflink pointer trigger */ + static int trans_trigger_reflink_p_segment(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 *idx, enum btree_iter_update_trigger_flags flags) @@ -253,44 +313,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); } -/* indirect extents */ - -int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) -{ - int ret = 0; - - bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), - c, reflink_v_pos_bad, - "indirect extent above maximum position 0:%llu", - REFLINK_P_IDX_MAX); - - ret = bch2_bkey_ptrs_validate(c, k, flags); -fsck_err: - return ret; -} - -void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); - - bch2_bkey_ptrs_to_text(out, c, k); -} - -#if 0 -Currently disabled, needs to be debugged: - -bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); - - return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -} -#endif +/* indirect extent trigger */ static inline void check_indirect_extent_deleting(struct bkey_s new, @@ -316,25 +339,6 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, return bch2_trigger_extent(trans, btree_id, level, old, new, flags); } -/* indirect inline data */ - -int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) -{ - return 0; -} - -void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "refcount %llu datalen %u: %*phN", - le64_to_cpu(d.v->refcount), datalen, - min(datalen, 32U), d.v->data); -} - int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, @@ -345,6 +349,8 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans, return 0; } +/* create */ + static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *orig) @@ -608,3 +614,97 @@ s64 bch2_remap_range(struct bch_fs *c, return dst_done ?: ret ?: ret2; } + +/* fsck */ + +static int bch2_gc_write_reflink_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + size_t *idx) +{ + struct bch_fs *c = trans->c; + const __le64 *refcount = bkey_refcount_c(k); + struct printbuf buf = PRINTBUF; + struct reflink_gc *r; + int ret = 0; + + if (!refcount) + return 0; + + while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && + r->offset < k.k->p.offset) + ++*idx; + + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + return -EINVAL; + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), + trans, reflink_v_refcount_wrong, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; + else + *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); + } +out: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_gc_reflink_done(struct bch_fs *c) +{ + size_t idx = 0; + + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_reflink_key(trans, &iter, k, &idx))); + c->reflink_gc_nr = 0; + return ret; +} + +int bch2_gc_reflink_start(struct bch_fs *c) +{ + c->reflink_gc_nr = 0; + + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, ({ + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, + c->reflink_gc_nr++, GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + 0; + }))); + + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 51afe11d8ed6..6ec3a9ea6bb4 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -76,4 +76,7 @@ static inline __le64 *bkey_refcount(struct bkey_s k) s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, subvol_inum, u64, u64, u64, s64 *); +int bch2_gc_reflink_done(struct bch_fs *); +int bch2_gc_reflink_start(struct bch_fs *); + #endif /* _BCACHEFS_REFLINK_H */ From df4270ccd3907ff2fc8ba3cba6328a229a5bd203 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 20:27:44 -0400 Subject: [PATCH 083/233] bcachefs: Don't delete reflink pointers to missing indirect extents To avoid tragic loss in the event of transient errors (i.e., a btree node topology error that was later corrected by btree node scan), we can't delete reflink pointers to correct errors. This adds a new error bit to bch_reflink_p, indicating that it is known to point to a missing indirect extent, and the error has already been reported. Indirect extent lookups now use bch2_lookup_indirect_extent(), which on error reports it as a fsck_err() and sets the error bit, and clears it if necessary on succesful lookup. This also gets rid of the bch2_inconsistent_error() call in __bch2_read_indirect_extent, and in the reflink_p trigger: part of the online self healing project. An on disk format change isn't necessary here: setting the error bit will be interpreted by older versions as pointing to a different index, which will also be missing - which is fine. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 5 +- fs/bcachefs/fs.c | 8 +- fs/bcachefs/io_read.c | 45 +------ fs/bcachefs/io_read.h | 28 +++- fs/bcachefs/reflink.c | 243 +++++++++++++++++++++++++++-------- fs/bcachefs/reflink.h | 4 + fs/bcachefs/reflink_format.h | 1 + 7 files changed, 223 insertions(+), 111 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index b853cecd3c1b..d55e215e8aa6 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -164,7 +164,8 @@ static void bchfs_read(struct btree_trans *trans, BTREE_ITER_slots); while (1) { struct bkey_s_c k; - unsigned bytes, sectors, offset_into_extent; + unsigned bytes, sectors; + s64 offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -197,7 +198,7 @@ static void bchfs_read(struct btree_trans *trans, k = bkey_i_to_s_c(sk.k); - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); if (readpages_iter) { ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f852dbf30aa2..50d323fca001 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1261,7 +1261,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; @@ -1308,9 +1307,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, continue; } - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&cur, c, k); @@ -1322,7 +1320,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index c700a95df89e..eb8d12fd6398 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -21,6 +21,7 @@ #include "io_read.h" #include "io_misc.h" #include "io_write.h" +#include "reflink.h" #include "subvolume.h" #include "trace.h" @@ -750,41 +751,6 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } -int __bch2_read_indirect_extent(struct btree_trans *trans, - unsigned *offset_into_extent, - struct bkey_buf *orig_k) -{ - struct bkey_i_reflink_p *p = bkey_i_to_reflink_p(orig_k->k); - u64 reflink_offset = REFLINK_P_IDX(&p->v) + *offset_into_extent; - - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), 0); - int ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_reflink_v && - k.k->type != KEY_TYPE_indirect_inline_data) { - bch_err_inum_offset_ratelimited(trans->c, - orig_k->k->k.p.inode, - orig_k->k->k.p.offset << 9, - "%llu len %u points to nonexistent indirect extent %llu", - orig_k->k->k.p.offset, - orig_k->k->k.size, - reflink_offset); - bch2_inconsistent_error(trans->c); - ret = -BCH_ERR_missing_indirect_extent; - goto err; - } - - *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - bch2_bkey_buf_reassemble(orig_k, trans->c, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, @@ -1160,7 +1126,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1180,9 +1145,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, if (ret) goto err; - offset_into_extent = iter.pos.offset - + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; + unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&sk, c, k); @@ -1197,9 +1162,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: */ - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index d9c18bb7d403..a82e8a94ccb6 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,7 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "reflink.h" struct bch_read_bio { struct bch_fs *c; @@ -79,19 +80,32 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; -int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_buf *); - static inline int bch2_read_indirect_extent(struct btree_trans *trans, enum btree_id *data_btree, - unsigned *offset_into_extent, - struct bkey_buf *k) + s64 *offset_into_extent, + struct bkey_buf *extent) { - if (k->k->k.type != KEY_TYPE_reflink_p) + if (extent->k->k.type != KEY_TYPE_reflink_p) return 0; *data_btree = BTREE_ID_reflink; - return __bch2_read_indirect_extent(trans, offset_into_extent, k); + struct btree_iter iter; + struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, + offset_into_extent, + bkey_i_to_s_c_reflink_p(extent->k), + true, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + return -BCH_ERR_missing_indirect_extent; + } + + bch2_bkey_buf_reassemble(extent, trans->c, k); + bch2_trans_iter_exit(trans, &iter); + return 0; } enum bch_read_flags { diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 36fb1e9473ff..38db5a011702 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -15,6 +15,17 @@ #include +static inline bool bkey_extent_is_reflink_data(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_reflink_v: + case KEY_TYPE_indirect_inline_data: + return true; + default: + return false; + } +} + static inline unsigned bkey_type_to_indirect(const struct bkey *k) { switch (k->type) { @@ -68,6 +79,9 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) return false; + if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) + return false; + bch2_key_resize(l.k, l.k->size + r.k->size); return true; } @@ -130,6 +144,144 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out, min(datalen, 32U), d.v->data); } +/* lookup */ + +static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, + bool should_commit) +{ + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + SET_REFLINK_P_ERROR(&new->v, false); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); + if (ret) + return ret; + + if (!should_commit) + return 0; + + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; +} + +static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 missing_start, u64 missing_end, + bool should_commit) +{ + if (REFLINK_P_ERROR(p.v)) + return -BCH_ERR_missing_indirect_extent; + + struct bch_fs *c = trans->c; + u64 live_start = REFLINK_P_IDX(p.v); + u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; + u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); + u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(missing_start < refd_start); + BUG_ON(missing_end > refd_end); + + if (fsck_err(trans, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + missing_start, missing_end)) { + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + /* + * Is the missing range not actually needed? + * + * p.v->idx refers to the data that we actually want, but if the + * indirect extent we point to was bigger, front_pad and back_pad + * indicate the range we took a reference on. + */ + + if (missing_end <= live_start) { + new->v.front_pad = cpu_to_le32(live_start - missing_end); + } else if (missing_start >= live_end) { + new->v.back_pad = cpu_to_le32(missing_start - live_end); + } else { + struct bpos new_start = bkey_start_pos(&new->k); + struct bpos new_end = new->k.p; + + if (missing_start > live_start) + new_start.offset += missing_start - live_start; + if (missing_end < live_end) + new_end.offset -= live_end - missing_end; + + bch2_cut_front(new_start, &new->k_i); + bch2_cut_back(new_end, &new->k_i); + + SET_REFLINK_P_ERROR(&new->v, true); + } + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); + if (ret) + goto err; + + if (should_commit) + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * This is used from the read path, which doesn't expect to have to do a + * transaction commit, and from triggers, which should not be doing a commit: + */ +struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, + struct btree_iter *iter, + s64 *offset_into_extent, + struct bkey_s_c_reflink_p p, + bool should_commit, + unsigned iter_flags) +{ + BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); + BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); + + u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; + + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, + POS(0, reflink_offset), iter_flags); + if (bkey_err(k)) + return k; + + if (unlikely(!bkey_extent_is_reflink_data(k.k))) { + bch2_trans_iter_exit(trans, iter); + + unsigned size = min((u64) k.k->size, + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - + reflink_offset); + bch2_key_resize(&iter->k, size); + + int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, + k.k->p.offset, should_commit); + if (ret) + return bkey_s_c_err(ret); + } else if (unlikely(REFLINK_P_ERROR(p.v))) { + bch2_trans_iter_exit(trans, iter); + + int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); + if (ret) + return bkey_s_c_err(ret); + } + + *offset_into_extent = reflink_offset - bkey_start_offset(k.k); + return k; +} + /* reflink pointer trigger */ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, @@ -137,37 +289,37 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i *k; - __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; struct printbuf buf = PRINTBUF; - int ret; - k = bch2_bkey_get_mut_noupdate(trans, &iter, - BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_with_updates); - ret = PTR_ERR_OR_ZERO(k); + s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); + struct btree_iter iter; + struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bkey_deleted(k.k)) { + if (!(flags & BTREE_TRIGGER_overwrite)) + ret = -BCH_ERR_missing_indirect_extent; + goto next; + } + + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); if (ret) goto err; - refcount = bkey_refcount(bkey_i_to_s(k)); - if (!refcount) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; - } - + __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + log_fsck_err(trans, reflink_refcount_underflow, + "indirect extent refcount underflow while marking\n %s", + buf.buf); + goto next; } if (flags & BTREE_TRIGGER_insert) { @@ -175,25 +327,26 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, u64 pad; pad = max_t(s64, le32_to_cpu(v->front_pad), - REFLINK_P_IDX(v) - bkey_start_offset(&k->k)); + REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); BUG_ON(pad > U32_MAX); v->front_pad = cpu_to_le32(pad); pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - REFLINK_P_IDX(v)); + new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); BUG_ON(pad > U32_MAX); v->back_pad = cpu_to_le32(pad); } - le64_add_cpu(refcount, add); + le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, k, 0); + ret = bch2_trans_update(trans, &iter, new, 0); if (ret) goto err; - - *idx = k->k.p.offset; +next: + *idx = k.k->p.offset; err: +fsck_err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; @@ -207,9 +360,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, struct bch_fs *c = trans->c; struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; - u64 start = REFLINK_P_IDX(p.v); - u64 end = start + p.k->size; - u64 next_idx = end + le32_to_cpu(p.v->back_pad); + u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); s64 ret = 0; struct printbuf buf = PRINTBUF; @@ -228,36 +379,14 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, *idx = r->offset; return 0; not_found: - BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); - - if (fsck_err(trans, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - *idx, next_idx)) { - struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); - ret = PTR_ERR_OR_ZERO(update); + if (flags & BTREE_TRIGGER_check_repair) { + ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); if (ret) goto err; - - if (next_idx <= start) { - bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); - } else if (*idx >= end) { - bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); - } else { - bkey_error_init(update); - update->k.p = p.k->p; - update->k.size = p.k->size; - set_bkey_val_u64s(&update->k, 0); - } - - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun); } *idx = next_idx; err: -fsck_err: printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 6ec3a9ea6bb4..b61a4bdd8e82 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -73,6 +73,10 @@ static inline __le64 *bkey_refcount(struct bkey_s k) } } +struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, + s64 *, struct bkey_s_c_reflink_p, + bool, unsigned); + s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, subvol_inum, u64, u64, u64, s64 *); diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h index 0d8de13b9ddf..53502627b2c5 100644 --- a/fs/bcachefs/reflink_format.h +++ b/fs/bcachefs/reflink_format.h @@ -18,6 +18,7 @@ struct bch_reflink_p { } __packed __aligned(8); LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); +LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); struct bch_reflink_v { struct bch_val v; From 3de116ce179b69eb2d92498a0872e5ab786cf4ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 27 Oct 2024 00:05:54 -0400 Subject: [PATCH 084/233] bcachefs: kill inconsistent err in invalidate_one_bucket() Change it to a normal fsck_err() - meaning it'll get repaired at runtime when that's flipped on. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 38df36f8e70a..72ba7354adac 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1977,8 +1977,11 @@ static int invalidate_one_bucket(struct btree_trans *trans, return 1; if (!bch2_dev_bucket_exists(c, bucket)) { - prt_str(&buf, "lru entry points to invalid bucket"); - goto err; + if (fsck_err(trans, lru_entry_to_invalid_bucket, + "lru key points to nonexistent device:bucket %llu:%llu", + bucket.inode, bucket.offset)) + return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + goto out; } if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) @@ -2019,28 +2022,9 @@ static int invalidate_one_bucket(struct btree_trans *trans, trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: +fsck_err: printbuf_exit(&buf); return ret; -err: - prt_str(&buf, "\n lru key: "); - bch2_bkey_val_to_text(&buf, c, lru_k); - - prt_str(&buf, "\n lru entry: "); - bch2_lru_pos_to_text(&buf, lru_iter->pos); - - prt_str(&buf, "\n alloc key: "); - if (!a) - bch2_bpos_to_text(&buf, bucket); - else - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); - - bch_err(c, "%s", buf.buf); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { - bch2_inconsistent_error(c); - ret = -EINVAL; - } - - goto out; } static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, From ecadaf9ae3c9d091cd04c6998341db98bdf683ce Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 27 Oct 2024 20:47:03 -0400 Subject: [PATCH 085/233] bcachefs: rework bch2_bucket_alloc_freelist() freelist iteration Prep work for converting try_alloc_bucket() to use bch2_check_discard_freespace_key(). Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 59 ++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 372178c8d416..645d8a269142 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -276,9 +276,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * } static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, u64 free_entry, + enum bch_watermark watermark, struct bucket_alloc_state *s, - struct bkey_s_c freespace_k, + struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; @@ -287,8 +287,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct open_bucket *ob; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - u64 b = free_entry & ~(~0ULL << 56); - unsigned genbits = free_entry >> 56; + u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); + unsigned genbits = freespace_iter->pos.offset >> 56; struct printbuf buf = PRINTBUF; int ret; @@ -296,7 +296,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" " freespace key ", ca->mi.first_bucket, ca->mi.nbuckets); - bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_bkey_to_text(&buf, &freespace_iter->k); bch2_trans_inconsistent(trans, "%s", buf.buf); ob = ERR_PTR(-EIO); goto err; @@ -321,7 +321,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc prt_printf(&buf, "non free bucket in freespace btree\n" " freespace key "); - bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_bkey_to_text(&buf, &freespace_iter->k); prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, k); bch2_trans_inconsistent(trans, "%s", buf.buf); @@ -334,7 +334,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" " freespace key ", genbits, alloc_freespace_genbits(*a) >> 56); - bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_bkey_to_text(&buf, &freespace_iter->k); prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, k); bch2_trans_inconsistent(trans, "%s", buf.buf); @@ -492,17 +492,20 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, BUG_ON(ca->new_fs_bucket_idx); again: - for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, alloc_cursor), 0, k, ret) { - if (k.k->p.inode != ca->dev_idx) - break; + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, alloc_cursor), + POS(ca->dev_idx, U64_MAX), + 0, k, ret) { + /* + * peek normally dosen't trim extents - they can span iter.pos, + * which is not what we want here: + */ + iter.k.size = iter.k.p.offset - iter.pos.offset; - for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); - alloc_cursor < k.k->p.offset; - alloc_cursor++) { + while (iter.k.size) { s->buckets_seen++; - u64 bucket = alloc_cursor & ~(~0ULL << 56); + u64 bucket = iter.pos.offset & ~(~0ULL << 56); if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { @@ -511,32 +514,36 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, goto fail; bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket) + 1, + round_up(bucket_to_sector(ca, bucket + 1), 1ULL << ca->mi.btree_bitmap_shift)); - u64 genbits = alloc_cursor >> 56; - alloc_cursor = bucket | (genbits << 56); + alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - if (alloc_cursor > k.k->p.offset) - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); s->skipped_mi_btree_bitmap++; - continue; + goto next; } - ob = try_alloc_bucket(trans, ca, watermark, - alloc_cursor, s, k, cl); + ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); if (ob) { + if (!IS_ERR(ob)) + *dev_alloc_cursor = iter.pos.offset; bch2_set_btree_iter_dontneed(&iter); break; } - } + iter.k.size--; + iter.pos.offset++; + } +next: if (ob || ret) break; } fail: bch2_trans_iter_exit(trans, &iter); - if (!ob && ret) + BUG_ON(ob && ret); + + if (ret) ob = ERR_PTR(ret); if (!ob && alloc_start > ca->mi.first_bucket) { @@ -544,8 +551,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, goto again; } - *dev_alloc_cursor = alloc_cursor; - return ob; } From c51b6019074d107e2c60b23dc23e5c7886a27a4e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 27 Oct 2024 00:40:43 -0400 Subject: [PATCH 086/233] bcachefs: try_alloc_bucket() now uses bch2_check_discard_freespace_key() check_discard_freespace_key() was doing all the same checks as try_alloc_bucket(), but with repair. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 80 ++++++++++++++++------------- fs/bcachefs/alloc_background.h | 2 + fs/bcachefs/alloc_foreground.c | 93 ++++++---------------------------- 3 files changed, 62 insertions(+), 113 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 72ba7354adac..1f42dd208957 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1332,51 +1332,53 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, return ret; } -static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter) +int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter; - struct bkey_s_c alloc_k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - u64 genbits; - struct bpos pos; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard ? BCH_DATA_need_discard : BCH_DATA_free; struct printbuf buf = PRINTBUF; - int ret; - pos = iter->pos; - pos.offset &= ~(~0ULL << 56); - genbits = iter->pos.offset & (~0ULL << 56); + struct bpos bucket = iter->pos; + bucket.offset &= ~(~0ULL << 56); + u64 genbits = iter->pos.offset & (~0ULL << 56); - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); - ret = bkey_err(alloc_k); + struct btree_iter alloc_iter; + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, BTREE_ITER_cached); + int ret = bkey_err(alloc_k); if (ret) return ret; - if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), - trans, need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) - goto delete; + if (!bch2_dev_bucket_exists(c, bucket)) { + if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) + goto delete; + ret = 1; + goto out; + } - a = bch2_alloc_to_v4(alloc_k, &a_convert); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); - if (fsck_err_on(a->data_type != state || - (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a)), - trans, need_discard_freespace_key_bad, - "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_id_str(iter->btree_id), - iter->pos.inode, - iter->pos.offset, - a->data_type == state, - genbits >> 56, alloc_freespace_genbits(*a) >> 56)) - goto delete; + if (a->data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(*a))) { + if (fsck_err(trans, need_discard_freespace_key_bad, + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_id_str(iter->btree_id), + iter->pos.inode, + iter->pos.offset, + a->data_type == state, + genbits >> 56, alloc_freespace_genbits(*a) >> 56)) + goto delete; + ret = 1; + goto out; + } + + *gen = a->gen; out: fsck_err: bch2_set_btree_iter_dontneed(&alloc_iter); @@ -1386,10 +1388,18 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran delete: ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); + BCH_TRANS_COMMIT_no_enospc) ?: + 1; goto out; } +static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) +{ + u8 gen; + int ret = bch2_check_discard_freespace_key(trans, iter, &gen); + return ret < 0 ? ret : 0; +} + /* * We've already checked that generation numbers in the bucket_gens btree are * valid for buckets that exist; this just checks for keys for nonexistent @@ -1544,7 +1554,7 @@ int bch2_check_alloc_info(struct bch_fs *c) ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_prefetch, k, - bch2_check_discard_freespace_key(trans, &iter)); + bch2_check_discard_freespace_key_fsck(trans, &iter)); if (ret) goto err; @@ -1557,7 +1567,7 @@ int bch2_check_alloc_info(struct bch_fs *c) break; ret = bkey_err(k) ?: - bch2_check_discard_freespace_key(trans, &iter); + bch2_check_discard_freespace_key_fsck(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 163a67b97a40..57723a37abb8 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -307,6 +307,8 @@ int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); + +int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 645d8a269142..955ea6ae868f 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -207,9 +207,8 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark) } static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 bucket, + u64 bucket, u8 gen, enum bch_watermark watermark, - const struct bch_alloc_v4 *a, struct bucket_alloc_state *s, struct closure *cl) { @@ -261,7 +260,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->dev = ca->dev_idx; - ob->gen = a->gen; + ob->gen = gen; ob->bucket = bucket; spin_unlock(&ob->lock); @@ -282,98 +281,36 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct closure *cl) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; - struct bkey_s_c k; - struct open_bucket *ob; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - unsigned genbits = freespace_iter->pos.offset >> 56; - struct printbuf buf = PRINTBUF; - int ret; + u8 gen; - if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { - prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" - " freespace key ", - ca->mi.first_bucket, ca->mi.nbuckets); - bch2_bkey_to_text(&buf, &freespace_iter->k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } + int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen); + if (ret < 0) + return ERR_PTR(ret); + if (ret) + return NULL; - k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_alloc, POS(ca->dev_idx, b), - BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) { - ob = ERR_PTR(ret); - goto err; - } - - a = bch2_alloc_to_v4(k, &a_convert); - - if (a->data_type != BCH_DATA_free) { - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { - ob = NULL; - goto err; - } - - prt_printf(&buf, "non free bucket in freespace btree\n" - " freespace key "); - bch2_bkey_to_text(&buf, &freespace_iter->k); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } - - if (genbits != (alloc_freespace_genbits(*a) >> 56) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" - " freespace key ", - genbits, alloc_freespace_genbits(*a) >> 56); - bch2_bkey_to_text(&buf, &freespace_iter->k); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } - - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { + if (unlikely(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers)) { struct bch_backpointer bp; struct bpos bp_pos = POS_MIN; ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, &bp_pos, &bp, BTREE_ITER_nopreserve); - if (ret) { - ob = ERR_PTR(ret); - goto err; - } + if (ret) + return ERR_PTR(ret); if (!bkey_eq(bp_pos, POS_MAX)) { /* * Bucket may have data in it - we don't call - * bc2h_trans_inconnsistent() because fsck hasn't + * bch2_trans_inconsistent() because fsck hasn't * finished yet */ - ob = NULL; - goto err; + return NULL; } } - ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); - if (!ob) - bch2_set_btree_iter_dontneed(&iter); -err: - if (iter.path) - bch2_set_btree_iter_dontneed(&iter); - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ob; + return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); } /* @@ -452,7 +389,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, s->buckets_seen++; - ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, a->gen, watermark, s, cl); next: bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); From cbc079bcff7d5eb38f54f3e7d378100d919e028a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 26 Oct 2024 22:21:20 -0400 Subject: [PATCH 087/233] bcachefs: bch2_bucket_do_index(): inconsistent_err -> fsck_err Factor out a common helper, need_discard_or_freespace_err(), which is now used by both fsck and the runtime checks, and can repair. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 83 ++++++++++++++++++---------------- fs/bcachefs/error.c | 7 +-- fs/bcachefs/error.h | 6 ++- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 1f42dd208957..0c044201787f 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -664,17 +664,44 @@ int bch2_alloc_read(struct bch_fs *c) /* Free space/discard btree: */ +static int __need_discard_or_freespace_err(struct btree_trans *trans, + struct bkey_s_c alloc_k, + bool set, bool discard, bool repair) +{ + struct bch_fs *c = trans->c; + enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); + enum bch_sb_error_id err_id = discard + ? BCH_FSCK_ERR_need_discard_key_wrong + : BCH_FSCK_ERR_freespace_key_wrong; + enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, alloc_k); + + int ret = __bch2_fsck_err(NULL, trans, flags, err_id, + "bucket incorrectly %sset in %s btree\n" + " %s", + set ? "" : "un", + bch2_btree_id_str(btree), + buf.buf); + printbuf_exit(&buf); + return ret; +} + +#define need_discard_or_freespace_err(...) \ + fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) + +#define need_discard_or_freespace_err_on(cond, ...) \ + (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) + static int bch2_bucket_do_index(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { - struct bch_fs *c = trans->c; enum btree_id btree; struct bpos pos; - enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; - struct printbuf buf = PRINTBUF; if (a->data_type != BCH_DATA_free && a->data_type != BCH_DATA_need_discard) @@ -699,26 +726,14 @@ static int bch2_bucket_do_index(struct btree_trans *trans, if (ret) return ret; - if (ca->mi.freespace_initialized && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && - bch2_trans_inconsistent_on(old.k->type != old_type, trans, - "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" - " for %s", - set ? "setting" : "clearing", - bch2_btree_id_str(btree), - iter.pos.inode, - iter.pos.offset, - bch2_bkey_types[old.k->type], - bch2_bkey_types[old_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = -EIO; - goto err; - } + need_discard_or_freespace_err_on(ca->mi.freespace_initialized && + !old.k->type != set, + trans, alloc_k, set, + btree == BTREE_ID_need_discard, false); ret = bch2_btree_bit_mod_iter(trans, &iter, set); -err: +fsck_err: bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); return ret; } @@ -1116,7 +1131,6 @@ int bch2_check_alloc_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; @@ -1136,41 +1150,30 @@ int bch2_check_alloc_key(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); - discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) goto err; - if (fsck_err_on(k.k->type != discard_key_type, - trans, need_discard_key_wrong, - "incorrect key in need_discard btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[discard_key_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_btree_bit_mod_iter(trans, discard_iter, !!discard_key_type); + bool is_discarded = a->data_type == BCH_DATA_need_discard; + if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, + trans, alloc_k, !is_discarded, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); if (ret) goto err; } - freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; - if (fsck_err_on(k.k->type != freespace_key_type, - trans, freespace_key_wrong, - "incorrect key in freespace btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[freespace_key_type], - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = bch2_btree_bit_mod_iter(trans, freespace_iter, !!freespace_key_type); + bool is_free = a->data_type == BCH_DATA_free; + if (need_discard_or_freespace_err_on(!!k.k->type != is_free, + trans, alloc_k, !is_free, false, true)) { + ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); if (ret) goto err; } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 22b0fa405a39..2960baa023f6 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -256,9 +256,10 @@ int __bch2_fsck_err(struct bch_fs *c, !trans && bch2_current_has_btree_trans(c)); - if ((flags & FSCK_CAN_FIX) && - test_bit(err, c->sb.errors_silent)) - return -BCH_ERR_fsck_fix; + if (test_bit(err, c->sb.errors_silent)) + return flags & FSCK_CAN_FIX + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; bch2_sb_error_count(c, err); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 24c41a9994df..8327a3461535 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -103,9 +103,9 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, void bch2_flush_fsck_errs(struct bch_fs *); -#define __fsck_err(c, _flags, _err_type, ...) \ +#define fsck_err_wrap(_do) \ ({ \ - int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \ + int _ret = _do; \ if (_ret != -BCH_ERR_fsck_fix && \ _ret != -BCH_ERR_fsck_ignore) { \ ret = _ret; \ @@ -115,6 +115,8 @@ void bch2_flush_fsck_errs(struct bch_fs *); _ret == -BCH_ERR_fsck_fix; \ }) +#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) + /* These macros return true if error should be fixed: */ /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ From a7df326af032869e31b0d2a7e3c03190caf3e381 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 26 Oct 2024 23:25:17 -0400 Subject: [PATCH 088/233] bcachefs: discard_one_bucket() now uses need_discard_or_freespace_err() More conversion of inconsistent errors to fsck errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 0c044201787f..e90561b6def6 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1770,11 +1770,13 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; if (a->v.data_type != BCH_DATA_need_discard) { - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, - trans, "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = -EIO; + if (need_discard_or_freespace_err(trans, k, true, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); + if (ret) + goto out; + goto commit; + } + goto out; } @@ -1814,16 +1816,20 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); alloc_data_type_set(&a->v, a->v.data_type); - ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto out; +commit: + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; count_event(c, bucket_discard); s->discarded++; out: +fsck_err: if (discard_locked) discard_in_flight_remove(ca, iter.pos.offset); s->seen++; From 632bcf38651efbbf9507cf35ae63d6ac291dca24 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Oct 2024 22:12:37 -0400 Subject: [PATCH 089/233] bcachefs: Implement bch2_btree_iter_prev_min() A user contributed a filessytem dump, where the dump was actually corrupted (due to being taken while the filesystem was online), but which exposed an interesting bug in fsck - reconstruct_inode(). When itearting in BTREE_ITER_filter_snapshots mode, it's required to give an end position for the iteration and it can't span inode numbers; continuing into the next inode might mean we start seeing keys from a different snapshot tree, that the is_ancestor() checks always filter, thus we're never able to return a key and stop iterating. Backwards iteration never implemented the end position because nothing else needed it - except for reconstuct_inode(). Additionally, backwards iteration is now able to overlay keys from the journal, which will be useful if we ever decide to start doing journal replay in the background. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 276 +++++++++++++++++++++---------- fs/bcachefs/btree_iter.h | 8 +- fs/bcachefs/btree_journal_iter.c | 46 ++++++ fs/bcachefs/btree_journal_iter.h | 2 + fs/bcachefs/errcode.h | 1 - fs/bcachefs/fsck.c | 4 +- fs/bcachefs/io_misc.c | 2 +- 7 files changed, 244 insertions(+), 95 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 580fee86a965..d66d773a37b4 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && iter->pos.snapshot != iter->snapshot); - BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || - bkey_gt(iter->pos, iter->k.p)); + BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : + !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : + (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || + bkey_gt(iter->pos, iter->k.p))); } static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) @@ -2152,6 +2154,37 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, return k; } +static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end_pos) +{ + struct btree_path *path = btree_iter_path(trans, iter); + + return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, + path->level, + path->pos, + end_pos, + &iter->journal_idx); +} + +static noinline +struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *next_journal = + bch2_btree_journal_peek_prev(trans, iter, + k.k ? k.k->p : path_l(path)->b->key.k.p); + + if (next_journal) { + iter->k = next_journal->k; + k = bkey_i_to_s_c(next_journal); + } + + return k; +} + /* * Checks btree key cache for key at iter->pos and returns it if present, or * bkey_s_c_null: @@ -2457,111 +2490,66 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } -/** - * bch2_btree_iter_peek_prev() - returns first key less than or equal to - * iterator's current position - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bpos search_key = iter->pos; - struct bkey_s_c k; - struct bkey saved_k; - const struct bch_val *saved_v; - btree_path_idx_t saved_path = 0; - int ret; - - bch2_trans_verify_not_unlocked_or_in_restart(trans); - EBUG_ON(btree_iter_path(trans, iter)->cached || - btree_iter_path(trans, iter)->level); - - if (iter->flags & BTREE_ITER_with_journal) - return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); + struct bkey_s_c k, k2; bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_entry_exit(iter); - - if (iter->flags & BTREE_ITER_filter_snapshots) - search_key.snapshot = U32_MAX; while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); + iter->flags & BTREE_ITER_intent, + btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out_no_locked; + break; } struct btree_path *path = btree_iter_path(trans, iter); + struct btree_path_level *l = path_l(path); - k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); - if (!k.k || - ((iter->flags & BTREE_ITER_is_extents) - ? bpos_ge(bkey_start_pos(k.k), search_key) - : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + break; + } + + btree_path_set_should_be_locked(trans, path); + + k = btree_path_level_peek_all(trans->c, l, &iter->k); + if (!k.k || bpos_gt(k.k->p, search_key)) { + k = btree_path_level_prev(trans, path, l, &iter->k); + + BUG_ON(k.k && bpos_gt(k.k->p, search_key)); + } + + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; + if (bkey_err(k2)) { + bch2_btree_iter_set_pos(iter, iter->pos); + break; + } + } + + if (unlikely(iter->flags & BTREE_ITER_with_journal)) + k = btree_trans_peek_prev_journal(trans, iter, k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_prev_updates(trans, iter, &k); - if (likely(k.k)) { - if (iter->flags & BTREE_ITER_filter_snapshots) { - if (k.k->p.snapshot == iter->snapshot) - goto got_key; - - /* - * If we have a saved candidate, and we're no - * longer at the same _key_ (not pos), return - * that candidate - */ - if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { - bch2_path_put_nokeep(trans, iter->path, - iter->flags & BTREE_ITER_intent); - iter->path = saved_path; - saved_path = 0; - iter->k = saved_k; - k.v = saved_v; - goto got_key; - } - - if (bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - if (saved_path) - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_intent); - saved_path = btree_path_clone(trans, iter->path, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - path = btree_iter_path(trans, iter); - trace_btree_path_save_pos(trans, path, trans->paths + saved_path); - saved_k = *k.k; - saved_v = k.v; - } - - search_key = bpos_predecessor(k.k->p); - continue; - } -got_key: - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_all_snapshots)) { - search_key = bkey_predecessor(iter, k.k->p); - if (iter->flags & BTREE_ITER_filter_snapshots) - search_key.snapshot = U32_MAX; - continue; - } - - btree_path_set_should_be_locked(trans, path); + if (likely(k.k && !bkey_deleted(k.k))) { break; + } else if (k.k) { + search_key = bpos_predecessor(k.k->p); } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ search_key = bpos_predecessor(path->l[0].b->data->min_key); @@ -2569,15 +2557,120 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) /* Start of btree: */ bch2_btree_iter_set_pos(iter, POS_MIN); k = bkey_s_c_null; - goto out_no_locked; + break; } } - EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); + bch2_btree_iter_verify(iter); + return k; +} + +/** + * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to + * iterator's current position + * @iter: iterator to peek from + * @end: search limit: returns keys greater than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct bkey_s_c k; + btree_path_idx_t saved_path = 0; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + + if (iter->flags & BTREE_ITER_filter_snapshots) + search_key.snapshot = U32_MAX; + + while (1) { + k = __bch2_btree_iter_peek_prev(iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) + goto out_no_locked; + + if (iter->flags & BTREE_ITER_filter_snapshots) { + struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; + if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { + /* + * If we have a saved candidate, and we're past + * the last possible snapshot overwrite, return + * it: + */ + bch2_path_put_nokeep(trans, iter->path, + iter->flags & BTREE_ITER_intent); + iter->path = saved_path; + saved_path = 0; + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); + break; + } + + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + */ + if (unlikely(bkey_lt(k.k->p, end))) + goto end; + + if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { + search_key = bpos_predecessor(k.k->p); + continue; + } + + if (k.k->p.snapshot != iter->snapshot) { + /* + * Have a key visible in iter->snapshot, but + * might have overwrites: - save it and keep + * searching. Unless it's a whiteout - then drop + * our previous saved candidate: + */ + if (saved_path) { + bch2_path_put_nokeep(trans, saved_path, + iter->flags & BTREE_ITER_intent); + saved_path = 0; + } + + if (!bkey_whiteout(k.k)) { + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_intent, + _THIS_IP_); + trace_btree_path_save_pos(trans, + trans->paths + iter->path, + trans->paths + saved_path); + } + + search_key = bpos_predecessor(k.k->p); + continue; + } + + if (bkey_whiteout(k.k)) { + search_key = bkey_predecessor(iter, k.k->p); + search_key.snapshot = U32_MAX; + continue; + } + } + + EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : + bkey_gt(k.k->p, iter->pos)); + + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : + bkey_lt(k.k->p, end))) + goto end; + + break; + } /* Extents can straddle iter->pos: */ - if (bkey_lt(k.k->p, iter->pos)) - iter->pos = k.k->p; + iter->pos = bpos_min(iter->pos, k.k->p);; if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; @@ -2587,8 +2680,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - return k; +end: + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; } /** diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index cd9022ce15a5..3477fc8c0396 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -389,7 +389,13 @@ static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) return bch2_btree_iter_peek_max(iter, SPOS_MAX); } -struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); + +static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +{ + return bch2_btree_iter_peek_prev_min(iter, POS_MIN); +} + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index c9dee4b4627a..c44889ef9817 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -107,6 +107,52 @@ struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_ return NULL; } +struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; + + BUG_ON(*idx > keys->nr); +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) + return NULL; + + if (k->overwritten) { + --(*idx); + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) >= 0) + return k->k; + + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + return NULL; +} + struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos) { diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 754939f604d5..fa8c4f82c9c7 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -45,6 +45,8 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 40bf1e5775a9..18c995d41203 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -193,7 +193,6 @@ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ - x(EINVAL, btree_iter_with_journal_not_supported) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e0335265de3d..e10abd2e6c69 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -620,7 +620,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct btree_iter iter = {}; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); bch2_trans_iter_exit(trans, &iter); int ret = bkey_err(k); if (ret) @@ -1649,7 +1649,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->count != count2) { bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", w->last_pos.inode, i->snapshot, i->count, count2); - return -BCH_ERR_internal_fsck_err; + i->count = count2; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index ff661a072000..524e31e7411b 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -426,7 +426,7 @@ case LOGGED_OP_FINSERT_shift_extents: bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); k = insert - ? bch2_btree_iter_peek_prev(&iter) + ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; From 3140d0052a47723243dbd8f5a1f49ebb5eda2e9e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 25 Oct 2024 20:41:06 -0400 Subject: [PATCH 090/233] bcachefs: peek_prev_min(): Search forwards for extents, snapshots With extents and snapshots, for slightly different reasons, we may have to search forwards to find a key that compares equal to iter->pos (i.e. a key that peek_prev() should return, as it returns keys <= iter->pos). peek_slot() does this, and is an easy way to fix this case. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index d66d773a37b4..ed74f0655d98 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2575,6 +2575,26 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru */ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) { + if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && + !bkey_eq(iter->pos, POS_MAX)) { + /* + * bkey_start_pos(), for extents, is not monotonically + * increasing until after filtering for snapshots: + * + * Thus, for extents we need to search forward until we find a + * real visible extents - easiest to just use peek_slot() (which + * internally uses peek() for extents) + */ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) + return k; + + if (!bkey_deleted(k.k) && + (!(iter->flags & BTREE_ITER_is_extents) || + bkey_lt(bkey_start_pos(k.k), iter->pos))) + return k; + } + struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; struct bkey_s_c k; @@ -2584,9 +2604,6 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); - if (iter->flags & BTREE_ITER_filter_snapshots) - search_key.snapshot = U32_MAX; - while (1) { k = __bch2_btree_iter_peek_prev(iter, search_key); if (unlikely(!k.k)) From 59fad23c7abcecc8d4022e76050295c2f37c1bfb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 21:28:40 -0500 Subject: [PATCH 091/233] bcachefs: Delete backpointers check in try_alloc_bucket() try_alloc_bucket() has a "safety" check, which avoids allocating a bucket if there's any backpointers present. But backpointers are not the source of truth for live data in a bucket, the bucket sector counts are; this check was fairly useless, and we're also deferring backpointers checks from fsck to runtime in the near future. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 955ea6ae868f..6d665b720f72 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -290,26 +290,6 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc if (ret) return NULL; - if (unlikely(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers)) { - struct bch_backpointer bp; - struct bpos bp_pos = POS_MIN; - - ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, - &bp_pos, &bp, - BTREE_ITER_nopreserve); - if (ret) - return ERR_PTR(ret); - - if (!bkey_eq(bp_pos, POS_MAX)) { - /* - * Bucket may have data in it - we don't call - * bch2_trans_inconsistent() because fsck hasn't - * finished yet - */ - return NULL; - } - } - return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); } From 7fdfb0cbea34b8dcc319be4b4898d89350a7f40f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 21:53:38 -0500 Subject: [PATCH 092/233] bcachefs: Kill bch2_get_next_backpointer() Since for quite some time backpointers have only been stored in the backpointers btree, not alloc keys (an aborted experiment, support for which has been removed) - we can replace get_next_backpointer() with simple btree iteration. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 125 +++++++++++-------------------------- fs/bcachefs/backpointers.h | 11 ++-- fs/bcachefs/ec.c | 41 +++++------- fs/bcachefs/move.c | 41 ++++++------ 4 files changed, 75 insertions(+), 143 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index f323ce4b0b33..a9ffbea277bd 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -215,59 +215,9 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, return ret; } -/* - * Find the next backpointer >= *bp_offset: - */ -int bch2_get_next_backpointer(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, int gen, - struct bpos *bp_pos, - struct bch_backpointer *bp, - unsigned iter_flags) -{ - struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); - struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; - struct bkey_s_c k; - int ret = 0; - - if (bpos_ge(*bp_pos, bp_end_pos)) - goto done; - - if (gen >= 0) { - k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_cached|iter_flags); - ret = bkey_err(k); - if (ret) - goto out; - - if (k.k->type != KEY_TYPE_alloc_v4 || - bkey_s_c_to_alloc_v4(k).v->gen != gen) - goto done; - } - - *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); - - for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, - *bp_pos, iter_flags, k, ret) { - if (bpos_ge(k.k->p, bp_end_pos)) - break; - - *bp_pos = k.k->p; - *bp = *bkey_s_c_to_backpointer(k).v; - goto out; - } -done: - *bp_pos = SPOS_MAX; -out: - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - -static void backpointer_not_found(struct btree_trans *trans, - struct bpos bp_pos, - struct bch_backpointer bp, - struct bkey_s_c k) +static void backpointer_target_not_found(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct bkey_s_c target_k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -281,22 +231,22 @@ static void backpointer_not_found(struct btree_trans *trans, return; struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket)) return; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", - bp.level ? "btree node" : "extent"); + bp.v->level ? "btree node" : "extent"); prt_printf(&buf, "bucket: "); bch2_bpos_to_text(&buf, bucket); prt_printf(&buf, "\n "); prt_printf(&buf, "backpointer pos: "); - bch2_bpos_to_text(&buf, bp_pos); + bch2_bpos_to_text(&buf, bp.k->p); prt_printf(&buf, "\n "); - bch2_backpointer_to_text(&buf, &bp); + bch2_backpointer_to_text(&buf, bp.v); prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); + bch2_bkey_val_to_text(&buf, c, target_k); if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) bch_err_ratelimited(c, "%s", buf.buf); else @@ -306,21 +256,20 @@ static void backpointer_not_found(struct btree_trans *trans, } struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, struct btree_iter *iter, - struct bpos bp_pos, - struct bch_backpointer bp, unsigned iter_flags) { - if (likely(!bp.level)) { + if (likely(!bp.v->level)) { struct bch_fs *c = trans->c; struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket)) return bkey_s_c_err(-EIO); bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, + bp.v->btree_id, + bp.v->pos, 0, 0, iter_flags); struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); @@ -329,14 +278,15 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, return k; } - if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bucket, *bp.v)) return k; bch2_trans_iter_exit(trans, iter); - backpointer_not_found(trans, bp_pos, bp, k); + backpointer_target_not_found(trans, bp, k); return bkey_s_c_null; } else { - struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + struct btree *b = bch2_backpointer_get_node(trans, bp, iter); if (IS_ERR_OR_NULL(b)) { bch2_trans_iter_exit(trans, iter); @@ -347,39 +297,38 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, } struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos bp_pos, - struct bch_backpointer bp) + struct bkey_s_c_backpointer bp, + struct btree_iter *iter) { struct bch_fs *c = trans->c; - BUG_ON(!bp.level); + BUG_ON(!bp.v->level); struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) + if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket)) return ERR_PTR(-EIO); bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, + bp.v->btree_id, + bp.v->pos, 0, - bp.level - 1, + bp.v->level - 1, 0); struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; - BUG_ON(b->c.level != bp.level - 1); + BUG_ON(b->c.level != bp.v->level - 1); - if (extent_matches_bp(c, bp.btree_id, bp.level, + if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, bkey_i_to_s_c(&b->key), - bucket, bp)) + bucket, *bp.v)) return b; if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); + backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key)); b = NULL; } err: @@ -581,10 +530,10 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer) goto missing; - struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; + struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); + bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; @@ -603,7 +552,7 @@ static int check_bp_exists(struct btree_trans *trans, bch_err(c, "%s", buf.buf); if (other_extent.k->size <= orig_k.k->size) { - ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); + ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bucket.inode); if (ret) goto err; goto out; @@ -615,7 +564,7 @@ static int check_bp_exists(struct btree_trans *trans, } } - ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); + ret = check_extent_checksum(trans, other_bp.v->btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); if (ret < 0) goto err; if (ret) { @@ -623,7 +572,7 @@ static int check_bp_exists(struct btree_trans *trans, goto missing; } - ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); + ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.v->btree_id, other_extent, bucket.inode); if (ret < 0) goto err; if (ret) { @@ -964,18 +913,16 @@ static int check_one_backpointer(struct btree_trans *trans, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); struct bch_fs *c = trans->c; - struct btree_iter iter; struct bbpos pos = bp_to_bbpos(*bp.v); - struct bkey_s_c k; struct printbuf buf = PRINTBUF; - int ret; if (bbpos_cmp(pos, start) < 0 || bbpos_cmp(pos, end) > 0) return 0; - k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0); + int ret = bkey_err(k); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) return 0; if (ret) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 3b29fdf519dd..74c96aee713e 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -165,13 +165,10 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors); } -int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, - struct bpos *, struct bch_backpointer *, unsigned); -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_backpointer, - unsigned); -struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_backpointer); +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, + struct btree_iter *, unsigned); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, + struct btree_iter *); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d6560bccd87c..aa8ada4f0ec0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1276,11 +1276,10 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, - struct bpos *bp_pos) + struct bkey_s_c_backpointer bp) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_fs *c = trans->c; - struct bch_backpointer bp; struct btree_iter iter; struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; @@ -1289,33 +1288,26 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bkey_i *n; int ret, dev, block; - ret = bch2_get_next_backpointer(trans, ca, bucket, gen, - bp_pos, &bp, BTREE_ITER_cached); - if (ret) - return ret; - if (bpos_eq(*bp_pos, SPOS_MAX)) - return 0; - - if (bp.level) { + if (bp.v->level) { struct printbuf buf = PRINTBUF; struct btree_iter node_iter; struct btree *b; - b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); + b = bch2_backpointer_get_node(trans, bp, &node_iter); bch2_trans_iter_exit(trans, &node_iter); if (!b) return 0; prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); - bch2_backpointer_to_text(&buf, &bp); + bch2_backpointer_to_text(&buf, bp.v); bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); return -EIO; } - k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); + k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent); ret = bkey_err(k); if (ret) return ret; @@ -1374,7 +1366,6 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_fs *c = trans->c; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_extent_ptr ptr = v->ptrs[block]; - struct bpos bp_pos = POS_MIN; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); @@ -1383,18 +1374,20 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - while (1) { - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); - if (ret) - break; - if (bkey_eq(bp_pos, POS_MAX)) + ret = for_each_btree_key_commit(trans, bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(ca, bucket_pos, 0), 0, bp_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ({ + if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) break; - bp_pos = bpos_nosnap_successor(bp_pos); - } + if (bp_k.k->type != KEY_TYPE_backpointer) + continue; + + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, + bkey_s_c_to_backpointer(bp_k)); + })); bch2_dev_put(ca); return ret; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index a6b503278519..88ab9d7e1a1b 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -670,16 +670,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bch_fs *c = trans->c; bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter; + struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; - struct bch_backpointer bp; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct bkey_s_c k; struct data_update_opts data_opts; unsigned dirty_sectors, bucket_size; u64 fragmentation; - struct bpos bp_pos = POS_MIN; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); @@ -695,21 +691,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, BTREE_ITER_cached); - ret = lockrestart_do(trans, - bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(ca, bucket, 0), 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); - dirty_sectors = bch2_bucket_sectors_dirty(*a); - bucket_size = ca->mi.bucket_size; - fragmentation = alloc_lru_idx_fragmentation(*a, ca); - ret = bch2_btree_write_buffer_tryflush(trans); bch_err_msg(c, ret, "flushing btree write buffer"); if (ret) @@ -721,18 +709,24 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(trans, ca, bucket, gen, - &bp_pos, &bp, - BTREE_ITER_cached); + k = bch2_btree_iter_peek(&bp_iter); + ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) goto err; - if (bkey_eq(bp_pos, POS_MAX)) + + if (!k.k || + bkey_ge(k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0))) break; - if (!bp.level) { - k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); + if (k.k->type != KEY_TYPE_backpointer) + goto next; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + if (!bp.v->level) { + k = bch2_backpointer_get_key(trans, bp, &iter, 0); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -785,7 +779,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, } else { struct btree *b; - b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); + b = bch2_backpointer_get_node(trans, bp, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) continue; @@ -814,11 +808,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, } } next: - bp_pos = bpos_nosnap_successor(bp_pos); + bch2_btree_iter_advance(&bp_iter); } trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); err: + bch2_trans_iter_exit(trans, &bp_iter); bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); return ret; From d5b149f3108a40e2bc88e8fcd9bc5d70096fa6c3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 03:31:01 -0500 Subject: [PATCH 093/233] bcachefs: add missing BTREE_ITER_intent this fixes excessive transaction restarts due to trans_commit having to upgrade Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f11e11279f01..f97ebb30f6c0 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -216,6 +216,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), + BTREE_ITER_intent| BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) From 3c0fc088af9edef54fb6fb410f928df0268a7f63 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 16 Nov 2024 21:03:53 -0500 Subject: [PATCH 094/233] bcachefs: compression workspaces should be indexed by opt, not type type includes lz4 and lz4_old, which do not get different compression workspaces, and incompressible, a fake type - BCH_COMPRESSION_OPTS() is the correct enum to use. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/compress.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index c59a58b93a92..60ad547c52a8 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -982,7 +982,7 @@ struct bch_fs { struct rhashtable promote_table; mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; + mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; mempool_t decompress_workspace; size_t zstd_workspace_size; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 1410365a8891..4f541a195c84 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -394,8 +394,11 @@ static unsigned __bio_compress(struct bch_fs *c, unsigned pad; int ret = 0; - BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); - BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + /* bch2_compression_decode catches unknown compression types: */ + BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); + + mempool_t *workspace_pool = &c->compress_workspace[compression.type]; + BUG_ON(!mempool_initialized(workspace_pool)); /* If it's only one block, don't bother trying to compress: */ if (src->bi_iter.bi_size <= c->opts.block_size) @@ -404,7 +407,7 @@ static unsigned __bio_compress(struct bch_fs *c, dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); - workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); *src_len = src->bi_iter.bi_size; *dst_len = dst->bi_iter.bi_size; @@ -447,7 +450,7 @@ static unsigned __bio_compress(struct bch_fs *c, *src_len = round_down(*src_len, block_bytes(c)); } - mempool_free(workspace, &c->compress_workspace[compression_type]); + mempool_free(workspace, workspace_pool); if (ret) goto err; @@ -576,17 +579,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) struct { unsigned feature; - enum bch_compression_type type; + enum bch_compression_opts type; size_t compress_workspace; size_t decompress_workspace; } compression_types[] = { - { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, + { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), 0 }, - { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, + { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, - { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, + { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, c->zstd_workspace_size, zstd_dctx_workspace_bound() }, }, *i; From 3a1897837a020cf57b2fa9ceb69f488762e89255 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Nov 2024 00:52:20 -0500 Subject: [PATCH 095/233] bcachefs: Don't use a shared decompress workspace mempool gzip and zstd require different decompress workspace sizes, and if we start with one and then start using the other at runtime we may not get the correct size Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/compress.c | 52 +++++++++++++++++++++++++----------------- fs/bcachefs/errcode.h | 1 - 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 60ad547c52a8..7a947d43d504 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -983,7 +983,6 @@ struct bch_fs { mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; - mempool_t decompress_workspace; size_t zstd_workspace_size; struct crypto_shash *sha256; diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 4f541a195c84..2813e4556f0d 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -9,6 +9,24 @@ #include #include +static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) +{ + switch (type) { + case BCH_COMPRESSION_TYPE_none: + case BCH_COMPRESSION_TYPE_incompressible: + return BCH_COMPRESSION_OPT_none; + case BCH_COMPRESSION_TYPE_lz4_old: + case BCH_COMPRESSION_TYPE_lz4: + return BCH_COMPRESSION_OPT_lz4; + case BCH_COMPRESSION_TYPE_gzip: + return BCH_COMPRESSION_OPT_gzip; + case BCH_COMPRESSION_TYPE_zstd: + return BCH_COMPRESSION_OPT_zstd; + default: + BUG(); + } +} + /* Bounce buffer: */ struct bbuf { void *b; @@ -158,6 +176,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, void *workspace; int ret; + enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); + mempool_t *workspace_pool = &c->compress_workspace[opt]; + BUG_ON(!mempool_initialized(workspace_pool)); + src_data = bio_map_or_bounce(c, src, READ); switch (crc.compression_type) { @@ -176,13 +198,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, .avail_out = dst_len, }; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); ret = zlib_inflate(&strm, Z_FINISH); - mempool_free(workspace, &c->decompress_workspace); + mempool_free(workspace, workspace_pool); if (ret != Z_STREAM_END) goto err; @@ -195,14 +217,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, if (real_src_len > src_len - 4) goto err; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); ret = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); - mempool_free(workspace, &c->decompress_workspace); + mempool_free(workspace, workspace_pool); if (ret != dst_len) goto err; @@ -562,7 +584,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) { unsigned i; - mempool_exit(&c->decompress_workspace); for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) mempool_exit(&c->compress_workspace[i]); mempool_exit(&c->compression_bounce[WRITE]); @@ -571,7 +592,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { - size_t decompress_workspace_size = 0; ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); @@ -581,17 +601,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) unsigned feature; enum bch_compression_opts type; size_t compress_workspace; - size_t decompress_workspace; } compression_types[] = { { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), - 0 }, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, - zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), - zlib_inflate_workspacesize(), }, + max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize()) }, { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, - c->zstd_workspace_size, - zstd_dctx_workspace_bound() }, + max(c->zstd_workspace_size, + zstd_dctx_workspace_bound()) }, }, *i; bool have_compressed = false; @@ -616,9 +634,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) { - decompress_workspace_size = - max(decompress_workspace_size, i->decompress_workspace); - if (!(features & (1 << i->feature))) continue; @@ -631,11 +646,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return -BCH_ERR_ENOMEM_compression_workspace_init; } - if (!mempool_initialized(&c->decompress_workspace) && - mempool_init_kvmalloc_pool(&c->decompress_workspace, - 1, decompress_workspace_size)) - return -BCH_ERR_ENOMEM_decompression_workspace_init; - return 0; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 18c995d41203..3affdafc2c04 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -54,7 +54,6 @@ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ - x(ENOMEM, ENOMEM_decompression_workspace_init) \ x(ENOMEM, ENOMEM_bucket_gens) \ x(ENOMEM, ENOMEM_buckets_nouse) \ x(ENOMEM, ENOMEM_usage_init) \ From b287adb628223810c78703e6bcad624944dde679 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 23:03:40 -0500 Subject: [PATCH 096/233] bcachefs: Don't BUG_ON() when superblock feature wasn't set for compressed data We don't allocate the mempools for compression/decompression unless we need them - but that means there's an inconsistency to check for. Reported-by: syzbot+cb3fbcfb417448cfd278@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 29 +++++++++++++++++++++++++++-- fs/bcachefs/errcode.h | 1 + fs/bcachefs/opts.c | 2 +- fs/bcachefs/opts.h | 1 + fs/bcachefs/sb-errors_format.h | 4 +++- 5 files changed, 33 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 2813e4556f0d..f99ff1819597 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -2,7 +2,9 @@ #include "bcachefs.h" #include "checksum.h" #include "compress.h" +#include "error.h" #include "extents.h" +#include "opts.h" #include "super-io.h" #include @@ -178,7 +180,16 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); mempool_t *workspace_pool = &c->compress_workspace[opt]; - BUG_ON(!mempool_initialized(workspace_pool)); + if (unlikely(!mempool_initialized(workspace_pool))) { + if (fsck_err(c, compression_type_not_marked_in_sb, + "compression type %s set but not marked in superblock", + __bch2_compression_types[crc.compression_type])) + ret = bch2_check_set_has_compressed_data(c, opt); + else + ret = -BCH_ERR_compression_workspace_not_initialized; + if (ret) + goto out; + } src_data = bio_map_or_bounce(c, src, READ); @@ -234,6 +245,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, BUG(); } ret = 0; +fsck_err: out: bio_unmap_or_unbounce(c, src_data); return ret; @@ -420,7 +432,17 @@ static unsigned __bio_compress(struct bch_fs *c, BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); mempool_t *workspace_pool = &c->compress_workspace[compression.type]; - BUG_ON(!mempool_initialized(workspace_pool)); + if (unlikely(!mempool_initialized(workspace_pool))) { + if (fsck_err(c, compression_opt_not_marked_in_sb, + "compression opt %s set but not marked in superblock", + bch2_compression_opts[compression.type])) { + ret = bch2_check_set_has_compressed_data(c, compression.type); + if (ret) /* memory allocation failure, don't compress */ + return 0; + } else { + return 0; + } + } /* If it's only one block, don't bother trying to compress: */ if (src->bi_iter.bi_size <= c->opts.block_size) @@ -502,6 +524,9 @@ static unsigned __bio_compress(struct bch_fs *c, err: ret = BCH_COMPRESSION_TYPE_incompressible; goto out; +fsck_err: + ret = 0; + goto out; } unsigned bch2_bio_compress(struct bch_fs *c, diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 3affdafc2c04..2dda7f962e5b 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -54,6 +54,7 @@ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ + x(EIO, compression_workspace_not_initialized) \ x(ENOMEM, ENOMEM_bucket_gens) \ x(ENOMEM, ENOMEM_buckets_nouse) \ x(ENOMEM, ENOMEM_usage_init) \ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 0ba58d74c21f..6772faf385a5 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -54,7 +54,7 @@ const char * const __bch2_csum_opts[] = { NULL }; -static const char * const __bch2_compression_types[] = { +const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 6b29339ea725..ea69099e681d 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -17,6 +17,7 @@ extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; extern const char * const __bch2_csum_opts[]; +extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index f2b38493356d..d5b18ff1645c 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -305,7 +305,9 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 295, 0) + x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ + x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ + x(MAX, 297, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From ed144047ef65601342eb7a821a8648b19d6b44a9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 16 Nov 2024 23:54:19 -0500 Subject: [PATCH 097/233] bcachefs: kill bch2_journal_entries_free() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 17 ++++++----------- fs/bcachefs/btree_journal_iter.h | 2 -- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index c44889ef9817..39898baa8854 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -527,16 +527,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, /* sort and dedup all keys in the journal: */ -void bch2_journal_entries_free(struct bch_fs *c) -{ - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - kvfree(*i); - genradix_free(&c->journal_entries); -} - /* * When keys compare equal, oldest compares first: */ @@ -569,7 +559,12 @@ void bch2_journal_keys_put(struct bch_fs *c) keys->data = NULL; keys->nr = keys->gap = keys->size = 0; - bch2_journal_entries_free(c); + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + kvfree(*i); + genradix_free(&c->journal_entries); } static void __journal_keys_sort(struct journal_keys *keys) diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index fa8c4f82c9c7..5ddbb7571770 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -81,8 +81,6 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c) c->journal_keys.initial_ref_held = false; } -void bch2_journal_entries_free(struct bch_fs *); - int bch2_journal_keys_sort(struct bch_fs *); void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, From 1d1374a0837b8ba85c6ef9bf48efe52bf975cd51 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 14:20:35 -0500 Subject: [PATCH 098/233] bcachefs: journal keys: sort keys for interior nodes first There's an unavoidable issue with btree lookups when we're overlaying journal keys and the journal has many deletions for keys present in the btree - peek operations will have to iterate over all those deletions to find the next live key to return. This is mainly a problem for lookups in interior nodes, if we have to traverse to a leaf. Looking up an insert position in a leaf (for journal replay) doesn't have to find the next live key, but walking down the btree does. So to ameloriate this, change journal key sort ordering so that we replay keys from roots and interior nodes first. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 10 ++++------ fs/bcachefs/btree_journal_iter.h | 13 ++++++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 39898baa8854..dbc9bc233cca 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -172,9 +172,8 @@ static void journal_iter_verify(struct journal_iter *iter) if (iter->idx < keys->size) { struct journal_key *k = keys->data + iter->idx; - int cmp = cmp_int(k->btree_id, iter->btree_id) ?: - cmp_int(k->level, iter->level); - BUG_ON(cmp < 0); + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + BUG_ON(cmp > 0); } } @@ -365,9 +364,8 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; - int cmp = cmp_int(k->btree_id, iter->btree_id) ?: - cmp_int(k->level, iter->level); - if (cmp > 0) + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + if (cmp < 0) break; BUG_ON(cmp); diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 5ddbb7571770..118ada4cdd1b 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -28,14 +28,21 @@ struct btree_and_journal_iter { bool prefetch; }; +static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, + unsigned l_level, + const struct journal_key *r) +{ + return -cmp_int(l_level, r->level) ?: + cmp_int(l_btree_id, r->btree_id); +} + static inline int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, const struct journal_key *r) { - return (cmp_int(l_btree_id, r->btree_id) ?: - cmp_int(l_level, r->level) ?: - bpos_cmp(l_pos, r->k->k.p)); + return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: + bpos_cmp(l_pos, r->k->k.p); } static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) From 1a8f5adc2028bd7a11a96f85abae6a0e051c7ba4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 14:39:46 -0500 Subject: [PATCH 099/233] bcachefs: btree_and_journal_iter: don't iterate over too many whiteouts when prefetching To help ameloriate issues with peek operations having to skip over deletions in the journal - just bail out if all we're doing is prefetching btree nodes. Since btree node prefetching runs every time we iterate to a new node, and has to sequentially scan ahead, this avoids another O(n^2). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 ++ fs/bcachefs/btree_journal_iter.c | 7 +++++++ fs/bcachefs/btree_journal_iter.h | 1 + 3 files changed, 10 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index ed74f0655d98..89f9665ce70d 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -825,6 +825,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p bch2_bkey_buf_init(&tmp); + jiter->fail_if_too_many_whiteouts = true; + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index dbc9bc233cca..cc7f5fad90c6 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -426,6 +426,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter : (level > 1 ? 1 : 16); iter.prefetch = false; + iter.fail_if_too_many_whiteouts = true; bch2_bkey_buf_init(&tmp); while (nr--) { @@ -444,6 +445,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; + size_t iters = 0; if (iter->prefetch && iter->journal.level) btree_and_journal_iter_prefetch(iter); @@ -451,6 +453,11 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * if (iter->at_end) return bkey_s_c_null; + iters++; + + if (iters > 20 && iter->fail_if_too_many_whiteouts) + return bkey_s_c_null; + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 118ada4cdd1b..9e8f8ab1c6ff 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -26,6 +26,7 @@ struct btree_and_journal_iter { struct bpos pos; bool at_end; bool prefetch; + bool fail_if_too_many_whiteouts; }; static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, From 92084feca4fd9d534b7d1d9e1425faeeaf91c3fa Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 02:23:24 -0500 Subject: [PATCH 100/233] bcachefs: fix O(n^2) issue with whiteouts in journal keys The journal_keys array can't be substantially modified after we go RW, because lookups need to be able to check it locklessly - thus we're limited on what we can do when a key in the journal has been overwritten. This is a problem when there's many overwrites to skip over for peek() operations. To fix this, add tracking of ranges of overwrites: we create a range entry when there's more than one contiguous whiteout. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 23 +--- fs/bcachefs/btree_journal_iter.c | 156 ++++++++++++++++++++++--- fs/bcachefs/btree_journal_iter.h | 2 + fs/bcachefs/btree_journal_iter_types.h | 36 ++++++ fs/bcachefs/super.c | 3 +- 5 files changed, 179 insertions(+), 41 deletions(-) create mode 100644 fs/bcachefs/btree_journal_iter_types.h diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7a947d43d504..11f9ed42a9da 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -205,6 +205,7 @@ #include #include "bcachefs_format.h" +#include "btree_journal_iter_types.h" #include "disk_accounting_types.h" #include "errcode.h" #include "fifo.h" @@ -658,28 +659,6 @@ struct journal_seq_blacklist_table { } entries[]; }; -struct journal_keys { - /* must match layout in darray_types.h */ - size_t nr, size; - struct journal_key { - u64 journal_seq; - u32 journal_offset; - enum btree_id btree_id:8; - unsigned level:8; - bool allocated; - bool overwritten; - struct bkey_i *k; - } *data; - /* - * Gap buffer: instead of all the empty space in the array being at the - * end of the buffer - from @nr to @size - the empty space is at @gap. - * This means that sequential insertions are O(n) instead of O(n^2). - */ - size_t gap; - atomic_t ref; - bool initial_ref_held; -}; - struct btree_trans_buf { struct btree_trans *trans; }; diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index cc7f5fad90c6..de3db161d6ab 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -16,6 +16,17 @@ * operations for the regular btree iter code to use: */ +static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) +{ + size_t gap_size = keys->size - keys->nr; + + BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); + + if (pos >= keys->gap) + pos -= gap_size; + return pos; +} + static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) { size_t gap_size = keys->size - keys->nr; @@ -84,27 +95,37 @@ struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_ } } + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - return NULL; + break; if (k->overwritten) { - (*idx)++; + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->end; + else + *idx += 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) - return k->k; + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { + ret = k->k; + break; + } (*idx)++; iters++; if (iters == 10) { *idx = 0; + rcu_read_unlock(); goto search; } } - return NULL; + rcu_read_unlock(); + return ret; } struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, @@ -130,17 +151,25 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b } } + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) - return NULL; + break; if (k->overwritten) { - --(*idx); + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->start - 1; + else + *idx -= 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) >= 0) - return k->k; + if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { + ret = k->k; + break; + } --(*idx); iters++; @@ -150,7 +179,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b } } - return NULL; + rcu_read_unlock(); + return ret; } struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, @@ -163,6 +193,7 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree static void journal_iter_verify(struct journal_iter *iter) { +#ifdef CONFIG_BCACHEFS_DEBUG struct journal_keys *keys = iter->keys; size_t gap_size = keys->size - keys->nr; @@ -175,6 +206,7 @@ static void journal_iter_verify(struct journal_iter *iter) int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); BUG_ON(cmp > 0); } +#endif } static void journal_iters_fix(struct bch_fs *c) @@ -335,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, bkey_deleted(&keys->data[idx].k->k)); } +static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) +{ + struct journal_key *k = keys->data + pos; + size_t idx = pos_to_idx(keys, pos); + + k->overwritten = true; + + struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; + struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; + + bool prev_overwritten = prev && prev->overwritten; + bool next_overwritten = next && next->overwritten; + + struct journal_key_range_overwritten *prev_range = + prev_overwritten ? prev->overwritten_range : NULL; + struct journal_key_range_overwritten *next_range = + next_overwritten ? next->overwritten_range : NULL; + + BUG_ON(prev_range && prev_range->end != idx); + BUG_ON(next_range && next_range->start != idx + 1); + + if (prev_range && next_range) { + prev_range->end = next_range->end; + + keys->data[pos].overwritten_range = prev_range; + for (size_t i = next_range->start; i < next_range->end; i++) { + struct journal_key *ip = keys->data + idx_to_pos(keys, i); + BUG_ON(ip->overwritten_range != next_range); + ip->overwritten_range = prev_range; + } + + kfree_rcu_mightsleep(next_range); + } else if (prev_range) { + prev_range->end++; + k->overwritten_range = prev_range; + if (next_overwritten) { + prev_range->end++; + next->overwritten_range = prev_range; + } + } else if (next_range) { + next_range->start--; + k->overwritten_range = next_range; + if (prev_overwritten) { + next_range->start--; + prev->overwritten_range = next_range; + } + } else if (prev_overwritten || next_overwritten) { + struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return; + + r->start = idx - (size_t) prev_overwritten; + r->end = idx + 1 + (size_t) next_overwritten; + + rcu_assign_pointer(k->overwritten_range, r); + if (prev_overwritten) + prev->overwritten_range = r; + if (next_overwritten) + next->overwritten_range = r; + } +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { @@ -344,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, if (idx < keys->size && keys->data[idx].btree_id == btree && keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos)) - keys->data[idx].overwritten = true; + bpos_eq(keys->data[idx].k->k.p, pos) && + !keys->data[idx].overwritten) { + mutex_lock(&keys->overwrite_lock); + __bch2_journal_key_overwritten(keys, idx); + mutex_unlock(&keys->overwrite_lock); + } } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -359,8 +457,11 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { + struct bkey_s_c ret = bkey_s_c_null; + journal_iter_verify(iter); + rcu_read_lock(); while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; @@ -369,13 +470,19 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) break; BUG_ON(cmp); - if (!k->overwritten) - return bkey_i_to_s_c(k->k); + if (!k->overwritten) { + ret = bkey_i_to_s_c(k->k); + break; + } - bch2_journal_iter_advance(iter); + if (k->overwritten_range) + iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); + else + bch2_journal_iter_advance(iter); } + rcu_read_unlock(); - return bkey_s_c_null; + return ret; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -556,9 +663,15 @@ void bch2_journal_keys_put(struct bch_fs *c) move_gap(keys, keys->nr); - darray_for_each(*keys, i) + darray_for_each(*keys, i) { + if (i->overwritten_range && + (i == &darray_last(*keys) || + i->overwritten_range != i[1].overwritten_range)) + kfree(i->overwritten_range); + if (i->allocated) kfree(i->k); + } kvfree(keys->data); keys->data = NULL; @@ -682,3 +795,12 @@ void bch2_journal_keys_dump(struct bch_fs *c) } printbuf_exit(&buf); } + +void bch2_fs_journal_keys_init(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + atomic_set(&keys->ref, 1); + keys->initial_ref_held = true; + mutex_init(&keys->overwrite_lock); +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 9e8f8ab1c6ff..2a3082919b8d 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -97,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, void bch2_journal_keys_dump(struct bch_fs *); +void bch2_fs_journal_keys_init(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h new file mode 100644 index 000000000000..8b773823704f --- /dev/null +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H +#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H + +struct journal_key_range_overwritten { + size_t start, end; +}; + +struct journal_key { + u64 journal_seq; + u32 journal_offset; + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; + bool overwritten; + struct journal_key_range_overwritten __rcu * + overwritten_range; + struct bkey_i *k; +}; + +struct journal_keys { + /* must match layout in darray_types.h */ + size_t nr, size; + struct journal_key *data; + /* + * Gap buffer: instead of all the empty space in the array being at the + * end of the buffer - from @nr to @size - the empty space is at @gap. + * This means that sequential insertions are O(n) instead of O(n^2). + */ + size_t gap; + atomic_t ref; + bool initial_ref_held; + struct mutex overwrite_lock; +}; + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 37eee352fa21..08170a3d524f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -773,8 +773,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - atomic_set(&c->journal_keys.ref, 1); - c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -784,6 +782,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); + bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); From 16de1298962eb607d154962cbf2b6ce6bdbd5f8f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 9 Dec 2024 06:18:49 -0500 Subject: [PATCH 101/233] bcachefs: Fix evacuate_bucket tracepoint 86a494c8eef9 ("bcachefs: Kill bch2_get_next_backpointer()") dropped some things the tracepoint emitted because bch2_evacuate_bucket() no longer looks at the alloc key - but we did want at least some of that. We still no longer look at the alloc key so we can't report on the fragmentation number, but that's a direct function of dirty_sectors and a copygc concern anyways - copygc should get its own tracepoint that includes information from the fragmentation LRU. But we can report on the number of sectors we moved and the bucket size. Co-developed-by: Piotr Zalewski Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 21 +++++++++++++-------- fs/bcachefs/trace.h | 10 ++++------ 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 88ab9d7e1a1b..74839268d6ab 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -674,8 +674,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_buf sk; struct bkey_s_c k; struct data_update_opts data_opts; - unsigned dirty_sectors, bucket_size; - u64 fragmentation; + unsigned sectors_moved = 0; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); @@ -748,14 +747,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.target = io_opts.background_target; data_opts.rewrite_ptrs = 0; + unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ unsigned i = 0; - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == bucket.inode) { - data_opts.rewrite_ptrs |= 1U << i; - if (ptr->cached) { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + if (p.ptr.dev == bucket.inode) { + if (p.ptr.cached) { bch2_trans_iter_exit(trans, &iter); goto next; } + data_opts.rewrite_ptrs |= 1U << i; + break; } i++; } @@ -775,7 +778,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->stats) - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + sectors_moved += sectors; } else { struct btree *b; @@ -806,12 +810,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, atomic64_add(sectors, &ctxt->stats->sectors_seen); atomic64_add(sectors, &ctxt->stats->sectors_moved); } + sectors_moved += btree_sectors(c); } next: bch2_btree_iter_advance(&bp_iter); } - trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); + trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); err: bch2_trans_iter_exit(trans, &bp_iter); bch2_dev_put(ca); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 5597b9d6297f..2d5932d2881e 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -848,8 +848,8 @@ TRACE_EVENT(move_data, TRACE_EVENT(evacuate_bucket, TP_PROTO(struct bch_fs *c, struct bpos *bucket, unsigned sectors, unsigned bucket_size, - u64 fragmentation, int ret), - TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), + int ret), + TP_ARGS(c, bucket, sectors, bucket_size, ret), TP_STRUCT__entry( __field(dev_t, dev ) @@ -857,7 +857,6 @@ TRACE_EVENT(evacuate_bucket, __field(u64, bucket ) __field(u32, sectors ) __field(u32, bucket_size ) - __field(u64, fragmentation ) __field(int, ret ) ), @@ -867,15 +866,14 @@ TRACE_EVENT(evacuate_bucket, __entry->bucket = bucket->offset; __entry->sectors = sectors; __entry->bucket_size = bucket_size; - __entry->fragmentation = fragmentation; __entry->ret = ret; ), - TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", + TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->member, __entry->bucket, __entry->sectors, __entry->bucket_size, - __entry->fragmentation, __entry->ret) + __entry->ret) ); TRACE_EVENT(copygc, From 62b185571a65ab3088546fbe1f40aeb085bc7267 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 22:49:40 -0500 Subject: [PATCH 102/233] bcachefs: fix bp_pos_to_bucket_nodev_noerror _noerror means don't produce inconsistent errors, so it should be using bch2_dev_rcu_noerror(). Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 74c96aee713e..eda3a78a5e2b 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -46,7 +46,7 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); if (ca) *bucket = bp_pos_to_bucket(ca, bp_pos); rcu_read_unlock(); From 3f2e467845296b24eb0e0d026e1f551ef73f0896 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 18 Nov 2024 00:16:52 -0500 Subject: [PATCH 103/233] bcachefs: check for backpointers to invalid device Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 4 ++++ fs/bcachefs/sb-errors_format.h | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index a9ffbea277bd..1c7ddaed6c1c 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -59,6 +59,10 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, "backpointer level bad: %u >= %u", bp.v->level, BTREE_MAX_DEPTH); + bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, + c, backpointer_dev_bad, + "backpointer for BCH_SB_MEMBER_INVALID"); + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); if (!ca) { diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d5b18ff1645c..9e3425f533bc 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -138,6 +138,7 @@ enum bch_fsck_flags { x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ x(backpointer_level_bad, 294, 0) \ + x(backpointer_dev_bad, 297, 0) \ x(backpointer_to_missing_device, 126, 0) \ x(backpointer_to_missing_alloc, 127, 0) \ x(backpointer_to_missing_ptr, 128, 0) \ @@ -307,7 +308,7 @@ enum bch_fsck_flags { x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(MAX, 297, 0) + x(MAX, 298, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From bbc2ccccfd24a9da72cdcd9b26568883250ff7a4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Nov 2024 16:30:30 -0500 Subject: [PATCH 104/233] bcachefs: bucket_pos_to_bp_end() Better helpers for iterating over backpointers within a specific bucket Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.h | 10 ++++++++++ fs/bcachefs/ec.c | 5 +++-- fs/bcachefs/move.c | 5 ++--- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index eda3a78a5e2b..595db7960939 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -80,6 +80,16 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, return ret; } +static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) +{ + return bucket_pos_to_bp(ca, bucket, 0); +} + +static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) +{ + return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); +} + int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index aa8ada4f0ec0..a4a2555d7c4f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1374,8 +1374,9 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - ret = for_each_btree_key_commit(trans, bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(ca, bucket_pos, 0), 0, bp_k, + ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket_pos), + bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, NULL, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc, ({ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 74839268d6ab..460175464762 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -691,7 +691,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(ca, bucket, 0), 0); + bucket_pos_to_bp_start(ca, bucket), 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) @@ -715,8 +715,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; - if (!k.k || - bkey_ge(k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0))) + if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) break; if (k.k->type != KEY_TYPE_backpointer) From 165ca83f5581716dad0494addfffd2360fa52445 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Nov 2024 17:45:44 -0500 Subject: [PATCH 105/233] bcachefs: Drop swab code for backpointers in alloc keys Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e90561b6def6..ae9fdb5ad758 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -322,7 +322,6 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - struct bch_backpointer *bp, *bps; a->journal_seq = swab64(a->journal_seq); a->flags = swab32(a->flags); @@ -333,13 +332,6 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); a->stripe_sectors = swab32(a->stripe_sectors); - - bps = alloc_v4_backpointers(a); - for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { - bp->bucket_offset = swab40(bp->bucket_offset); - bp->bucket_len = swab32(bp->bucket_len); - bch2_bpos_swab(&bp->pos); - } } void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) From ad5834890f182b4f9ccf169c1b469dd0e9b9a135 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Nov 2024 17:36:09 -0500 Subject: [PATCH 106/233] bcachefs: bch_backpointer -> bkey_i_backpointer Since we no longer store backpointers in alloc keys, there's no reason not to pass around bkey_i_backpointers; this means we don't have to pass the bucket pos separately. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 215 +++++++++++++++---------------------- fs/bcachefs/backpointers.h | 51 ++++----- fs/bcachefs/buckets.c | 8 +- fs/bcachefs/ec.c | 2 +- 4 files changed, 111 insertions(+), 165 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 1c7ddaed6c1c..24804f0bb2fd 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -14,40 +14,6 @@ #include -static bool extent_matches_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct bpos bucket, - struct bch_backpointer bp) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - rcu_read_lock(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket2; - struct bch_backpointer bp2; - - if (p.ptr.cached) - continue; - - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - if (!ca) - continue; - - bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); - if (bpos_eq(bucket, bucket2) && - !memcmp(&bp, &bp2, sizeof(bp))) { - rcu_read_unlock(); - return true; - } - } - rcu_read_unlock(); - - return false; -} - int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, enum bch_validate_flags flags) { @@ -78,23 +44,15 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || !bpos_eq(bp.k->p, bp_pos), c, backpointer_bucket_offset_wrong, - "backpointer bucket_offset wrong"); + "backpointer bucket_offset wrong (%llu)", (u64) bp.v->bucket_offset); fsck_err: return ret; } -void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - bch2_btree_id_level_to_text(out, bp->btree_id, bp->level); - prt_printf(out, " offset=%llu:%u len=%u pos=", - (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), - (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp->bucket_len); - bch2_bpos_to_text(out, bp->pos); -} + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); -void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); if (ca) { @@ -107,7 +65,12 @@ void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct b rcu_read_unlock(); } - bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); + bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_printf(out, " offset=%llu:%u len=%u pos=", + (u64) (bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), + (u32) bp.v->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp.v->bucket_len); + bch2_bpos_to_text(out, bp.v->pos); } void bch2_backpointer_swab(struct bkey_s k) @@ -119,10 +82,43 @@ void bch2_backpointer_swab(struct bkey_s k) bch2_bpos_swab(&bp.v->pos); } +static bool extent_matches_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct bkey_s_c_backpointer bp) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + rcu_read_lock(); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket2; + struct bkey_i_backpointer bp2; + + if (p.ptr.cached) + continue; + + struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (!ca) + continue; + + bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); + if (bpos_eq(bp.k->p, bp2.k.p) && + !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + + return false; +} + static noinline int backpointer_mod_err(struct btree_trans *trans, - struct bch_backpointer bp, - struct bkey_s_c bp_k, struct bkey_s_c orig_k, + struct bkey_i_backpointer *new_bp, + struct bkey_s_c found_bp, bool insert) { struct bch_fs *c = trans->c; @@ -130,12 +126,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, if (insert) { prt_printf(&buf, "existing backpointer found when inserting "); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); printbuf_indent_add(&buf, 2); prt_printf(&buf, "found "); - bch2_bkey_val_to_text(&buf, c, bp_k); + bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); @@ -147,11 +143,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); prt_printf(&buf, "got "); - bch2_bkey_val_to_text(&buf, c, bp_k); + bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); @@ -170,50 +166,35 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, - struct bch_backpointer bp, struct bkey_s_c orig_k, + struct bkey_i_backpointer *bp, bool insert) { struct btree_iter bp_iter; - struct bkey_s_c k; - struct bkey_i_backpointer *bp_k; - int ret; - - bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); - ret = PTR_ERR_OR_ZERO(bp_k); - if (ret) - return ret; - - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); - bp_k->v = bp; - - if (!insert) { - bp_k->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k->k, 0); - } - - k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bp_k->k.p, + struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bp->k.p, BTREE_ITER_intent| BTREE_ITER_slots| BTREE_ITER_with_updates); - ret = bkey_err(k); + int ret = bkey_err(k); if (ret) - goto err; + return ret; if (insert ? k.k->type : (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { - ret = backpointer_mod_err(trans, bp, k, orig_k, insert); + memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { + ret = backpointer_mod_err(trans, orig_k, bp, k, insert); if (ret) goto err; } - ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); + if (!insert) { + bp->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp->k, 0); + } + + ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); err: bch2_trans_iter_exit(trans, &bp_iter); return ret; @@ -234,22 +215,11 @@ static void backpointer_target_not_found(struct btree_trans *trans, if (likely(!bch2_backpointers_no_use_write_buffer)) return; - struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket)) - return; - prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.v->level ? "btree node" : "extent"); - prt_printf(&buf, "bucket: "); - bch2_bpos_to_text(&buf, bucket); + bch2_bkey_val_to_text(&buf, c, bp.s_c); prt_printf(&buf, "\n "); - prt_printf(&buf, "backpointer pos: "); - bch2_bpos_to_text(&buf, bp.k->p); - prt_printf(&buf, "\n "); - - bch2_backpointer_to_text(&buf, bp.v); - prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, target_k); if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) bch_err_ratelimited(c, "%s", buf.buf); @@ -267,10 +237,6 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, if (likely(!bp.v->level)) { struct bch_fs *c = trans->c; - struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket)) - return bkey_s_c_err(-EIO); - bch2_trans_node_iter_init(trans, iter, bp.v->btree_id, bp.v->pos, @@ -283,7 +249,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, } if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bucket, *bp.v)) + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; bch2_trans_iter_exit(trans, iter); @@ -308,10 +274,6 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, BUG_ON(!bp.v->level); - struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp.k->p, &bucket)) - return ERR_PTR(-EIO); - bch2_trans_node_iter_init(trans, iter, bp.v->btree_id, bp.v->pos, @@ -325,8 +287,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, BUG_ON(b->c.level != bp.v->level - 1); if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, - bkey_i_to_s_c(&b->key), - bucket, *bp.v)) + bkey_i_to_s_c(&b->key), bp)) return b; if (btree_node_will_make_reachable(b)) { @@ -480,8 +441,7 @@ static int check_extent_checksum(struct btree_trans *trans, static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, - struct bpos bucket, - struct bch_backpointer bp, + struct bkey_i_backpointer *bp, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; @@ -491,30 +451,28 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c bp_k; int ret = 0; - struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); + struct bch_dev *ca = bch2_dev_tryget_noerror(c, bp->k.p.inode); if (!ca) { - prt_str(&buf, "extent for nonexistent device:bucket "); - bch2_bpos_to_text(&buf, bucket); - prt_str(&buf, "\n "); + prt_printf(&buf, "extent for nonexistent device %llu\n", bp->k.p.inode); bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); ret = -BCH_ERR_fsck_repair_unimplemented; goto err; } + struct bpos bucket = bp_pos_to_bucket(ca, bp->k.p); + if (bpos_lt(bucket, s->bucket_start) || bpos_gt(bucket, s->bucket_end)) goto out; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(ca, bucket, bp.bucket_offset), - 0); + bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); ret = bkey_err(bp_k); if (ret) goto err; if (bp_k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); if (ret) goto err; @@ -561,14 +519,17 @@ static int check_bp_exists(struct btree_trans *trans, goto err; goto out; } else { - ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); + ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bucket.inode); if (ret) goto err; goto missing; } } - ret = check_extent_checksum(trans, other_bp.v->btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); + ret = check_extent_checksum(trans, + other_bp.v->btree_id, other_extent, + bp->v.btree_id, orig_k, + bucket.inode); if (ret < 0) goto err; if (ret) { @@ -576,7 +537,8 @@ static int check_bp_exists(struct btree_trans *trans, goto missing; } - ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.v->btree_id, other_extent, bucket.inode); + ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, + other_bp.v->btree_id, other_extent, bucket.inode); if (ret < 0) goto err; if (ret) { @@ -594,22 +556,15 @@ static int check_bp_exists(struct btree_trans *trans, goto err; missing: printbuf_reset(&buf); - prt_str(&buf, "missing backpointer for btree="); - bch2_btree_id_to_text(&buf, bp.btree_id); - prt_printf(&buf, " l=%u ", bp.level); + prt_str(&buf, "missing backpointer "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, orig_k); prt_printf(&buf, "\n got: "); bch2_bkey_val_to_text(&buf, c, bp_k); - struct bkey_i_backpointer n_bp_k; - bkey_backpointer_init(&n_bp_k.k_i); - n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); - n_bp_k.v = bp; - prt_printf(&buf, "\n want: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); - if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); + ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); goto out; } @@ -627,8 +582,8 @@ static int check_extent_to_backpointers(struct btree_trans *trans, ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos = POS_MIN; - struct bch_backpointer bp; + struct bpos bucket_pos; + struct bkey_i_backpointer bp; if (p.ptr.cached) continue; @@ -642,7 +597,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (!ca) continue; - ret = check_bp_exists(trans, s, bucket_pos, bp, k); + ret = check_bp_exists(trans, s, &bp, k); if (ret) return ret; } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 595db7960939..5f34a25b599a 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -19,13 +19,12 @@ static inline u64 swab40(u64 x) } int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); -void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); -void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ .key_validate = bch2_backpointer_validate, \ - .val_to_text = bch2_backpointer_k_to_text, \ + .val_to_text = bch2_backpointer_to_text, \ .swab = bch2_backpointer_swab, \ .min_val_size = 32, \ }) @@ -53,12 +52,6 @@ static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos return ca != NULL; } -static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) -{ - return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), - c, "backpointer for missing device %llu", bp_pos.inode); -} - static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) @@ -90,31 +83,25 @@ static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, - struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, + struct bkey_s_c, + struct bkey_i_backpointer *, + bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, - struct bch_backpointer bp, struct bkey_s_c orig_k, + struct bkey_i_backpointer *bp, bool insert) { if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); - - struct bkey_i_backpointer bp_k; - - bkey_backpointer_init(&bp_k.k_i); - bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); - bp_k.v = bp; + return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); if (!insert) { - bp_k.k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k.k, 0); + bp->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp->k, 0); } - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); } static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, @@ -148,17 +135,21 @@ static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - struct bpos *bucket_pos, struct bch_backpointer *bp, + struct bpos *bucket, struct bkey_i_backpointer *bp, u64 sectors) { u32 bucket_offset; - *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); - *bp = (struct bch_backpointer) { + *bucket = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); + + u64 bp_bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; + + bkey_backpointer_init(&bp->k_i); + bp->k.p = bucket_pos_to_bp(ca, *bucket, bp_bucket_offset); + bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, .data_type = bch2_bkey_ptr_data_type(k, p, entry), - .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + - p.crc.offset, + .bucket_offset = bp_bucket_offset, .bucket_len = sectors, .pos = k.k->p, }; @@ -168,7 +159,7 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - struct bpos *bucket_pos, struct bch_backpointer *bp) + struct bpos *bucket_pos, struct bkey_i_backpointer *bp) { u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 5b42f0a7b0cb..1547141ba2a0 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -585,18 +585,18 @@ static int bch2_trigger_pointer(struct btree_trans *trans, } struct bpos bucket; - struct bch_backpointer bp; + struct bkey_i_backpointer bp; __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors); if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v); + __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v); if (ret) goto err; if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert); + ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); if (ret) goto err; } @@ -614,7 +614,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new); + ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new); alloc_to_bucket(g, new); bucket_unlock(g); err_unlock: diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index a4a2555d7c4f..f6b7b8b54f62 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1300,7 +1300,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, return 0; prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); - bch2_backpointer_to_text(&buf, bp.v); + bch2_bkey_val_to_text(&buf, c, bp.s_c); bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); From 283dcbb80c18b9d9cc2f17dc81bfb2a049cbef0e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 18 Nov 2024 00:32:57 -0500 Subject: [PATCH 107/233] bcachefs: Fix check_backpointers_to_extents range limiting bch2_get_btree_in_memory_pos() will return positions that refer directly to the btree it's checking will fit in memory - i.e. backpointer positions, not buckets. This also means check_bp_exists() no longer has to refer to the device, and we can delete some code. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 63 +++++++++++++++----------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 24804f0bb2fd..5963217cd90c 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -352,8 +352,8 @@ int bch2_check_btree_backpointers(struct bch_fs *c) } struct extents_to_bp_state { - struct bpos bucket_start; - struct bpos bucket_end; + struct bpos bp_start; + struct bpos bp_end; struct bkey_buf last_flushed; }; @@ -445,29 +445,16 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = {}; struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; - struct bkey_s_c bp_k; - int ret = 0; - struct bch_dev *ca = bch2_dev_tryget_noerror(c, bp->k.p.inode); - if (!ca) { - prt_printf(&buf, "extent for nonexistent device %llu\n", bp->k.p.inode); - bch2_bkey_val_to_text(&buf, c, orig_k); - bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; - } + if (bpos_lt(bp->k.p, s->bp_start) || + bpos_gt(bp->k.p, s->bp_end)) + return 0; - struct bpos bucket = bp_pos_to_bucket(ca, bp->k.p); - - if (bpos_lt(bucket, s->bucket_start) || - bpos_gt(bucket, s->bucket_end)) - goto out; - - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); - ret = bkey_err(bp_k); + struct btree_iter bp_iter; + struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); + int ret = bkey_err(bp_k); if (ret) goto err; @@ -484,7 +471,6 @@ static int check_bp_exists(struct btree_trans *trans, fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); printbuf_exit(&buf); return ret; check_existing_bp: @@ -514,12 +500,13 @@ static int check_bp_exists(struct btree_trans *trans, bch_err(c, "%s", buf.buf); if (other_extent.k->size <= orig_k.k->size) { - ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bucket.inode); + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); if (ret) goto err; goto out; } else { - ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bucket.inode); + ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); if (ret) goto err; goto missing; @@ -529,7 +516,7 @@ static int check_bp_exists(struct btree_trans *trans, ret = check_extent_checksum(trans, other_bp.v->btree_id, other_extent, bp->v.btree_id, orig_k, - bucket.inode); + bp->k.p.inode); if (ret < 0) goto err; if (ret) { @@ -538,7 +525,7 @@ static int check_bp_exists(struct btree_trans *trans, } ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, - other_bp.v->btree_id, other_extent, bucket.inode); + other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret < 0) goto err; if (ret) { @@ -547,7 +534,7 @@ static int check_bp_exists(struct btree_trans *trans, } printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); bch2_bkey_val_to_text(&buf, c, orig_k); prt_str(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, other_extent); @@ -811,7 +798,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bucket_start = POS_MIN }; + struct extents_to_bp_state s = { .bp_start = POS_MIN }; int ret; bch2_bkey_buf_init(&s.last_flushed); @@ -822,35 +809,35 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ret = bch2_get_btree_in_memory_pos(trans, BIT_ULL(BTREE_ID_backpointers), BIT_ULL(BTREE_ID_backpointers), - BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); + BBPOS(BTREE_ID_backpointers, s.bp_start), &end); if (ret) break; - s.bucket_end = end.pos; + s.bp_end = end.pos; - if ( bpos_eq(s.bucket_start, POS_MIN) && - !bpos_eq(s.bucket_end, SPOS_MAX)) + if ( bpos_eq(s.bp_start, POS_MIN) && + !bpos_eq(s.bp_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(s.bucket_start, POS_MIN) || - !bpos_eq(s.bucket_end, SPOS_MAX)) { + if (!bpos_eq(s.bp_start, POS_MIN) || + !bpos_eq(s.bp_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, s.bucket_start); + bch2_bpos_to_text(&buf, s.bp_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, s.bucket_end); + bch2_bpos_to_text(&buf, s.bp_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } ret = bch2_check_extents_to_backpointers_pass(trans, &s); - if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) + if (ret || bpos_eq(s.bp_end, SPOS_MAX)) break; - s.bucket_start = bpos_successor(s.bucket_end); + s.bp_start = bpos_successor(s.bp_end); } bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); From da89857b5fee06e4424cc235c2534edd4621dc6f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 18:26:54 -0500 Subject: [PATCH 108/233] bcachefs: kill bch_backpointer.bucket_offset usage bch_backpointer.bucket_offset is going away - it's no longer needed since we no longer store backpointers in alloc keys, the same information is in the key position itself. And we'll be reclaiming the space in bch_backpointer for the bucket generation number. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 15 +++++++-------- fs/bcachefs/backpointers.h | 8 ++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 5963217cd90c..620fa67db7a6 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -54,21 +54,20 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); if (ca) { - struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); + u32 bucket_offset; + struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); rcu_read_unlock(); - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bucket); - prt_str(out, " "); + prt_printf(out, "bucket=%llu:%llu:%u", bucket.inode, bucket.offset, bucket_offset); } else { rcu_read_unlock(); + prt_printf(out, "sector=%llu:%llu", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); - prt_printf(out, " offset=%llu:%u len=%u pos=", - (u64) (bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), - (u32) bp.v->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + prt_printf(out, " suboffset=%u len=%u pos=", + (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len); bch2_bpos_to_text(out, bp.v->pos); } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 5f34a25b599a..d8a15f5fa767 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -42,6 +42,14 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, + u32 *bucket_offset) +{ + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); +} + static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { rcu_read_lock(); From 1f3c4ab3fbb3400f5527087427c590632a4a75df Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 21:34:43 -0500 Subject: [PATCH 109/233] bcachefs: New backpointers helpers - bch2_backpointer_del() - bch2_backpointer_maybe_flush() Kill a bit of open coding and make sure we're properly handling the btree write buffer. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 58 +++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 620fa67db7a6..cfd9b9ead473 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -199,6 +199,22 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, return ret; } +static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) +{ + return likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) + : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0); +} + +static int bch2_backpointers_maybe_flush(struct btree_trans *trans, + struct bkey_s_c visiting_k, + struct bkey_buf *last_flushed) +{ + return likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) + : 0; +} + static void backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct bkey_s_c target_k) @@ -300,9 +316,12 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, return b; } -static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, - struct bkey_s_c k) +static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, + struct bkey_buf *last_flushed) { + if (k.k->type != KEY_TYPE_backpointer) + return 0; + struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; struct bkey_s_c alloc_k; @@ -311,10 +330,14 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct bpos bucket; if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + if (fsck_err(trans, backpointer_to_missing_device, "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, bp_iter, 0); + ret = bch2_backpointer_del(trans, k.k->p); goto out; } @@ -323,13 +346,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ if (ret) goto out; - if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, - trans, backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); - goto out; + if (alloc_k.k->type != KEY_TYPE_alloc_v4) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + + if (fsck_err(trans, backpointer_to_missing_alloc, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_backpointer_del(trans, k.k->p); } out: fsck_err: @@ -341,11 +367,17 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_btree_backpointer(trans, &iter, k))); + bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } @@ -874,7 +906,7 @@ static int check_one_backpointer(struct btree_trans *trans, return ret; if (!k.k) { - ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); + ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed); if (ret) goto out; @@ -882,7 +914,7 @@ static int check_one_backpointer(struct btree_trans *trans, "backpointer for missing %s\n %s", bp.v->level ? "btree node" : "extent", (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + ret = bch2_backpointer_del(trans, bp.k->p); goto out; } } From 01d8d04564c46cfafc454511601610af2fb4a8fb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 16:27:47 -0500 Subject: [PATCH 110/233] bcachefs: Can now block journal activity without closing cur entry Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 44 +++++++++++++++++++++++++++++++++++-- fs/bcachefs/journal.h | 3 ++- fs/bcachefs/journal_types.h | 2 ++ 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 2cf8f24d50cc..bfbb1ac60c3d 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -217,6 +217,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); bch2_journal_do_writes(j); + + /* + * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an + * open journal entry + */ + wake_up(&j->wait); } /* @@ -251,6 +257,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t if (!__journal_entry_is_open(old)) return; + if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) + old.cur_entry_offset = j->cur_entry_offset_if_blocked; + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); @@ -868,16 +877,44 @@ int bch2_journal_meta(struct journal *j) void bch2_journal_unblock(struct journal *j) { spin_lock(&j->lock); - j->blocked--; + if (!--j->blocked && + j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + new.v = old.v; + new.cur_entry_offset = j->cur_entry_offset_if_blocked; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + } spin_unlock(&j->lock); journal_wake(j); } +static void __bch2_journal_block(struct journal *j) +{ + if (!j->blocked++) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + j->cur_entry_offset_if_blocked = old.cur_entry_offset; + + if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) + break; + + new.v = old.v; + new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + } +} + void bch2_journal_block(struct journal *j) { spin_lock(&j->lock); - j->blocked++; + __bch2_journal_block(j); spin_unlock(&j->lock); journal_quiesce(j); @@ -1481,6 +1518,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) case JOURNAL_ENTRY_CLOSED_VAL: prt_printf(out, "closed\n"); break; + case JOURNAL_ENTRY_BLOCKED_VAL: + prt_printf(out, "blocked\n"); + break; default: prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 2762be6f9814..6d3c839bbbef 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq spin_lock(&j->lock); bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); - } + } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) + wake_up(&j->wait); } /* diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 19183fcf7ad7..425d1abb257e 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -112,6 +112,7 @@ union journal_res_state { */ #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) @@ -193,6 +194,7 @@ struct journal { * insufficient devices: */ enum journal_errors cur_entry_error; + unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; /* From 709336f96d44a887179a4ae58d86bdd01640eb3c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 18:21:12 -0500 Subject: [PATCH 111/233] bcachefs: trivial btree write buffer refactoring Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 64 ++++++++++++++++---------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 1639c60dffa0..1bd26221f156 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -19,8 +19,6 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); - static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { return (cmp_int(l->hi, r->hi) ?: @@ -481,13 +479,38 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) return ret; } -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } + + spin_lock(&c->journal.lock); + buf->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); +out: + ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; + return ret; +} + +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) { struct journal *j = &c->journal; struct journal_buf *buf; int ret = 0; - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq))) { ret = bch2_journal_keys_to_write_buffer(c, buf); mutex_unlock(&j->buf_lock); } @@ -495,7 +518,7 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) return ret; } -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, bool *did_work) { struct bch_fs *c = trans->c; @@ -505,7 +528,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, do { bch2_trans_unlock(trans); - fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; @@ -518,8 +541,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, mutex_unlock(&wb->flushing.lock); } while (!ret && (fetch_from_journal_err || - (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); + (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); return ret; } @@ -771,31 +794,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ return ret; } -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } - - spin_lock(&c->journal.lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); -out: - ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; - return ret; -} - static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) { if (wb->keys.size >= new_size) From 2c9a60bc315537ac764ee026635daf588c1beb9b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 16:47:10 -0500 Subject: [PATCH 112/233] bcachefs: Bias reads more in favor of faster device Per reports of performance issues on mixed multi device filesystems where we're issuing too much IO to the spinning rust - tweak this algorithm. Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 98bb680b3860..83aeceb68847 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -89,6 +89,14 @@ static inline bool ptr_better(struct bch_fs *c, u64 l1 = dev_latency(c, p1.ptr.dev); u64 l2 = dev_latency(c, p2.ptr.dev); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + l1 *= l1; + l2 *= l2; + /* Pick at random, biased in favor of the faster device: */ return bch2_rand_range(l1 + l2) > l1; From 4e378cabba8d65835f76a60f05ddc2b44ce44f45 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 21 Nov 2024 20:09:45 -0500 Subject: [PATCH 113/233] bcachefs: discard fastpath now uses bch2_discard_one_bucket() The discard bucket fastpath previously was using its own code for discarding buckets and clearing them in the need_discard btree, which didn't have any of the consistency checks of the main discard path. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 75 +++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index ae9fdb5ad758..1e9f53db4bb8 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1725,7 +1725,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - struct discard_buckets_state *s) + struct discard_buckets_state *s, + bool fastpath) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1782,10 +1783,12 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (discard_in_flight_add(ca, iter.pos.offset, true)) - goto out; + if (!fastpath) { + if (discard_in_flight_add(ca, iter.pos.offset, true)) + goto out; - discard_locked = true; + discard_locked = true; + } if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { @@ -1799,6 +1802,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, ca->mi.bucket_size, GFP_KERNEL); *discard_pos_done = iter.pos; + s->discarded++; ret = bch2_trans_relock_notrace(trans); if (ret) @@ -1819,12 +1823,12 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; count_event(c, bucket_discard); - s->discarded++; out: fsck_err: if (discard_locked) discard_in_flight_remove(ca, iter.pos.offset); - s->seen++; + if (!ret) + s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; @@ -1848,7 +1852,7 @@ static void bch2_do_discards_work(struct work_struct *work) BTREE_ID_need_discard, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX), 0, k, - bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); @@ -1881,27 +1885,31 @@ void bch2_do_discards(struct bch_fs *c) bch2_dev_do_discards(ca); } -static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) +static int bch2_do_discards_fast_one(struct btree_trans *trans, + struct bch_dev *ca, + u64 bucket, + struct bpos *discard_pos_done, + struct discard_buckets_state *s) { - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - int ret = bkey_err(k); + struct btree_iter need_discard_iter; + struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, + BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); + int ret = bkey_err(discard_k); if (ret) - goto err; + return ret; - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto err; + if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, + trans, discarding_bucket_not_in_need_discard_btree, + "attempting to discard bucket %u:%llu not in need_discard btree", + ca->dev_idx, bucket)) { + /* log it in the superblock and continue: */ + goto out; + } - BUG_ON(a->v.dirty_sectors); - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); - - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); + ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); +out: +fsck_err: + bch2_trans_iter_exit(trans, &need_discard_iter); return ret; } @@ -1909,6 +1917,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); struct bch_fs *c = ca->fs; + struct discard_buckets_state s = {}; + struct bpos discard_pos_done = POS_MAX; + struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; while (1) { bool got_bucket = false; @@ -1929,16 +1941,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) if (!got_bucket) break; - if (ca->mi.discard && !c->opts.nochanges) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, - GFP_KERNEL); - - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); + ret = lockrestart_do(trans, + bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); bch_err_fn(c, ret); discard_in_flight_remove(ca, bucket); @@ -1947,6 +1951,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } From 0e796cf8047d1e35c77573a8ef62f13e6d242b76 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 23 Apr 2024 02:18:18 -0400 Subject: [PATCH 114/233] bcachefs: btree_write_buffer_flush_seq() no longer closes journal Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_write_buffer.c | 19 ++++++++++++++----- fs/bcachefs/journal.c | 27 ++++++++++++++++++++------- fs/bcachefs/journal.h | 2 +- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 1bd26221f156..49ce2d1e5c02 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -495,10 +495,6 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu entry->type = BCH_JSET_ENTRY_btree_keys; } - - spin_lock(&c->journal.lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); out: ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; return ret; @@ -508,11 +504,24 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) { struct journal *j = &c->journal; struct journal_buf *buf; + bool blocked; int ret = 0; - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq))) { + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { ret = bch2_journal_keys_to_write_buffer(c, buf); + + if (!blocked && !ret) { + spin_lock(&j->lock); + buf->need_flush_to_write_buffer = false; + spin_unlock(&j->lock); + } + mutex_unlock(&j->buf_lock); + + if (blocked) { + bch2_journal_unblock(j); + break; + } } return ret; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bfbb1ac60c3d..699db0d0749a 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -908,6 +908,8 @@ static void __bch2_journal_block(struct journal *j) new.v = old.v; new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } @@ -920,7 +922,8 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret = NULL; @@ -937,13 +940,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou struct journal_buf *buf = j->buf + idx; if (buf->need_flush_to_write_buffer) { - if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); - ret = journal_state_count(s, idx) + unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); + + if (open && !*blocked) { + __bch2_journal_block(j); + *blocked = true; + } + + ret = journal_state_count(s, idx) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -956,11 +963,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou return ret; } -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret; + *blocked = false; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, + max_seq, blocked)) != ERR_PTR(-EAGAIN)); + if (IS_ERR_OR_NULL(ret) && *blocked) + bch2_journal_unblock(j); - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); return ret; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 6d3c839bbbef..71a50846967f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -425,7 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); From 2ae6c5e05ddc1693a90fdad3e3443aed353a0b38 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Nov 2024 22:12:58 -0500 Subject: [PATCH 115/233] bcachefs: BCH_ERR_btree_node_read_error_cached Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 6 +++--- fs/bcachefs/errcode.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index a0a406b0c7bc..36dfa6a48aa6 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1131,7 +1131,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1221,7 +1221,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1303,7 +1303,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_error); + b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); goto out; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 2dda7f962e5b..131b9bef21a0 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -242,6 +242,7 @@ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ From 3ed349d91ec1912da074e1e4acf2af7892b659c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 20:15:30 -0500 Subject: [PATCH 116/233] bcachefs: Use separate rhltable for bch2_inode_or_descendents_is_open() Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/fs.c | 39 ++++++++++++++++++++++++++++++--------- fs/bcachefs/fs.h | 1 + 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 11f9ed42a9da..f1d8c821d27a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1020,6 +1020,7 @@ struct bch_fs { struct list_head vfs_inodes_list; struct mutex vfs_inodes_lock; struct rhashtable vfs_inodes_table; + struct rhltable vfs_inodes_by_inum_table; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 50d323fca001..c6e7df7c67fa 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -176,8 +177,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; + siphash_key_t k = { .key[0] = seed }; - return jhash(&inum->inum, sizeof(inum->inum), seed); + return siphash_2u64(inum->subvol, inum->inum, &k); } static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) @@ -206,11 +208,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { + .head_offset = offsetof(struct bch_inode_info, by_inum_hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), + .key_len = sizeof(u64), + .automatic_shrinking = true, +}; + int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { struct bch_fs *c = trans->c; - struct rhashtable *ht = &c->vfs_inodes_table; - subvol_inum inum = (subvol_inum) { .inum = p.offset }; + struct rhltable *ht = &c->vfs_inodes_by_inum_table; + u64 inum = p.offset; DARRAY(u32) subvols; int ret = 0; @@ -235,15 +244,15 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) struct rhash_lock_head __rcu *const *bkt; struct rhash_head *he; unsigned int hash; - struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); + struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); restart: - hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); bkt = rht_bucket(tbl, hash); do { struct bch_inode_info *inode; rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { - if (inode->ei_inum.inum == inum.inum) { + if (inode->ei_inum.inum == inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, GFP_NOWAIT|__GFP_NOWARN); if (ret) { @@ -264,7 +273,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) /* Ensure we see any new tables. */ smp_rmb(); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); + tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); if (unlikely(tbl)) goto restart; rcu_read_unlock(); @@ -343,7 +352,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod spin_unlock(&inode->v.i_lock); if (remove) { - int ret = rhashtable_remove_fast(&c->vfs_inodes_table, + int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + + ret = rhashtable_remove_fast(&c->vfs_inodes_table, &inode->hash, bch2_vfs_inodes_params); BUG_ON(ret); inode->v.i_hash.pprev = NULL; @@ -388,6 +401,11 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, discard_new_inode(&inode->v); return old; } else { + int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, + bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + inode_fake_hash(&inode->v); inode_sb_list_add(&inode->v); @@ -2359,13 +2377,16 @@ static int bch2_init_fs_context(struct fs_context *fc) void bch2_fs_vfs_exit(struct bch_fs *c) { + if (c->vfs_inodes_by_inum_table.ht.tbl) + rhltable_destroy(&c->vfs_inodes_by_inum_table); if (c->vfs_inodes_table.tbl) rhashtable_destroy(&c->vfs_inodes_table); } int bch2_fs_vfs_init(struct bch_fs *c) { - return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: + rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); } static struct file_system_type bcache_fs_type = { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 59f9f7ae728d..dd2198541455 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -14,6 +14,7 @@ struct bch_inode_info { struct inode v; struct rhash_head hash; + struct rhlist_head by_inum_hash; subvol_inum ei_inum; struct list_head ei_vfs_inode_list; From f3542deaa920f10ee70f33bbf435c84e37a33a65 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 21:49:08 -0500 Subject: [PATCH 117/233] bcachefs: errcode cleanup: journal errors Instead of throwing standard error codes, we should be throwing dedicated private error codes, this greatly improves debugability. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 2 ++ fs/bcachefs/journal.c | 4 ++-- fs/bcachefs/journal.h | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 131b9bef21a0..c989ce4f715f 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -241,6 +241,8 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, journal_shutdown) \ + x(EIO, journal_flush_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 699db0d0749a..bbdd0b17ae69 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -673,7 +673,7 @@ void bch2_journal_entry_res_resize(struct journal *j, * @seq: seq to flush * @parent: closure object to wait with * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -EIO if @seq will never be flushed + * -BCH_ERR_journal_flush_err if @seq will never be flushed * * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary @@ -696,7 +696,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -EIO; + ret = -BCH_ERR_journal_flush_err; goto out; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 71a50846967f..a6a2e888c59b 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -412,7 +412,7 @@ void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -EIO : 0; + ? -BCH_ERR_journal_shutdown : 0; } struct bch_dev; From 8b5373916097e3993c6539930f4181ca05b29e20 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:23:41 -0500 Subject: [PATCH 118/233] bcachefs: disk_accounting: bch2_dev_rcu -> bch2_dev_rcu_noerror Accounting keys that reference invalid devices are corrected by fsck, they shouldn't cause an emergency shutdown. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 4 ++-- fs/bcachefs/disk_accounting.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 55a00018dc8b..fa821d278c45 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -805,7 +805,7 @@ int bch2_accounting_read(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; percpu_u64_set(&d->buckets, v[0]); @@ -911,7 +911,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) { rcu_read_unlock(); continue; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 6639535dc91c..8b2b2f83e6a4 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -142,7 +142,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); From ba91f39cd42bcfac86b9a7165d56b6964c314c81 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:28:41 -0500 Subject: [PATCH 119/233] bcachefs: Fix accounting_read when we rewind If we rewind recovery to run topology repair, that causes accounting_read to run twice. This fixes accounting being double counted. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index fa821d278c45..bb5dbbf71d04 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -700,6 +700,21 @@ int bch2_accounting_read(struct bch_fs *c) struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + percpu_up_write(&c->mark_lock); + int ret = for_each_btree_key(trans, iter, BTREE_ID_accounting, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ From 658ca218178763396a5647ea63cf3e2f1a669e4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:45:25 -0500 Subject: [PATCH 120/233] bcachefs: backpointer_to_missing_ptr is now autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 9e3425f533bc..d45d0789f1b1 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -141,7 +141,7 @@ enum bch_fsck_flags { x(backpointer_dev_bad, 297, 0) \ x(backpointer_to_missing_device, 126, 0) \ x(backpointer_to_missing_alloc, 127, 0) \ - x(backpointer_to_missing_ptr, 128, 0) \ + x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ x(lru_entry_bad, 131, FSCK_AUTOFIX) \ From 3307caf863385f8b670a6b496083571a88d3c0bc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 22:57:01 -0500 Subject: [PATCH 121/233] bcachefs: Fix btree node scan when unknown btree IDs are present btree_root entries for unknown btree IDs are created during recovery, before reading those btree roots. But btree_node_scan may find btree nodes with unknown btree IDs when we haven't seen roots for those btrees. Reported-by: syzbot+1f202d4da221ec6ebf8e@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 11 ++++++++--- fs/bcachefs/btree_cache.h | 9 +++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 36dfa6a48aa6..1f06e24e53fc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1406,9 +1406,14 @@ void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsi void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { bch2_btree_id_to_text(out, b->c.btree_id); - prt_printf(out, " level %u/%u\n ", - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); + prt_printf(out, " level %u/", b->c.level); + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + if (r) + prt_printf(out, "%u", r->level); + else + prt_printf(out, "(unknown)"); + prt_printf(out, "\n "); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); } diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 6cfacacb6769..dcc34fe4996d 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -128,14 +128,19 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i } else { unsigned idx = id - BTREE_ID_NR; - EBUG_ON(idx >= c->btree_roots_extra.nr); + /* This can happen when we're called from btree_node_scan */ + if (idx >= c->btree_roots_extra.nr) + return NULL; + return &c->btree_roots_extra.data[idx]; } } static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) { - return bch2_btree_id_root(c, b->c.btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + + return r ? r->b : NULL; } const char *bch2_btree_id_str(enum btree_id); /* avoid */ From 873a885d1af707d5a44eec5439e49fe4ff4a3b02 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 23:28:21 -0500 Subject: [PATCH 122/233] bcachefs: Kill bch2_bucket_alloc_new_fs() The early-early allocation path, bch2_bucket_alloc_new_fs(), is no longer needed - and inconsistencies around new_fs_bucket_idx have been a frequent source of bugs. Reported-by: syzbot+592425844580a6598410@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 40 ++++++++++++++-------------------- fs/bcachefs/alloc_foreground.h | 2 -- fs/bcachefs/bcachefs.h | 1 - fs/bcachefs/buckets.c | 25 +++++++++++++++++++++ fs/bcachefs/buckets.h | 21 +----------------- fs/bcachefs/journal.c | 34 +++++++++++++---------------- fs/bcachefs/journal_reclaim.c | 3 +++ fs/bcachefs/recovery.c | 5 +---- fs/bcachefs/super.c | 12 +++++----- 9 files changed, 66 insertions(+), 77 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 6d665b720f72..4d1ff7f1f302 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -156,6 +156,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } +static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +{ + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + return false; + + return bch2_is_superblock_bucket(ca, b); +} + static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { BUG_ON(c->open_buckets_partial_nr >= @@ -175,20 +183,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -/* _only_ for allocating the journal on a new device: */ -long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -{ - while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { - u64 b = ca->new_fs_bucket_idx++; - - if (!is_superblock_bucket(ca, b) && - (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) - return b; - } - - return -1; -} - static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { @@ -214,6 +208,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * { struct open_bucket *ob; + if (unlikely(is_superblock_bucket(c, ca, bucket))) + return NULL; + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { s->skipped_nouse++; return NULL; @@ -295,9 +292,6 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc /* * This path is for before the freespace btree is initialized: - * - * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & - * journal buckets - journal buckets will be < ca->new_fs_bucket_idx */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, @@ -309,7 +303,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 first_bucket = ca->mi.first_bucket; u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; @@ -332,10 +326,6 @@ bch2_bucket_alloc_early(struct btree_trans *trans, if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (ca->new_fs_bucket_idx && - is_superblock_bucket(ca, k.k->p.offset)) - continue; - if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { @@ -406,8 +396,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; - - BUG_ON(ca->new_fs_bucket_idx); again: for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, POS(ca->dev_idx, alloc_cursor), @@ -551,6 +539,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, bch2_dev_do_invalidates(ca); if (!avail) { + if (watermark > BCH_WATERMARK_normal && + c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + goto alloc; + if (cl && !waiting) { closure_wait(&c->freelist_wait, cl); waiting = true; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 1a16fd5bd4f8..4f87745df97e 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct bch_devs_mask *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -long bch2_bucket_alloc_new_fs(struct bch_dev *); - static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) { return bch2_dev_have_ref(c, ob->dev); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index f1d8c821d27a..a85b3bcc6383 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -560,7 +560,6 @@ struct bch_dev { struct bch_dev_usage __percpu *usage; /* Allocator: */ - u64 new_fs_bucket_idx; u64 alloc_cursor[3]; unsigned nr_open_buckets; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 1547141ba2a0..afd35c93fcfb 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1161,6 +1161,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); } +bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + for (i = 0; i < ca->journal.nr; i++) + if (b == ca->journal.buckets[i]) + return true; + + return false; +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index ccc78bfe2fd4..3bebc4c3044f 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -308,26 +308,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); -static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - return false; -} +bool bch2_is_superblock_bucket(struct bch_dev *, u64); static inline const char *bch2_data_type_str(enum bch_data_type type) { diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bbdd0b17ae69..95cccda3b22c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1002,19 +1002,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } for (nr_got = 0; nr_got < nr_want; nr_got++) { - if (new_fs) { - bu[nr_got] = bch2_bucket_alloc_new_fs(ca); - if (bu[nr_got] < 0) { - ret = -BCH_ERR_ENOSPC_bucket_alloc; - break; - } - } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, - BCH_DATA_journal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; + enum bch_watermark watermark = new_fs + ? BCH_WATERMARK_btree + : BCH_WATERMARK_normal; + ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, + BCH_DATA_journal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + if (!new_fs) { ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, @@ -1024,9 +1022,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bch_err_msg(c, ret, "marking new journal buckets"); break; } - - bu[nr_got] = ob[nr_got]->bucket; } + + bu[nr_got] = ob[nr_got]->bucket; } if (!nr_got) @@ -1066,8 +1064,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (ret) goto err_unblock; - if (!new_fs) - bch2_write_super(c); + bch2_write_super(c); /* Commit: */ if (c) @@ -1101,9 +1098,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bu[i], BCH_DATA_free, 0, BTREE_TRIGGER_transactional)); err_free: - if (!new_fs) - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); kfree(new_bucket_seq); kfree(new_buckets); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 3d8fc2642425..1aabbbe328d9 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { + if (!ja->nr) + return 0; + unsigned available = (journal_space_from(ja, from) - ja->cur_idx - 1 + ja->nr) % ja->nr; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 7086a7226989..547c78a323f7 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1070,7 +1070,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_btree_running, &c->flags); set_bit(BCH_FS_may_go_rw, &c->flags); @@ -1111,9 +1110,6 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(c, ca) - ca->new_fs_bucket_idx = 0; - ret = bch2_fs_freespace_init(c); if (ret) goto err; @@ -1172,6 +1168,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 08170a3d524f..14157820705d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1750,11 +1750,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca, true); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err; - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1805,11 +1800,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err_late; - ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + up_write(&c->state_lock); return 0; From 46522a75a47ed8db6da54f37c4dcf934e12fe540 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 00:21:27 -0500 Subject: [PATCH 123/233] bcachefs: Bad btree roots are now autofix Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index d45d0789f1b1..89d9dc2c859b 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -68,8 +68,8 @@ enum bch_fsck_flags { x(btree_node_bkey_bad_format, 55, 0) \ x(btree_node_bad_bkey, 56, 0) \ x(btree_node_bkey_out_of_order, 57, 0) \ - x(btree_root_bkey_invalid, 58, 0) \ - x(btree_root_read_error, 59, 0) \ + x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ + x(btree_root_read_error, 59, FSCK_AUTOFIX) \ x(btree_root_bad_min_key, 60, 0) \ x(btree_root_bad_max_key, 61, 0) \ x(btree_node_read_error, 62, 0) \ From 2d66d3160dd7ee36bb8a42111516373ec1cc4d25 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 01:26:56 -0500 Subject: [PATCH 124/233] bcachefs: Fix dup/misordered check in btree node read We were checking for out of order keys, but not duplicate keys. Reported-by: syzbot+dedbd67513939979f84f@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 89a42ee81e5c..2b5da566fbac 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -857,6 +857,14 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); } +static inline int btree_node_read_bkey_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); +} + static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, bool have_retry, bool *saw_error) @@ -917,7 +925,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, BSET_BIG_ENDIAN(i), write, &b->format, k); - if (prev && bkey_iter_cmp(b, prev, k) > 0) { + if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { struct bkey up = bkey_unpack_key(b, prev); printbuf_reset(&buf); From ec3ca7c9e05b1253ccce6a20ca1269750a71bd2a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 02:05:02 -0500 Subject: [PATCH 125/233] bcachefs: Don't try to en/decrypt when encryption not available If a btree node says it's encrypted, but the superblock never had an encryptino key - whoops, that needs to be handled. Reported-by: syzbot+026f1857b12f5eb3f9e9@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 109 +++++++++++++++++----------------- fs/bcachefs/btree_node_scan.c | 3 + fs/bcachefs/checksum.c | 10 +++- fs/bcachefs/errcode.h | 1 + fs/bcachefs/io_read.c | 14 ++++- 5 files changed, 80 insertions(+), 57 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 2b5da566fbac..3bb6db9bd4a4 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1045,39 +1045,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors; - struct nonce nonce; bool first = !b->written; - bool csum_bad; - if (!b->written) { + if (first) { + bne = NULL; i = &b->data->keys; + } else { + bne = write_block(b); + i = &bne->keys; - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + if (i->seq != b->data->keys.seq) + break; + } - nonce = btree_nonce(i, b->written << 9); + struct nonce nonce = btree_nonce(i, b->written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - csum_bad = bch2_crc_cmp(b->data->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + btree_err_on(!good_csum_type, + bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) + ? -BCH_ERR_btree_node_read_err_must_retry + : -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), - buf.buf)); + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + bool csum_bad = bch2_crc_cmp(b->data->csum, csum); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), + buf.buf)); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), @@ -1088,37 +1100,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sectors = vstruct_sectors(b->data, c->block_bits); } else { - bne = write_block(b); - i = &bne->keys; + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + bool csum_bad = bch2_crc_cmp(bne->csum, csum); + if (ca && csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - if (i->seq != b->data->keys.seq) - break; + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), + buf.buf)); - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, b->written << 9); - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - csum_bad = bch2_crc_cmp(bne->csum, csum); - if (ca && csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } sectors = vstruct_sectors(bne, c->block_bits); } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 4b4df31d4b95..327f1a1859b9 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -159,6 +159,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, return; if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + if (!c->chacha20) + return; + struct nonce nonce = btree_nonce(&bn->keys, 0); unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index ce8fc677bef9..23a383577d4c 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "checksum.h" #include "errcode.h" +#include "error.h" #include "super.h" #include "super-io.h" @@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (!bch2_csum_type_is_encryption(type)) return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; + return do_encrypt(c->chacha20, nonce, data, len); } @@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, size_t sgl_len = 0; int ret = 0; - if (!bch2_csum_type_is_encryption(type)) - return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; darray_init(&sgl); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index c989ce4f715f..a12050e9c191 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -260,6 +260,7 @@ x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ + x(EIO, no_encryption_key) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index eb8d12fd6398..4b6b6d25725b 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -830,7 +830,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (!pick_ret) goto hole; - if (pick_ret < 0) { + if (unlikely(pick_ret < 0)) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); @@ -843,6 +843,18 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto err; } + if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "attempting to read encrypted data without encryption key\n %s", + buf.buf); + printbuf_exit(&buf); + goto err; + } + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); /* From 72177d492d17031c6cbbb7f6802a23cbdabd37ad Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Nov 2024 17:03:13 -0500 Subject: [PATCH 126/233] bcachefs: Change "disk accounting version 0" check to commit only 6.11 had a bug where we'd sometimes create disk accounting keys with version 0, which causes issues for journal replay - but we don't need to delete existing accounting keys with version 0. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index bb5dbbf71d04..c5e61265b709 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -134,7 +134,8 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, void *end = &acc_k + 1; int ret = 0; - bkey_fsck_err_on(bversion_zero(k.k->bversion), + bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && + bversion_zero(k.k->bversion), c, accounting_key_version_0, "accounting key with version=0"); From d6dd534eb3db761193e9913d669357a177c2b659 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Nov 2024 15:16:57 -0500 Subject: [PATCH 127/233] bcachefs: Fix bch2_btree_node_update_key_early() Fix an assertion pop from the recent btree cache freelist fixes. Fixes: baefd3f849ed ("bcachefs: btree_cache.freeable list fixes") Reported-by: Tyler Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1f06e24e53fc..1117be901cf0 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -326,7 +326,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, if (!IS_ERR_OR_NULL(b)) { mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); From 839c29d574315fe2f3c3f707f248d75348cf769d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Nov 2024 21:27:16 -0500 Subject: [PATCH 128/233] bcachefs: Go RW earlier, for normal rw mount Previously, when mounting read-write after a clean shutdown, we wouldn't go read-write until after all the recovery passes completed. Now, go RW early in recovery, the same as any other situation we'll need to go read-write. This fixes a bug where we discover unlinked inodes after a clean shutdown: repair fails because we're read only. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 1240c5c19fea..f6d3a99cb63e 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -46,7 +46,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) return bch2_fs_read_write_early(c); return 0; } From 33213a5be19ee403b58d57f4f311bd65ee261e3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 26 Nov 2024 22:59:27 -0500 Subject: [PATCH 129/233] bcachefs: Fix null ptr deref in btree_path_lock_root() Historically, we required that all btree node roots point to a valid (possibly fake) node, but we're improving our ability to continue in the presence of errors. Reported-by: syzbot+e22007d6acb9c87c2362@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 89f9665ce70d..80c3b55ce763 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -722,7 +722,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, path->btree_id); enum six_lock_type lock_type; unsigned i; int ret; @@ -730,7 +730,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, EBUG_ON(path->nodes_locked); while (1) { - b = READ_ONCE(*rootp); + struct btree *b = READ_ONCE(r->b); + if (unlikely(!b)) { + BUG_ON(!r->error); + return r->error; + } + path->level = READ_ONCE(b->c.level); if (unlikely(path->level < depth_want)) { @@ -755,7 +760,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, BUG(); } - if (likely(b == READ_ONCE(*rootp) && + if (likely(b == READ_ONCE(r->b) && b->c.level == path->level && !race_fault())) { for (i = 0; i < path->level; i++) From aa492d53186526b0879bdcd62079c555d611e7a0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Nov 2024 01:03:41 -0500 Subject: [PATCH 130/233] bcachefs: Ignore empty btree root journal entries There's no reason to treat them as errors: just ignore them, and go with a previous btree root if we had one. Reported-by: syzbot+e22007d6acb9c87c2362@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 547c78a323f7..727e894762f5 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -442,7 +442,9 @@ static int journal_replay_entry_early(struct bch_fs *c, switch (entry->type) { case BCH_JSET_ENTRY_btree_root: { - struct btree_root *r; + + if (unlikely(!entry->u64s)) + return 0; if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, c, invalid_btree_id, @@ -456,15 +458,11 @@ static int journal_replay_entry_early(struct bch_fs *c, return ret; } - r = bch2_btree_id_root(c, entry->btree_id); + struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - if (entry->u64s) { - r->level = entry->level; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - r->error = 0; - } else { - r->error = -BCH_ERR_btree_node_read_error; - } + r->level = entry->level; + bkey_copy(&r->key, (struct bkey_i *) entry->start); + r->error = 0; r->alive = true; break; } From 4d13c89412e72913f81b03b655e0f03ab0c8605d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Nov 2024 00:29:52 -0500 Subject: [PATCH 131/233] bcachefs: struct bkey_validate_context Add a new parameter to bkey validate functions, and use it to improve invalid bkey error messages: we can now print the btree and depth it came from, or if it came from the journal, or is a btree root. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 10 +++---- fs/bcachefs/alloc_background.h | 16 ++++++----- fs/bcachefs/backpointers.c | 2 +- fs/bcachefs/backpointers.h | 3 +- fs/bcachefs/bkey.h | 7 ----- fs/bcachefs/bkey_methods.c | 29 ++++++++++--------- fs/bcachefs/bkey_methods.h | 15 +++++----- fs/bcachefs/bkey_types.h | 26 +++++++++++++++++ fs/bcachefs/btree_io.c | 44 ++++++++++++++++++++++++----- fs/bcachefs/btree_node_scan.c | 7 ++++- fs/bcachefs/btree_trans_commit.c | 7 ++++- fs/bcachefs/btree_update_interior.c | 11 ++++++-- fs/bcachefs/data_update.c | 7 +++-- fs/bcachefs/dirent.c | 4 +-- fs/bcachefs/dirent.h | 4 +-- fs/bcachefs/disk_accounting.c | 4 +-- fs/bcachefs/disk_accounting.h | 3 +- fs/bcachefs/ec.c | 4 +-- fs/bcachefs/ec.h | 5 ++-- fs/bcachefs/error.c | 20 ++++++++++--- fs/bcachefs/error.h | 4 +-- fs/bcachefs/extents.c | 20 ++++++------- fs/bcachefs/extents.h | 9 +++--- fs/bcachefs/inode.c | 16 +++++------ fs/bcachefs/inode.h | 9 +++--- fs/bcachefs/journal_io.c | 35 ++++++++++++++--------- fs/bcachefs/lru.c | 2 +- fs/bcachefs/lru.h | 2 +- fs/bcachefs/quota.c | 2 +- fs/bcachefs/quota.h | 4 +-- fs/bcachefs/recovery.c | 1 + fs/bcachefs/reflink.c | 8 +++--- fs/bcachefs/reflink.h | 10 +++---- fs/bcachefs/snapshot.c | 4 +-- fs/bcachefs/snapshot.h | 7 ++--- fs/bcachefs/subvolume.c | 2 +- fs/bcachefs/subvolume.h | 5 ++-- fs/bcachefs/xattr.c | 2 +- fs/bcachefs/xattr.h | 3 +- 39 files changed, 231 insertions(+), 142 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 1e9f53db4bb8..8846daaa1162 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -198,7 +198,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; @@ -213,7 +213,7 @@ int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_alloc_unpacked u; int ret = 0; @@ -226,7 +226,7 @@ int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_alloc_unpacked u; int ret = 0; @@ -239,7 +239,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bch_alloc_v4 a; int ret = 0; @@ -509,7 +509,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) } int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 57723a37abb8..8cacddd188f4 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,8 +8,6 @@ #include "debug.h" #include "super.h" -enum bch_validate_flags; - /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -245,10 +243,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -282,7 +284,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); }) int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index cfd9b9ead473..ff08afd667a0 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -15,7 +15,7 @@ #include int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); int ret = 0; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index d8a15f5fa767..95caeabb8978 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -18,7 +18,8 @@ static inline u64 swab40(u64 x) ((x & 0xff00000000ULL) >> 32)); } -int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); +int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, + struct bkey_validate_context); void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 41df24a53d97..054e2d5e8448 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -9,13 +9,6 @@ #include "util.h" #include "vstructs.h" -enum bch_validate_flags { - BCH_VALIDATE_write = BIT(0), - BCH_VALIDATE_commit = BIT(1), - BCH_VALIDATE_journal = BIT(2), - BCH_VALIDATE_silent = BIT(3), -}; - #if 0 /* diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index e7ac227ba7e8..15c93576b5c2 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -28,7 +28,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -42,7 +42,7 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -59,7 +59,7 @@ static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, }) static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -83,7 +83,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -124,7 +124,7 @@ const struct bkey_ops bch2_bkey_null_ops = { }; int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; @@ -140,7 +140,7 @@ int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, if (!ops->key_validate) return 0; - ret = ops->key_validate(c, k, flags); + ret = ops->key_validate(c, k, from); fsck_err: return ret; } @@ -161,9 +161,10 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) } int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bch_validate_flags flags) + struct bkey_validate_context from) { + enum btree_node_type type = __btree_node_type(from.level, from.btree); + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; @@ -177,7 +178,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, return 0; bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && - (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && + (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", @@ -228,15 +229,15 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bch_validate_flags flags) + struct bkey_validate_context from) { - return __bch2_bkey_validate(c, k, type, flags) ?: - bch2_bkey_val_validate(c, k, flags); + return __bch2_bkey_validate(c, k, from) ?: + bch2_bkey_val_validate(c, k, from); } int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, enum bch_validate_flags flags) + struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 018fb72e32d3..bf34111cdf00 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -22,7 +22,7 @@ extern const struct bkey_ops bch2_bkey_null_ops; */ struct bkey_ops { int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags); + struct bkey_validate_context from); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); @@ -48,13 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) : &bch2_bkey_null_ops; } -int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags); -int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags); +int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context from); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h index c9ae9e42b385..2af6279b02a9 100644 --- a/fs/bcachefs/bkey_types.h +++ b/fs/bcachefs/bkey_types.h @@ -210,4 +210,30 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ BCH_BKEY_TYPES(); #undef x +enum bch_validate_flags { + BCH_VALIDATE_write = BIT(0), + BCH_VALIDATE_commit = BIT(1), + BCH_VALIDATE_journal = BIT(2), + BCH_VALIDATE_silent = BIT(3), +}; + +#define BKEY_VALIDATE_CONTEXTS() \ + x(unknown) \ + x(commit) \ + x(journal) \ + x(btree_root) \ + x(btree_node) + +struct bkey_validate_context { + enum { +#define x(n) BKEY_VALIDATE_##n, + BKEY_VALIDATE_CONTEXTS() +#undef x + } from:8; + u8 level; + enum btree_id btree; + bool root:1; + enum bch_validate_flags flags:8; +}; + #endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 3bb6db9bd4a4..eedcb2445b99 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -831,13 +831,32 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, return ret; } +static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + enum bch_validate_flags flags) +{ + return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = flags + }); +} + static int bset_key_validate(struct bch_fs *c, struct btree *b, struct bkey_s_c k, - bool updated_range, int rw) + bool updated_range, + enum bch_validate_flags flags) { - return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: - (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); + struct bkey_validate_context from = (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = flags, + }; + return __bch2_bkey_validate(c, k, from) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: + (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); } static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, @@ -854,7 +873,13 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); + return !__bch2_bkey_validate(c, u.s_c, + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_silent + }); } static inline int btree_node_read_bkey_cmp(const struct btree *b, @@ -1224,7 +1249,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - ret = bch2_bkey_val_validate(c, u.s_c, READ); + ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && !bversion_cmp(u.k->bversion, MAX_VERSION))) { @@ -1943,7 +1968,12 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, bool saw_error; int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), - BKEY_TYPE_btree, WRITE); + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level + 1, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_write, + }); if (ret) { bch2_fs_inconsistent(c, "invalid btree node key before write"); return ret; diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 327f1a1859b9..eeafb5e7354e 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -538,7 +538,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); printbuf_exit(&buf); - BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); + BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = level + 1, + .btree = btree, + })); ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); if (ret) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index cf313477567a..78d72c26083d 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -726,7 +726,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags); + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_commit, + .level = i->level, + .btree = i->btree_id, + .flags = invalid_flags, + }); if (unlikely(ret)){ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", trans->fn, (void *) i->ip_allocated); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index faa2816e02a0..56a70e95ef9a 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1360,9 +1360,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), - btree_node_type(b), BCH_VALIDATE_write) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) { + struct bkey_validate_context from = (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_commit, + }; + if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); dump_stack(); } diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index e4af2ccdf4c8..31b2aeb0c6e6 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -318,8 +318,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * it's been hard to reproduce, so this should give us some more * information when it does occur: */ - int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), - BCH_VALIDATE_commit); + int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), + (struct bkey_validate_context) { + .btree = m->btree_id, + .flags = BCH_VALIDATE_commit, + }); if (invalid) { struct printbuf buf = PRINTBUF; diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 4c22f78b0484..41813f9ce831 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -101,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { }; int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); @@ -120,7 +120,7 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, + bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 53ad99666022..362b3b2f2f2e 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -4,10 +4,10 @@ #include "str_hash.h" -enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index c5e61265b709..71c49a7ee2fe 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -127,14 +127,14 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); void *end = &acc_k + 1; int ret = 0; - bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && + bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && bversion_zero(k.k->bversion), c, accounting_key_version_0, "accounting key with version=0"); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 8b2b2f83e6a4..566aa2a8539d 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -83,7 +83,8 @@ int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); -int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_swab(struct bkey_s); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index f6b7b8b54f62..7d6c33f04092 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -110,7 +110,7 @@ struct ec_bio { /* Stripes btree keys: */ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; int ret = 0; @@ -130,7 +130,7 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, "invalid csum granularity (%u >= 64)", s->csum_granularity_bits); - ret = bch2_bkey_ptrs_validate(c, k, flags); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 43326370b410..583ca6a226da 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -6,9 +6,8 @@ #include "buckets_types.h" #include "extents_types.h" -enum bch_validate_flags; - -int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 2960baa023f6..9a695322b33c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "btree_iter.h" #include "error.h" #include "journal.h" @@ -443,23 +444,34 @@ int __bch2_fsck_err(struct bch_fs *c, return ret; } +static const char * const bch2_bkey_validate_contexts[] = { +#define x(n) #n, + BKEY_VALIDATE_CONTEXTS() +#undef x + NULL +}; + int __bch2_bkey_fsck_err(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags validate_flags, + struct bkey_validate_context from, enum bch_sb_error_id err, const char *fmt, ...) { - if (validate_flags & BCH_VALIDATE_silent) + if (from.flags & BCH_VALIDATE_silent) return -BCH_ERR_fsck_delete_bkey; unsigned fsck_flags = 0; - if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) + if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; struct printbuf buf = PRINTBUF; va_list args; - prt_str(&buf, "invalid bkey "); + prt_printf(&buf, "invalid bkey in %s btree=", + bch2_bkey_validate_contexts[from.from]); + bch2_btree_id_to_text(&buf, from.btree); + prt_printf(&buf, " level=%u: ", from.level); + bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, "\n "); va_start(args, fmt); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 8327a3461535..3b278f28e56b 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -153,7 +153,7 @@ enum bch_validate_flags; __printf(5, 6) int __bch2_bkey_fsck_err(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, + struct bkey_validate_context from, enum bch_sb_error_id, const char *, ...); @@ -163,7 +163,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *, */ #define bkey_fsck_err(c, _err_type, _err_msg, ...) \ do { \ - int _ret = __bch2_bkey_fsck_err(c, k, flags, \ + int _ret = __bch2_bkey_fsck_err(c, k, from, \ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ if (_ret != -BCH_ERR_fsck_fix && \ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 83aeceb68847..aa3b88291814 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -178,7 +178,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -186,7 +186,7 @@ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - ret = bch2_bkey_ptrs_validate(c, k, flags); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -198,7 +198,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; @@ -212,13 +212,13 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, c, btree_ptr_v2_min_key_bad, "min_key > key"); - if ((flags & BCH_VALIDATE_write) && + if ((from.flags & BCH_VALIDATE_write) && c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) bkey_fsck_err_on(!bp.v->sectors_written, c, btree_ptr_v2_written_0, "sectors_written == 0"); - ret = bch2_bkey_ptrs_validate(c, k, flags); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -405,7 +405,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0; @@ -1231,7 +1231,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, static int extent_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, + struct bkey_validate_context from, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata) @@ -1274,7 +1274,7 @@ static int extent_ptr_validate(struct bch_fs *c, } int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1301,7 +1301,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false); + ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); if (ret) return ret; @@ -1348,7 +1348,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(crc_is_encoded(crc) && (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), + (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), c, ptr_crc_uncompressed_size_too_big, "too large encoded extent"); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index ba33788fee36..620b284aa34f 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -8,7 +8,6 @@ struct bch_fs; struct btree_trans; -enum bch_validate_flags; /* extent entries: */ @@ -410,12 +409,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -452,7 +451,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -696,7 +695,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, struct bch_extent_ptr ptr2) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 5c603ab66be0..8818e41883f2 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -429,7 +429,7 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) } static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bch_inode_unpacked unpacked; int ret = 0; @@ -469,7 +469,7 @@ static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; @@ -479,13 +479,13 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, "invalid str hash type (%llu >= %u)", INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_validate(c, k, flags); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); int ret = 0; @@ -495,13 +495,13 @@ int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, "invalid str hash type (%llu >= %u)", INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_validate(c, k, flags); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); int ret = 0; @@ -519,7 +519,7 @@ int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, "invalid str hash type (%llu >= %u)", INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_validate(c, k, flags); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } @@ -780,7 +780,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index f52336cb298f..927c875976da 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -7,15 +7,14 @@ #include "opts.h" #include "snapshot.h" -enum bch_validate_flags; extern const char * const bch2_inode_opts[]; int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); @@ -60,7 +59,7 @@ static inline bool bkey_is_inode(const struct bkey *k) } int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 768a3b950997..1627f3e16517 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -327,11 +327,11 @@ static void journal_entry_err_msg(struct printbuf *out, static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - unsigned level, enum btree_id btree_id, struct bkey_i *k, - unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from, + unsigned version, int big_endian) { + enum bch_validate_flags flags = from.flags; int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); int ret = 0; @@ -366,11 +366,10 @@ static int journal_validate_key(struct bch_fs *c, } if (!write) - bch2_bkey_compat(level, btree_id, version, big_endian, + bch2_bkey_compat(from.level, from.btree, version, big_endian, write, NULL, bkey_to_packed(k)); - ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write); + ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); if (ret == -BCH_ERR_fsck_delete_bkey) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -381,7 +380,7 @@ static int journal_validate_key(struct bch_fs *c, goto fsck_err; if (write) - bch2_bkey_compat(level, btree_id, version, big_endian, + bch2_bkey_compat(from.level, from.btree, version, big_endian, write, NULL, bkey_to_packed(k)); fsck_err: return ret; @@ -394,13 +393,15 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, enum bch_validate_flags flags) { struct bkey_i *k = entry->start; + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .level = entry->level, + .btree = entry->btree_id, + .flags = flags|BCH_VALIDATE_journal, + }; while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, jset, entry, - entry->level, - entry->btree_id, - k, version, big_endian, - flags|BCH_VALIDATE_journal); + int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) continue; else if (ret) @@ -455,8 +456,14 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, - version, big_endian, flags); + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .level = entry->level + 1, + .btree = entry->btree_id, + .root = true, + .flags = flags, + }; + ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) ret = 0; fsck_err: diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index c18242748ca3..ce794d55818f 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -12,7 +12,7 @@ /* KEY_TYPE_lru is obsolete: */ int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index e6a7d8241bb8..f31a6cf1514c 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -33,7 +33,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 74f45a8162ad..8b857fc33244 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -60,7 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { }; int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index a62abcc5332a..1551800ff44c 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -5,10 +5,10 @@ #include "inode.h" #include "quota_types.h" -enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 727e894762f5..e361057ffad4 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -569,6 +569,7 @@ static int read_btree_roots(struct bch_fs *c) r->error = 0; ret = bch2_btree_lost_data(c, i); + BUG_ON(ret); } } diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 38db5a011702..e1911b9beb61 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -41,7 +41,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; @@ -89,7 +89,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r /* indirect extents */ int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -98,7 +98,7 @@ int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, "indirect extent above maximum position 0:%llu", REFLINK_P_IDX_MAX); - ret = bch2_bkey_ptrs_validate(c, k, flags); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -128,7 +128,7 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r /* indirect inline data */ int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index b61a4bdd8e82..f119316adc81 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -2,9 +2,8 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H -enum bch_validate_flags; - -int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, @@ -19,7 +18,8 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, .min_val_size = 16, \ }) -int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, @@ -34,7 +34,7 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, }) int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 6a52090485dc..f368270d6d9b 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -32,7 +32,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -225,7 +225,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_snapshot s; u32 i, id; diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 29c94716293e..ae23d45fad66 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -2,11 +2,9 @@ #ifndef _BCACHEFS_SNAPSHOT_H #define _BCACHEFS_SNAPSHOT_H -enum bch_validate_flags; - void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ .key_validate = bch2_snapshot_tree_validate, \ @@ -19,7 +17,8 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index cb45ef769c54..5e5ae405cb28 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -207,7 +207,7 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); int ret = 0; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 07b23dc08614..d53d292c22d7 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -5,12 +5,11 @@ #include "darray.h" #include "subvolume_types.h" -enum bch_validate_flags; - int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); -int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 820c1791545a..aed7c6984173 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { }; int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 2c96de051f3e..132fbbd15a66 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -6,7 +6,8 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ From 502a010a6c8b57555514ff5518adbe6ee5fe6ace Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 24 Nov 2024 21:28:07 -0500 Subject: [PATCH 132/233] bcachefs: Make topology errors autofix These repair paths are well tested, we can repair them without explicit user intervention This also tweaks bch2_topology_error() so that we run topology repair if we're in recovery, not just fsck. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 2 +- fs/bcachefs/recovery.c | 31 +++++++++++++++++++++++++------ fs/bcachefs/sb-errors_format.h | 12 ++++++------ 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 2e8cfc4d3265..19db4d8aca88 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -348,7 +348,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_unreadable, + trans, btree_node_read_error, "Topology repair: unreadable btree node at\n" " %s", buf.buf)) { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index e361057ffad4..64bb330eac86 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -40,19 +40,42 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) int ret = 0; mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (!(c->sb.btrees_lost_data & b)) { struct printbuf buf = PRINTBUF; bch2_btree_id_to_text(&buf, btree); bch_err(c, "flagging btree %s lost data", buf.buf); printbuf_exit(&buf); - bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); + ext->btrees_lost_data |= cpu_to_le64(b); } + /* Once we have runtime self healing for topology errors we won't need this: */ + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + + /* Btree node accounting will be off: */ + __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + +#ifdef CONFIG_BCACHEFS_DEBUG + /* + * These are much more minor, and don't need to be corrected right away, + * but in debug mode we want the next fsck run to be clean: + */ + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; +#endif + switch (btree) { case BTREE_ID_alloc: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; @@ -75,7 +98,6 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) goto out; default: ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; goto out; } out: @@ -748,9 +770,6 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); - if (c->opts.fsck) set_bit(BCH_FS_fsck_running, &c->flags); if (c->sb.clean) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 89d9dc2c859b..917ef6aa4a23 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -72,12 +72,12 @@ enum bch_fsck_flags { x(btree_root_read_error, 59, FSCK_AUTOFIX) \ x(btree_root_bad_min_key, 60, 0) \ x(btree_root_bad_max_key, 61, 0) \ - x(btree_node_read_error, 62, 0) \ - x(btree_node_topology_bad_min_key, 63, 0) \ - x(btree_node_topology_bad_max_key, 64, 0) \ - x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ - x(btree_node_topology_overwritten_by_next_node, 66, 0) \ - x(btree_node_topology_interior_node_empty, 67, 0) \ + x(btree_node_read_error, 62, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ + x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ From 8afb03592ffda8f215e4b6816f847a32ad2ac53d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Nov 2024 03:00:54 -0500 Subject: [PATCH 133/233] bcachefs: BCH_FS_recovery_running If we're autofixing topology errors, we shouldn't shutdown if we're still in recovery. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/error.c | 2 +- fs/bcachefs/recovery.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index a85b3bcc6383..d88129503bc5 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -614,6 +614,7 @@ struct bch_dev { x(going_ro) \ x(write_disable_complete) \ x(clean_shutdown) \ + x(recovery_running) \ x(fsck_running) \ x(initial_gc_unfixed) \ x(need_delete_dead_snapshots) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 9a695322b33c..5b67361b0cf1 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -34,7 +34,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) int bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if (!test_bit(BCH_FS_recovery_running, &c->flags)) { bch2_inconsistent_error(c); return -BCH_ERR_btree_need_topology_repair; } else { diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 64bb330eac86..c50dede64785 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -774,6 +774,7 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_fsck_running, &c->flags); if (c->sb.clean) set_bit(BCH_FS_clean_recovery, &c->flags); + set_bit(BCH_FS_recovery_running, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -925,6 +926,7 @@ int bch2_fs_recovery(struct bch_fs *c) */ set_bit(BCH_FS_may_go_rw, &c->flags); clear_bit(BCH_FS_fsck_running, &c->flags); + clear_bit(BCH_FS_recovery_running, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); From a5d7cf346646cb3d58221d896ed65224a306bf8f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Nov 2024 21:58:43 -0500 Subject: [PATCH 134/233] bcachefs: Guard against journal seq overflow Wraparound is impractical to handle since in various places we use 0 as a sentinal value - but 64 bits (or 56, because the btree write buffer steals a few bits) is enough for all practical purposes. Reported-by: syzbot+73ed43fbe826227bd4e0@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 9 +++++++++ fs/bcachefs/journal_types.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 95cccda3b22c..dc66521964b7 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -382,6 +382,10 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; + if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX, + c, "cannot start: journal seq overflow")) + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + BUG_ON(!j->cur_entry_sectors); buf->expires = @@ -1270,6 +1274,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) bool had_entries = false; u64 last_seq = cur_seq, nr, seq; + if (cur_seq >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + return -EINVAL; + } + genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 425d1abb257e..e9bd716fbb71 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -9,6 +9,9 @@ #include "super_types.h" #include "fifo.h" +/* btree write buffer steals 8 bits for its own purposes: */ +#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) + #define JOURNAL_BUF_BITS 2 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) From 6248d420a98583d960b736bec0fc52d1a4134894 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Nov 2024 22:09:29 -0500 Subject: [PATCH 135/233] bcachefs: Issue a transaction restart after commit in repair transaction commits invalidate pointers to btree values, and they also downgrade intent locks. This breaks the interior btree update path, which takes intent locks and then calls into the allocator. This isn't an ideal solution: we can't unconditionally issue a restart after a transaction commit, because that would break other codepaths. Reported-by: syzbot+78d82470c16a49702682@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 2 +- fs/bcachefs/errcode.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 8846daaa1162..79af226ca609 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1384,7 +1384,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - 1; + -BCH_ERR_transaction_restart_commit; goto out; } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a12050e9c191..a0cfc0f286f4 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -148,6 +148,7 @@ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(BCH_ERR_transaction_restart, transaction_restart_commit) \ x(0, no_btree_node) \ x(BCH_ERR_no_btree_node, no_btree_node_relock) \ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ From fbb140ee4560aaf301b11de4e22cf7821ee8ab0b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Nov 2024 22:29:54 -0500 Subject: [PATCH 136/233] bcachefs: Guard against backpointers to unknown btrees Reported-by: syzbot+997f0573004dcb964555@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 7 +++++-- fs/bcachefs/sb-errors_format.h | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ff08afd667a0..702bf62d7fa7 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -249,9 +249,12 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct btree_iter *iter, unsigned iter_flags) { - if (likely(!bp.v->level)) { - struct bch_fs *c = trans->c; + struct bch_fs *c = trans->c; + if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) + return bkey_s_c_null; + + if (likely(!bp.v->level)) { bch2_trans_node_iter_init(trans, iter, bp.v->btree_id, bp.v->pos, diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 917ef6aa4a23..e73d1c60198e 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -67,7 +67,7 @@ enum bch_fsck_flags { x(btree_node_bkey_past_bset_end, 54, 0) \ x(btree_node_bkey_bad_format, 55, 0) \ x(btree_node_bad_bkey, 56, 0) \ - x(btree_node_bkey_out_of_order, 57, 0) \ + x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ x(btree_root_read_error, 59, FSCK_AUTOFIX) \ x(btree_root_bad_min_key, 60, 0) \ From 5a1b4c8d17569dd893b47115fb744bb860031a28 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 15:10:24 -0500 Subject: [PATCH 137/233] bcachefs: Fix journal_iter list corruption Fix exiting an iterator that wasn't initialized. Reported-by: syzbot+2f7c2225ed8a5cb24af1@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 56a70e95ef9a..5eabd532e388 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -58,6 +58,10 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { printbuf_reset(&buf); @@ -77,11 +81,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (!b->c.level) - return 0; - - bch2_bkey_buf_init(&prev); - bkey_init(&prev.k->k); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + goto out; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) From 1a7e03622b24f52ac9991bf5eb9b5345776d1fb2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 16:09:04 -0500 Subject: [PATCH 138/233] bcachefs: add missing printbuf_reset() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 19db4d8aca88..e59924cfe2bc 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -521,6 +521,7 @@ int bch2_check_topology(struct bch_fs *c) struct btree_root *r = bch2_btree_id_root(c, i); bool reconstructed_root = false; + printbuf_reset(&buf); bch2_btree_id_to_text(&buf, i); if (r->error) { From b7b7f5ab552920578736a6a25cb370df0096d1df Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 16:09:15 -0500 Subject: [PATCH 139/233] bcachefs: mark more errors AUTOFIX mark errors as autofix where syzbot has hit the repair paths Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index e73d1c60198e..382fcafa815a 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -124,9 +124,9 @@ enum bch_fsck_flags { x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ x(bucket_sector_count_overflow, 112, 0) \ x(bucket_metadata_type_mismatch, 113, 0) \ - x(need_discard_key_wrong, 114, 0) \ - x(freespace_key_wrong, 115, 0) \ - x(freespace_hole_missing, 116, 0) \ + x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ + x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ + x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ x(bucket_gens_val_size_bad, 117, 0) \ x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ @@ -288,7 +288,7 @@ enum bch_fsck_flags { x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ x(snapshot_node_missing, 264, 0) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ - x(btree_bitmap_not_marked, 266, 0) \ + x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ x(sb_clean_entry_overrun, 267, 0) \ x(btree_ptr_v2_written_0, 268, 0) \ x(subvol_snapshot_bad, 269, 0) \ From ca04ac9a4aeaf4a70f9670befc2056fd85c517bb Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 16:14:06 -0500 Subject: [PATCH 140/233] bcachefs: Don't error out when logging fsck error Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 8 +++++--- fs/bcachefs/error.c | 29 +++++++++++++++++------------ fs/bcachefs/error.h | 14 +++++++++----- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 79af226ca609..6de0387ede99 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -676,6 +676,10 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, set ? "" : "un", bch2_btree_id_str(btree), buf.buf); + if (ret == -BCH_ERR_fsck_ignore || + ret == -BCH_ERR_fsck_errors_not_fixed) + ret = 0; + printbuf_exit(&buf); return ret; } @@ -1901,10 +1905,8 @@ static int bch2_do_discards_fast_one(struct btree_trans *trans, if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, trans, discarding_bucket_not_in_need_discard_btree, "attempting to discard bucket %u:%llu not in need_discard btree", - ca->dev_idx, bucket)) { - /* log it in the superblock and continue: */ + ca->dev_idx, bucket)) goto out; - } ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); out: diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 5b67361b0cf1..23b9ecbcf3cf 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -227,7 +227,7 @@ int __bch2_fsck_err(struct bch_fs *c, { struct fsck_err_state *s = NULL; va_list args; - bool print = true, suppressing = false, inconsistent = false; + bool print = true, suppressing = false, inconsistent = false, exiting = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; @@ -320,13 +320,19 @@ int __bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if ((flags & FSCK_CAN_FIX) && - (flags & FSCK_AUTOFIX) && + if ((flags & FSCK_AUTOFIX) && (c->opts.errors == BCH_ON_ERROR_continue || c->opts.errors == BCH_ON_ERROR_fix_safe)) { prt_str(out, ", "); - prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + if (flags & FSCK_CAN_FIX) { + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } + + goto print; } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { @@ -396,14 +402,13 @@ int __bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; - bool exiting = - test_bit(BCH_FS_fsck_running, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore); - - if (exiting) + if (test_bit(BCH_FS_fsck_running, &c->flags) && + (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore)) { + exiting = true; print = true; - + } +print: if (print) { if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s\n", out->buf); diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 3b278f28e56b..12ca5287e20a 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -45,12 +45,11 @@ int bch2_topology_error(struct bch_fs *); bch2_inconsistent_error(c); \ }) -#define bch2_fs_inconsistent_on(cond, c, ...) \ +#define bch2_fs_inconsistent_on(cond, ...) \ ({ \ bool _ret = unlikely(!!(cond)); \ - \ if (_ret) \ - bch2_fs_inconsistent(c, __VA_ARGS__); \ + bch2_fs_inconsistent(__VA_ARGS__); \ _ret; \ }) @@ -146,8 +145,13 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define log_fsck_err(c, _err_type, ...) \ __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) -#define log_fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) +#define log_fsck_err_on(cond, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + if (_ret) \ + log_fsck_err(__VA_ARGS__); \ + _ret; \ +}) enum bch_validate_flags; __printf(5, 6) From 422310542e0699139ceba54439ed022097b55ebd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 16:25:41 -0500 Subject: [PATCH 141/233] bcachefs: do_fsck_ask_yn() __bch2_fsck_err() is huge, and badly needs more refactoring Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 59 ++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 23b9ecbcf3cf..0517782ca57a 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -219,6 +219,30 @@ static const u8 fsck_flags_extra[] = { #undef x }; +static int do_fsck_ask_yn(struct bch_fs *c, + struct btree_trans *trans, + struct printbuf *question, + const char *action) +{ + prt_str(question, ", "); + prt_str(question, action); + + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", question->buf); + else + bch2_print_string_as_lines(KERN_ERR, question->buf); + + int ask = bch2_fsck_ask_yn(c, trans); + + if (trans) { + int ret = bch2_trans_relock(trans); + if (ret) + return ret; + } + + return ask; +} + int __bch2_fsck_err(struct bch_fs *c, struct btree_trans *trans, enum bch_fsck_flags flags, @@ -291,16 +315,14 @@ int __bch2_fsck_err(struct bch_fs *c, */ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; - mutex_unlock(&c->fsck_error_msgs_lock); - goto err; + goto err_unlock; } kfree(s->last_msg); s->last_msg = kstrdup(buf.buf, GFP_KERNEL); if (!s->last_msg) { - mutex_unlock(&c->fsck_error_msgs_lock); ret = -ENOMEM; - goto err; + goto err_unlock; } if (c->opts.ratelimit_errors && @@ -356,31 +378,18 @@ int __bch2_fsck_err(struct bch_fs *c, : c->opts.fix_errors; if (fix == FSCK_FIX_ask) { - prt_str(out, ", "); - prt_str(out, action); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", out->buf); - else - bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - int ask = bch2_fsck_ask_yn(c, trans); + ret = do_fsck_ask_yn(c, trans, out, action); + if (ret < 0) + goto err_unlock; - if (trans) { - ret = bch2_trans_relock(trans); - if (ret) { - mutex_unlock(&c->fsck_error_msgs_lock); - goto err; - } - } - - if (ask >= YN_ALLNO && s) - s->fix = ask == YN_ALLNO + if (ret >= YN_ALLNO && s) + s->fix = ret == YN_ALLNO ? FSCK_FIX_no : FSCK_FIX_yes; - ret = ask & 1 + ret = ret & 1 ? -BCH_ERR_fsck_fix : -BCH_ERR_fsck_ignore; } else if (fix == FSCK_FIX_yes || @@ -424,8 +433,6 @@ int __bch2_fsck_err(struct bch_fs *c, if (s) s->ret = ret; - mutex_unlock(&c->fsck_error_msgs_lock); - if (inconsistent) bch2_inconsistent_error(c); @@ -442,6 +449,8 @@ int __bch2_fsck_err(struct bch_fs *c, set_bit(BCH_FS_error, &c->flags); } } +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); err: if (action != action_orig) kfree(action); From 6efc86ff29d2010ce9ffe0d8121ab64c89832a4f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 16:59:40 -0500 Subject: [PATCH 142/233] bcachefs: Check for bucket journal seq in the future This fixes an assertion pop in bch2_journal_noflush_seq() - log the error to the superblock and continue instead. Reported-by: syzbot+85700120f75fc10d4e18@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 63 +++++++++++++++++++--------------- fs/bcachefs/sb-errors_format.h | 3 +- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 6de0387ede99..e8c246e5803c 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -926,37 +926,43 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - u64 journal_seq = trans->journal_res.seq; - u64 bucket_journal_seq = new_a->journal_seq; + u64 transaction_seq = trans->journal_res.seq; - if ((flags & BTREE_TRIGGER_insert) && - data_type_is_empty(old_a->data_type) != - data_type_is_empty(new_a->data_type) && - new.k->type == KEY_TYPE_alloc_v4) { - struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; + if (log_fsck_err_on(transaction_seq && new_a->journal_seq > transaction_seq, + trans, alloc_key_journal_seq_in_future, + "bucket journal seq in future (currently at %llu)\n%s", + journal_cur_seq(&c->journal), + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) + new_a->journal_seq = transaction_seq; - /* - * If the btree updates referring to a bucket weren't flushed - * before the bucket became empty again, then the we don't have - * to wait on a journal flush before we can reuse the bucket: - */ - v->journal_seq = bucket_journal_seq = - data_type_is_empty(new_a->data_type) && - (journal_seq == v->journal_seq || - bch2_journal_noflush_seq(&c->journal, v->journal_seq)) - ? 0 : journal_seq; - } + int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - + (int) data_type_is_empty(old_a->data_type); - if (!data_type_is_empty(old_a->data_type) && - data_type_is_empty(new_a->data_type) && - bucket_journal_seq) { - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - bucket_journal_seq); - if (bch2_fs_fatal_err_on(ret, c, - "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) - goto err; + /* Record journal sequence number of empty -> nonempty transition: */ + if (is_empty_delta < 0) + new_a->journal_seq = max(new_a->journal_seq, transaction_seq); + + /* + * Bucket becomes empty: mark it as waiting for a journal flush, + * unless updates since empty -> nonempty transition were never + * flushed - we may need to ask the journal not to flush + * intermediate sequence numbers: + */ + if (is_empty_delta > 0) { + if (new_a->journal_seq == transaction_seq || + bch2_journal_noflush_seq(&c->journal, new_a->journal_seq)) + new_a->journal_seq = 0; + else { + new_a->journal_seq = transaction_seq; + + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + transaction_seq); + if (bch2_fs_fatal_err_on(ret, c, + "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) + goto err; + } } if (new_a->gen != old_a->gen) { @@ -1004,6 +1010,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, rcu_read_unlock(); } err: +fsck_err: printbuf_exit(&buf); bch2_dev_put(ca); return ret; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 382fcafa815a..8e3a6c5da10d 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -122,6 +122,7 @@ enum bch_fsck_flags { x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ x(bucket_sector_count_overflow, 112, 0) \ x(bucket_metadata_type_mismatch, 113, 0) \ x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ @@ -308,7 +309,7 @@ enum bch_fsck_flags { x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(MAX, 298, 0) + x(MAX, 299, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From 9d59eb0be24ff7e84dfd8bc80f2da35e819decfe Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 17:48:20 -0500 Subject: [PATCH 143/233] bcachefs: Check for inode journal seq in the future More check and repair code: this fixes a warning in bch2_journal_flush_seq_async() Reported-by: syzbot+d119b445ec739e7f3068@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io.c | 35 +++++++++++++++++++++++++++++++--- fs/bcachefs/fsck.c | 13 ++++++++++++- fs/bcachefs/sb-errors_format.h | 3 ++- 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index c6fdfec51082..33d0e7080bf6 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* fsync: */ +static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, + u64 *seq) +{ + struct printbuf buf = PRINTBUF; + struct bch_inode_unpacked u; + struct btree_iter iter; + int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); + if (ret) + return ret; + + u64 cur_seq = journal_cur_seq(&trans->c->journal); + *seq = min(cur_seq, u.bi_journal_seq); + + if (fsck_err_on(u.bi_journal_seq > cur_seq, + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", + cur_seq, + (bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_journal_seq = cur_seq; + ret = bch2_inode_write(trans, &iter, &u); + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + /* * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an * insert trigger: look up the btree inode instead @@ -180,9 +208,10 @@ static int bch2_flush_inode(struct bch_fs *c, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) return -EROFS; - struct bch_inode_unpacked u; - int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: - bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?: + u64 seq; + int ret = bch2_trans_commit_do(c, NULL, NULL, 0, + bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: + bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); bch2_write_ref_put(c, BCH_WRITE_REF_fsync); return ret; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e10abd2e6c69..f2174528ee5f 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1392,7 +1392,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, - "inode %llu%u unlinked and not open", + "inode %llu:%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck deleting inode"); @@ -1441,6 +1441,17 @@ static int check_inode(struct btree_trans *trans, do_update = true; } } + + if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", + journal_cur_seq(&c->journal), + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_journal_seq = journal_cur_seq(&c->journal); + do_update = true; + } do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u); diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 8e3a6c5da10d..342eda8ab69f 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -233,6 +233,7 @@ enum bch_fsck_flags { x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ + x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ @@ -309,7 +310,7 @@ enum bch_fsck_flags { x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(MAX, 299, 0) + x(MAX, 300, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From 1b1f8623fbdcc0fc2ea1c087e146f77e30e94b1b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 17:57:55 -0500 Subject: [PATCH 144/233] bcachefs: cryptographic MACs on superblock are not (yet?) supported We should add support for cryptographic macs on the superblock - and it won't be hard, but it'll need an incompatible feature bit (and we have a new incompatible feature versioning scheme coming). For now, just add a guard to avoid a dull ptr deref in gen_poly_key(). Reported-by: syzbot+dd3d9835055dacb66f35@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 4c29f8215d54..6a086c1c4b14 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -677,7 +677,8 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf } enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); - if (csum_type >= BCH_CSUM_NR) { + if (csum_type >= BCH_CSUM_NR || + bch2_csum_type_is_encryption(csum_type)) { prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); return -BCH_ERR_invalid_sb_csum_type; } From 4271983e79a7da2a3eb02eeee4cdfe35d3c7bc99 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 18:05:06 -0500 Subject: [PATCH 145/233] bcachefs: bch2_trans_relock() is trylock for lockdep fix some spurious lockdep splats Reported-by: syzbot+e088be3c2d5c05aaac35@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 8 ++++---- fs/bcachefs/btree_locking.c | 2 +- fs/bcachefs/btree_locking.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 80c3b55ce763..9c54891c737a 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1007,7 +1007,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) bch2_trans_unlock(trans); cond_resched(); - trans_set_locked(trans); + trans_set_locked(trans, false); if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -3248,7 +3248,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; - trans_set_locked(trans); + trans_set_locked(trans, false); if (trans->restarted) { bch2_btree_path_traverse_all(trans); @@ -3354,7 +3354,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; - trans_set_locked(trans); + trans_set_locked(trans, false); closure_init_stack_release(&trans->ref); return trans; @@ -3622,7 +3622,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) #ifdef CONFIG_LOCKDEP fs_reclaim_acquire(GFP_KERNEL); struct btree_trans *trans = bch2_trans_get(c); - trans_set_locked(trans); + trans_set_locked(trans, false); bch2_trans_put(trans); fs_reclaim_release(GFP_KERNEL); #endif diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index efe2a007b482..d343df9f0ad2 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -782,7 +782,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) return bch2_trans_relock_fail(trans, path, &f, trace); } - trans_set_locked(trans); + trans_set_locked(trans, true); out: bch2_trans_verify_locks(trans); return 0; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index ca4aeefd631e..7474ab6ce019 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -188,10 +188,10 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); /* lock: */ -static inline void trans_set_locked(struct btree_trans *trans) +static inline void trans_set_locked(struct btree_trans *trans, bool try) { if (!trans->locked) { - lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_); + lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); trans->locked = true; trans->last_unlock_ip = 0; From 12b2baa0b5cb956abfd3ef05a1cc6ed21b565006 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 19:02:18 -0500 Subject: [PATCH 146/233] bcachefs: Check for extent crc uncompressed/compressed size mismatch When not compressed, these must be equal - this fixes an assertion pop in bch2_rechecksum_bio(). Reported-by: syzbot+50d3544c9b8db9c99fd2@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/extents.c | 22 +++++++++++++--------- fs/bcachefs/sb-errors_format.h | 5 +++-- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index aa3b88291814..2fc9ace5533c 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1323,9 +1323,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, - c, ptr_crc_uncompressed_size_too_small, - "checksum offset + key size > uncompressed size"); bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, ptr_crc_csum_type_unknown, "invalid checksum type"); @@ -1333,6 +1330,19 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, c, ptr_crc_compression_type_unknown, "invalid compression type"); + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, + c, ptr_crc_uncompressed_size_too_small, + "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(crc_is_encoded(crc) && + (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && + (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), + c, ptr_crc_uncompressed_size_too_big, + "too large encoded extent"); + bkey_fsck_err_on(!crc_is_compressed(crc) && + crc.compressed_size != crc.uncompressed_size, + c, ptr_crc_uncompressed_size_mismatch, + "not compressed but compressed != uncompressed size"); + if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; @@ -1346,12 +1356,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, "redundant crc entry"); crc_since_last_ptr = true; - bkey_fsck_err_on(crc_is_encoded(crc) && - (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), - c, ptr_crc_uncompressed_size_too_big, - "too large encoded extent"); - size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 342eda8ab69f..3bbda181f314 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -172,10 +172,11 @@ enum bch_fsck_flags { x(ptr_bucket_data_type_mismatch, 155, 0) \ x(ptr_cached_and_erasure_coded, 156, 0) \ x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ x(ptr_crc_csum_type_unknown, 158, 0) \ x(ptr_crc_compression_type_unknown, 159, 0) \ x(ptr_crc_redundant, 160, 0) \ - x(ptr_crc_uncompressed_size_too_big, 161, 0) \ x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ @@ -310,7 +311,7 @@ enum bch_fsck_flags { x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(MAX, 300, 0) + x(MAX, 301, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, From 5bb89aa54d003da949edb90ce6e399e25d7ba2bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Nov 2024 19:30:23 -0500 Subject: [PATCH 147/233] bcachefs: Don't recurse in check_discard_freespace_key When calling check_discard_freeespace_key from the allocator, we can't repair without recursing - run it asynchronously instead. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 72 ++++++++++++++++++++++++++++++---- fs/bcachefs/alloc_background.h | 2 +- fs/bcachefs/alloc_foreground.c | 2 +- fs/bcachefs/bcachefs.h | 1 + 4 files changed, 67 insertions(+), 10 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index e8c246e5803c..b2d570453351 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -1338,7 +1338,40 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, return ret; } -int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen) +struct check_discard_freespace_key_async { + struct work_struct work; + struct bch_fs *c; + struct bbpos pos; +}; + +static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + u8 gen; + ret = k.k->type != KEY_TYPE_set + ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) + : 0; + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void check_discard_freespace_key_work(struct work_struct *work) +{ + struct check_discard_freespace_key_async *w = + container_of(work, struct check_discard_freespace_key_async, work); + + bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); + bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); + kfree(w); +} + +int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, + bool async_repair) { struct bch_fs *c = trans->c; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard @@ -1351,7 +1384,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite u64 genbits = iter->pos.offset & (~0ULL << 56); struct btree_iter alloc_iter; - struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, BTREE_ITER_cached); + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, BTREE_ITER_cached); int ret = bkey_err(alloc_k); if (ret) return ret; @@ -1392,17 +1426,39 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; - goto out; + if (!async_repair) { + ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_commit; + goto out; + } else { + /* + * We can't repair here when called from the allocator path: the + * commit will recurse back into the allocator + */ + struct check_discard_freespace_key_async *w = + kzalloc(sizeof(*w), GFP_KERNEL); + if (!w) + goto out; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { + kfree(w); + goto out; + } + + INIT_WORK(&w->work, check_discard_freespace_key_work); + w->c = c; + w->pos = BBPOS(iter->btree_id, iter->pos); + queue_work(c->write_ref_wq, &w->work); + goto out; + } } static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) { u8 gen; - int ret = bch2_check_discard_freespace_key(trans, iter, &gen); + int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 8cacddd188f4..de25ba4ee94b 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -310,7 +310,7 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); -int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *); +int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 4d1ff7f1f302..c40a76df76b8 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -281,7 +281,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); u8 gen; - int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen); + int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); if (ret < 0) return ERR_PTR(ret); if (ret) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index d88129503bc5..c16937e54734 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -680,6 +680,7 @@ struct btree_trans_buf { x(dio_write) \ x(discard) \ x(discard_fast) \ + x(check_discard_freespace_key) \ x(invalidate) \ x(delete_dead_snapshots) \ x(gc_gens) \ From 9a956407c26a40a289626d58669c62bd3417e368 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 29 Nov 2024 14:38:27 +0800 Subject: [PATCH 148/233] bcachefs: Add missing parameter description to bch2_bucket_alloc_trans() The function bch2_bucket_alloc_trans() lacked a description for the nowait parameter in its documentation comment block. This patch adds the missing description to ensure all parameters are properly documented. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=12179 Signed-off-by: Yang Li Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index c40a76df76b8..095bfe7c53bd 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -505,6 +505,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, * @watermark: how important is this allocation? * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available + * @nowait: if true, do not wait for buckets to become available * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. From 428a2c2d6b128c18f3dcd289f549bf510933679a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Nov 2024 21:12:47 -0500 Subject: [PATCH 149/233] bcachefs: Fix fsck.c build in userspace Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f2174528ee5f..cc15ff135cd6 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3206,6 +3206,8 @@ int bch2_fix_reflink_p(struct bch_fs *c) return ret; } +#ifndef NO_BCACHEFS_CHARDEV + struct fsck_thread { struct thread_with_stdio thr; struct bch_fs *c; @@ -3421,3 +3423,5 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) } return ret; } + +#endif /* NO_BCACHEFS_CHARDEV */ From 2ab8d3198995c8db970dd9b4716d5acba215d48b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 28 Sep 2024 15:40:49 -0400 Subject: [PATCH 150/233] bcachefs: bch2_inum_to_path() Add a function for walking backpointers to find a path from a given inode number, and convert various error messages to use it. Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/error.c | 34 +++++++ fs/bcachefs/error.h | 6 ++ fs/bcachefs/fs-common.c | 81 +++++++++++++++ fs/bcachefs/fs-common.h | 2 + fs/bcachefs/fs-io-buffered.c | 10 +- fs/bcachefs/fsck.c | 12 ++- fs/bcachefs/io_misc.c | 12 ++- fs/bcachefs/io_read.c | 185 +++++++++++++++++++++++++---------- 9 files changed, 279 insertions(+), 64 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a0cfc0f286f4..47387f7d6202 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -116,6 +116,7 @@ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ + x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 0517782ca57a..abaa9570cd62 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -3,6 +3,7 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" +#include "fs-common.h" #include "journal.h" #include "recovery_passes.h" #include "super.h" @@ -515,3 +516,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } + +int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +{ + u32 restart_count = trans->restart_count; + int ret = 0; + + /* XXX: we don't yet attempt to print paths when we don't know the subvol */ + if (inum.subvol) + ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + + return trans_was_restarted(trans, restart_count); +} + +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) +{ + int ret = bch2_inum_err_msg_trans(trans, out, inum); + prt_printf(out, " offset %llu: ", offset); + return ret; +} + +void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) +{ + bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); +} + +void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) +{ + bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 12ca5287e20a..7acf2a27ca28 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -238,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); _ret; \ }) +int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); +int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); + +void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); +void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); + #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 7e10a9ddcfd9..dcaa47f68f31 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -548,3 +548,84 @@ int bch2_rename_trans(struct btree_trans *trans, bch2_trans_iter_exit(trans, &src_dir_iter); return ret; } + +static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) +{ + bch2_printbuf_make_room(out, n); + + unsigned can_print = min(n, printbuf_remaining(out)); + + b += n; + + for (unsigned i = 0; i < can_print; i++) + out->buf[out->pos++] = *((char *) --b); + + printbuf_nul_terminate(out); +} + +static inline void reverse_bytes(void *b, size_t n) +{ + char *e = b + n, *s = b; + + while (s < e) { + --e; + swap(*s, *e); + s++; + } +} + +/* XXX: we don't yet attempt to print paths when we don't know the subvol */ +int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) +{ + unsigned orig_pos = path->pos; + int ret = 0; + + while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && + inum.inum == BCACHEFS_ROOT_INO)) { + struct bch_inode_unpacked inode; + ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); + if (ret) + goto err; + + if (!inode.bi_dir && !inode.bi_dir_offset) { + ret = -BCH_ERR_ENOENT_inode_no_backpointer; + goto err; + } + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + struct btree_iter d_iter; + struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, + BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), + 0, dirent); + ret = bkey_err(d.s_c); + if (ret) + goto err; + + struct qstr dirent_name = bch2_dirent_get_name(d); + prt_bytes_reversed(path, dirent_name.name, dirent_name.len); + + prt_char(path, '/'); + + if (d.v->d_type == DT_SUBVOL) + inum.subvol = le32_to_cpu(d.v->d_parent_subvol); + inum.inum = d.k->p.inode; + + bch2_trans_iter_exit(trans, &d_iter); + } + + if (orig_pos == path->pos) + prt_char(path, '/'); + + ret = path->allocation_failure ? -ENOMEM : 0; + if (ret) + goto err; + + reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); + return 0; +err: + return ret; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index c934e807b380..2b59210bb5e8 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -42,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *, bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); +int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); + #endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index d55e215e8aa6..ff8b8df50bf3 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -231,10 +231,12 @@ static void bchfs_read(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); if (ret) { - bch_err_inum_offset_ratelimited(c, - iter.pos.inode, - iter.pos.offset << 9, - "read error %i from btree lookup", ret); + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index cc15ff135cd6..1a5a07112779 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -212,6 +212,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); + struct btree_iter lostfound_iter = { NULL }; u64 inum = 0; unsigned d_type = 0; int ret; @@ -290,11 +291,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * XXX: we could have a nicer log message here if we had a nice way to * walk backpointers to print a path */ - bch_notice(c, "creating lost+found in subvol %llu snapshot %u", - root_inum.subvol, le32_to_cpu(st.root_snapshot)); + struct printbuf path = PRINTBUF; + ret = bch2_inum_to_path(trans, root_inum, &path); + if (ret) + goto err; + + bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", + path.buf, root_inum.subvol, snapshot); + printbuf_exit(&path); u64 now = bch2_current_time(c); - struct btree_iter lostfound_iter = { NULL }; u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, lostfound); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 524e31e7411b..5353979117b0 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans, err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) - bch_err_inum_offset_ratelimited(c, - inum.inum, - iter->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + if (should_print_err(ret)) { + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); + prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } err_noprint: bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 4b6b6d25725b..34a3569d085a 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -322,6 +322,20 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, /* Read */ +static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) +{ + return bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { rbio->subvol, read_pos.inode }, + read_pos.offset << 9); +} + +static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) +{ + bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); +} + #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 @@ -500,6 +514,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, } } +static void bch2_read_io_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bio *bio = &rbio->bio; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); + + if (ca) { + bch2_io_error(ca, BCH_MEMBER_ERROR_read); + bch_err_ratelimited(ca, "%s", buf.buf); + } else { + bch_err_ratelimited(c, "%s", buf.buf); + } + + printbuf_exit(&buf); + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); +} + static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { @@ -563,6 +600,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) __bch2_rbio_narrow_crcs(trans, rbio)); } +static void bch2_read_csum_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bio *src = &rbio->bio; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "data "); + bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) { + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + bch_err_ratelimited(ca, "%s", buf.buf); + } else { + bch_err_ratelimited(c, "%s", buf.buf); + } + + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + printbuf_exit(&buf); +} + +static void bch2_read_decompress_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decompression error"); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) + bch_err_ratelimited(ca, "%s", buf.buf); + else + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + printbuf_exit(&buf); +} + +static void bch2_read_decrypt_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decrypt error"); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) + bch_err_ratelimited(ca, "%s", buf.buf); + else + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + printbuf_exit(&buf); +} + /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { @@ -669,33 +773,13 @@ static void __bch2_read_endio(struct work_struct *work) goto out; } - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_str(&buf, "data "); - bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) { - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data %s", buf.buf); - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - } - printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decompression error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decrypt_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decrypt error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; } @@ -716,16 +800,8 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bio->bi_status) { - if (ca) { - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status)); - bch2_io_error(ca, BCH_MEMBER_ERROR_read); - } - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + if (unlikely(bio->bi_status)) { + bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); return; } @@ -832,25 +908,22 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (unlikely(pick_ret < 0)) { struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); bch2_bkey_val_to_text(&buf, c, k); - bch_err_inum_offset_ratelimited(c, - read_pos.inode, read_pos.offset << 9, - "no device to read from: %s\n %s", - bch2_err_str(pick_ret), - buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); goto err; } if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); bch2_bkey_val_to_text(&buf, c, k); - bch_err_inum_offset_ratelimited(c, - read_pos.inode, read_pos.offset << 9, - "attempting to read encrypted data without encryption key\n %s", - buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); goto err; } @@ -1036,11 +1109,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } if (!rbio->pick.idx) { - if (!rbio->have_ioref) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, - read_pos.offset << 9, - "no device to read from"); + if (unlikely(!rbio->have_ioref)) { + struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); + prt_printf(&buf, "no device to read from:\n "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -1202,16 +1279,20 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, } bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); if (ret) { - bch_err_inum_offset_ratelimited(c, inum.inum, - bvec_iter.bi_sector << 9, - "read error %i from btree lookup", ret); + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); } + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); } void bch2_fs_io_read_exit(struct bch_fs *c) From 67434cd4b78571347f6e72f6e083a7ed2629ca1b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 13 Nov 2024 23:08:57 -0500 Subject: [PATCH 151/233] bcachefs: Convert write path errors to inum_to_path() Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 91 +++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index f97ebb30f6c0..bae045e76055 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -396,6 +396,21 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ +static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, + u64 offset) +{ + bch2_inum_offset_err_msg(op->c, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + +static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +{ + __bch2_write_op_error(out, op, op->pos.offset); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -532,14 +547,14 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); - if (ret && !bch2_err_matches(ret, EROFS)) { + if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "%s write error while doing btree update: %s", - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } if (ret) @@ -1081,11 +1096,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, *_dst = dst; return more; csum_err: - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)", - op->flags & BCH_WRITE_MOVE ? "move" : "user"); + { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1176,11 +1194,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) if (ret && !bch2_err_matches(ret, EROFS)) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "%s write error while doing btree update: %s", - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } if (ret) { @@ -1340,17 +1358,19 @@ static void bch2_nocow_write(struct bch_write_op *op) if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + bch2_trans_put(trans); + darray_exit(&buckets); + if (ret) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, op->pos.offset << 9, - "%s: btree lookup error %s", __func__, bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); op->error = ret; op->flags |= BCH_WRITE_SUBMITTED; } - bch2_trans_put(trans); - darray_exit(&buckets); - /* fallback to cow write path? */ if (!(op->flags & BCH_WRITE_SUBMITTED)) { closure_sync(&op->cl); @@ -1463,14 +1483,14 @@ static void __bch2_write(struct bch_write_op *op) if (ret <= 0) { op->flags |= BCH_WRITE_SUBMITTED; - if (ret < 0) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s(): %s error: %s", __func__, - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + if (unlikely(ret < 0)) { + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } op->error = ret; break; } @@ -1596,12 +1616,11 @@ CLOSURE_CALLBACK(bch2_write) bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; - if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s write error: misaligned write", - op->flags & BCH_WRITE_MOVE ? "move" : "user"); + if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "misaligned write"); + printbuf_exit(&buf); op->error = -EIO; goto err; } From 5e415199384c50c686f34275a94dd0f831ed480d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Nov 2024 19:13:54 -0500 Subject: [PATCH 152/233] bcachefs: list_pop_entry() Signed-off-by: Kent Overstreet --- fs/bcachefs/ec.c | 6 ++---- fs/bcachefs/io_write.c | 4 +--- fs/bcachefs/util.h | 13 +++++++++++++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 7d6c33f04092..250e73897d95 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -2465,11 +2465,9 @@ void bch2_fs_ec_exit(struct bch_fs *c) while (1) { mutex_lock(&c->ec_stripe_head_lock); - h = list_first_entry_or_null(&c->ec_stripe_head_list, - struct ec_stripe_head, list); - if (h) - list_del(&h->list); + h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); mutex_unlock(&c->ec_stripe_head_lock); + if (!h) break; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index bae045e76055..3e71860f66b9 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -637,9 +637,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work) while (1) { spin_lock_irq(&wp->writes_lock); - op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); - if (op) - list_del(&op->wp_list); + op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); wp_update_state(wp, op != NULL); spin_unlock_irq(&wp->writes_lock); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index fb02c1c36004..5e4820c8fa44 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -317,6 +317,19 @@ do { \ _ptr ? container_of(_ptr, type, member) : NULL; \ }) +static inline struct list_head *list_pop(struct list_head *head) +{ + if (list_empty(head)) + return NULL; + + struct list_head *ret = head->next; + list_del_init(ret); + return ret; +} + +#define list_pop_entry(head, type, member) \ + container_of_or_null(list_pop(head), type, member) + /* Does linear interpolation between powers of two */ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) { From 28d5570cd27b3ec683df66093c223751371c95a7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Nov 2024 18:20:42 -0500 Subject: [PATCH 153/233] bcachefs: bkey_fsck_err now respects errors_silent Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index abaa9570cd62..9e34374960f3 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -476,11 +476,16 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, return -BCH_ERR_fsck_delete_bkey; unsigned fsck_flags = 0; - if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) + if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { + if (test_bit(err, c->sb.errors_silent)) + return -BCH_ERR_fsck_delete_bkey; + fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; + } + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + fsck_flags |= fsck_flags_extra[err]; struct printbuf buf = PRINTBUF; - va_list args; prt_printf(&buf, "invalid bkey in %s btree=", bch2_bkey_validate_contexts[from.from]); @@ -489,9 +494,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, "\n "); + + va_list args; va_start(args, fmt); prt_vprintf(&buf, fmt, args); va_end(args); + prt_str(&buf, ": delete?"); int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); From 247a12f3a216cb2368d596a152f0c32c2cb9d9c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Nov 2024 18:17:00 -0500 Subject: [PATCH 154/233] bcachefs: If we did repair on a btree node, make sure we rewrite it Ensure that "invalid bkey" repair gets persisted, so that it doesn't repeatedly spam the logs. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index eedcb2445b99..9df9fc1c5e2b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -997,6 +997,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, got_good_key: le16_add_cpu(&i->u64s, -next_good_key); memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_node_need_rewrite(b); } fsck_err: printbuf_exit(&buf); @@ -1259,6 +1260,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); + set_btree_node_need_rewrite(b); continue; } if (ret) @@ -1372,15 +1374,18 @@ static void btree_node_read_work(struct work_struct *work) rb->start_time); bio_put(&rb->bio); - if (saw_error && + if ((saw_error || + btree_node_need_rewrite(b)) && !btree_node_read_error(b) && c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", - __func__, buf.buf); + if (saw_error) { + printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", + __func__, buf.buf); + } bch2_btree_node_rewrite_async(c, b); } From 0b5819b73c40535daa75a83347e6aaf5ce32ea55 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Nov 2024 18:53:26 -0500 Subject: [PATCH 155/233] bcachefs: bch2_async_btree_node_rewrites_flush() Add a method to flush btree node rewrites at the end of recovery, to ensure that corrected errors are persisted. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 7 +- fs/bcachefs/btree_update_interior.c | 153 ++++++++++++++++------------ fs/bcachefs/btree_update_interior.h | 1 + fs/bcachefs/recovery.c | 2 + 4 files changed, 97 insertions(+), 66 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index c16937e54734..b12c9c78beec 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -829,9 +829,10 @@ struct bch_fs { struct work_struct btree_interior_update_work; struct workqueue_struct *btree_node_rewrite_worker; - - struct list_head pending_node_rewrites; - struct mutex pending_node_rewrites_lock; + struct list_head btree_node_rewrites; + struct list_head btree_node_rewrites_pending; + spinlock_t btree_node_rewrites_lock; + struct closure_waitlist btree_node_rewrites_wait; /* btree_io.c: */ spinlock_t btree_write_error_lock; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 5eabd532e388..f2a1d5d3d8d5 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2206,42 +2206,50 @@ struct async_btree_rewrite { struct list_head list; enum btree_id btree_id; unsigned level; - struct bpos pos; - __le64 seq; + struct bkey_buf key; }; static int async_btree_node_rewrite_trans(struct btree_trans *trans, struct async_btree_rewrite *a) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct btree *b; - int ret; - - bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, + bch2_trans_node_iter_init(trans, &iter, + a->btree_id, a->key.k->k.p, BTREE_MAX_DEPTH, a->level, 0); - b = bch2_btree_iter_peek_node(&iter); - ret = PTR_ERR_OR_ZERO(b); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; - if (!b || b->data->keys.seq != a->seq) { + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); + ret = found + ? bch2_btree_node_rewrite(trans, &iter, b, 0) + : -ENOENT; + +#if 0 + /* Tracepoint... */ + if (!ret || ret == -ENOENT) { + struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null"); - bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", - __func__, a->seq, buf.buf); + if (!ret) { + prt_printf(&buf, "rewrite node:\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); + } else { + prt_printf(&buf, "node to rewrite not found:\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); + prt_printf(&buf, "\n got: "); + if (b) + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + else + prt_str(&buf, "(null)"); + } + bch_info(c, "%s", buf.buf); printbuf_exit(&buf); - goto out; } - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +#endif out: bch2_trans_iter_exit(trans, &iter); - return ret; } @@ -2252,81 +2260,96 @@ static void async_btree_node_rewrite_work(struct work_struct *work) struct bch_fs *c = a->c; int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); - bch_err_fn_ratelimited(c, ret); + if (ret != -ENOENT) + bch_err_fn_ratelimited(c, ret); + + spin_lock(&c->btree_node_rewrites_lock); + list_del(&a->list); + spin_unlock(&c->btree_node_rewrites_lock); + + closure_wake_up(&c->btree_node_rewrites_wait); + + bch2_bkey_buf_exit(&a->key, c); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { - struct async_btree_rewrite *a; - int ret; - - a = kmalloc(sizeof(*a), GFP_NOFS); - if (!a) { - bch_err(c, "%s: error allocating memory", __func__); + struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); + if (!a) return; - } a->c = c; a->btree_id = b->c.btree_id; a->level = b->c.level; - a->pos = b->key.k.p; - a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { - mutex_lock(&c->pending_node_rewrites_lock); - list_add(&a->list, &c->pending_node_rewrites); - mutex_unlock(&c->pending_node_rewrites_lock); - return; + bch2_bkey_buf_init(&a->key); + bch2_bkey_buf_copy(&a->key, c, &b->key); + + bool now = false, pending = false; + + spin_lock(&c->btree_node_rewrites_lock); + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + list_add(&a->list, &c->btree_node_rewrites); + now = true; + } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { + list_add(&a->list, &c->btree_node_rewrites_pending); + pending = true; } + spin_unlock(&c->btree_node_rewrites_lock); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - if (test_bit(BCH_FS_started, &c->flags)) { - bch_err(c, "%s: error getting c->writes ref", __func__); - kfree(a); - return; - } - - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "going read-write"); - if (ret) { - kfree(a); - return; - } - - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + if (now) { + queue_work(c->btree_node_rewrite_worker, &a->work); + } else if (pending) { + /* bch2_do_pending_node_rewrites will execute */ + } else { + bch2_bkey_buf_exit(&a->key, c); + kfree(a); } +} - queue_work(c->btree_node_rewrite_worker, &a->work); +void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) +{ + closure_wait_event(&c->btree_node_rewrites_wait, + list_empty(&c->btree_node_rewrites)); } void bch2_do_pending_node_rewrites(struct bch_fs *c) { - struct async_btree_rewrite *a, *n; + while (1) { + spin_lock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a = + list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); + if (a) + list_add(&a->list, &c->btree_node_rewrites); + spin_unlock(&c->btree_node_rewrites_lock); - mutex_lock(&c->pending_node_rewrites_lock); - list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { - list_del(&a->list); + if (!a) + break; bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); queue_work(c->btree_node_rewrite_worker, &a->work); } - mutex_unlock(&c->pending_node_rewrites_lock); } void bch2_free_pending_node_rewrites(struct bch_fs *c) { - struct async_btree_rewrite *a, *n; + while (1) { + spin_lock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a = + list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); + spin_unlock(&c->btree_node_rewrites_lock); - mutex_lock(&c->pending_node_rewrites_lock); - list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { - list_del(&a->list); + if (!a) + break; + bch2_bkey_buf_exit(&a->key, c); kfree(a); } - mutex_unlock(&c->pending_node_rewrites_lock); } static int __bch2_btree_node_update_key(struct btree_trans *trans, @@ -2683,6 +2706,9 @@ void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_interior_update_exit(struct bch_fs *c) { + WARN_ON(!list_empty(&c->btree_node_rewrites)); + WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); + if (c->btree_node_rewrite_worker) destroy_workqueue(c->btree_node_rewrite_worker); if (c->btree_interior_update_worker) @@ -2698,8 +2724,9 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) mutex_init(&c->btree_interior_update_lock); INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); - INIT_LIST_HEAD(&c->pending_node_rewrites); - mutex_init(&c->pending_node_rewrites_lock); + INIT_LIST_HEAD(&c->btree_node_rewrites); + INIT_LIST_HEAD(&c->btree_node_rewrites_pending); + spin_lock_init(&c->btree_node_rewrites_lock); } int bch2_fs_btree_interior_update_init(struct bch_fs *c) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 1c6cf3e2e6a9..7930ffea3075 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -334,6 +334,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, struct jset_entry *, unsigned long); +void bch2_async_btree_node_rewrites_flush(struct bch_fs *); void bch2_do_pending_node_rewrites(struct bch_fs *); void bch2_free_pending_node_rewrites(struct bch_fs *); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index c50dede64785..a342744fd275 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -931,6 +931,8 @@ int bch2_fs_recovery(struct bch_fs *c) /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); + bch2_async_btree_node_rewrites_flush(c); + /* fsync if we fixed errors */ if (test_bit(BCH_FS_errors_fixed, &c->flags)) { bch2_journal_flush_all_pins(&c->journal); From a5b377f77372811d1d080f97c5d9b05e8c887435 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 30 Nov 2024 23:27:45 -0500 Subject: [PATCH 156/233] bcachefs: fix bch2_journal_key_insert_take() seq Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_journal_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index de3db161d6ab..6d25e3f85ce8 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -259,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, * Ensure these keys are done last by journal replay, to unblock * journal reclaim: */ - .journal_seq = U32_MAX, + .journal_seq = U64_MAX, }; struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); From 030d6ebb78879e795f75533755f2b7b656806165 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Dec 2024 16:39:54 -0500 Subject: [PATCH 157/233] bcachefs: Improve "unable to allocate journal write" message Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 1627f3e16517..bb69d80886b5 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -2036,8 +2036,9 @@ CLOSURE_CALLBACK(bch2_journal_write) struct printbuf buf = PRINTBUF; buf.atomic++; - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), bch2_err_str(ret)); __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); From e37f4286d41e5e8ddddd1b0716a08c0395deaf4a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 2 Dec 2024 23:36:38 -0500 Subject: [PATCH 158/233] bcachefs: Fix allocating too big journal entry The "journal space available" calculations didn't take into account mismatched bucket sizes; we need to take the minimum space available out of our devices. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 1aabbbe328d9..b7936ad3ae7f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -140,6 +140,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne struct bch_fs *c = container_of(j, struct bch_fs, journal); unsigned pos, nr_devs = 0; struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + unsigned min_bucket_size = U32_MAX; BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); @@ -148,6 +149,8 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne if (!ca->journal.nr) continue; + min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); + space = journal_dev_space_available(j, ca, from); if (!space.next_entry) continue; @@ -167,7 +170,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: */ - return dev_space[nr_devs_want - 1]; + space = dev_space[nr_devs_want - 1]; + space.next_entry = min(space.next_entry, min_bucket_size); + return space; } void bch2_journal_space_available(struct journal *j) From 8a2713582e65edd634af55042a1580d3065e1ae9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 3 Dec 2024 17:40:10 +0100 Subject: [PATCH 159/233] bcachefs: BCACHEFS_PATH_TRACEPOINTS should depend on TRACING When tracing is disabled, there is no point in asking the user about enabling extra btree_path tracepoints in bcachefs. Fixes: 32ed4a620c5405be ("bcachefs: Btree path tracepoints") Signed-off-by: Geert Uytterhoeven Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5bac803ea367..e8549d04dcb8 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -89,7 +89,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN config BCACHEFS_PATH_TRACEPOINTS bool "Extra btree_path tracepoints" - depends on BCACHEFS_FS + depends on BCACHEFS_FS && TRACING help Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. From 354ae858ba2d7fa5f387dddcfe12cdf7810217a9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 3 Dec 2024 21:22:26 -0500 Subject: [PATCH 160/233] bcachefs: rcu_pending now works in userspace Introduce a typedef to handle the difference between unsigned long/struct urcu_gp_poll_state. Signed-off-by: Kent Overstreet --- fs/bcachefs/rcu_pending.c | 40 ++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index 67522aa344a7..bef2aa1b8bcd 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -25,21 +25,37 @@ enum rcu_pending_special { #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) -static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) +#ifdef __KERNEL__ +typedef unsigned long rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l == r; +} +#else +typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l.grace_period_id == r.grace_period_id; +} +#endif + +static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? get_state_synchronize_srcu(ssp) : get_state_synchronize_rcu(); } -static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) +static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? start_poll_synchronize_srcu(ssp) : start_poll_synchronize_rcu(); } -static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) +static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) { return ssp ? poll_state_synchronize_srcu(ssp, cookie) @@ -71,13 +87,13 @@ struct rcu_pending_seq { GENRADIX(struct rcu_head *) objs; size_t nr; struct rcu_head **cursor; - unsigned long seq; + rcu_gp_poll_state_t seq; }; struct rcu_pending_list { struct rcu_head *head; struct rcu_head *tail; - unsigned long seq; + rcu_gp_poll_state_t seq; }; struct rcu_pending_pcpu { @@ -316,10 +332,10 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu) } static __always_inline struct rcu_pending_seq * -get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) +get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) { darray_for_each_reverse(p->objs, objs) - if (objs->seq == seq) + if (rcu_gp_poll_cookie_eq(objs->seq, seq)) return objs; if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) @@ -329,7 +345,7 @@ get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) } static noinline bool -rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, +rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, struct rcu_head *head, void *ptr, unsigned long *flags) { @@ -364,7 +380,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, again: for (struct rcu_pending_list *i = p->lists; i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (i->seq == seq) { + if (rcu_gp_poll_cookie_eq(i->seq, seq)) { rcu_pending_list_add(i, head); return false; } @@ -408,7 +424,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, struct rcu_pending_pcpu *p; struct rcu_pending_seq *objs; struct genradix_node *new_node = NULL; - unsigned long seq, flags; + unsigned long flags; bool start_gp = false; BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); @@ -416,7 +432,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, local_irq_save(flags); p = this_cpu_ptr(pending->p); spin_lock(&p->lock); - seq = __get_state_synchronize_rcu(pending->srcu); + rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && unlikely(process_finished_items(pending, p, flags))) @@ -478,9 +494,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, */ if (!p->cb_armed) { p->cb_armed = true; - spin_unlock_irqrestore(&p->lock, flags); __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); - goto free_node; } else { __start_poll_synchronize_rcu(pending->srcu); } From 3ac87fa03f2ff6527539a7ec7be84488813841da Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Dec 2024 21:35:11 -0500 Subject: [PATCH 161/233] bcachefs: logged ops only use inum 0 of logged ops btree we wish to use the logged ops btree for other items that aren't strictly logged ops: cursors for inode allocation There's no reason to create another cached btree for inode allocator cursors - so reserve different parts of the keyspace for different purposes. Older versions will ignore or delete the cursors. Signed-off-by: Kent Overstreet --- fs/bcachefs/logged_ops.c | 10 +++++----- fs/bcachefs/logged_ops_format.h | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 60e00702d1a4..1ac51af16299 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -63,8 +63,9 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, int bch2_resume_logged_ops(struct bch_fs *c) { int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_logged_ops, POS_MIN, + for_each_btree_key_max(trans, iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX), BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); @@ -74,9 +75,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { struct btree_iter iter; - int ret; - - ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX)); if (ret) return ret; diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h index 6a4bf7129dba..0b370a963ac6 100644 --- a/fs/bcachefs/logged_ops_format.h +++ b/fs/bcachefs/logged_ops_format.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H #define _BCACHEFS_LOGGED_OPS_FORMAT_H +#define LOGGED_OPS_INUM 0 + struct bch_logged_op_truncate { struct bch_val v; __le32 subvol; From 52a0da6fcd27f7fd8ab591735479e135c3cda3af Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 3 Dec 2024 22:03:18 -0500 Subject: [PATCH 162/233] bcachefs: Simplify disk accounting validate late The validate late path was iterating over accounting entries in eytzinger order, which is unnecessarily tricky when we may have to remove entries. Signed-off-by: Kent Overstreet --- fs/bcachefs/darray.h | 2 +- fs/bcachefs/disk_accounting.c | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 8f4c3f0665c4..c6151495985f 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 71c49a7ee2fe..a915d9dc8de4 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -765,15 +765,16 @@ int bch2_accounting_read(struct bch_fs *c) keys->gap = keys->nr = dst - keys->data; percpu_down_write(&c->mark_lock); - unsigned i = 0; - while (i < acc->k.nr) { - unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + darray_for_each_reverse(acc->k, i) { struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); + bpos_to_disk_accounting_pos(&acc_k, i->pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); + memset(v, 0, sizeof(v)); + + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); /* * If the entry counters are zeroed, it should be treated as @@ -782,26 +783,25 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, - v, acc->k.data[idx].nr_counters); + : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(acc->k.data[idx].v[0]); - free_percpu(acc->k.data[idx].v[1]); - darray_remove_item(&acc->k, &acc->k.data[idx]); - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); ret = 0; continue; } if (ret) goto fsck_err; - i++; } + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); From fdfafffb03e233615fbe7ba3fab3d7b0bbe78bd9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 01:19:28 -0500 Subject: [PATCH 163/233] bcachefs: Advance to next bp on BCH_ERR_backpointer_to_overwritten_btree_node Don't spin. Fixes: de95cc201a97 ("bcachefs: Kill bch2_get_next_backpointer()") Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 460175464762..6f21e36d89f7 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -785,7 +785,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, b = bch2_backpointer_get_node(trans, bp, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - continue; + goto next; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) From 2a21c9dea9ea0babf4adcbab544204ae50ae8164 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 17:44:25 -0500 Subject: [PATCH 164/233] bcachefs: trace_accounting_mem_insert Add a tracepoint for inserting new accounting entries: we're seeing odd spinning behaviour in accounting read. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 8 ++++++++ fs/bcachefs/trace.h | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index a915d9dc8de4..a0061bcf9159 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -324,6 +324,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); + + if (trace_accounting_mem_insert_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_accounting_to_text(&buf, c, a.s_c); + trace_accounting_mem_insert(c, buf.buf); + printbuf_exit(&buf); + } return 0; err: free_percpu(n.v[1]); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 2d5932d2881e..7baf66beee22 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* disk_accounting.c */ + +TRACE_EVENT(accounting_mem_insert, + TP_PROTO(struct bch_fs *c, const char *acc), + TP_ARGS(c, acc), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned, new_nr ) + __string(acc, acc ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->new_nr = c->accounting.k.nr; + __assign_str(acc); + ), + + TP_printk("%d,%d entries %u added %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_nr, + __get_str(acc)) +); + /* fs.c: */ TRACE_EVENT(bch2_sync_fs, TP_PROTO(struct super_block *sb, int wait), From fe236881929435323d0f4f144e1c3807d443c60c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 17:48:06 -0500 Subject: [PATCH 165/233] bcachefs: Silence "unable to allocate journal write" if we're already RO Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index bb69d80886b5..e7a43400a587 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -2032,7 +2032,7 @@ CLOSURE_CALLBACK(bch2_journal_write) bch2_journal_do_discards(j); } - if (ret) { + if (ret && !bch2_journal_error(j)) { struct printbuf buf = PRINTBUF; buf.atomic++; @@ -2044,8 +2044,9 @@ CLOSURE_CALLBACK(bch2_journal_write) spin_unlock(&j->lock); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); - goto err; } + if (ret) + goto err; /* * write is allocated, no longer need to account for it in From fd2a164b5c5fd7b690cc98c313d0f1512e0d2647 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 17:53:38 -0500 Subject: [PATCH 166/233] bcachefs: BCH_ERR_insufficient_journal_devices kill another standard error code use Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 1 + fs/bcachefs/journal_io.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 47387f7d6202..5e4dd85ac669 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -263,6 +263,7 @@ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ + x(EIO, insufficient_journal_devices) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index e7a43400a587..e5fce5e497f2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1503,8 +1503,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); + __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); if (replicas >= replicas_want) goto done; @@ -1544,7 +1543,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) From effc7a1c0683576324bc3ef92d83e51091a8bca6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 18:16:25 -0500 Subject: [PATCH 167/233] bcachefs: Fix failure to allocate journal write on discard retry When allocating a journal write fails, then retries after doing discards, we were failing to count already allocated replicas. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index e5fce5e497f2..d7dfea5f0181 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1498,6 +1498,15 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) READ_ONCE(c->opts.metadata_replicas_required)); rcu_read_lock(); + + /* We might run more than once if we have to stop and do discards: */ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); + bkey_for_each_ptr(ptrs, p) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); + if (ca) + replicas += ca->mi.durability; + } + retry: devs = target_rw_devs(c, BCH_DATA_journal, target); From 0a1a0391c46b3128dc7f9b1b845bc98832f233a6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 19:21:22 -0500 Subject: [PATCH 168/233] bcachefs: dev_alloc_list.devs -> dev_alloc_list.data This lets us use darray macros on dev_alloc_list (and it will become a darray eventually, when we increase the maximum number of devices). Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 60 ++++++++++++++-------------------- fs/bcachefs/alloc_foreground.h | 2 +- fs/bcachefs/journal_io.c | 21 +++++------- 3 files changed, 34 insertions(+), 49 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 095bfe7c53bd..49c9275465f9 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -626,9 +626,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.devs[ret.nr++] = i; + ret.data[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + bubble_sort(ret.data, ret.nr, dev_stripe_cmp); return ret; } @@ -700,18 +700,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted = - bch2_dev_alloc_list(c, stripe, devs_may_alloc); int ret = -BCH_ERR_insufficient_devices; BUG_ON(*nr_effective >= nr_replicas); - for (unsigned i = 0; i < devs_sorted.nr; i++) { - struct bch_dev_usage usage; - struct open_bucket *ob; - - unsigned dev = devs_sorted.devs[i]; - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; @@ -720,8 +715,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + struct bch_dev_usage usage; + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, + cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -765,10 +761,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted; - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, ec_idx; int ret = 0; if (nr_replicas < 2) @@ -777,34 +769,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (ec_open_bucket(c, ptrs)) return 0; - h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + struct ec_stripe_head *h = + bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - - for (i = 0; i < devs_sorted.nr; i++) - for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) + for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; - ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == devs_sorted.devs[i] && - !test_and_set_bit(ec_idx, h->s->blocks_allocated)) - goto got_bucket; - } - goto out_put_head; -got_bucket: - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); + struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { + ob->ec_idx = ec_idx; + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); -out_put_head: + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, ob); + goto out; + } + } +out: bch2_ec_stripe_head_put(c, h); return ret; } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 4f87745df97e..f25481a0d1a0 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); struct dev_alloc_list { unsigned nr; - u8 devs[BCH_SB_MEMBERS_MAX]; + u8 data[BCH_SB_MEMBERS_MAX]; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index d7dfea5f0181..9a1647297d11 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1422,25 +1422,22 @@ int bch2_journal_read(struct bch_fs *c, static void __journal_write_alloc(struct journal *j, struct journal_buf *w, - struct dev_alloc_list *devs_sorted, + struct dev_alloc_list *devs, unsigned sectors, unsigned *replicas, unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja; - struct bch_dev *ca; - unsigned i; if (*replicas >= replicas_want) return; - for (i = 0; i < devs_sorted->nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) continue; - ja = &ca->journal; + struct journal_device *ja = &ca->journal; /* * Check that we can use this device, and aren't already using @@ -1486,13 +1483,11 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; - struct journal_device *ja; - struct bch_dev *ca; struct dev_alloc_list devs_sorted; unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned i, replicas = 0, replicas_want = + unsigned replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); @@ -1517,12 +1512,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) if (replicas >= replicas_want) goto done; - for (i = 0; i < devs_sorted.nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + darray_for_each(devs_sorted, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) continue; - ja = &ca->journal; + struct journal_device *ja = &ca->journal; if (sectors > ja->sectors_free && sectors <= ca->mi.bucket_size && From 70feb569f2ce915068ac3d2050b843322cb5218c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 18:14:14 -0500 Subject: [PATCH 169/233] bcachefs: Journal write path refactoring, debug improvements Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 6 ++++ fs/bcachefs/journal_io.c | 70 ++++++++++++++++++++++------------------ 2 files changed, 45 insertions(+), 31 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index dc66521964b7..04a9ccf76d75 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1564,6 +1564,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { + if (!ca->mi.durability) + continue; + struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1573,6 +1576,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) continue; prt_printf(out, "dev %u:\n", ca->dev_idx); + prt_printf(out, "durability %u:\n", ca->mi.durability); printbuf_indent_add(out, 2); prt_printf(out, "nr\t%u\n", ja->nr); prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); @@ -1584,6 +1588,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); } + prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); + rcu_read_unlock(); --out->atomic; diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 9a1647297d11..2f4daa8bd498 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1420,6 +1420,35 @@ int bch2_journal_read(struct bch_fs *c, /* journal write: */ +static void journal_advance_devs_to_next_bucket(struct journal *j, + struct dev_alloc_list *devs, + unsigned sectors, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); + if (!ca) + continue; + + struct journal_device *ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); + } + } +} + static void __journal_write_alloc(struct journal *j, struct journal_buf *w, struct dev_alloc_list *devs, @@ -1429,9 +1458,6 @@ static void __journal_write_alloc(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); - if (*replicas >= replicas_want) - return; - darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1491,6 +1517,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); + bool advance_done = false; rcu_read_lock(); @@ -1502,45 +1529,26 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) replicas += ca->mi.durability; } -retry: +retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); - +retry_alloc: __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); - if (replicas >= replicas_want) + if (likely(replicas >= replicas_want)) goto done; - darray_for_each(devs_sorted, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); - if (!ca) - continue; - - struct journal_device *ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } + if (!advance_done) { + journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); + advance_done = true; + goto retry_alloc; } - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); - if (replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; - goto retry; + advance_done = false; + goto retry_target; } done: rcu_read_unlock(); From 8ea098f248cceb4838f41d7a01389eb26c8109e4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 19:41:38 -0500 Subject: [PATCH 170/233] bcachefs: Call bch2_btree_lost_data() on btree read error Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e59924cfe2bc..24f2f3bdf704 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -29,6 +29,7 @@ #include "move.h" #include "recovery_passes.h" #include "reflink.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -359,11 +360,9 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct if (ret) break; - if (!btree_id_is_alloc(b->c.btree_id)) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - break; - } + ret = bch2_btree_lost_data(c, b->c.btree_id); + if (ret) + break; continue; } @@ -525,7 +524,7 @@ int bch2_check_topology(struct bch_fs *c) bch2_btree_id_to_text(&buf, i); if (r->error) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + ret = bch2_btree_lost_data(c, i); if (ret) break; reconstruct_root: @@ -741,7 +740,7 @@ static int bch2_gc_btrees(struct bch_fs *c) (printbuf_reset(&buf), bch2_btree_id_to_text(&buf, btree), buf.buf))) - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + ret = bch2_btree_lost_data(c, btree); } fsck_err: printbuf_exit(&buf); From 61ab7cbbaae3ba7afa6e637dfd8a8207daf1c244 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 19:46:35 -0500 Subject: [PATCH 171/233] bcachefs: Make sure __bch2_run_explicit_recovery_pass() signals to rewind We should always signal to rewind if the requested pass hasn't been run, even if called multiple times. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 1 + fs/bcachefs/recovery_passes.c | 52 +++++++++++++++++------------------ 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b12c9c78beec..e6cd93e1ed0f 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1044,6 +1044,7 @@ struct bch_fs { * for signaling to the toplevel code which pass we want to run now. */ enum bch_recovery_pass curr_recovery_pass; + enum bch_recovery_pass next_recovery_pass; /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index f6d3a99cb63e..0b3c951c32da 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -103,27 +103,31 @@ u64 bch2_recovery_passes_from_stable(u64 v) static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - if (c->opts.recovery_passes & BIT_ULL(pass)) - return 0; - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) return -BCH_ERR_not_in_recovery; + if (c->recovery_passes_complete & BIT_ULL(pass)) + return 0; + + bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { - bch_info(c, "need recovery pass %s (%u), but already rw", - bch2_recovery_passes[pass], pass); + if (print) + bch_info(c, "need recovery pass %s (%u), but already rw", + bch2_recovery_passes[pass], pass); return -BCH_ERR_cannot_rewind_recovery; } - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + if (print) + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); c->opts.recovery_passes |= BIT_ULL(pass); - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; + if (c->curr_recovery_pass > pass) { + c->next_recovery_pass = pass; c->recovery_passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { @@ -264,7 +268,9 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + c->next_recovery_pass = c->curr_recovery_pass + 1; + spin_lock_irq(&c->recovery_pass_lock); unsigned pass = c->curr_recovery_pass; @@ -285,31 +291,25 @@ int bch2_run_recovery_passes(struct bch_fs *c) ret = bch2_run_recovery_pass(c, pass) ?: bch2_journal_flush(&c->journal); + if (!ret && !test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, pass); + spin_lock_irq(&c->recovery_pass_lock); - if (c->curr_recovery_pass < pass) { + if (c->next_recovery_pass < c->curr_recovery_pass) { /* * bch2_run_explicit_recovery_pass() was called: we * can't always catch -BCH_ERR_restart_recovery because * it may have been called from another thread (btree * node read completion) */ - spin_unlock_irq(&c->recovery_pass_lock); - continue; - } else if (c->curr_recovery_pass == pass) { - c->curr_recovery_pass++; + ret = 0; + c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); } else { - BUG(); + c->recovery_passes_complete |= BIT_ULL(pass); + c->recovery_pass_done = max(c->recovery_pass_done, pass); } + c->curr_recovery_pass = c->next_recovery_pass; spin_unlock_irq(&c->recovery_pass_lock); - - if (ret) - break; - - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); - - if (!test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); } return ret; From cea5427fbab13e53bbd2885955b47c3849c1befc Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 20:43:01 -0500 Subject: [PATCH 172/233] bcachefs: Don't call bch2_btree_interior_update_will_free_node() until after update succeeds Originally, btree splits always succeeded once we got to the point of recursing to the btree_insert_node() call. But that changed when we switched to not taking intent locks all the way up to the root, and that introduced a bug, because bch2_btree_interior_update_will_free_node() cancels paending writes and reparents a node that's going to be made visible on disk by another btree update to the current btree update. This was discovered in recent backpointers work, because bch2_btree_interior_update_will_free_node() also clears the will_make_reachable flag, causing backpointer target lookup to spuriously thing it had found a dangling backpointer (when the backpointer just hadn't been created yet by btree_update_nodes_written()). Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index f2a1d5d3d8d5..7d9dab95bdcf 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1607,8 +1607,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) return ret; - bch2_btree_interior_update_will_free_node(as, b); - if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; @@ -1707,6 +1705,8 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) goto err; + bch2_btree_interior_update_will_free_node(as, b); + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -2063,9 +2063,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, trace_and_count(c, btree_node_merge, trans, b); - bch2_btree_interior_update_will_free_node(as, b); - bch2_btree_interior_update_will_free_node(as, m); - n = bch2_btree_node_alloc(as, trans, b->c.level); SET_BTREE_NODE_SEQ(n->data, @@ -2101,6 +2098,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err_free_update; + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); + bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); @@ -2155,8 +2155,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto out; - bch2_btree_interior_update_will_free_node(as, b); - n = bch2_btree_node_alloc_replacement(as, trans, b); bch2_btree_build_aux_trees(n); @@ -2180,6 +2178,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto err; + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); From 7dacc22d765601ef7a2f13ec006a36724e82be6b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 23:40:26 -0500 Subject: [PATCH 173/233] bcachefs: kill flags param to bch2_subvolume_get() Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 2 +- fs/bcachefs/fs-common.c | 4 +--- fs/bcachefs/fs.c | 7 +++---- fs/bcachefs/fsck.c | 7 +++---- fs/bcachefs/snapshot.c | 5 ++--- fs/bcachefs/subvolume.c | 14 ++++++-------- fs/bcachefs/subvolume.h | 2 +- 7 files changed, 17 insertions(+), 24 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 41813f9ce831..600eee936f13 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } else { target->subvol = le32_to_cpu(d.v->d_child_subvol); - ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); + ret = bch2_subvolume_get(trans, target->subvol, true, &s); target->inum = le64_to_cpu(s.inode); } diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index dcaa47f68f31..f8d27244e1d6 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!snapshot_src.inum) { /* Inode wasn't specified, just snapshot: */ struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, - BTREE_ITER_cached, &s); + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); if (ret) goto err; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index c6e7df7c67fa..3f83f131d0e8 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -499,7 +499,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; int ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); @@ -569,8 +569,7 @@ __bch2_create(struct mnt_idmap *idmap, inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; - ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_with_updates, &subvol) ?: + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -651,7 +650,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; - ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 1a5a07112779..1e00b2694db7 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -109,7 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -226,8 +226,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; struct bch_subvolume subvol; - ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), - false, 0, &subvol); + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), false, &subvol); bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", le32_to_cpu(st.master_subvol), snapshot); if (ret) @@ -1421,7 +1420,7 @@ static int check_inode(struct btree_trans *trans, if (u.bi_subvol) { struct bch_subvolume s; - ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); + ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index f368270d6d9b..99f045518312 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -570,8 +570,7 @@ static int check_snapshot_tree(struct btree_trans *trans, goto err; } - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), - false, 0, &subvol); + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -811,7 +810,7 @@ static int check_snapshot(struct btree_trans *trans, if (should_have_subvol) { id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + ret = bch2_subvolume_get(trans, id, false, &subvol); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot points to nonexistent subvolume:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 5e5ae405cb28..0e756e35c3d9 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -286,11 +286,11 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) static __always_inline int bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - iter_flags, subvolume, s); + BTREE_ITER_cached| + BTREE_ITER_with_updates, subvolume, s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found, trans->c, "missing subvolume %u", subvol); @@ -299,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { - return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); + return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); } int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) { struct bch_subvolume s; - int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s); + int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); if (ret) return ret; @@ -328,7 +327,7 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, struct bch_snapshot snap; return bch2_snapshot_lookup(trans, snapshot, &snap) ?: - bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); } int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, @@ -396,8 +395,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d struct bch_subvolume s; return lockrestart_do(trans, - bch2_subvolume_get(trans, subvolid_to_delete, true, - BTREE_ITER_cached, &s)) ?: + bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index d53d292c22d7..910f6196700e 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -24,7 +24,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, - bool, int, struct bch_subvolume *); + bool, struct bch_subvolume *); int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *, bool); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); From 0ecfac8b60c8ad86a9d60aa6e5ec3acaeeb96064 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 4 Dec 2024 23:36:33 -0500 Subject: [PATCH 174/233] bcachefs: factor out str_hash.c Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/fsck.c | 214 ++--------------------------------------- fs/bcachefs/fsck.h | 8 ++ fs/bcachefs/str_hash.c | 209 ++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/str_hash.h | 7 ++ 5 files changed, 232 insertions(+), 207 deletions(-) create mode 100644 fs/bcachefs/str_hash.c diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 56d20e219f59..d2689388d5e8 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -82,6 +82,7 @@ bcachefs-y := \ siphash.o \ six.o \ snapshot.o \ + str_hash.o \ subvolume.o \ super.o \ super-io.o \ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 1e00b2694db7..22a33b9ba30d 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -941,69 +941,16 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) -{ - if (d.v->d_type == DT_SUBVOL) { - u32 snap; - u64 inum; - int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - return !ret; - } else { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -} - /* * Prefer to delete the first one, since that will be the one at the wrong * offset: * return value: 0 -> delete k1, 1 -> delete k2 */ -static int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) -{ - if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && - !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) - return 0; - - switch (desc.btree_id) { - case BTREE_ID_dirents: { - int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); - if (ret < 0) - return ret; - if (!ret) - return 0; - - ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); - if (ret < 0) - return ret; - if (!ret) - return 1; - return 2; - } - default: - return 0; - } -} - -static int fsck_update_backpointers(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_i *new) +int bch2_fsck_update_backpointers(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_i *new) { if (new->k.type != KEY_TYPE_dirent) return 0; @@ -1031,153 +978,6 @@ static int fsck_update_backpointers(struct btree_trans *trans, return ret; } -static int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) -{ - struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_dirent_init(&new->k_i); - dirent_copy_target(new, old); - new->k.p = old.k->p; - - for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len); - - if (u64s > U8_MAX) - return -EINVAL; - - new->k.u64s = u64s; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - (subvol_inum) { 0, old.k->p.inode }, - old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (!bch2_err_matches(ret, EEXIST)) - break; - } - - if (ret) - return ret; - - return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); -} - -static int hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; - struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - u64 hash; - int ret = 0; - - if (hash_k.k->type != desc.key_type) - return 0; - - hash = desc.hash_bkey(hash_info, hash_k); - - if (likely(hash == hash_k.k->p.offset)) - return 0; - - if (hash_k.k->p.offset < hash) - goto bad_hash; - - for_each_btree_key_norestart(trans, iter, desc.btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots, k, ret) { - if (bkey_eq(k.k->p, hash_k.k->p)) - break; - - if (k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k)) - goto duplicate_entries; - - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - goto bad_hash; - } - } -out: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -bad_hash: - if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", - bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); - if (IS_ERR(new)) - return PTR_ERR(new); - - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, - (subvol_inum) { 0, hash_k.k->p.inode }, - hash_k.k->p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k) - goto duplicate_entries; - - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - fsck_update_backpointers(trans, s, desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } -fsck_err: - goto out; -duplicate_entries: - ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); - break; - case 2: - ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; -} - static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, @@ -2496,7 +2296,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -2610,7 +2410,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 4481b40a881d..574948278cd4 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -2,6 +2,14 @@ #ifndef _BCACHEFS_FSCK_H #define _BCACHEFS_FSCK_H +#include "str_hash.h" + +int bch2_fsck_update_backpointers(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc, + struct bch_hash_info *, + struct bkey_i *); + int bch2_check_inodes(struct bch_fs *); int bch2_check_extents(struct bch_fs *); int bch2_check_indirect_extents(struct bch_fs *); diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c new file mode 100644 index 000000000000..c3276a7e7324 --- /dev/null +++ b/fs/bcachefs/str_hash.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "dirent.h" +#include "fsck.h" +#include "str_hash.h" +#include "subvolume.h" + +static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) +{ + if (d.v->d_type == DT_SUBVOL) { + struct bch_subvolume subvol; + int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), + false, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + return !ret; + } else { + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +} + +static int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) +{ + struct qstr old_name = bch2_dirent_get_name(old); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_dirent_init(&new->k_i); + dirent_copy_target(new, old); + new->k.p = old.k->p; + + for (unsigned i = 0; i < 1000; i++) { + unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + + if (u64s > U8_MAX) + return -EINVAL; + + new->k.u64s = u64s; + + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + (subvol_inum) { 0, old.k->p.inode }, + old.k->p.snapshot, &new->k_i, + BTREE_UPDATE_internal_snapshot_node); + if (!bch2_err_matches(ret, EEXIST)) + break; + } + + if (ret) + return ret; + + return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); +} + +static int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) +{ + if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && + !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) + return 0; + + switch (desc.btree_id) { + case BTREE_ID_dirents: { + int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); + if (ret < 0) + return ret; + if (!ret) + return 0; + + ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); + if (ret < 0) + return ret; + if (!ret) + return 1; + return 2; + } + default: + return 0; + } +} + +int bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + u64 hash; + int ret = 0; + + if (hash_k.k->type != desc.key_type) + return 0; + + hash = desc.hash_bkey(hash_info, hash_k); + + if (likely(hash == hash_k.k->p.offset)) + return 0; + + if (hash_k.k->p.offset < hash) + goto bad_hash; + + for_each_btree_key_norestart(trans, iter, desc.btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_slots, k, ret) { + if (bkey_eq(k.k->p, hash_k.k->p)) + break; + + if (k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k)) + goto duplicate_entries; + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + goto bad_hash; + } + } +out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +bad_hash: + if (fsck_err(trans, hash_table_key_wrong_offset, + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", + bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); + if (IS_ERR(new)) + return PTR_ERR(new); + + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, + (subvol_inum) { 0, hash_k.k->p.inode }, + hash_k.k->p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(k); + if (ret) + goto out; + if (k.k) + goto duplicate_entries; + + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } +fsck_err: + goto out; +duplicate_entries: + ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); + break; + case 2: + ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; +} diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 00c785055d22..0c20f3af03f8 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -393,4 +393,11 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +struct snapshots_seen; +int bch2_str_hash_check_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c); + #endif /* _BCACHEFS_STR_HASH_H */ From 6315b49e95cf0af5196f1931a019d63a02d2a2a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Dec 2024 12:35:17 -0500 Subject: [PATCH 175/233] bcachefs: Journal space calculations should skip durability=0 devices Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index b7936ad3ae7f..3c8242606da7 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -146,7 +146,8 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr) + if (!ca->journal.nr || + !ca->mi.durability) continue; min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); From 8ceb549abdc9d97f2faba341ae7755d876d480a1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 20:11:16 -0500 Subject: [PATCH 176/233] bcachefs: fix bch2_btree_node_header_to_text() format string Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 9df9fc1c5e2b..d99f8a78d286 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -26,7 +26,7 @@ static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) { bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); - prt_printf(out, " seq %llux\n", bn->keys.seq); + prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); prt_str(out, "min: "); bch2_bpos_to_text(out, bn->min_key); prt_newline(out); From 2c77b170156262024d9b91795ee8f1531b444d70 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 19:49:46 -0500 Subject: [PATCH 177/233] bcachefs: Mark more errors autofix tested repairing from a bug uncovered by the merge_torture_flakey test Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-errors_format.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 3bbda181f314..0bc4cec2926c 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -58,7 +58,7 @@ enum bch_fsck_flags { x(bset_empty, 45, 0) \ x(bset_bad_seq, 46, 0) \ x(bset_blacklisted_journal_seq, 47, 0) \ - x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ x(btree_node_bad_btree, 49, 0) \ x(btree_node_bad_level, 50, 0) \ x(btree_node_bad_min_key, 51, 0) \ @@ -168,7 +168,7 @@ enum bch_fsck_flags { x(ptr_to_incorrect_stripe, 151, 0) \ x(ptr_gen_newer_than_bucket_gen, 152, 0) \ x(ptr_too_stale, 153, 0) \ - x(stale_dirty_ptr, 154, 0) \ + x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ x(ptr_cached_and_erasure_coded, 156, 0) \ x(ptr_crc_uncompressed_size_too_small, 157, 0) \ From e0e0d738ca9a0a34e7023f42abf56f570d3106d5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 22:37:42 -0500 Subject: [PATCH 178/233] bcachefs: Minor bucket alloc optimization Check open buckets and buckets waiting for journal commit before doing other expensive lookups. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 55 ++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 49c9275465f9..57d5f14c93d0 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -200,14 +200,35 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark) } } +static inline bool may_alloc_bucket(struct bch_fs *c, + struct bpos bucket, + struct bucket_alloc_state *s) +{ + if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { + s->skipped_open++; + return false; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { + s->skipped_need_journal_commit++; + return false; + } + + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { + s->skipped_nocow++; + return false; + } + + return true; +} + static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, u64 bucket, u8 gen, enum bch_watermark watermark, struct bucket_alloc_state *s, struct closure *cl) { - struct open_bucket *ob; - if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; @@ -216,22 +237,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { - s->skipped_open++; - return NULL; - } - - if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { - s->skipped_need_journal_commit++; - return NULL; - } - - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { - s->skipped_nocow++; - return NULL; - } - spin_lock(&c->freelist_lock); if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { @@ -250,10 +255,9 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } - ob = bch2_open_bucket_alloc(c); + struct open_bucket *ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->dev = ca->dev_idx; @@ -279,8 +283,11 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - u8 gen; + if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + return NULL; + + u8 gen; int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); if (ret < 0) return ERR_PTR(ret); @@ -300,6 +307,7 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; @@ -359,7 +367,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, s->buckets_seen++; - ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, a->gen, watermark, s, cl); + ob = may_alloc_bucket(c, k.k->p, s) + ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, + watermark, s, cl) + : NULL; next: bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); From 05cb2a44a9ed0b1f5151702a334011bd884c2641 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 19:16:02 -0500 Subject: [PATCH 179/233] lib min_heap: Switch to size_t size_t is the correct type for a count of objects that can fit in memory: this also means heaps now have the same memory layout as darrays (fs/bcachefs/darray.h), and darrays can be used as heaps. Cc: Kuan-Wei Chiu Cc: Ian Rogers Cc: Andrew Morton Cc: Coly Li Cc: Peter Zijlstra Signed-off-by: Kent Overstreet --- include/linux/min_heap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 43a7b9dcf15e..fe17b4828171 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -15,8 +15,8 @@ */ #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ struct _name { \ - int nr; \ - int size; \ + size_t nr; \ + size_t size; \ _type *data; \ _type preallocated[_nr]; \ } From 488249b3f6257c8db748bcb27efad901481060e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 19:23:22 -0500 Subject: [PATCH 180/233] bcachefs: Use a heap for handling overwrites in btree node scan Fix an O(n^2) issue when we find many overlapping (overwritten) btree nodes - especially when one node overwrites many smaller nodes. This was discovered to be an issue with the bcachefs merge_torture_flakey test - if we had a large btree that was then emptied, the number of difficult overwrites can be unbounded. Cc: Kuan-Wei Chiu Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_node_scan.c | 133 ++++++++++++++++++---------- fs/bcachefs/btree_node_scan_types.h | 1 - 2 files changed, 86 insertions(+), 48 deletions(-) diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index eeafb5e7354e..a7f06deee13c 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -12,6 +12,7 @@ #include "recovery_passes.h" #include +#include #include struct find_btree_nodes_worker { @@ -31,8 +32,6 @@ static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, con if (n->range_updated) prt_str(out, " range updated"); - if (n->overwritten) - prt_str(out, " overwritten"); for (unsigned i = 0; i < n->nr_ptrs; i++) { prt_char(out, ' '); @@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r) -found_btree_node_cmp_time(l, r); } +static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) +{ + return found_btree_node_cmp_pos(l, r) < 0; +} + +static inline void found_btree_node_swap(void *_l, void *_r, void *arg) +{ + struct found_btree_node *l = _l; + struct found_btree_node *r = _r; + + swap(*l, *r); +} + +static const struct min_heap_callbacks found_btree_node_heap_cbs = { + .less = found_btree_node_cmp_pos_less, + .swp = found_btree_node_swap, +}; + static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, struct bio *bio, struct btree_node *bn, u64 offset) { @@ -295,55 +312,48 @@ static int read_btree_nodes(struct find_btree_nodes *f) return f->ret ?: ret; } -static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) +static bool nodes_overlap(const struct found_btree_node *l, + const struct found_btree_node *r) { - while (n + 1 < end && - found_btree_node_cmp_pos(n, n + 1) > 0) { - swap(n[0], n[1]); - n++; - } + return (l->btree_id == r->btree_id && + l->level == r->level && + bpos_gt(l->max_key, r->min_key)); } static int handle_overwrites(struct bch_fs *c, - struct found_btree_node *start, - struct found_btree_node *end) + struct found_btree_node *l, + found_btree_nodes *nodes_heap) { - struct found_btree_node *n; -again: - for (n = start + 1; - n < end && - n->btree_id == start->btree_id && - n->level == start->level && - bpos_lt(n->min_key, start->max_key); - n++) { - int cmp = found_btree_node_cmp_time(start, n); + struct found_btree_node *r; + + while ((r = min_heap_peek(nodes_heap)) && + nodes_overlap(l, r)) { + int cmp = found_btree_node_cmp_time(l, r); if (cmp > 0) { - if (bpos_cmp(start->max_key, n->max_key) >= 0) - n->overwritten = true; + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); else { - n->range_updated = true; - n->min_key = bpos_successor(start->max_key); - n->range_updated = true; - bubble_up(n, end); - goto again; + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } else if (cmp < 0) { - BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); + BUG_ON(bpos_eq(l->min_key, r->min_key)); - start->max_key = bpos_predecessor(n->min_key); - start->range_updated = true; - } else if (n->level) { - n->overwritten = true; + l->max_key = bpos_predecessor(r->min_key); + l->range_updated = true; + } else if (r->level) { + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); } else { - if (bpos_cmp(start->max_key, n->max_key) >= 0) - n->overwritten = true; + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); else { - n->range_updated = true; - n->min_key = bpos_successor(start->max_key); - n->range_updated = true; - bubble_up(n, end); - goto again; + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } } @@ -355,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) { struct find_btree_nodes *f = &c->found_btree_nodes; struct printbuf buf = PRINTBUF; + found_btree_nodes nodes_heap = {}; size_t dst; int ret = 0; @@ -409,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) bch2_print_string_as_lines(KERN_INFO, buf.buf); } - dst = 0; - darray_for_each(f->nodes, i) { - if (i->overwritten) - continue; + swap(nodes_heap, f->nodes); - ret = handle_overwrites(c, i, &darray_top(f->nodes)); + { + /* darray must have same layout as a heap */ + min_heap_char real_heap; + BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); + BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); + BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); + BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); + } + + min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); + + if (nodes_heap.nr) { + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); if (ret) goto err; - BUG_ON(i->overwritten); - f->nodes.data[dst++] = *i; + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); } - f->nodes.nr = dst; - if (c->opts.verbose) { + while (true) { + ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); + if (ret) + goto err; + + if (!nodes_heap.nr) + break; + + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); + if (ret) + goto err; + + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); + } + + for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) + BUG_ON(nodes_overlap(n, n + 1)); + + if (0 && c->opts.verbose) { printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); bch2_print_string_as_lines(KERN_INFO, buf.buf); + } else { + bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); } eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); err: + darray_exit(&nodes_heap); printbuf_exit(&buf); return ret; } diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h index b6c36c45d0be..2811b6857c97 100644 --- a/fs/bcachefs/btree_node_scan_types.h +++ b/fs/bcachefs/btree_node_scan_types.h @@ -6,7 +6,6 @@ struct found_btree_node { bool range_updated:1; - bool overwritten:1; u8 btree_id; u8 level; unsigned sectors_written; From e8d604148bad3ccb9a7cac1cb7ebb613b73fc51b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Dec 2024 21:36:15 -0500 Subject: [PATCH 181/233] bcachefs: Plumb bkey_validate_context to journal_entry_validate This lets us print the exact location in the journal if it was found in the journal, or correctly print if it was found in the superblock. Signed-off-by: Kent Overstreet --- fs/bcachefs/bkey_types.h | 12 ++-- fs/bcachefs/btree_trans_commit.c | 50 +++++++---------- fs/bcachefs/error.c | 9 ++- fs/bcachefs/extents.c | 13 ++--- fs/bcachefs/journal_io.c | 95 ++++++++++++++++++-------------- fs/bcachefs/journal_io.h | 2 +- fs/bcachefs/sb-clean.c | 6 +- 7 files changed, 100 insertions(+), 87 deletions(-) diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h index 2af6279b02a9..b4f328f9853c 100644 --- a/fs/bcachefs/bkey_types.h +++ b/fs/bcachefs/bkey_types.h @@ -213,16 +213,16 @@ BCH_BKEY_TYPES(); enum bch_validate_flags { BCH_VALIDATE_write = BIT(0), BCH_VALIDATE_commit = BIT(1), - BCH_VALIDATE_journal = BIT(2), - BCH_VALIDATE_silent = BIT(3), + BCH_VALIDATE_silent = BIT(2), }; #define BKEY_VALIDATE_CONTEXTS() \ x(unknown) \ - x(commit) \ + x(superblock) \ x(journal) \ x(btree_root) \ - x(btree_node) + x(btree_node) \ + x(commit) struct bkey_validate_context { enum { @@ -230,10 +230,12 @@ struct bkey_validate_context { BKEY_VALIDATE_CONTEXTS() #undef x } from:8; + enum bch_validate_flags flags:8; u8 level; enum btree_id btree; bool root:1; - enum bch_validate_flags flags:8; + unsigned journal_offset; + u64 journal_seq; }; #endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 78d72c26083d..9011cc3f7190 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -719,19 +719,29 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } + struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; + + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, + CPU_BIG_ENDIAN, validate_context); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); + goto fatal_err; + } + } + trans_for_each_update(trans, i) { - enum bch_validate_flags invalid_flags = 0; + validate_context.level = i->level; + validate_context.btree = i->btree_id; - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), - (struct bkey_validate_context) { - .from = BKEY_VALIDATE_commit, - .level = i->level, - .btree = i->btree_id, - .flags = invalid_flags, - }); + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); if (unlikely(ret)){ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", trans->fn, (void *) i->ip_allocated); @@ -740,24 +750,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, btree_insert_entry_checks(trans, i); } - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) { - enum bch_validate_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_journal_entry_validate(c, NULL, i, - bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags); - if (unlikely(ret)) { - bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", - trans->fn); - goto fatal_err; - } - } - if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 9e34374960f3..038da6a61f6b 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -486,9 +486,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, fsck_flags |= fsck_flags_extra[err]; struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "invalid bkey in %s btree=", + prt_printf(&buf, "invalid bkey in %s", bch2_bkey_validate_contexts[from.from]); + + if (from.from == BKEY_VALIDATE_journal) + prt_printf(&buf, " journal seq=%llu offset=%u", + from.journal_seq, from.journal_offset); + + prt_str(&buf, " btree="); bch2_btree_id_to_text(&buf, from.btree); prt_printf(&buf, " level=%u: ", from.level); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 2fc9ace5533c..05d5f71a7ca9 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1238,6 +1238,12 @@ static int extent_ptr_validate(struct bch_fs *c, { int ret = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr2) + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, + c, ptr_to_duplicate_device, + "multiple pointers to same device (%u)", ptr->dev); + /* bad pointers are repaired by check_fix_ptrs(): */ rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); @@ -1252,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c, unsigned bucket_size = ca->mi.bucket_size; rcu_read_unlock(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, - c, ptr_to_duplicate_device, - "multiple pointers to same device (%u)", ptr->dev); - - bkey_fsck_err_on(bucket >= nbuckets, c, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, nbuckets); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 2f4daa8bd498..7f2efe85a805 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -301,7 +301,7 @@ static void journal_entry_err_msg(struct printbuf *out, journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (flags & BCH_VALIDATE_write) { \ + switch (from.flags & BCH_VALIDATE_write) { \ case READ: \ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ @@ -390,15 +390,12 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .level = entry->level, - .btree = entry->btree_id, - .flags = flags|BCH_VALIDATE_journal, - }; + + from.level = entry->level; + from.btree = entry->btree_id; while (k != vstruct_last(entry)) { int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); @@ -435,11 +432,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; int ret = 0; + from.root = true; + from.level = entry->level + 1; + from.btree = entry->btree_id; + if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, c, version, jset, entry, @@ -456,13 +457,6 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - struct bkey_validate_context from = { - .from = BKEY_VALIDATE_journal, - .level = entry->level + 1, - .btree = entry->btree_id, - .root = true, - .flags = flags, - }; ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) ret = 0; @@ -480,7 +474,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { /* obsolete, don't care: */ return 0; @@ -495,7 +489,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -522,7 +516,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -564,7 +558,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -598,7 +592,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -642,7 +636,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -682,7 +676,7 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); @@ -739,7 +733,7 @@ static int journal_entry_log_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -756,10 +750,11 @@ static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { + from.flags = 0; return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, @@ -772,10 +767,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, @@ -788,7 +783,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { unsigned bytes = vstruct_bytes(entry); unsigned expected = 16; @@ -818,7 +813,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bch_validate_flags); + struct bkey_validate_context); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -836,11 +831,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, flags) + version, big_endian, from) : 0; } @@ -858,10 +853,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, static int jset_validate_entries(struct bch_fs *c, struct jset *jset, enum bch_validate_flags flags) { + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; + unsigned version = le32_to_cpu(jset->version); int ret = 0; vstruct_for_each(jset, entry) { + from.journal_offset = (u64 *) entry - jset->_data; + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), c, version, jset, entry, journal_entry_past_jset_end, @@ -870,8 +873,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, break; } - ret = bch2_journal_entry_validate(c, jset, entry, - version, JSET_BIG_ENDIAN(jset), flags); + ret = bch2_journal_entry_validate(c, jset, entry, version, + JSET_BIG_ENDIAN(jset), from); if (ret) break; } @@ -884,13 +887,17 @@ static int jset_validate(struct bch_fs *c, struct jset *jset, u64 sector, enum bch_validate_flags flags) { - unsigned version; + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -935,15 +942,16 @@ static int jset_validate_early(struct bch_fs *c, unsigned bucket_sectors_left, unsigned sectors_read) { - size_t bytes = vstruct_bytes(jset); - unsigned version; - enum bch_validate_flags flags = BCH_VALIDATE_journal; + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -956,6 +964,7 @@ static int jset_validate_early(struct bch_fs *c, return -EINVAL; } + size_t bytes = vstruct_bytes(jset); if (bytes > (sectors_read << 9) && sectors_read < bucket_sectors_left) return JOURNAL_ENTRY_REREAD; @@ -1240,8 +1249,6 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - enum bch_validate_flags flags = BCH_VALIDATE_journal; - i = *_i; if (journal_replay_ignore(i)) @@ -1261,6 +1268,10 @@ int bch2_journal_read(struct bch_fs *c, continue; } + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(i->j.seq), + }; if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), c, le32_to_cpu(i->j.version), &i->j, NULL, jset_last_seq_newer_than_seq, diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 2ca9cde30ea8..12b39fcb4424 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -63,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 005275281804..59c8770e4a0e 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -23,6 +23,10 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) { + struct bkey_validate_context from = { + .flags = write, + .from = BKEY_VALIDATE_superblock, + }; struct jset_entry *entry; int ret; @@ -40,7 +44,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle ret = bch2_journal_entry_validate(c, NULL, entry, le16_to_cpu(c->disk_sb.sb->version), BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - write); + from); if (ret) return ret; } From 8f367a5c8eb16059e43f8a329c894c380c0e2bc1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 5 Dec 2024 12:35:43 -0500 Subject: [PATCH 182/233] bcachefs: Don't add unknown accounting types to eytzinger tree Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 16 ++++++++++++++++ fs/bcachefs/disk_accounting.h | 8 +++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index a0061bcf9159..b18cbe80936b 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -729,6 +729,16 @@ int bch2_accounting_read(struct bch_fs *c) BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); + + if (k.k->type != KEY_TYPE_accounting) + continue; + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (!bch2_accounting_is_mem(acc_k)) + continue; + accounting_read_key(trans, k); })); if (ret) @@ -740,6 +750,12 @@ int bch2_accounting_read(struct bch_fs *c) darray_for_each(*keys, i) { if (i->k->k.type == KEY_TYPE_accounting) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); + + if (!bch2_accounting_is_mem(acc_k)) + continue; + struct bkey_s_c k = bkey_i_to_s_c(i->k); unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 566aa2a8539d..0eeaca12c589 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -114,6 +114,12 @@ enum bch_accounting_mode { int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +{ + return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc.type != BCH_DISK_ACCOUNTING_inum; +} + /* * Update in memory counters so they match the btree update we're doing; called * from transaction commit path @@ -130,7 +136,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, EBUG_ON(gc && !acc->gc_running); - if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + if (!bch2_accounting_is_mem(acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { From f3b4692b79f930695b312a7192f0bc8f260af9ff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 7 Dec 2024 20:43:07 -0500 Subject: [PATCH 183/233] bcachefs: Set bucket needs discard, inc gen on empty -> nonempty transition Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index b2d570453351..62069231c63b 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -856,7 +856,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (flags & BTREE_TRIGGER_transactional) { alloc_data_type_set(new_a, new_a->data_type); - if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { + int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - + (int) data_type_is_empty(old_a->data_type); + + if (is_empty_delta < 0) { new_a->io_time[READ] = bch2_current_io_time(c, READ); new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); From c106801642fe12b22001489702d42643103425ef Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 00:28:16 -0500 Subject: [PATCH 184/233] bcachefs: bch2_journal_noflush_seq() now takes [start, end) Harder to screw up if we're explicit about the range, and more correct as journal reservations can be outstanding on multiple journal entries simultaneously. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 4 +++- fs/bcachefs/journal.c | 11 ++++++----- fs/bcachefs/journal.h | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 62069231c63b..9ae567402b03 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -953,7 +953,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, */ if (is_empty_delta > 0) { if (new_a->journal_seq == transaction_seq || - bch2_journal_noflush_seq(&c->journal, new_a->journal_seq)) + bch2_journal_noflush_seq(&c->journal, + new_a->journal_seq, + transaction_seq)) new_a->journal_seq = 0; else { new_a->journal_seq = transaction_seq; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 04a9ccf76d75..2cd20114b74b 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -807,10 +807,11 @@ int bch2_journal_flush(struct journal *j) } /* - * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the + * range [start, end) * @seq */ -bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) { struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 unwritten_seq; @@ -819,15 +820,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) return false; - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) return false; spin_lock(&j->lock); - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) goto out; for (unwritten_seq = journal_last_unwritten_seq(j); - unwritten_seq < seq; + unwritten_seq < end; unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index a6a2e888c59b..cb0df0663946 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -404,7 +404,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); -bool bch2_journal_noflush_seq(struct journal *, u64); +bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); From d7f6becfe039b95593c28ff8180b3b53a2585f69 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 6 Dec 2024 23:15:05 -0500 Subject: [PATCH 185/233] bcachefs: Fix reuse of bucket before journal flush on multiple empty -> nonempty transition For each bucket we track when the bucket became nonempty and when it became empty again: if we can ensure that there will be no journal flushes in the range [nonempty, empty) (possibly because they occured at the same journal sequence number), then it's safe to reuse the bucket without waiting for a journal commit. This is a major performance optimization for erasure coding, where writes are initially replicated, but the extra replicas are quickly dropped: if those buckets are reused and overwritten without issuing a cache flush to the underlying device, then they only cost bus bandwidth. But there's a tricky corner case when there's multiple empty -> nonempty -> empty transitions in quick succession, i.e. when data is getting overwritten immediately as it's being written. If this happens and the previous empty transition hasn't been flushed, we need to continue tracking the previous nonempty transition - not start a new one. Fixing this means we now need to track both the nonempty and empty transitions in bch_alloc_v4. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 78 ++++++++++++++------------- fs/bcachefs/alloc_background_format.h | 4 +- 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 9ae567402b03..94e7bc889cb1 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -323,7 +323,8 @@ void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - a->journal_seq = swab64(a->journal_seq); + a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); + a->journal_seq_empty = swab64(a->journal_seq_empty); a->flags = swab32(a->flags); a->dirty_sectors = swab32(a->dirty_sectors); a->cached_sectors = swab32(a->cached_sectors); @@ -346,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); - prt_printf(out, "journal_seq %llu\n", a->journal_seq); - prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); - prt_printf(out, "cached_sectors %u\n", a->cached_sectors); - prt_printf(out, "stripe %u\n", a->stripe); - prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); - prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); - prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); + prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); + prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); + prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); if (ca) prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); @@ -384,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); *out = (struct bch_alloc_v4) { - .journal_seq = u.journal_seq, + .journal_seq_nonempty = u.journal_seq, .flags = u.need_discard, .gen = u.gen, .oldest_gen = u.oldest_gen, @@ -930,20 +932,29 @@ int bch2_trigger_alloc(struct btree_trans *trans, if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { u64 transaction_seq = trans->journal_res.seq; + BUG_ON(!transaction_seq); - if (log_fsck_err_on(transaction_seq && new_a->journal_seq > transaction_seq, + if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, trans, alloc_key_journal_seq_in_future, "bucket journal seq in future (currently at %llu)\n%s", journal_cur_seq(&c->journal), (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) - new_a->journal_seq = transaction_seq; + new_a->journal_seq_nonempty = transaction_seq; int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - (int) data_type_is_empty(old_a->data_type); - /* Record journal sequence number of empty -> nonempty transition: */ - if (is_empty_delta < 0) - new_a->journal_seq = max(new_a->journal_seq, transaction_seq); + /* + * Record journal sequence number of empty -> nonempty transition: + * Note that there may be multiple empty -> nonempty + * transitions, data in a bucket may be overwritten while we're + * still writing to it - so be careful to only record the first: + * */ + if (is_empty_delta < 0 && + new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { + new_a->journal_seq_nonempty = transaction_seq; + new_a->journal_seq_empty = 0; + } /* * Bucket becomes empty: mark it as waiting for a journal flush, @@ -952,20 +963,21 @@ int bch2_trigger_alloc(struct btree_trans *trans, * intermediate sequence numbers: */ if (is_empty_delta > 0) { - if (new_a->journal_seq == transaction_seq || + if (new_a->journal_seq_nonempty == transaction_seq || bch2_journal_noflush_seq(&c->journal, - new_a->journal_seq, - transaction_seq)) - new_a->journal_seq = 0; - else { - new_a->journal_seq = transaction_seq; + new_a->journal_seq_nonempty, + transaction_seq)) { + new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; + } else { + new_a->journal_seq_empty = transaction_seq; ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - transaction_seq); + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + transaction_seq); if (bch2_fs_fatal_err_on(ret, c, - "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) + "setting bucket_needs_journal_commit: %s", + bch2_err_str(ret))) goto err; } } @@ -983,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) -#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) +#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) if (statechange(a->data_type == BCH_DATA_free) && bucket_flushed(new_a)) @@ -1845,16 +1857,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, - trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = -EIO; - goto out; - } - if (!fastpath) { if (discard_in_flight_add(ca, iter.pos.offset, true)) goto out; diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h index befdaa95c515..740238369a5a 100644 --- a/fs/bcachefs/alloc_background_format.h +++ b/fs/bcachefs/alloc_background_format.h @@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) struct bch_alloc_v4 { struct bch_val v; - __u64 journal_seq; + __u64 journal_seq_nonempty; __u32 flags; __u8 gen; __u8 oldest_gen; @@ -70,7 +70,7 @@ struct bch_alloc_v4 { __u32 stripe; __u32 nr_external_backpointers; /* end of fields in original version of alloc_v4 */ - __u64 _fragmentation_lru; /* obsolete */ + __u64 journal_seq_empty; __u32 stripe_sectors; __u32 pad; } __packed __aligned(8); From 0939c611cef4c7f917295a42dfbd2f56bc249a34 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 9 Dec 2024 06:00:33 -0500 Subject: [PATCH 186/233] bcachefs: Don't start rewriting btree nodes until after journal replay This fixes a deadlock during journal replay when btree node read errors kick off a ton of rewrites: we don't want them competing with journal replay. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 7d9dab95bdcf..03a6eba7403d 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -2291,7 +2291,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && + bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { From 2a11567a57d5c84dd6bf99e80901d7561b677eb5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 04:11:21 -0500 Subject: [PATCH 187/233] bcachefs: Kill unnecessary mark_lock usage We can't hold mark_lock while calling fsck_err() - that's a deadlock, mark_lock is meant to be a leaf node lock. It's also unnecessary for gc_bucket() and bucket_gen(); rcu suffices since the bucket_gens array describes its size, and we can't race with device removal or resize during gc/fsck since that takes state lock. Reported-by: syzbot+38641fcbda1aaffefdd4@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 4 ---- fs/bcachefs/bcachefs.h | 6 ++--- fs/bcachefs/btree_gc.c | 7 ------ fs/bcachefs/buckets.c | 40 ++++++++++------------------------ fs/bcachefs/buckets.h | 9 ++++---- fs/bcachefs/ec.c | 6 ++--- fs/bcachefs/errcode.h | 1 + fs/bcachefs/super.c | 2 -- 8 files changed, 20 insertions(+), 55 deletions(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 57d5f14c93d0..6df41c331a52 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) return; } - percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - ob->valid = false; ob->data_type = 0; - spin_unlock(&ob->lock); - percpu_up_read(&c->mark_lock); spin_lock(&c->freelist_lock); bch2_open_bucket_hash_remove(c, ob); diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e6cd93e1ed0f..3a3cb79d8518 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -547,15 +547,13 @@ struct bch_dev { /* * Buckets: - * Per-bucket arrays are protected by c->mark_lock, bucket_lock and - * gc_gens_lock, for device resize - holding any is sufficient for - * access: Or rcu_read_lock(), but only for dev_ptr_stale(): + * Per-bucket arrays are protected by either rcu_read_lock or + * state_lock, for device resize. */ GENRADIX(struct bucket) buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; - struct rw_semaphore bucket_lock; struct bch_dev_usage __percpu *usage; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 24f2f3bdf704..e5ba7d1429b9 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -811,7 +811,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, old = bch2_alloc_to_v4(k, &old_convert); gc = new = *old; - percpu_down_read(&c->mark_lock); __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); old_gc = gc; @@ -822,7 +821,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, gc.data_type = old->data_type; gc.dirty_sectors = old->dirty_sectors; } - percpu_up_read(&c->mark_lock); /* * gc.data_type doesn't yet include need_discard & need_gc_gen states - @@ -840,11 +838,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, * safe w.r.t. transaction restarts, so fixup the gc_bucket so * we don't run it twice: */ - percpu_down_read(&c->mark_lock); struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); gc_m->data_type = gc.data_type; gc_m->dirty_sectors = gc.dirty_sectors; - percpu_up_read(&c->mark_lock); } if (fsck_err_on(new.data_type != gc.data_type, @@ -1088,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - percpu_down_read(&c->mark_lock); rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); @@ -1097,7 +1092,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, if (dev_ptr_stale(ca, ptr) > 16) { rcu_read_unlock(); - percpu_up_read(&c->mark_lock); goto update; } } @@ -1112,7 +1106,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, *gen = ptr->gen; } rcu_read_unlock(); - percpu_up_read(&c->mark_lock); return 0; update: u = bch2_bkey_make_mut(trans, iter, &k, 0); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index afd35c93fcfb..eb2ed4edbbbc 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -262,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - percpu_down_read(&c->mark_lock); - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) @@ -364,7 +362,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, bch_info(c, "new key %s", buf.buf); } - percpu_up_read(&c->mark_lock); struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, BTREE_ITER_intent|BTREE_ITER_all_snapshots); @@ -373,8 +370,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun); bch2_trans_iter_exit(trans, &iter); - percpu_down_read(&c->mark_lock); - if (ret) goto err; @@ -382,7 +377,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } err: - percpu_up_read(&c->mark_lock); printbuf_exit(&buf); return ret; } @@ -603,13 +597,12 @@ static int bch2_trigger_pointer(struct btree_trans *trans, } if (flags & BTREE_TRIGGER_gc) { - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = -BCH_ERR_trigger_pointer; - goto err_unlock; + goto err; } bucket_lock(g); @@ -617,8 +610,6 @@ static int bch2_trigger_pointer(struct btree_trans *trans, ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new); alloc_to_bucket(g, new); bucket_unlock(g); -err_unlock: - percpu_up_read(&c->mark_lock); if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); @@ -996,11 +987,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * struct bch_fs *c = trans->c; int ret = 0; - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, b); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", ca->dev_idx, bch2_data_type_str(data_type))) - goto err_unlock; + goto err; bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g); @@ -1010,26 +1000,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) - goto err; + goto err_unlock; if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) - goto err; + goto err_unlock; g->data_type = data_type; g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); bucket_unlock(g); - percpu_up_read(&c->mark_lock); ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); return ret; -err: - bucket_unlock(g); err_unlock: - percpu_up_read(&c->mark_lock); + bucket_unlock(g); +err: return -BCH_ERR_metadata_bucket_inconsistency; } @@ -1295,7 +1283,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bool resize = ca->bucket_gens != NULL; int ret; - BUG_ON(resize && ca->buckets_nouse); + if (resize) + lockdep_assert_held(&c->state_lock); + + if (resize && ca->buckets_nouse) + return -BCH_ERR_no_resize_with_buckets_nouse; bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets), GFP_KERNEL|__GFP_ZERO); @@ -1309,11 +1301,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->nbuckets_minus_first = bucket_gens->nbuckets - bucket_gens->first_bucket; - if (resize) { - down_write(&ca->bucket_lock); - percpu_down_write(&c->mark_lock); - } - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { @@ -1331,11 +1318,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) nbuckets = ca->mi.nbuckets; - if (resize) { - percpu_up_write(&c->mark_lock); - up_write(&ca->bucket_lock); - } - ret = 0; err: if (bucket_gens) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 3bebc4c3044f..a9acdd6c0c86 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b) static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - return genradix_ptr(&ca->buckets_gc, b); + return bucket_valid(ca, b) + ? genradix_ptr(&ca->buckets_gc, b) + : NULL; } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) { return rcu_dereference_check(ca->bucket_gens, - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->state_lock) || - lockdep_is_held(&ca->bucket_lock)); + lockdep_is_held(&ca->fs->state_lock)); } static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 250e73897d95..45541a101344 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -305,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, } if (flags & BTREE_TRIGGER_gc) { - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -BCH_ERR_mark_stripe; - goto err_unlock; + goto err; } bucket_lock(g); @@ -319,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); alloc_to_bucket(g, new); bucket_unlock(g); -err_unlock: - percpu_up_read(&c->mark_lock); + if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 5e4dd85ac669..a6a9561a890d 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -195,6 +195,7 @@ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ + x(EINVAL, no_resize_with_buckets_nouse) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 14157820705d..2b2e0835c8fe 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1311,8 +1311,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, init_completion(&ca->ref_completion); init_completion(&ca->io_ref_completion); - init_rwsem(&ca->bucket_lock); - INIT_WORK(&ca->io_error_work, bch2_io_error_work); bch2_time_stats_quantiles_init(&ca->io_latency[READ]); From c5022a702e50321829219339841693ad5d0db035 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 20:55:03 -0500 Subject: [PATCH 188/233] bcachefs: kill sysfs internal/accounting Since we added per-inode counters there's now far too many counters to show in one shot - if we want this in the future, it'll have to be in debugfs. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 26 -------------------------- fs/bcachefs/disk_accounting.h | 1 - fs/bcachefs/sysfs.c | 5 ----- 3 files changed, 32 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b18cbe80936b..22a7db63e50c 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -471,32 +471,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc return ret; } -void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - percpu_down_read(&c->mark_lock); - out->atomic++; - - eytzinger0_for_each(i, acc->k.nr) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); - - bch2_accounting_key_to_text(out, &acc_k); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - prt_str(out, ":"); - for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - prt_printf(out, " %llu", v[j]); - prt_newline(out); - } - - --out->atomic; - percpu_up_read(&c->mark_lock); -} - static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { darray_for_each(acc->k, e) { diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 0eeaca12c589..2560de10b09d 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -251,7 +251,6 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); -void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); int bch2_gc_accounting_start(struct bch_fs *); int bch2_gc_accounting_done(struct bch_fs *); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 97733c766948..48bc6ad03f09 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -203,7 +203,6 @@ read_attribute(disk_groups); read_attribute(has_data); read_attribute(alloc_debug); -read_attribute(accounting); read_attribute(usage_base); #define x(t, n, ...) read_attribute(t); @@ -397,9 +396,6 @@ SHOW(bch2_fs) if (attr == &sysfs_alloc_debug) bch2_fs_alloc_debug_to_text(out, c); - if (attr == &sysfs_accounting) - bch2_fs_accounting_to_text(out, c); - if (attr == &sysfs_usage_base) bch2_fs_usage_base_to_text(out, c); @@ -595,7 +591,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_disk_groups, &sysfs_alloc_debug, - &sysfs_accounting, &sysfs_usage_base, NULL }; From 5cd80c5f33629a8f559b2f75a39ef49e782dbd27 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 21:42:49 -0500 Subject: [PATCH 189/233] bcachefs: Use proper errcodes for inode unpack errors Signed-off-by: Kent Overstreet --- fs/bcachefs/errcode.h | 3 +++ fs/bcachefs/inode.c | 12 ++++++------ fs/bcachefs/varint.c | 5 +++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index a6a9561a890d..5d17ceb1e83a 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -196,6 +196,8 @@ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ x(EINVAL, no_resize_with_buckets_nouse) \ + x(EINVAL, inode_unpack_error) \ + x(EINVAL, varint_decode_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -313,6 +315,7 @@ static inline long bch2_err_class(long err) #define BLK_STS_REMOVED ((__force blk_status_t)128) +#include const char *bch2_blk_status_to_str(blk_status_t); #endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 8818e41883f2..f6245b78eb78 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -48,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end, u8 *p; if (in >= end) - return -1; + return -BCH_ERR_inode_unpack_error; if (!*in) - return -1; + return -BCH_ERR_inode_unpack_error; /* * position of highest set bit indicates number of bytes: @@ -61,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end, bytes = byte_table[shift - 1]; if (in + bytes > end) - return -1; + return -BCH_ERR_inode_unpack_error; p = (u8 *) be + 16 - bytes; memcpy(p, in, bytes); @@ -177,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, return ret; \ \ if (field_bits > sizeof(unpacked->_name) * 8) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ \ unpacked->_name = field[1]; \ in += ret; @@ -218,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v2() @@ -269,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v3() diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c index 6a78553d9b0c..6620ecae26af 100644 --- a/fs/bcachefs/varint.c +++ b/fs/bcachefs/varint.c @@ -9,6 +9,7 @@ #include #endif +#include "errcode.h" #include "varint.h" /** @@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) u64 v; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { __le64 v_le = 0; @@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) unsigned bytes = ffz(*in) + 1; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { v >>= bytes; From 90ae216d588636a18155188a1ef9626896260600 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 22:00:36 -0500 Subject: [PATCH 190/233] bcachefs: Don't BUG_ON() inode unpack error Bkey validation checks that inodes are well-formed and unpack successfully, so an unpack error should always indicate memory corruption or some other kind of hardware bug - but these are still errors we can recover from. Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 37 +++++++++++++++++++++++++------------ fs/bcachefs/move.c | 4 +++- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 22a33b9ba30d..1b887f332b74 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -458,7 +458,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * continue; struct bch_inode_unpacked child_inode; - bch2_inode_unpack(k, &child_inode); + ret = bch2_inode_unpack(k, &child_inode); + if (ret) + break; if (!inode_should_reattach(&child_inode)) { ret = maybe_delete_dirent(trans, @@ -809,9 +811,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, { struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(inode, &u)); - - return darray_push(&w->inodes, ((struct inode_walker_entry) { + return bch2_inode_unpack(inode, &u) ?: + darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, .snapshot = inode.k->p.snapshot, })); @@ -1065,7 +1066,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans, goto err; BUG(); found_root: - BUG_ON(bch2_inode_unpack(k, root)); + ret = bch2_inode_unpack(k, root); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1096,7 +1097,9 @@ static int check_inode(struct btree_trans *trans, if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); + ret = bch2_inode_unpack(k, &u); + if (ret) + goto err; if (snapshot_root->bi_inum != u.bi_inum) { ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); @@ -1318,7 +1321,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, break; struct bch_inode_unpacked parent_inode; - bch2_inode_unpack(k, &parent_inode); + ret = bch2_inode_unpack(k, &parent_inode); + if (ret) + break; if (!inode_should_reattach(&parent_inode)) break; @@ -1341,7 +1346,9 @@ static int check_unreachable_inode(struct btree_trans *trans, return 0; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(k, &inode)); + ret = bch2_inode_unpack(k, &inode); + if (ret) + return ret; if (!inode_should_reattach(&inode)) return 0; @@ -2603,14 +2610,16 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino { struct bch_fs *c = trans->c; struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode; struct printbuf buf = PRINTBUF; u32 snapshot = inode_k.k->p.snapshot; int ret = 0; p->nr = 0; - BUG_ON(bch2_inode_unpack(inode_k, &inode)); + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(inode_k, &inode); + if (ret) + return ret; if (!S_ISDIR(inode.bi_mode)) return 0; @@ -2810,7 +2819,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, /* Should never fail, checked by bch2_inode_invalid: */ struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(k, &u)); + _ret3 = bch2_inode_unpack(k, &u); + if (_ret3) + break; /* * Backpointer and directory structure checks are sufficient for @@ -2888,7 +2899,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; if (S_ISDIR(u.bi_mode)) return 0; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 6f21e36d89f7..6d38afcaaaab 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -412,7 +412,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, continue; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(k, &inode)); + _ret3 = bch2_inode_unpack(k, &inode); + if (_ret3) + break; struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); From eccc694e143329be8be7d6c4fd212a9abf6f5987 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 21:47:34 -0500 Subject: [PATCH 191/233] bcachefs: bch2_str_hash_check_key() now checks inode hash info Versions of the same inode in different snapshots must have the same hash info; this is critical for lookups to work correctly. We're going to be running the str_hash checks online, at readdir or xattr list time, so we now need str_hash_check_key() to check for inode hash seed mismatches, since it won't be run right after check_inodes(). Signed-off-by: Kent Overstreet --- fs/bcachefs/fsck.c | 6 +- fs/bcachefs/str_hash.c | 129 ++++++++++++++++++++++++++++++++--------- fs/bcachefs/str_hash.h | 25 ++++++-- 3 files changed, 126 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 1b887f332b74..b8ced64cce2c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1110,7 +1110,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), trans, inode_snapshot_mismatch, - "inodes in different snapshots don't match")) { + "inode hash info in different snapshots don't match")) { u.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); do_update = true; @@ -2303,7 +2303,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -2417,7 +2417,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index c3276a7e7324..ed3c852fc0be 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -101,38 +101,108 @@ static int hash_pick_winner(struct btree_trans *trans, } } -int bch2_str_hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) +static int repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != snapshot_root->bi_inum) + break; + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + break; + + if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), + trans, inode_snapshot_mismatch, + "inode hash info in different snapshots don't match")) { + inode.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); + ret = __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + break; + } + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * All versions of the same inode in different snapshots must have the same hash + * seed/type: verify that the hash info we're using matches the root + */ +static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + goto found; + } + bch_err(c, "%s(): inum %llu not found", __func__, inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; +found: + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + if (memcmp(hash_info, &hash2, sizeof(hash2))) { + ret = repair_inode_hash_info(trans, &inode); + if (!ret) { + bch_err(c, "inode hash info mismatch with root, but mismatch not found"); + ret = -BCH_ERR_fsck_repair_unimplemented; + } + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) { struct bch_fs *c = trans->c; struct btree_iter iter = { NULL }; struct printbuf buf = PRINTBUF; struct bkey_s_c k; - u64 hash; int ret = 0; - if (hash_k.k->type != desc.key_type) - return 0; - - hash = desc.hash_bkey(hash_info, hash_k); - - if (likely(hash == hash_k.k->p.offset)) - return 0; - + u64 hash = desc->hash_bkey(hash_info, hash_k); if (hash_k.k->p.offset < hash) goto bad_hash; - for_each_btree_key_norestart(trans, iter, desc.btree_id, + for_each_btree_key_norestart(trans, iter, desc->btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), BTREE_ITER_slots, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; - if (k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k)) + if (k.k->type == desc->key_type && + !desc->cmp_bkey(k, hash_k)) goto duplicate_entries; if (bkey_deleted(k.k)) { @@ -145,16 +215,23 @@ int bch2_str_hash_check_key(struct btree_trans *trans, printbuf_exit(&buf); return ret; bad_hash: + /* + * Before doing any repair, check hash_info itself: + */ + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); + if (ret) + goto out; + if (fsck_err(trans, hash_table_key_wrong_offset, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", - bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); if (IS_ERR(new)) return PTR_ERR(new); - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, (subvol_inum) { 0, hash_k.k->p.inode }, hash_k.k->p.snapshot, new, STR_HASH_must_create| @@ -166,9 +243,9 @@ int bch2_str_hash_check_key(struct btree_trans *trans, if (k.k) goto duplicate_entries; - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, desc, hash_info, new) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: -BCH_ERR_transaction_restart_nested; goto out; @@ -176,7 +253,7 @@ int bch2_str_hash_check_key(struct btree_trans *trans, fsck_err: goto out; duplicate_entries: - ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); + ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); if (ret < 0) goto out; @@ -192,14 +269,14 @@ int bch2_str_hash_check_key(struct btree_trans *trans, switch (ret) { case 0: - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); break; case 1: - ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); + ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); break; case 2: - ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); + ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); goto out; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 0c20f3af03f8..55a4ac7bf220 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -394,10 +394,25 @@ int bch2_hash_delete(struct btree_trans *trans, } struct snapshots_seen; -int bch2_str_hash_check_key(struct btree_trans *, - struct snapshots_seen *, - const struct bch_hash_desc, - struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c); +int __bch2_str_hash_check_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c); + +static inline int bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + if (hash_k.k->type != desc->key_type) + return 0; + + if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) + return 0; + + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); +} #endif /* _BCACHEFS_STR_HASH_H */ From d54b4f311f313e2a926cc9649d3c7a97187d00ee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 22:30:19 -0500 Subject: [PATCH 192/233] bcachefs: bch2_check_key_has_snapshot() prints btree id Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 99f045518312..f65f7b191d31 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_buf.h" +#include "btree_cache.h" #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" @@ -1097,7 +1098,9 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: From 152c28eef5bd508af933704d19146352769dd6e8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 9 Dec 2024 01:31:43 -0500 Subject: [PATCH 193/233] bcachefs: bch2_snapshot_exists() bch2_snapshot_equiv() is going away; convert users that just wanted to know if the snapshot exists to something better Signed-off-by: Kent Overstreet --- fs/bcachefs/data_update.c | 2 +- fs/bcachefs/snapshot.c | 7 ++++--- fs/bcachefs/snapshot.h | 15 +++++++++++++++ fs/bcachefs/subvolume_types.h | 1 + 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 31b2aeb0c6e6..585214931e05 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -620,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans, * and we have to check for this because we go rw before repairing the * snapshots table - just skip it, we can move it later. */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) + if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) return -BCH_ERR_data_update_done; if (!bkey_get_dev_refs(c, k)) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index f65f7b191d31..ac664888847f 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -318,6 +318,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + t->live = true; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -914,7 +915,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - if (bch2_snapshot_equiv(c, id)) + if (bch2_snapshot_exists(c, id)) return 0; /* Do we need to reconstruct the snapshot_tree entry as well? */ @@ -1062,7 +1063,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_equiv(c, *id), + if (fsck_err_on(!bch2_snapshot_exists(c, *id), trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { @@ -1095,7 +1096,7 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), + if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", (bch2_btree_id_to_text(&buf, iter->btree_id), diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index ae23d45fad66..3ff0ffa774f5 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -119,6 +119,21 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } +static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s = snapshot_t(c, id); + return s ? s->live : 0; +} + +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + bool ret = __bch2_snapshot_exists(c, id); + rcu_read_unlock(); + + return ret; +} + static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index f2ec4277c2a5..8a7f7e87c381 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -9,6 +9,7 @@ typedef DARRAY(u32) snapshot_id_list; #define IS_ANCESTOR_BITMAP 128 struct snapshot_t { + bool live; u32 parent; u32 skip[3]; u32 depth; From 42c1d1a9549ce73cc877d785f67e0152cacfd279 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Dec 2024 10:29:12 -0500 Subject: [PATCH 194/233] bcachefs: trace_write_buffer_maybe_flush Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 2 +- fs/bcachefs/btree_write_buffer.c | 8 ++++++++ fs/bcachefs/trace.h | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 702bf62d7fa7..0e3b7b5d626e 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -206,7 +206,7 @@ static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0); } -static int bch2_backpointers_maybe_flush(struct btree_trans *trans, +static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, struct bkey_buf *last_flushed) { diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 49ce2d1e5c02..746db6d5a0fb 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -632,6 +632,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + if (trace_write_buffer_maybe_flush_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, referring_k); + trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + bch2_bkey_buf_reassemble(&tmp, c, referring_k); if (bkey_is_btree_ptr(referring_k.k)) { diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 7baf66beee22..11e6547f91d6 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1436,6 +1436,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, TP_printk("%zu/%zu", __entry->slowpath, __entry->total) ); +TRACE_EVENT(write_buffer_maybe_flush, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), + TP_ARGS(trans, caller_ip, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) +); + DEFINE_EVENT(fs_str, rebalance_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) From cee2a479acb23e673fdff0ad20041d10a678676c Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 10 Dec 2024 11:12:07 -0700 Subject: [PATCH 195/233] bcachefs: Add empty statement between label and declaration in check_inode_hash_info_matches_root() Clang 18 and newer warns (or errors with CONFIG_WERROR=y): fs/bcachefs/str_hash.c:164:2: error: label followed by a declaration is a C23 extension [-Werror,-Wc23-extensions] 164 | struct bch_inode_unpacked inode; | ^ In Clang 17 and prior, this is an unconditional hard error: fs/bcachefs/str_hash.c:164:2: error: expected expression 164 | struct bch_inode_unpacked inode; | ^ fs/bcachefs/str_hash.c:165:30: error: use of undeclared identifier 'inode' 165 | ret = bch2_inode_unpack(k, &inode); | ^ fs/bcachefs/str_hash.c:169:55: error: use of undeclared identifier 'inode' 169 | struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); | ^ fs/bcachefs/str_hash.c:171:40: error: use of undeclared identifier 'inode' 171 | ret = repair_inode_hash_info(trans, &inode); | ^ Add an empty statement between the label and the declaration to fix the warning/error without disturbing the code too much. Fixes: 2519d3b0d656 ("bcachefs: bch2_str_hash_check_key() now checks inode hash info") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412092339.QB7hffGC-lkp@intel.com/ Signed-off-by: Nathan Chancellor Signed-off-by: Kent Overstreet --- fs/bcachefs/str_hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index ed3c852fc0be..f5977c5c6743 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -160,7 +160,7 @@ static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inu bch_err(c, "%s(): inum %llu not found", __func__, inum); ret = -BCH_ERR_fsck_repair_unimplemented; goto err; -found: +found:; struct bch_inode_unpacked inode; ret = bch2_inode_unpack(k, &inode); if (ret) From bb4ae1459d12972bad07e9197b13d0a3e3e782e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Dec 2024 13:23:47 -0500 Subject: [PATCH 196/233] bcachefs: Refactor c->opts.reconstruct_alloc Now handled in one place. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index a342744fd275..fbef6579d884 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -107,6 +107,12 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) return ret; } +static void kill_btree(struct bch_fs *c, enum btree_id btree) +{ + bch2_btree_id_root(c, btree)->alive = false; + bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); +} + /* for -o reconstruct_alloc: */ static void bch2_reconstruct_alloc(struct bch_fs *c) { @@ -157,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) + if (btree_id_is_alloc(i)) + kill_btree(c, i); } /* @@ -573,9 +572,6 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) - continue; - printbuf_reset(&buf); bch2_btree_id_level_to_text(&buf, i, r->level); @@ -863,15 +859,15 @@ int bch2_fs_recovery(struct bch_fs *c) c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) - bch2_reconstruct_alloc(c); - zero_out_btree_mem_ptr(&c->journal_keys); ret = journal_replay_early(c, clean); if (ret) goto err; + if (c->opts.reconstruct_alloc) + bch2_reconstruct_alloc(c); + /* * After an unclean shutdown, skip then next few journal sequence * numbers as they may have been referenced by btree writes that From cf44b080f7d9c6c7f7e17a4b3cfc3a7ddf7c0111 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 8 Dec 2024 21:10:27 -0500 Subject: [PATCH 197/233] bcachefs: check_indirect_extents can run online Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 94dc20ca2065..2b3ef3980fc3 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -43,7 +43,7 @@ x(fs_upgrade_for_subvolumes, 22, 0) \ x(check_inodes, 24, PASS_FSCK) \ x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ From 52ee09e70b8a9a04eaad4d9df65a6fcfc5fad43c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 03:38:14 -0500 Subject: [PATCH 198/233] bcachefs: tidy up __bch2_btree_iter_peek() Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 9c54891c737a..368ebcaf05fd 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2260,7 +2260,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out; + break; } struct btree_path *path = btree_iter_path(trans, iter); @@ -2270,7 +2270,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* No btree nodes at requested level: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } btree_path_set_should_be_locked(trans, path); @@ -2281,10 +2281,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; - ret = bkey_err(k); - if (ret) { + if (bkey_err(k)) { bch2_btree_iter_set_pos(iter, iter->pos); - goto out; + break; } } @@ -2318,12 +2317,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* End of btree: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } } -out: - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(iter); return k; } From eece59055ba456b267488446fca8dd2ba56b91e2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Dec 2024 06:02:24 -0500 Subject: [PATCH 199/233] bcachefs: tidy btree_trans_peek_journal() Change to match bch2_btree_trans_peek_updates() calling convention. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 368ebcaf05fd..51ebce9d5b5c 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2144,21 +2144,18 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, } static noinline -struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +void btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(path)->b->key.k.p); - + k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; - k = bkey_i_to_s_c(next_journal); + *k = bkey_i_to_s_c(next_journal); } - - return k; } static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, @@ -2175,21 +2172,19 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, } static noinline -struct bkey_s_c btree_trans_peek_prev_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +void btree_trans_peek_prev_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek_prev(trans, iter, - k.k ? k.k->p : path_l(path)->b->key.k.p); + k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; - k = bkey_i_to_s_c(next_journal); + *k = bkey_i_to_s_c(next_journal); } - - return k; } /* @@ -2288,7 +2283,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - k = btree_trans_peek_journal(trans, iter, k); + btree_trans_peek_journal(trans, iter, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) @@ -2545,7 +2540,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - k = btree_trans_peek_prev_journal(trans, iter, k); + btree_trans_peek_prev_journal(trans, iter, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) From bf5ec9b976f721411eb8c4962b75f3d5a630c073 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 02:26:15 -0500 Subject: [PATCH 200/233] bcachefs: Fix btree_trans_peek_key_cache() BTREE_ITER_all_snapshots In BTREE_ITER_all_snapshots mode, we're required to only return keys where the snapshot field matches the iterator position - BTREE_ITER_filter_snapshots requires pulling keys into the key cache from ancestor snapshots, so we have to check for that. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 51ebce9d5b5c..e370fa327769 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2230,6 +2230,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); if (k.k && !bkey_err(k)) { + if ((iter->flags & BTREE_ITER_all_snapshots) && + !bpos_eq(pos, k.k->p)) + return bkey_s_c_null; + iter->k = u; k.k = &iter->k; } From 760fbaf1a8f921d2b413d662d17a8c9056a0bdc4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Dec 2024 05:29:27 -0500 Subject: [PATCH 201/233] bcachefs: Fix key cache + BTREE_ITER_all_snapshots Normally, whitouts (KEY_TYPE_whitout) are filtered from btree lookups, since they exist only to represent deletions of keys in ancestor snapshots - except, they should not be filtered in BTREE_ITER_all_snapshots mode, so that e.g. snapshot deletion can clean them up. This means that that the key cache has to store whiteouts, and key cache fills cannot filter them. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e370fa327769..b27944b62087 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -1854,7 +1854,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * !bkey_eq(path->pos, ck->key.pos)); *u = ck->k->k; - k = bkey_i_to_s_c(ck->k); + k = (struct bkey_s_c) { u, &ck->k->v }; } return k; @@ -2421,7 +2421,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en continue; } - if (bkey_whiteout(k.k)) { + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_key_cache_fill)) { search_key = bkey_successor(iter, k.k->p); continue; } @@ -2781,6 +2782,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; + + if (unlikely(k.k->type == KEY_TYPE_whiteout && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_key_cache_fill))) + iter->k.type = KEY_TYPE_deleted; } else { struct bpos next; struct bpos end = iter->pos; From 395d7f5e2438f5a5f70f7cbcc7b3873629e15d14 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 02:32:32 -0500 Subject: [PATCH 202/233] bcachefs: alloc_data_type_set() happens in alloc trigger Originally, we ran insert triggers before overwrite so that if an extent was being moved (by fallocate insert/collapse range), the bucket sector count wouldn't hit 0 partway through, and so we don't trigger state changes caused by that too soon. But this is better solved by just moving the data type change to the alloc trigger itself, where it's already called. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 1 - fs/bcachefs/buckets.c | 11 ++++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index e5ba7d1429b9..5aa11ca08c94 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1134,7 +1134,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev return ret; a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - alloc_data_type_set(&a_mut->v, a_mut->v.data_type); return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index eb2ed4edbbbc..bbd37b1ed5d2 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -541,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct extent_ptr_decoded *p, s64 sectors, enum bch_data_type ptr_data_type, - struct bch_alloc_v4 *a) + struct bch_alloc_v4 *a, + bool insert) { u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : !p->ptr.cached ? &a->dirty_sectors : @@ -551,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, if (ret) return ret; - - alloc_data_type_set(a, ptr_data_type); + if (insert) + alloc_data_type_set(a, ptr_data_type); return 0; } @@ -585,7 +586,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v); + __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); if (ret) goto err; @@ -607,7 +608,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new); + ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); alloc_to_bucket(g, new); bucket_unlock(g); From 75bca410520b1bd8ec4417d1c266124c5c8b9d76 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 02:27:52 -0500 Subject: [PATCH 203/233] bcachefs: Don't run overwrite triggers before insert This breaks when the trigger is inserting updates for the same btree, as the inode trigger now does. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_trans_commit.c | 81 +++++++++++++++----------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 9011cc3f7190..c3a3bfd11e8c 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans, old, flags); } -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) { verify_update_old_key(trans, i); @@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), BTREE_TRIGGER_insert| BTREE_TRIGGER_overwrite|flags) ?: 1; - } else if (overwrite && !i->overwrite_trigger_run) { + } else if (!i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!overwrite && !i->insert_trigger_run) { + } else if (!i->insert_trigger_run) { i->insert_trigger_run = true; return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { @@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned btree_id_start) + unsigned *btree_id_updates_start) { - for (int overwrite = 1; overwrite >= 0; --overwrite) { - bool trans_trigger_run; + bool trans_trigger_run; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; - for (unsigned i = btree_id_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; - i++) { - if (trans->updates[i].btree_id != btree_id) - continue; - - int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; + for (unsigned i = *btree_id_updates_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; + i++) { + if (trans->updates[i].btree_id < btree_id) { + *btree_id_updates_start = i; + continue; } - } while (trans_trigger_run); - } + + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && + i->btree_id == btree_id && + btree_node_type_has_trans_triggers(i->bkey_type) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); return 0; } static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - unsigned btree_id = 0, btree_id_start = 0; + unsigned btree_id = 0, btree_id_updates_start = 0; int ret = 0; /* @@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->nr_updates && - trans->updates[btree_id_start].btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); + ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); if (ret) return ret; } - for (unsigned idx = 0; idx < trans->nr_updates; idx++) { - struct btree_insert_entry *i = trans->updates + idx; - - if (i->btree_id > BTREE_ID_alloc) - break; - if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); - if (ret) - return ret; - break; - } - } + btree_id_updates_start = 0; + ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); + if (ret) + return ret; #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) From 46f92a9e9932872336ed485b0152ae560d3e2907 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 02:41:37 -0500 Subject: [PATCH 204/233] bcachefs: Kill equiv_seen arg to delete_dead_snapshots_process_key() When deleting dead snapshots, we move keys from redundant interior snapshot nodes to child nodes - unless there's already a key, in which case the ancestor key is deleted. Previously, we tracked via equiv_seen whether the child snapshot had a key, but this was tricky w.r.t. transaction restarts, and not transactionally safe w.r.t. updates in the child snapshot. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 51 ++++++++++++++---------------------------- 1 file changed, 17 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index ac664888847f..ca7e4e975a60 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1421,9 +1421,7 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - snapshot_id_list *deleted, - snapshot_id_list *equiv_seen, - struct bpos *last_pos) + snapshot_id_list *deleted) { int ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret) @@ -1434,24 +1432,10 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ return 0; - if (!bkey_eq(k.k->p, *last_pos)) - equiv_seen->nr = 0; - if (snapshot_list_has_id(deleted, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - if (!bpos_eq(*last_pos, k.k->p) && - snapshot_list_has_id(equiv_seen, equiv)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - - *last_pos = k.k->p; - - ret = snapshot_list_add_nodup(c, equiv_seen, equiv); - if (ret) - return ret; - /* * When we have a linear chain of snapshot nodes, we consider * those to form an equivalence class: we're going to collapse @@ -1473,20 +1457,23 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, new->k.p.snapshot = equiv; - struct btree_iter new_iter; - bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, - BTREE_ITER_all_snapshots| - BTREE_ITER_cached| - BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&new_iter) ?: - bch2_trans_update(trans, &new_iter, new, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &new_iter); + struct btree_iter dst_iter; + struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, + iter->btree_id, new->k.p, + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); + ret = bkey_err(dst_k); if (ret) return ret; + + ret = (bkey_deleted(dst_k.k) + ? bch2_trans_update(trans, &dst_iter, new, + BTREE_UPDATE_internal_snapshot_node) + : 0) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; } return 0; @@ -1648,8 +1635,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - struct bpos last_pos = POS_MIN; - snapshot_id_list equiv_seen = { 0 }; struct disk_reservation res = { 0 }; if (!btree_type_has_snapshots(btree)) @@ -1659,11 +1644,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, &deleted, - &equiv_seen, &last_pos)); + delete_dead_snapshots_process_key(trans, &iter, k, &deleted)); bch2_disk_reservation_put(c, &res); - darray_exit(&equiv_seen); bch_err_msg(c, ret, "deleting keys from dying snapshots"); if (ret) From 7c410e21d8bb472455e57fdbf174c6accb3d9a78 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 03:03:58 -0500 Subject: [PATCH 205/233] bcachefs: Snapshot deletion no longer uses snapshot_t->equiv Switch to generating a private list of interior nodes to delete, instead of using the equivalence class in the global data structure. This eliminates possible races with snapshot creation, and is much cleaner - it'll let us delete a lot of janky code for calculating and maintaining the equivalence classes. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 276 ++++++++++++++++++++--------------------- 1 file changed, 137 insertions(+), 139 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index ca7e4e975a60..0d60251946f1 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1418,44 +1418,74 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int delete_dead_snapshots_process_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *deleted) -{ - int ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) - return ret < 0 ? ret : 0; +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ +static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) +{ + darray_for_each(*l, i) + if (i->id == id) + return i->live_child; + return 0; +} + +static unsigned __live_child(struct snapshot_table *t, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + struct snapshot_t *s = __snapshot_t(t, id); + if (!s) return 0; - if (snapshot_list_has_id(deleted, k.k->p.snapshot)) + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) + if (s->children[i] && + !snapshot_list_has_id(delete_leaves, s->children[i]) && + !interior_delete_has_id(delete_interior, s->children[i])) + return s->children[i]; + + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { + u32 live_child = s->children[i] + ? __live_child(t, s->children[i], delete_leaves, delete_interior) + : 0; + if (live_child) + return live_child; + } + + return 0; +} + +static unsigned live_child(struct bch_fs *c, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + rcu_read_lock(); + u32 ret = __live_child(rcu_dereference(c->snapshots), id, + delete_leaves, delete_interior); + rcu_read_unlock(); + return ret; +} + +static int delete_dead_snapshots_process_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - /* - * When we have a linear chain of snapshot nodes, we consider - * those to form an equivalence class: we're going to collapse - * them all down to a single node, and keep the leaf-most node - - * which has the same id as the equivalence class id. - * - * If there are multiple keys in different snapshots at the same - * position, we're only going to keep the one in the newest - * snapshot (we delete the others above) - the rest have been - * overwritten and are redundant, and for the key we're going to keep we - * need to move it to the equivalance class ID if it's not there - * already. - */ - if (equiv != k.k->p.snapshot) { + u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + if (live_child) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; - new->k.p.snapshot = equiv; + new->k.p.snapshot = live_child; struct btree_iter dst_iter; struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, @@ -1479,55 +1509,62 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, return 0; } -static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bkey_s_c_snapshot snap; - u32 children[2]; - int ret; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || - BCH_SNAPSHOT_SUBVOL(snap.v)) - return 0; - - children[0] = le32_to_cpu(snap.v->children[0]); - children[1] = le32_to_cpu(snap.v->children[1]); - - ret = bch2_snapshot_live(trans, children[0]) ?: - bch2_snapshot_live(trans, children[1]); - if (ret < 0) - return ret; - return !ret; -} - /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it * as deleted. */ -static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) { - int ret = bch2_snapshot_needs_delete(trans, k); + if (k.k->type != KEY_TYPE_snapshot) + return 0; - return ret <= 0 - ? ret - : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + unsigned live_children = 0; + + if (BCH_SNAPSHOT_SUBVOL(s.v)) + return 0; + + for (unsigned i = 0; i < 2; i++) { + u32 child = le32_to_cpu(s.v->children[i]); + + live_children += child && + !snapshot_list_has_id(delete_leaves, child); + } + + if (live_children == 0) { + return snapshot_list_add(c, delete_leaves, s.k->p.offset); + } else if (live_children == 1) { + struct snapshot_interior_delete d = { + .id = s.k->p.offset, + .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + }; + + if (!d.live_child) { + bch_err(c, "error finding live child of snapshot %u", d.id); + return -EINVAL; + } + + return darray_push(delete_interior, d); + } else { + return 0; + } } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - snapshot_id_list *skip) + interior_delete_list *skip) { rcu_read_lock(); - while (snapshot_list_has_id(skip, id)) + while (interior_delete_has_id(skip, id)) id = __bch2_snapshot_parent(c, id); while (n--) { do { id = __bch2_snapshot_parent(c, id); - } while (snapshot_list_has_id(skip, id)); + } while (interior_delete_has_id(skip, id)); } rcu_read_unlock(); @@ -1536,7 +1573,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - snapshot_id_list *deleted) + interior_delete_list *deleted) { struct bch_fs *c = trans->c; u32 nr_deleted_ancestors = 0; @@ -1546,7 +1583,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, if (k.k->type != KEY_TYPE_snapshot) return 0; - if (snapshot_list_has_id(deleted, k.k->p.offset)) + if (interior_delete_has_id(deleted, k.k->p.offset)) return 0; s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); @@ -1555,7 +1592,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return ret; darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); + nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); if (!nr_deleted_ancestors) return 0; @@ -1573,7 +1610,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { u32 id = le32_to_cpu(s->v.skip[j]); - if (snapshot_list_has_id(deleted, id)) { + if (interior_delete_has_id(deleted, id)) { id = bch2_snapshot_nth_parent_skip(c, parent, depth > 1 @@ -1592,48 +1629,27 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { - struct btree_trans *trans; - snapshot_id_list deleted = { 0 }; - snapshot_id_list deleted_interior = { 0 }; - int ret = 0; - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - trans = bch2_trans_get(c); + struct btree_trans *trans = bch2_trans_get(c); + snapshot_id_list delete_leaves = {}; + interior_delete_list delete_interior = {}; + int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - NULL, NULL, 0, - bch2_delete_redundant_snapshot(trans, k)); - bch_err_msg(c, ret, "deleting redundant snapshots"); - if (ret) - goto err; - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_snapshot_set_equiv(trans, k)); - bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); - if (ret) - goto err; - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - if (k.k->type != KEY_TYPE_snapshot) - continue; - - BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) - ? snapshot_list_add(c, &deleted, k.k->p.offset) - : 0; - })); + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, + check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; + if (!delete_leaves.nr && !delete_interior.nr) + goto err; + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { struct disk_reservation res = { 0 }; @@ -1644,7 +1660,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, &deleted)); + delete_dead_snapshots_process_key(trans, &iter, k, + &delete_leaves, + &delete_interior)); bch2_disk_reservation_put(c, &res); @@ -1653,22 +1671,13 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - bch2_trans_unlock(trans); - down_write(&c->snapshot_create_lock); - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - u32 snapshot = k.k->p.offset; - u32 equiv = bch2_snapshot_equiv(c, snapshot); - - equiv != snapshot - ? snapshot_list_add(c, &deleted_interior, snapshot) - : 0; - })); - - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err_create_lock; + darray_for_each(delete_leaves, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) + goto err; + } /* * Fixing children of deleted snapshots can't be done completely @@ -1678,30 +1687,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); if (ret) - goto err_create_lock; + goto err; - darray_for_each(deleted, i) { + darray_for_each(delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch2_snapshot_node_delete(trans, i->id)); + bch_err_msg(c, ret, "deleting snapshot %u", i->id); if (ret) - goto err_create_lock; + goto err; } - - darray_for_each(deleted_interior, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); - if (ret) - goto err_create_lock; - } -err_create_lock: - up_write(&c->snapshot_create_lock); err: - darray_exit(&deleted_interior); - darray_exit(&deleted); + darray_exit(&delete_interior); + darray_exit(&delete_leaves); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; @@ -1754,24 +1753,23 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } +static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) +{ + /* If there's one child, it's redundant and keys will be moved to the child */ + return !!snap.v->children[0] + !!snap.v->children[1] == 1; +} + static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) { - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot snap; - int ret = 0; - if (k.k->type != KEY_TYPE_snapshot) return 0; - snap = bkey_s_c_to_snapshot(k); + struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); if (BCH_SNAPSHOT_DELETED(snap.v) || - bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || - (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return 0; - } + interior_snapshot_needs_delete(snap)) + set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - return ret; + return 0; } int bch2_snapshots_read(struct bch_fs *c) From 5d9b21a555e0df44048c16aa0b1b918883fb68c8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 04:03:32 -0500 Subject: [PATCH 206/233] bcachefs: Kill snapshot_t->equiv Now entirely dead code. Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 100 +++------------------------------- fs/bcachefs/snapshot.h | 15 ----- fs/bcachefs/subvolume_types.h | 1 - 3 files changed, 7 insertions(+), 109 deletions(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 0d60251946f1..f975f2cf3e35 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -280,23 +280,6 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, return ret; } -static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - struct snapshot_t *t = snapshot_t_mut(c, id); - u32 parent = id; - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); -} - -static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - mutex_lock(&c->snapshot_table_lock); - __set_is_ancestor_bitmap(c, id); - mutex_unlock(&c->snapshot_table_lock); -} - static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -337,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, t->skip[2] = 0; } - __set_is_ancestor_bitmap(c, id); + u32 parent = id; + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); if (BCH_SNAPSHOT_DELETED(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); @@ -367,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_with_updates, snapshot, s); } -static int bch2_snapshot_live(struct btree_trans *trans, u32 id) -{ - struct bch_snapshot v; - int ret; - - if (!id) - return 0; - - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot node %u not found", id); - if (ret) - return ret; - - return !BCH_SNAPSHOT_DELETED(&v); -} - -/* - * If @k is a snapshot with just one live child, it's part of a linear chain, - * which we consider to be an equivalence class: and then after snapshot - * deletion cleanup, there should only be a single key at a given position in - * this equivalence class. - * - * This sets the equivalence class of @k to be the child's equivalence class, if - * it's part of such a linear chain: this correctly sets equivalence classes on - * startup if we run leaf to root (i.e. in natural key order). - */ -static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - unsigned i, nr_live = 0, live_idx = 0; - struct bkey_s_c_snapshot snap; - u32 id = k.k->p.offset, child[2]; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - - child[0] = le32_to_cpu(snap.v->children[0]); - child[1] = le32_to_cpu(snap.v->children[1]); - - for (i = 0; i < 2; i++) { - int ret = bch2_snapshot_live(trans, child[i]); - - if (ret < 0) - return ret; - - if (ret) - live_idx = i; - nr_live += ret; - } - - mutex_lock(&c->snapshot_table_lock); - - snapshot_t_mut(c, id)->equiv = nr_live == 1 - ? snapshot_t_mut(c, child[live_idx])->equiv - : id; - - mutex_unlock(&c->snapshot_table_lock); - - return 0; -} - /* fsck: */ static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) @@ -964,8 +887,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: - bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); } /* Figure out which snapshot nodes belong in the same tree: */ @@ -1309,10 +1231,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; new_snapids[i] = iter.pos.offset; - - mutex_lock(&c->snapshot_table_lock); - snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; - mutex_unlock(&c->snapshot_table_lock); } err: bch2_trans_iter_exit(trans, &iter); @@ -1778,11 +1696,7 @@ int bch2_snapshots_read(struct bch_fs *c) for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(trans, k) ?: - bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); + bch2_check_snapshot_needs_deletion(trans, k))); bch_err_fn(c, ret); /* diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 3ff0ffa774f5..00373cf32e7b 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -134,21 +134,6 @@ static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) return ret; } -static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) -{ - const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->equiv : 0; -} - -static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) -{ - rcu_read_lock(); - id = __bch2_snapshot_equiv(c, id); - rcu_read_unlock(); - - return id; -} - static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { rcu_read_lock(); diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 8a7f7e87c381..1549d6daf7af 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -16,7 +16,6 @@ struct snapshot_t { u32 children[2]; u32 subvol; /* Nonzero only if a subvolume points to this node: */ u32 tree; - u32 equiv; unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; }; From 50dd5a0edf33ff18f0672a3a2ab7b285161ec1ac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 00:44:28 -0500 Subject: [PATCH 207/233] bcachefs: bch2_trans_log_msg() Export a helper for logging to the journal when we're already in a transaction context. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update.c | 13 ++++++++++--- fs/bcachefs/btree_update.h | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 06fd5aa62296..a4b70e3fe4c3 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -823,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) { + unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); + prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); + + int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + return ret; + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; @@ -862,7 +869,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, - __bch2_trans_log_msg(trans, &buf, u64s)); + bch2_trans_log_msg(trans, &buf)); } err: printbuf_exit(&buf); diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 58df20194306..8f22ef9a7651 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -159,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); +int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); From dd0d1ff378c1d32a9c2ccab19b98db720657f36d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 04:00:40 -0500 Subject: [PATCH 208/233] bcachefs: Log message in journal for snapshot deletion Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index f975f2cf3e35..0dafadc1bcf2 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1568,6 +1568,22 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) if (!delete_leaves.nr && !delete_interior.nr) goto err; + { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "deleting leaves"); + darray_for_each(delete_leaves, i) + prt_printf(&buf, " %u", *i); + + prt_printf(&buf, " interior"); + darray_for_each(delete_interior, i) + prt_printf(&buf, " %u->%u", i->id, i->live_child); + + ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); + printbuf_exit(&buf); + if (ret) + goto err; + } + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { struct disk_reservation res = { 0 }; From 0694b43ff91f13b81474ff335b00b560dcb94ea9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Dec 2024 05:43:00 -0500 Subject: [PATCH 209/233] bcachefs: trace_key_cache_fill Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 10 ++++++++++ fs/bcachefs/trace.h | 27 ++++++++++++++++++++++----- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 3bd40ea0fa3d..4eba2871f289 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -309,6 +309,16 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, ret = btree_key_cache_create(trans, ck_path, k); if (ret) goto err; + + if (trace_key_cache_fill_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); + } out: /* We're not likely to need this iterator again: */ bch2_set_btree_iter_dontneed(&iter); diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 11e6547f91d6..9d40b7d4ea29 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1338,6 +1338,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1374,10 +1380,21 @@ TRACE_EVENT(path_downgrade, __entry->pos_snapshot) ); -DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) +TRACE_EVENT(key_cache_fill, + TP_PROTO(struct btree_trans *trans, const char *key), + TP_ARGS(trans, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %s", __entry->trans_fn, __get_str(key)) ); TRACE_EVENT(write_buffer_flush, @@ -1443,7 +1460,7 @@ TRACE_EVENT(write_buffer_maybe_flush, TP_STRUCT__entry( __array(char, trans_fn, 32 ) __field(unsigned long, caller_ip ) - __string(key, key ) + __string(key, key ) ), TP_fast_assign( From 8062b348614c13b0be9ec57fc78effa59318baf9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 13 Dec 2024 05:58:34 -0500 Subject: [PATCH 210/233] bcachefs: bch2_btree_path_peek_slot() doesn't return errors Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index b27944b62087..a1c5fcced24e 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2229,14 +2229,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (k.k && !bkey_err(k)) { - if ((iter->flags & BTREE_ITER_all_snapshots) && - !bpos_eq(pos, k.k->p)) - return bkey_s_c_null; + if (!k.k) + return k; - iter->k = u; - k.k = &iter->k; - } + if ((iter->flags & BTREE_ITER_all_snapshots) && + !bpos_eq(pos, k.k->p)) + return bkey_s_c_null; + + iter->k = u; + k.k = &iter->k; return k; } From ab7eb8e365936471733a4ea529818354132e4bd2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 16 Nov 2024 23:53:07 -0500 Subject: [PATCH 211/233] bcachefs: bcachefs_metadata_version_backpointer_bucket_gen New on disk format version: backpointers new include the generation number of the bucket they refer to, and the obsolete bucket_offset field (no longer needed because we no longer store backpointers in alloc keys) is gone. This is an expensive forced upgrade - hopefully the last; we have to run the extents_to_backpointers recovery pass to regenerate backpointers. It's a forced incompatible upgrade because the alternative would've been permamently making backpointers bigger, and as one of the biggest btrees (along with the extents btree) that's not an ideal option. It's worth it though, because this allows us to make the check_extents_to_backpointers pass drastically cheaper: an upcoming patch changes it to sum up backpointers in a bucket and check the sum against the sector counts for that bucket, only looking for missing backpointers if they don't match (and then only for specific buckets). Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 27 +++++---------------------- fs/bcachefs/backpointers.h | 2 +- fs/bcachefs/bcachefs_format.h | 6 ++++-- fs/bcachefs/sb-downgrade.c | 15 +++++++++++++-- 4 files changed, 23 insertions(+), 27 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 0e3b7b5d626e..b19719b02df8 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -28,23 +28,6 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, c, backpointer_dev_bad, "backpointer for BCH_SB_MEMBER_INVALID"); - - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (!ca) { - /* these will be caught by fsck */ - rcu_read_unlock(); - return 0; - } - - struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); - struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); - rcu_read_unlock(); - - bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || - !bpos_eq(bp.k->p, bp_pos), - c, backpointer_bucket_offset_wrong, - "backpointer bucket_offset wrong (%llu)", (u64) bp.v->bucket_offset); fsck_err: return ret; } @@ -59,16 +42,17 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke u32 bucket_offset; struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); rcu_read_unlock(); - prt_printf(out, "bucket=%llu:%llu:%u", bucket.inode, bucket.offset, bucket_offset); + prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); } else { rcu_read_unlock(); - prt_printf(out, "sector=%llu:%llu", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); + prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); - prt_printf(out, " suboffset=%u len=%u pos=", + prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp.v->bucket_len); + bp.v->bucket_len, + bp.v->bucket_gen); bch2_bpos_to_text(out, bp.v->pos); } @@ -76,7 +60,6 @@ void bch2_backpointer_swab(struct bkey_s k) { struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - bp.v->bucket_offset = swab40(bp.v->bucket_offset); bp.v->bucket_len = swab32(bp.v->bucket_len); bch2_bpos_swab(&bp.v->pos); } diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 95caeabb8978..caffc68407ab 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -158,7 +158,7 @@ static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, .btree_id = btree_id, .level = level, .data_type = bch2_bkey_ptr_data_type(k, p, entry), - .bucket_offset = bp_bucket_offset, + .bucket_gen = p.ptr.gen, .bucket_len = sectors, .pos = k.k->p, }; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index dc14bfe37e3b..e4bb74d6f439 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -463,7 +463,8 @@ struct bch_backpointer { __u8 btree_id; __u8 level; __u8 data_type; - __u64 bucket_offset:40; + __u8 bucket_gen; + __u32 pad; __u32 bucket_len; struct bpos pos; } __packed __aligned(8); @@ -677,7 +678,8 @@ struct bch_sb_field_ext { x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \ x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ - x(inode_has_child_snapshots, BCH_VERSION(1, 13)) + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ + x(backpointer_bucket_gen, BCH_VERSION(1, 14)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 8767c33c2b51..9879845413a6 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -81,7 +81,11 @@ BCH_FSCK_ERR_accounting_mismatch) \ x(inode_has_child_snapshots, \ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_child_snapshots_wrong) + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ + x(backpointer_bucket_gen, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_backpointer_to_missing_ptr, \ + BCH_FSCK_ERR_ptr_to_missing_backpointer) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -117,7 +121,14 @@ BCH_FSCK_ERR_bkey_version_in_future) \ x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(backpointer_bucket_gen, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ + BCH_FSCK_ERR_backpointer_to_missing_ptr, \ + BCH_FSCK_ERR_ptr_to_missing_backpointer) struct upgrade_downgrade_entry { u64 recovery_passes; From c6b74e6733a136501032c13c3d762f8896b0cd24 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 29 Nov 2024 17:41:43 -0500 Subject: [PATCH 212/233] bcachefs: bcachefs_metadata_version_disk_accounting_big_endian Fix sort order for disk accounting keys, in order to fix a regression on mount times. The typetag is now the most significant byte of the key, meaning disk accounting keys of the same type now sort together. This lets us skip over disk accounting keys that aren't mirrored in memory when reading accounting at startup, instead of having them interleaved with other counter types. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/disk_accounting.c | 22 +++++++++++++++++----- fs/bcachefs/disk_accounting.h | 18 +++++++++++------- fs/bcachefs/sb-downgrade.c | 14 ++++++++++++-- fs/bcachefs/util.h | 9 +++++++++ 5 files changed, 51 insertions(+), 15 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index e4bb74d6f439..cef22c15c256 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -679,7 +679,8 @@ struct bch_sb_field_ext { x(disk_accounting_inum, BCH_VERSION(1, 11)) \ x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ - x(backpointer_bucket_gen, BCH_VERSION(1, 14)) + x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ + x(disk_accounting_big_endian, BCH_VERSION(1, 15)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 22a7db63e50c..72c8dcb9226f 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -698,8 +698,11 @@ int bch2_accounting_read(struct bch_fs *c) percpu_memset(c->usage, 0, sizeof(*c->usage)); percpu_up_write(&c->mark_lock); - int ret = for_each_btree_key(trans, iter, - BTREE_ID_accounting, POS_MIN, + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + iter.flags &= ~BTREE_ITER_with_journal; + int ret = for_each_btree_key_continue(trans, iter, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); @@ -710,8 +713,14 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); - if (!bch2_accounting_is_mem(acc_k)) + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; + } accounting_read_key(trans, k); })); @@ -896,10 +905,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c) bpos_to_disk_accounting_pos(&acc_k, k.k->p); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - continue; + break; - if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; + } bch2_accounting_mem_read(c, k.k->p, v, nr); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 2560de10b09d..fc1b673689c8 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -63,20 +63,24 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) { - acc->_pad = p; + BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - bch2_bpos_swab(&acc->_pad); + acc->_pad = p; +#else + memcpy_swab(acc, &p, sizeof(p)); #endif } -static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k) +static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) { - struct bpos ret = k->_pad; - + struct bpos p; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - bch2_bpos_swab(&ret); + p = acc->_pad; +#else + memcpy_swab(&p, acc, sizeof(p)); #endif - return ret; + return p; } int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 9879845413a6..051214fdc735 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -85,7 +85,12 @@ x(backpointer_bucket_gen, \ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(disk_accounting_big_endian, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -128,7 +133,12 @@ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ BCH_FSCK_ERR_backpointer_to_missing_ptr, \ - BCH_FSCK_ERR_ptr_to_missing_backpointer) + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(disk_accounting_big_endian, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) struct upgrade_downgrade_entry { u64 recovery_passes; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 5e4820c8fa44..c292b9ce8240 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -709,4 +709,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr) return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; } +static inline void memcpy_swab(void *_dst, void *_src, size_t len) +{ + u8 *dst = _dst + len; + u8 *src = _src; + + while (len--) + *--dst = *src++; +} + #endif /* _BCACHEFS_UTIL_H */ From 70e1e1af77787dbbfc559cdab323c44f7bc68ba5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 23:58:21 -0500 Subject: [PATCH 213/233] bcachefs: bch2_extent_ptr_to_bp() no longer depends on device bch_backpointer no longer contains the bucket_offset field, it's just a direct LBA mapping (with low bits to account for compressed extent splitting), so we don't need to refer to the device to construct it anymore. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 26 ++++---------------------- fs/bcachefs/backpointers.h | 17 ++++++----------- fs/bcachefs/buckets.c | 7 ++++--- 3 files changed, 14 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index b19719b02df8..98d89133fc75 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -73,26 +73,14 @@ static bool extent_matches_bp(struct bch_fs *c, const union bch_extent_entry *entry; struct extent_ptr_decoded p; - rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket2; struct bkey_i_backpointer bp2; + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); - if (p.ptr.cached) - continue; - - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - if (!ca) - continue; - - bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); if (bpos_eq(bp.k->p, bp2.k.p) && - !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) { - rcu_read_unlock(); + !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) return true; - } } - rcu_read_unlock(); return false; } @@ -586,21 +574,15 @@ static int check_extent_to_backpointers(struct btree_trans *trans, ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos; struct bkey_i_backpointer bp; if (p.ptr.cached) continue; - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - if (ca) - bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); - rcu_read_unlock(); - - if (!ca) + if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); ret = check_bp_exists(trans, s, &bp, k); if (ret) return ret; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index caffc68407ab..d126d40dda99 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -140,20 +140,15 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, } } -static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, +static inline void __bch2_extent_ptr_to_bp( enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - struct bpos *bucket, struct bkey_i_backpointer *bp, + struct bkey_i_backpointer *bp, u64 sectors) { - u32 bucket_offset; - *bucket = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); - - u64 bp_bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; - bkey_backpointer_init(&bp->k_i); - bp->k.p = bucket_pos_to_bp(ca, *bucket, bp_bucket_offset); + bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, @@ -164,15 +159,15 @@ static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, }; } -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - struct bpos *bucket_pos, struct bkey_i_backpointer *bp) + struct bkey_i_backpointer *bp) { u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); - __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors); + __bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, bp, sectors); } struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index bbd37b1ed5d2..30b983cf9780 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -572,6 +572,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans, u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); *sectors = insert ? abs_sectors : -abs_sectors; + struct bkey_i_backpointer bp; + __bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, &bp, abs_sectors); + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) @@ -579,9 +582,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, goto err; } - struct bpos bucket; - struct bkey_i_backpointer bp; - __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); From 78daf5eaab64b6d7adda12932102d22ea37f75e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 17 Nov 2024 18:37:41 -0500 Subject: [PATCH 214/233] bcachefs: kill __bch2_extent_ptr_to_bp() Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.h | 18 +++--------------- fs/bcachefs/buckets.c | 7 +++---- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index d126d40dda99..65ede8adbc36 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -140,12 +140,11 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, } } -static inline void __bch2_extent_ptr_to_bp( +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - struct bkey_i_backpointer *bp, - u64 sectors) + struct bkey_i_backpointer *bp) { bkey_backpointer_init(&bp->k_i); bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); @@ -154,22 +153,11 @@ static inline void __bch2_extent_ptr_to_bp( .level = level, .data_type = bch2_bkey_ptr_data_type(k, p, entry), .bucket_gen = p.ptr.gen, - .bucket_len = sectors, + .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), .pos = k.k->p, }; } -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - struct bkey_i_backpointer *bp) -{ - u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); - - __bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, bp, sectors); -} - struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, struct btree_iter *, unsigned); struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 30b983cf9780..56d3e3800a89 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -569,11 +569,10 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); - *sectors = insert ? abs_sectors : -abs_sectors; - struct bkey_i_backpointer bp; - __bch2_extent_ptr_to_bp(btree_id, level, k, p, entry, &bp, abs_sectors); + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); + + *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { From 9364e11cb32472226fe34188bc443ae38ec2963d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 22:13:29 -0500 Subject: [PATCH 215/233] bcachefs: Add write buffer flush param to backpointer_get_key() In an upcoming patch bch2_backpointer_get_key() will be repairing when it finds a dangling backpointer; it will need to flush the btree write buffer before it can definitively say there's an error. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 12 +++++++----- fs/bcachefs/backpointers.h | 5 +++-- fs/bcachefs/ec.c | 14 ++++++++++---- fs/bcachefs/move.c | 8 ++++++-- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 98d89133fc75..d2f0b3140983 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -218,7 +218,8 @@ static void backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct btree_iter *iter, - unsigned iter_flags) + unsigned iter_flags, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -245,7 +246,7 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, backpointer_target_not_found(trans, bp, k); return bkey_s_c_null; } else { - struct btree *b = bch2_backpointer_get_node(trans, bp, iter); + struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); if (IS_ERR_OR_NULL(b)) { bch2_trans_iter_exit(trans, iter); @@ -257,7 +258,8 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct bkey_s_c_backpointer bp, - struct btree_iter *iter) + struct btree_iter *iter, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -486,7 +488,7 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0); + bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, &s->last_flushed); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; @@ -866,7 +868,7 @@ static int check_one_backpointer(struct btree_trans *trans, return 0; struct btree_iter iter; - struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0); + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); int ret = bkey_err(k); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) return 0; diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 65ede8adbc36..060dad1521ee 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -158,10 +158,11 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, }; } +struct bkey_buf; struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, unsigned); + struct btree_iter *, unsigned, struct bkey_buf *); struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *); + struct btree_iter *, struct bkey_buf *); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 45541a101344..b211e90ac54e 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -1274,7 +1274,8 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, - struct bkey_s_c_backpointer bp) + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_fs *c = trans->c; @@ -1291,7 +1292,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct btree_iter node_iter; struct btree *b; - b = bch2_backpointer_get_node(trans, bp, &node_iter); + b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); bch2_trans_iter_exit(trans, &node_iter); if (!b) @@ -1305,7 +1306,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, return -EIO; } - k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent); + k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); ret = bkey_err(k); if (ret) return ret; @@ -1372,6 +1373,10 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, bucket_pos), bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, @@ -1385,9 +1390,10 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b continue; ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bkey_s_c_to_backpointer(bp_k)); + bkey_s_c_to_backpointer(bp_k), &last_flushed); })); + bch2_bkey_buf_exit(&last_flushed, c); bch2_dev_put(ca); return ret; } diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 6d38afcaaaab..184620d5a3b4 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -677,6 +677,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c k; struct data_update_opts data_opts; unsigned sectors_moved = 0; + struct bkey_buf last_flushed; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); @@ -685,6 +686,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, trace_bucket_evacuate(c, &bucket); + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); bch2_bkey_buf_init(&sk); /* @@ -726,7 +729,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); if (!bp.v->level) { - k = bch2_backpointer_get_key(trans, bp, &iter, 0); + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -784,7 +787,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, } else { struct btree *b; - b = bch2_backpointer_get_node(trans, bp, &iter); + b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) goto next; @@ -822,6 +825,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_iter_exit(trans, &bp_iter); bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); + bch2_bkey_buf_exit(&last_flushed, c); return ret; } From cbe8afdbcda6994f7fec5d0d019c7c30c4c18d75 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 15 Nov 2024 16:31:54 -0500 Subject: [PATCH 216/233] bcachefs: check_extents_to_backpointers() now only checks buckets with mismatches Instead of walking every extent and every backpointer it points to, first sum up backpointers in each bucket and check for mismatches, and only look for missing backpointers if mismatches were detected, and only check extents in those buckets. This is a major fsck scalability improvement, since the two backpointers passes (backpointers -> extents and extents -> backpointers) are the most expensive fsck passes by far. Additionally, to speed up the upgrade for backpointer bucket gens, or in situations when we have to rebuild alloc info, add a special case for when no backpointers are found in a bucket - don't check each individual backpointer (in particular, avoiding the write buffer flushes), just recreate them. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 339 +++++++++++++++++++++++++++++++++++-- fs/bcachefs/bcachefs.h | 3 + fs/bcachefs/btree_cache.c | 1 - fs/bcachefs/errcode.h | 1 + 4 files changed, 325 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index d2f0b3140983..892939763c3a 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -569,25 +569,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - int ret; - ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bkey_i_backpointer bp; - if (p.ptr.cached) continue; if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - ret = check_bp_exists(trans, s, &bp, k); - if (ret) - return ret; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); + bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + rcu_read_unlock(); + + if (check || empty) { + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + + int ret = check + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); + if (ret) + return ret; + } } return 0; @@ -796,26 +804,295 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } +enum alloc_sector_counter { + ALLOC_dirty, + ALLOC_cached, + ALLOC_stripe, + ALLOC_SECTORS_NR +}; + +static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +{ + switch (t) { + case BCH_DATA_btree: + case BCH_DATA_user: + return ALLOC_dirty; + case BCH_DATA_cached: + return ALLOC_cached; + case BCH_DATA_stripe: + return ALLOC_stripe; + default: + BUG(); + } +} + +static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); + +static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + bool need_commit = false; + + if (a->data_type == BCH_DATA_sb || + a->data_type == BCH_DATA_journal || + a->data_type == BCH_DATA_parity) + return 0; + + u32 sectors[ALLOC_SECTORS_NR]; + memset(sectors, 0, sizeof(sectors)); + + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); + if (!ca) + return 0; + + struct btree_iter iter; + struct bkey_s_c bp_k; + int ret = 0; + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, alloc_k.k->p), + bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { + if (bp_k.k->type != KEY_TYPE_backpointer) + continue; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && + (bp.v->bucket_gen != a->gen || + bp.v->pad)) { + ret = bch2_backpointer_del(trans, bp_k.k->p); + if (ret) + break; + + need_commit = true; + continue; + } + + if (bp.v->bucket_gen != a->gen) + continue; + + sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + }; + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + if (need_commit) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + } + + /* Cached pointers don't have backpointers: */ + + if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_stripe] != a->stripe_sectors) { + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { + ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); + if (ret) + goto err; + } + + if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_stripe] > a->stripe_sectors) { + ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: + -BCH_ERR_transaction_restart_nested; + goto err; + } + + if (!sectors[ALLOC_dirty] && + !sectors[ALLOC_stripe]) + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); + else + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); + } +err: + bch2_dev_put(ca); + return ret; +} + +static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr_v2: { + bool ret = false; + + rcu_read_lock(); + struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; + while (pos.inode <= k.k->p.inode) { + if (pos.inode >= c->sb.nr_devices) + break; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); + if (!ca) + goto next; + + struct bpos bucket = bp_pos_to_bucket(ca, pos); + bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, + ca->mi.nbuckets, bucket.offset); + if (bucket.offset == ca->mi.nbuckets) + goto next; + + ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); + if (ret) + break; +next: + pos = SPOS(pos.inode + 1, 0, 0); + } + rcu_read_unlock(); + + return ret; + } + case KEY_TYPE_btree_ptr: + return true; + default: + return false; + } +} + +static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, + enum btree_id btree, unsigned level) +{ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + if (b) + bch2_node_pin(trans->c, b); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, + struct bpos start, struct bpos *end) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + struct bkey_buf tmp; + bch2_bkey_buf_init(&tmp); + + bch2_btree_cache_unpin(c); + + *end = SPOS_MAX; + + s64 mem_may_pin = mem_may_pin_bytes(c); + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, + 0, 1, BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + if (!backpointer_node_has_missing(c, k)) + continue; + + mem_may_pin -= c->opts.btree_node_size; + if (mem_may_pin <= 0) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + struct btree_path *path = btree_iter_path(trans, &iter); + + BUG_ON(path->level != 1); + + bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); + })); + if (ret) + return ret; + + struct bpos pinned = SPOS_MAX; + mem_may_pin = mem_may_pin_bytes(c); + bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, + 0, 1, BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + if (!backpointer_node_has_missing(c, k)) + continue; + + mem_may_pin -= c->opts.btree_node_size; + if (mem_may_pin <= 0) { + *end = pinned; + break; + } + + bch2_bkey_buf_reassemble(&tmp, c, k); + struct btree_path *path = btree_iter_path(trans, &iter); + + BUG_ON(path->level != 1); + + int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); + + if (!ret2) + pinned = tmp.k->k.p; + + ret; + })); + if (ret) + return ret; + + return ret; +} + int bch2_check_extents_to_backpointers(struct bch_fs *c) { + int ret = 0; + + /* + * Can't allow devices to come/go/resize while we have bucket bitmaps + * allocated + */ + lockdep_assert_held(&c->state_lock); + + for_each_member_device(c, ca) { + BUG_ON(ca->bucket_backpointer_mismatches); + ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), + GFP_KERNEL); + ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), + GFP_KERNEL); + if (!ca->bucket_backpointer_mismatches || + !ca->bucket_backpointer_empty) { + bch2_dev_put(ca); + ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + goto err_free_bitmaps; + } + } + struct btree_trans *trans = bch2_trans_get(c); struct extents_to_bp_state s = { .bp_start = POS_MIN }; - int ret; bch2_bkey_buf_init(&s.last_flushed); bkey_init(&s.last_flushed.k->k); + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_prefetch, k, ({ + check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + })); + if (ret) + goto err; + + u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; + for_each_member_device(c, ca) { + nr_buckets += ca->mi.nbuckets; + nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); + nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); + } + + if (!nr_mismatches && !nr_empty) + goto err; + + bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", + nr_mismatches + nr_empty, nr_buckets); + while (1) { - struct bbpos end; - ret = bch2_get_btree_in_memory_pos(trans, - BIT_ULL(BTREE_ID_backpointers), - BIT_ULL(BTREE_ID_backpointers), - BBPOS(BTREE_ID_backpointers, s.bp_start), &end); + ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); if (ret) break; - s.bp_end = end.pos; - if ( bpos_eq(s.bp_start, POS_MIN) && !bpos_eq(s.bp_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", @@ -840,10 +1117,17 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) s.bp_start = bpos_successor(s.bp_end); } +err: bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); - bch2_btree_cache_unpin(c); +err_free_bitmaps: + for_each_member_device(c, ca) { + kvfree(ca->bucket_backpointer_empty); + ca->bucket_backpointer_empty = NULL; + kvfree(ca->bucket_backpointer_mismatches); + ca->bucket_backpointer_mismatches = NULL; + } bch_err_fn(c, ret); return ret; @@ -895,6 +1179,25 @@ static int check_one_backpointer(struct btree_trans *trans, return ret; } +static int check_bucket_backpointers_to_extents(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket) +{ + u32 restart_count = trans->restart_count; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket), + bucket_pos_to_bp_end(ca, bucket), + 0, k, + check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) + ); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret ?: trans_was_restarted(trans, restart_count); +} + static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 3a3cb79d8518..7b0959fb35dd 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -555,6 +555,9 @@ struct bch_dev { u8 *oldest_gen; unsigned long *buckets_nouse; + unsigned long *bucket_backpointer_mismatches; + unsigned long *bucket_backpointer_empty; + struct bch_dev_usage __percpu *usage; /* Allocator: */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 1117be901cf0..b00c6a20be27 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -222,7 +222,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) struct btree_cache *bc = &c->btree_cache; mutex_lock(&bc->lock); - BUG_ON(!__btree_node_pinned(bc, b)); if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { set_btree_node_pinned(b); list_move(&b->list, &bc->live[1].list); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 5d17ceb1e83a..c0df2587a580 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -54,6 +54,7 @@ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ + x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ x(EIO, compression_workspace_not_initialized) \ x(ENOMEM, ENOMEM_bucket_gens) \ x(ENOMEM, ENOMEM_buckets_nouse) \ From 19b148fc65a3eb2d583738f9f4390400c80f2419 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 12 Nov 2024 03:46:31 -0500 Subject: [PATCH 217/233] bcachefs: bch2_backpointer_get_key() now repairs dangling backpointers Continuing on with the self healing theme, we should be running any check and repair code at runtime that we can - instead of declaring the filesystemt inconsistent. This will also let us skip running the backpointers -> extents fsck pass except in debug mode. Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 70 +++++++++++++++----------------------- 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 892939763c3a..5cc4aaa3a325 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -172,9 +172,10 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { - return likely(!bch2_backpointers_no_use_write_buffer) - ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) - : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0); + return (likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) + : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, @@ -186,20 +187,25 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, : 0; } -static void backpointer_target_not_found(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct bkey_s_c target_k) +static int backpointer_target_not_found(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct bkey_s_c target_k, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + int ret = 0; /* * If we're using the btree write buffer, the backpointer we were * looking at may have already been deleted - failure to find what it * pointed to is not an error: */ - if (likely(!bch2_backpointers_no_use_write_buffer)) - return; + ret = last_flushed + ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) + : 0; + if (ret) + return ret; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.v->level ? "btree node" : "extent"); @@ -207,12 +213,13 @@ static void backpointer_target_not_found(struct btree_trans *trans, prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, target_k); - if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) - bch_err_ratelimited(c, "%s", buf.buf); - else - bch2_trans_inconsistent(trans, "%s", buf.buf); + if (fsck_err(trans, backpointer_to_missing_ptr, + "%s", buf.buf)) + ret = bch2_backpointer_del(trans, bp.k->p); +fsck_err: printbuf_exit(&buf); + return ret; } struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, @@ -243,15 +250,13 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, return k; bch2_trans_iter_exit(trans, iter); - backpointer_target_not_found(trans, bp, k); - return bkey_s_c_null; + int ret = backpointer_target_not_found(trans, bp, k, last_flushed); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + if (IS_ERR_OR_NULL(b)) + return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - if (IS_ERR_OR_NULL(b)) { - bch2_trans_iter_exit(trans, iter); - return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; - } return bkey_i_to_s_c(&b->key); } } @@ -284,8 +289,8 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key)); - b = NULL; + int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); + b = ret ? ERR_PTR(ret) : NULL; } err: bch2_trans_iter_exit(trans, iter); @@ -488,7 +493,7 @@ static int check_bp_exists(struct btree_trans *trans, struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, &s->last_flushed); + bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; @@ -1143,9 +1148,7 @@ static int check_one_backpointer(struct btree_trans *trans, return 0; struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - struct bch_fs *c = trans->c; struct bbpos pos = bp_to_bbpos(*bp.v); - struct printbuf buf = PRINTBUF; if (bbpos_cmp(pos, start) < 0 || bbpos_cmp(pos, end) > 0) @@ -1159,23 +1162,7 @@ static int check_one_backpointer(struct btree_trans *trans, if (ret) return ret; - if (!k.k) { - ret = bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_ptr, - "backpointer for missing %s\n %s", - bp.v->level ? "btree node" : "extent", - (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_backpointer_del(trans, bp.k->p); - goto out; - } - } -out: -fsck_err: bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); return ret; } @@ -1210,9 +1197,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, bkey_init(&last_flushed.k->k); progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_prefetch, k, ({ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); check_one_backpointer(trans, start, end, k, &last_flushed); })); From ae7a39471902965e7b18dcc675e1741d9c7e9a95 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Dec 2024 14:04:39 -0500 Subject: [PATCH 218/233] bcachefs: better backpointer_target_not_found() error message Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 5cc4aaa3a325..b93ddfa00fdd 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -210,10 +210,21 @@ static int backpointer_target_not_found(struct btree_trans *trans, prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", bp.v->level ? "btree node" : "extent"); bch2_bkey_val_to_text(&buf, c, bp.s_c); - prt_printf(&buf, "\n "); + prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, target_k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) + if (p.ptr.dev == bp.k->p.inode) { + prt_printf(&buf, "\n "); + struct bkey_i_backpointer bp2; + bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); + } + if (fsck_err(trans, backpointer_to_missing_ptr, "%s", buf.buf)) ret = bch2_backpointer_del(trans, bp.k->p); From a33c661174e055fb13192e13fd70a6a2eb047e49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Nov 2024 20:47:32 -0500 Subject: [PATCH 219/233] bcachefs: Only run check_backpointers_to_extents in debug mode The backpointers passes, check_backpointers_to_extents() and check_extents_to_backpointers() are the most expensive fsck passes. Now that we're running the same check and repair code when using a backpointer at runtime (via bch2_backpointer_get_key()) that fsck does, there's no reason fsck needs to - except to verify that the filesystem really has no errors in debug mode. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes_types.h | 92 +++++++++++++++-------------- fs/bcachefs/sb-errors_format.h | 4 +- 2 files changed, 51 insertions(+), 45 deletions(-) diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 2b3ef3980fc3..71baad41d8c5 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -8,53 +8,59 @@ #define PASS_ALWAYS BIT(3) #define PASS_ONLINE BIT(4) +#ifdef CONFIG_BCACHEFS_DEBUG +#define PASS_FSCK_DEBUG BIT(1) +#else +#define PASS_FSCK_DEBUG 0 +#endif + /* * Passes may be reordered, but the second field is a persistent identifier and * must never change: */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) \ +#define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ + x(scan_for_btree_nodes, 37, 0) \ + x(check_topology, 4, 0) \ + x(accounting_read, 39, PASS_ALWAYS) \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, PASS_ALWAYS) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_allocations, 5, PASS_FSCK) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) /* We normally enumerate recovery passes in the order we run them: */ enum bch_recovery_pass { diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 0bc4cec2926c..806486635075 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -140,8 +140,8 @@ enum bch_fsck_flags { x(backpointer_bucket_offset_wrong, 125, 0) \ x(backpointer_level_bad, 294, 0) \ x(backpointer_dev_bad, 297, 0) \ - x(backpointer_to_missing_device, 126, 0) \ - x(backpointer_to_missing_alloc, 127, 0) \ + x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ + x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ From a06f09e44c8c4e82680b58ed6c7c8048283a0466 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Nov 2024 21:50:29 -0500 Subject: [PATCH 220/233] bcachefs: BCH_SB_VERSION_INCOMPAT We've been getting away from feature bits: they don't have any kind of ordering, and thus it's possible for people to enable weird combinations of features that were never tested or intended to be run. Much better to just give every new feature, compatible or incompatible, a version number. Additionally, we probably won't ever rev the major version number: major version numbers represent incompatible versions, but that doesn't really fit with how we actually roll out incompatible features - we need a better way of rolling out incompatible features. So, this patch adds two new superblock fields: - BCH_SB_VERSION_INCOMPAT - BCH_SB_VERSION_INCOMPAT_ALLOWED BCH_SB_VERSION_INCOMPAT_ALLOWED indicates that incompatible features up to version number x are allowed to be used without user prompting, but it does not by itself deny old versions from mounting. BCH_SB_VERSION_INCOMPAT does deny old versions from mounting, and must be <= BCH_SB_VERSION_INCOMPAT_ALLOWED. BCH_SB_VERSION_INCOMPAT will only be set when a codepath attempts to use an incompatible feature, so as to not unnecessarily break compatibility with old versions. bch2_request_incompat_feature() is the new interface to check if an incompatible feature may be used. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 ++ fs/bcachefs/bcachefs_format.h | 24 +++++++++------- fs/bcachefs/recovery.c | 31 ++++++++++++++++---- fs/bcachefs/super-io.c | 54 +++++++++++++++++++++++++++++++++-- fs/bcachefs/super-io.h | 19 ++++++++++-- 5 files changed, 108 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7b0959fb35dd..b749c4ecad1b 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -760,6 +760,8 @@ struct bch_fs { __uuid_t user_uuid; u16 version; + u16 version_incompat; + u16 version_incompat_allowed; u16 version_min; u16 version_upgrade_complete; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index cef22c15c256..0c6dfc4c1743 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -845,6 +845,9 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, struct bch_sb, flags[5], 0, 16); LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, struct bch_sb, flags[5], 16, 32); +LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); +LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, + struct bch_sb, flags[5], 48, 64); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -897,21 +900,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(new_varint, 15) \ x(journal_no_flush, 16) \ x(alloc_v2, 17) \ - x(extents_across_btree_nodes, 18) + x(extents_across_btree_nodes, 18) \ + x(incompat_version_field, 19) #define BCH_SB_FEATURES_ALWAYS \ - ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ - (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ - (1ULL << BCH_FEATURE_btree_updates_journalled)|\ - (1ULL << BCH_FEATURE_alloc_v2)|\ - (1ULL << BCH_FEATURE_extents_across_btree_nodes)) + (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ + BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ + BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ + BIT_ULL(BCH_FEATURE_alloc_v2)|\ + BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) #define BCH_SB_FEATURES_ALL \ (BCH_SB_FEATURES_ALWAYS| \ - (1ULL << BCH_FEATURE_new_siphash)| \ - (1ULL << BCH_FEATURE_btree_ptr_v2)| \ - (1ULL << BCH_FEATURE_new_varint)| \ - (1ULL << BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_new_siphash)| \ + BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ + BIT_ULL(BCH_FEATURE_new_varint)| \ + BIT_ULL(BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index fbef6579d884..383e03606d6e 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -612,6 +612,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; + bool ret = false; if (old_version < bcachefs_metadata_required_upgrade_below) { if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || @@ -667,14 +668,32 @@ static bool check_version_upgrade(struct bch_fs *c) } bch_info(c, "%s", buf.buf); - - bch2_sb_upgrade(c, new_version); - printbuf_exit(&buf); - return true; + + ret = true; } - return false; + if (new_version > c->sb.version_incompat && + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, new_version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + + ret = true; + } + + if (ret) + bch2_sb_upgrade(c, new_version, + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); + + return ret; } int bch2_fs_recovery(struct bch_fs *c) @@ -1074,7 +1093,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_check_version_downgrade(c); if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { - bch2_sb_upgrade(c, bcachefs_metadata_version_current); + bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); bch2_write_super(c); } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6a086c1c4b14..b0d52b6ccad4 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -42,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = { #undef x }; -void bch2_version_to_text(struct printbuf *out, unsigned v) +void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) { const char *str = "(unknown version)"; @@ -55,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v) prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); } -unsigned bch2_latest_compatible_version(unsigned v) +enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) { if (!BCH_VERSION_MAJOR(v)) return v; @@ -69,6 +69,16 @@ unsigned bch2_latest_compatible_version(unsigned v) return v; } +void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +{ + mutex_lock(&c->sb_lock); + SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -369,6 +379,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_features; } + if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || + BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { + prt_printf(out, "Filesystem has incompatible version"); + return -BCH_ERR_invalid_sb_features; + } + block_size = le16_to_cpu(sb->block_size); if (block_size > PAGE_SECTORS) { @@ -407,6 +423,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_time_precision; } + /* old versions didn't know to downgrade this field */ + if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); + + if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { + prt_printf(out, "Invalid version_incompat "); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); + prt_str(out, " > incompat_allowed "); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); + if (flags & BCH_VALIDATE_write) + return -BCH_ERR_invalid_sb_version; + else + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); + } + if (!flags) { /* * Been seeing a bug where these are getting inexplicably @@ -520,6 +551,9 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.uuid = src->uuid; c->sb.user_uuid = src->user_uuid; c->sb.version = le16_to_cpu(src->version); + c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); + c->sb.version_incompat_allowed + = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); c->sb.version_min = le16_to_cpu(src->version_min); c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); c->sb.nr_devices = src->nr_devices; @@ -1152,6 +1186,8 @@ bool bch2_check_version_downgrade(struct bch_fs *c) */ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); if (c->sb.version > bcachefs_metadata_version_current) c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); if (c->sb.version_min > bcachefs_metadata_version_current) @@ -1160,7 +1196,7 @@ bool bch2_check_version_downgrade(struct bch_fs *c) return ret; } -void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) +void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) { lockdep_assert_held(&c->sb_lock); @@ -1170,6 +1206,10 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) c->disk_sb.sb->version = cpu_to_le16(new_version); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + + if (incompat) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, @@ -1334,6 +1374,14 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); + prt_printf(out, "Incompatible features allowed:\t"); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); + prt_newline(out); + + prt_printf(out, "Incompatible features in use:\t"); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); + prt_newline(out); + prt_printf(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 90e7b176cdd0..f1ab4f943720 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -18,8 +18,21 @@ static inline bool bch2_version_compatible(u16 version) version >= bcachefs_metadata_version_min; } -void bch2_version_to_text(struct printbuf *, unsigned); -unsigned bch2_latest_compatible_version(unsigned); +void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); +enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); + +void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); + +static inline bool bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) +{ + if (unlikely(version > c->sb.version_incompat)) { + if (version > c->sb.version_incompat_allowed) + return false; + bch2_set_version_incompat(c, version); + } + return true; +} static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) { @@ -94,7 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) } bool bch2_check_version_downgrade(struct bch_fs *); -void bch2_sb_upgrade(struct bch_fs *, unsigned); +void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); From 710fb4e0abfff57d297a14f08abb64ab56a2cfe1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 6 Nov 2024 23:16:24 -0500 Subject: [PATCH 221/233] bcachefs: bcachefs_metadata_version_reflink_p_may_update_opts Previously, io path option changes on a file would be picked up automatically and applied to existing data - but not for reflinked data, as we had no way of doing this safely. A user may have had permission to copy (and reflink) a given file, but not write to it, and if so they shouldn't be allowed to change e.g. nr_replicas or other options. This uses the incompat feature mechanism in the previous patch to add a new incompatible flag to bch_reflink_p, indicating whether a given reflink pointer may propagate io path option changes back to the indirect extent. In this initial patch we're only setting it for the source extents. We'd like to set it for the destination in a reflink copy, when the user has write access to the source, but that requires mnt_idmap which is not curretly plumbed up to remap_file_range. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/fs-io.c | 9 ++++++++- fs/bcachefs/reflink.c | 18 +++++++++++++++--- fs/bcachefs/reflink.h | 3 ++- fs/bcachefs/reflink_format.h | 2 ++ 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0c6dfc4c1743..c6cc2690aa26 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -680,7 +680,8 @@ struct bch_sb_field_ext { x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ - x(disk_accounting_big_endian, BCH_VERSION(1, 15)) + x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ + x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 33d0e7080bf6..94bf34b9b65f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -906,11 +906,18 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); + /* + * XXX: we'd like to be telling bch2_remap_range() if we have + * permission to write to the source file, and thus if io path option + * changes should be propagated through the copy, but we need mnt_idmap + * from the pathwalk, awkward + */ ret = bch2_remap_range(c, inode_inum(dst), pos_dst >> 9, inode_inum(src), pos_src >> 9, aligned_len >> 9, - pos_dst + len, &i_sectors_delta); + pos_dst + len, &i_sectors_delta, + false); if (ret < 0) goto err; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index e1911b9beb61..93ba4f4e47ca 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -482,7 +482,8 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans, static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, - struct bkey_i *orig) + struct bkey_i *orig, + bool reflink_p_may_update_opts_field) { struct bch_fs *c = trans->c; struct btree_iter reflink_iter = { NULL }; @@ -548,6 +549,9 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); + if (reflink_p_may_update_opts_field) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, BTREE_UPDATE_internal_snapshot_node); err: @@ -578,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c, subvol_inum dst_inum, u64 dst_offset, subvol_inum src_inum, u64 src_offset, u64 remap_sectors, - u64 new_i_size, s64 *i_sectors_delta) + u64 new_i_size, s64 *i_sectors_delta, + bool may_change_src_io_path_opts) { struct btree_trans *trans; struct btree_iter dst_iter, src_iter; @@ -591,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos src_want; u64 dst_done = 0; u32 dst_snapshot, src_snapshot; + bool reflink_p_may_update_opts_field = + bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) @@ -672,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c, src_k = bkey_i_to_s_c(new_src.k); ret = bch2_make_extent_indirect(trans, &src_iter, - new_src.k); + new_src.k, + reflink_p_may_update_opts_field); if (ret) continue; @@ -690,6 +698,10 @@ s64 bch2_remap_range(struct bch_fs *c, bkey_start_offset(src_k.k)); SET_REFLINK_P_IDX(&dst_p->v, offset); + + if (reflink_p_may_update_opts_field && + may_change_src_io_path_opts) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); } diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index f119316adc81..1632780bdf18 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -78,7 +78,8 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_i bool, unsigned); s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, - subvol_inum, u64, u64, u64, s64 *); + subvol_inum, u64, u64, u64, s64 *, + bool); int bch2_gc_reflink_done(struct bch_fs *); int bch2_gc_reflink_start(struct bch_fs *); diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h index 53502627b2c5..92995e4f898e 100644 --- a/fs/bcachefs/reflink_format.h +++ b/fs/bcachefs/reflink_format.h @@ -19,6 +19,8 @@ struct bch_reflink_p { LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); +LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, + struct bch_reflink_p, idx_flags, 57, 58); struct bch_reflink_v { struct bch_val v; From 7fa06b998c9143dbfc64150d4297f740ad9f250c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Oct 2024 02:12:21 -0400 Subject: [PATCH 222/233] bcachefs: Option changes now get propagated to reflinked data Now that bch2_move_get_io_opts() re-propagates changed inode io options to bch_extent_rebalance, we can properly suport changing IO path options for reflinked data. Changing a per-file IO path option, either via the xattr interface or via the BCHFS_IOC_REINHERIT_ATTRS ioctl, will now trigger a scan (the inode number is marked as needing a scan, via bch2_set_rebalance_needs_scan()), and rebalance will use bch2_move_data(), which will walk the inode number and pick up the new options. Signed-off-by: Kent Overstreet --- fs/bcachefs/move.c | 51 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 184620d5a3b4..c493ea625553 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -22,6 +22,7 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" +#include "reflink.h" #include "replicas.h" #include "snapshot.h" #include "super-io.h" @@ -389,6 +390,7 @@ int bch2_move_extent(struct moving_context *ctxt, static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ struct btree_iter *extent_iter, struct bkey_s_c extent_k) { @@ -400,12 +402,12 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (extent_k.k->type == KEY_TYPE_reflink_v) goto out; - if (io_opts->cur_inum != extent_k.k->p.inode) { + if (io_opts->cur_inum != extent_pos.inode) { io_opts->d.nr = 0; - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_k.k->p.inode) + if (k.k->p.offset != extent_pos.inode) break; if (!bkey_is_inode(k.k)) @@ -421,7 +423,7 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, darray_push(&io_opts->d, e); })); - io_opts->cur_inum = extent_k.k->p.inode; + io_opts->cur_inum = extent_pos.inode; } ret = ret ?: trans_was_restarted(trans, restart_count); @@ -527,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt, struct per_snapshot_io_opts snapshot_io_opts; struct bch_io_opts *io_opts; struct bkey_buf sk; - struct btree_iter iter; + struct btree_iter iter, reflink_iter = {}; struct bkey_s_c k; struct data_update_opts data_opts; + /* + * If we're moving a single file, also process reflinked data it points + * to (this includes propagating changed io_opts from the inode to the + * extent): + */ + bool walk_indirect = start.inode == end.inode; int ret = 0, ret2; per_snapshot_io_opts_init(&snapshot_io_opts, c); @@ -549,6 +557,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_ratelimit_reset(ctxt->rate); while (!bch2_move_ratelimit(ctxt)) { + struct btree_iter *extent_iter = &iter; + bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -567,10 +577,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ctxt->stats) ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + if (walk_indirect && + k.k->type == KEY_TYPE_reflink_p && + REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + + bch2_trans_iter_exit(trans, &reflink_iter); + k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_deleted(k.k)) + goto next_nondata; + + /* + * XXX: reflink pointers may point to multiple indirect + * extents, so don't advance past the entire reflink + * pointer - need to fixup iter->k + */ + extent_iter = &reflink_iter; + } + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, &iter, k); + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, + iter.pos, extent_iter, k); ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; @@ -586,7 +622,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -607,6 +643,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); per_snapshot_io_opts_exit(&snapshot_io_opts); From b4f1b7e26ce16f9fc85d1f6f9ff96242b3e053b4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Aug 2023 20:27:38 -0400 Subject: [PATCH 223/233] bcachefs: bcachefs_metadata_version_inode_depth This adds a new inode field, bi_depth, for directory inodes: this allows us to make the check_directory_structure pass much more efficient. Currently, to ensure the filesystem is fully connect and has no loops, for every directory we follow backpointers until we find the root. But by adding a depth counter, it sufficies to only check the parent of each directory, and check that the parent's bi_depth is smaller. (fsck doesn't require that bi_depth = parent->bi_depth + 1; if a rename causes bi_depth off, but the chain to the root is still strictly decreasing, then the algorithm still works and there's no need for fsck to fixup the bi_depth fields). We've already checked backpointers, so we know that every directory (excluding the root)has a valid parent: if bi_depth is always decreasing, every chain must terminate, and terminate at the root directory. bi_depth will not necessarily be correct when fsck runs, due to directory renames - we can't change bi_depth on every child directory when renaming a directory. That's ok; fsck will silently fix the bi_depth field as needed, and future fsck runs will be much faster. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 +- fs/bcachefs/fs-common.c | 13 +++++ fs/bcachefs/fsck.c | 96 +++++++++++++++++++++++++++-------- fs/bcachefs/inode.h | 14 +++++ fs/bcachefs/inode_format.h | 3 +- 5 files changed, 106 insertions(+), 23 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index c6cc2690aa26..f140c3366e65 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -681,7 +681,8 @@ struct bch_sb_field_ext { x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ - x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) + x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ + x(inode_depth, BCH_VERSION(1, 17)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index f8d27244e1d6..7d279f211312 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -170,6 +170,10 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } + if (S_ISDIR(mode) && + !new_inode->bi_subvol) + new_inode->bi_depth = dir_u->bi_depth + 1; + inode_iter.flags &= ~BTREE_ITER_all_snapshots; bch2_btree_iter_set_snapshot(&inode_iter, snapshot); @@ -510,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans, dst_dir_u->bi_nlink++; } + if (S_ISDIR(src_inode_u->bi_mode) && + !src_inode_u->bi_subvol) + src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + + if (mode == BCH_RENAME_EXCHANGE && + S_ISDIR(dst_inode_u->bi_mode) && + !dst_inode_u->bi_subvol) + dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b8ced64cce2c..ea8c8ed06940 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -2597,6 +2597,48 @@ struct pathbuf_entry { typedef DARRAY(struct pathbuf_entry) pathbuf; +static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, + u32 new_depth) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, p->inum, p->snapshot), 0); + + struct bch_inode_unpacked inode; + int ret = bkey_err(k) ?: + !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + if (inode.bi_depth != new_depth) { + inode.bi_depth = new_depth; + ret = __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) +{ + u32 restart_count = trans->restart_count; + int ret = 0; + + darray_for_each_reverse(*path, i) { + ret = nested_lockrestart_do(trans, + bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); + bch_err_fn(trans->c, ret); + if (ret) + break; + + new_bi_depth++; + } + + return ret ?: trans_was_restarted(trans, restart_count); +} + static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { darray_for_each(*p, i) @@ -2606,24 +2648,22 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) +static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; struct btree_iter inode_iter = {}; + pathbuf path = {}; struct printbuf buf = PRINTBUF; u32 snapshot = inode_k.k->p.snapshot; + bool redo_bi_depth = false; + u32 min_bi_depth = U32_MAX; int ret = 0; - p->nr = 0; - struct bch_inode_unpacked inode; ret = bch2_inode_unpack(inode_k, &inode); if (ret) return ret; - if (!S_ISDIR(inode.bi_mode)) - return 0; - while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; @@ -2632,7 +2672,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - break; + goto out; if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) bch2_trans_iter_exit(trans, &dirent_iter); @@ -2647,7 +2687,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &dirent_iter); - ret = darray_push(p, ((struct pathbuf_entry) { + ret = darray_push(&path, ((struct pathbuf_entry) { .inum = inode.bi_inum, .snapshot = snapshot, })); @@ -2659,22 +2699,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &inode_iter); inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, inode.bi_dir, snapshot), 0); + + struct bch_inode_unpacked parent_inode; ret = bkey_err(inode_k) ?: !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(inode_k, &inode); + : bch2_inode_unpack(inode_k, &parent_inode); if (ret) { /* Should have been caught in dirents pass */ bch_err_msg(c, ret, "error looking up parent directory"); - break; + goto out; } - snapshot = inode_k.k->p.snapshot; + min_bi_depth = parent_inode.bi_depth; - if (path_is_dup(p, inode.bi_inum, snapshot)) { + if (parent_inode.bi_depth < inode.bi_depth && + min_bi_depth < U16_MAX) + break; + + inode = parent_inode; + snapshot = inode_k.k->p.snapshot; + redo_bi_depth = true; + + if (path_is_dup(&path, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); - darray_for_each(*p, i) + darray_for_each(path, i) pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode.bi_inum, snapshot); @@ -2687,12 +2737,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino ret = reattach_inode(trans, &inode); bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); } - break; + + goto out; } } + + if (inode.bi_subvol) + min_bi_depth = 0; + + if (redo_bi_depth) + ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); + darray_exit(&path); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -2704,24 +2762,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino */ int bch2_check_directory_structure(struct bch_fs *c) { - pathbuf path = { 0, }; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_intent| BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (!bkey_is_inode(k.k)) + if (!S_ISDIR(bkey_inode_mode(k))) continue; if (bch2_inode_flags(k) & BCH_INODE_unlinked) continue; - check_path(trans, &path, k); + check_path_loop(trans, k); }))); - darray_exit(&path); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 927c875976da..5bca6950f20e 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -219,6 +219,20 @@ static inline u32 bch2_inode_flags(struct bkey_s_c k) } } +static inline unsigned bkey_inode_mode(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); + case KEY_TYPE_inode_v2: + return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); + case KEY_TYPE_inode_v3: + return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); + default: + return 0; + } +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 7928d0c6954f..be1e747629d2 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -101,7 +101,8 @@ struct bch_inode_generation { x(bi_dir_offset, 64) \ x(bi_subvol, 32) \ x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) + x(bi_nocow, 8) \ + x(bi_depth, 32) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ From 7dbe48b9636827e97961a904f10e6f0ae1129b4f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 1 Dec 2024 21:44:38 -0500 Subject: [PATCH 224/233] bcachefs: bcachefs_metadata_version_persistent_inode_cursors Persistent cursors for inode allocation. A free inodes btree would add substantial overhead to inode allocation and freeing - a "next num to allocate" cursor is always going to be faster. We just need it to be persistent, to avoid scanning the inodes btree from the start on startup. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 3 - fs/bcachefs/bcachefs_format.h | 10 ++- fs/bcachefs/btree_update.c | 2 +- fs/bcachefs/inode.c | 120 ++++++++++++++++++++++---------- fs/bcachefs/inode.h | 10 +++ fs/bcachefs/inode_format.h | 14 +++- fs/bcachefs/journal_io.c | 4 +- fs/bcachefs/logged_ops.c | 5 +- fs/bcachefs/logged_ops_format.h | 5 +- fs/bcachefs/opts.h | 10 +-- fs/bcachefs/sb-errors_format.h | 3 +- fs/bcachefs/super-io.c | 5 ++ fs/bcachefs/super.c | 7 +- 13 files changed, 138 insertions(+), 60 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index b749c4ecad1b..161cf2f05d2a 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -1063,9 +1063,6 @@ struct bch_fs { struct btree_node *verify_ondisk; struct mutex verify_lock; - u64 *unused_inode_hints; - unsigned inode_shard_bits; - /* * A btree node on disk could have too many bsets for an iterator to fit * on the stack - have to dynamically allocate them diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f140c3366e65..09e53bef1a30 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k) x(snapshot_tree, 31) \ x(logged_op_truncate, 32) \ x(logged_op_finsert, 33) \ - x(accounting, 34) + x(accounting, 34) \ + x(inode_alloc_cursor, 35) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -682,7 +683,8 @@ struct bch_sb_field_ext { x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ - x(inode_depth, BCH_VERSION(1, 17)) + x(inode_depth, BCH_VERSION(1, 17)) \ + x(persistent_inode_cursors, BCH_VERSION(1, 18)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -850,6 +852,7 @@ LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); +LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -1347,7 +1350,8 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_set)) \ x(logged_ops, 17, 0, \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)) \ + BIT_ULL(KEY_TYPE_logged_op_finsert)| \ + BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ x(subvolume_children, 19, 0, \ diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index a4b70e3fe4c3..13d794f201a5 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -588,7 +588,7 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); + bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); int ret = bkey_err(k); if (ret) diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index f6245b78eb78..04ec05206f8c 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -799,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } +int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, + c, inode_alloc_cursor_inode_bad, + "k.p.inode bad"); +fsck_err: + return ret; +} + +void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); + + prt_printf(out, "idx %llu generation %llu", + le64_to_cpu(i.v->idx), + le64_to_cpu(i.v->gen)); +} + void bch2_inode_init_early(struct bch_fs *c, struct bch_inode_unpacked *inode_u) { @@ -859,6 +881,56 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } +static struct bkey_i_inode_alloc_cursor * +bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) +{ + struct bch_fs *c = trans->c; + + u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; + + cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); + + struct bkey_i_inode_alloc_cursor *cursor = + k.k->type == KEY_TYPE_inode_alloc_cursor + ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) + : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); + ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + goto err; + + if (c->opts.inodes_32bit) { + *min = BLOCKDEV_INODE_MAX; + *max = INT_MAX; + } else { + cursor->v.bits = c->opts.shard_inode_numbers_bits; + + unsigned bits = 63 - c->opts.shard_inode_numbers_bits; + + *min = max(cpu << bits, (u64) INT_MAX + 1); + *max = (cpu << bits) | ~(ULLONG_MAX << bits); + } + + if (le64_to_cpu(cursor->v.idx) < *min) + cursor->v.idx = cpu_to_le64(*min); + + if (le64_to_cpu(cursor->v.idx) >= *max) { + cursor->v.idx = cpu_to_le64(*min); + le32_add_cpu(&cursor->v.gen, 1); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : cursor; +} + /* * This just finds an empty slot: */ @@ -867,35 +939,20 @@ int bch2_inode_create(struct btree_trans *trans, struct bch_inode_unpacked *inode_u, u32 snapshot, u64 cpu) { - struct bch_fs *c = trans->c; - struct bkey_s_c k; - u64 min, max, start, pos, *hint; - int ret = 0; - unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + u64 min, max; + struct bkey_i_inode_alloc_cursor *cursor = + bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); + int ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + return ret; - if (c->opts.shard_inode_numbers) { - bits -= c->inode_shard_bits; + u64 start = le64_to_cpu(cursor->v.idx); + u64 pos = start; - min = (cpu << bits); - max = (cpu << bits) | ~(ULLONG_MAX << bits); - - min = max_t(u64, min, BLOCKDEV_INODE_MAX); - hint = c->unused_inode_hints + cpu; - } else { - min = BLOCKDEV_INODE_MAX; - max = ~(ULLONG_MAX << bits); - hint = c->unused_inode_hints; - } - - start = READ_ONCE(*hint); - - if (start >= max || start < min) - start = min; - - pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), BTREE_ITER_all_snapshots| BTREE_ITER_intent); + struct bkey_s_c k; again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -925,6 +982,7 @@ int bch2_inode_create(struct btree_trans *trans, /* Retry from start */ pos = start = min; bch2_btree_iter_set_pos(iter, POS(0, pos)); + le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); @@ -935,9 +993,9 @@ int bch2_inode_create(struct btree_trans *trans, return ret; } - *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = bkey_generation(k); + inode_u->bi_generation = le64_to_cpu(cursor->v.gen); + cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); return 0; } @@ -999,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; struct bkey_s_c k; u32 snapshot; int ret; @@ -1040,13 +1096,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) goto err; } - bch2_inode_unpack(k, &inode_u); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + ret = bch2_btree_delete_at(trans, &iter, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); err: diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 5bca6950f20e..d2e134528f0e 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -68,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk .min_val_size = 8, \ }) +int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ + .key_validate = bch2_inode_alloc_cursor_validate, \ + .val_to_text = bch2_inode_alloc_cursor_to_text, \ + .min_val_size = 16, \ +}) + #if 0 typedef struct { u64 lo; diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index be1e747629d2..b99a5bf1a75e 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -102,7 +102,8 @@ struct bch_inode_generation { x(bi_subvol, 32) \ x(bi_parent_subvol, 32) \ x(bi_nocow, 8) \ - x(bi_depth, 32) + x(bi_depth, 32) \ + x(bi_inodes_32bit, 8) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -115,7 +116,8 @@ struct bch_inode_generation { x(foreground_target, 16) \ x(background_target, 16) \ x(erasure_code, 16) \ - x(nocow, 8) + x(nocow, 8) \ + x(inodes_32bit, 8) enum inode_opt_id { #define x(name, ...) \ @@ -165,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START, struct bch_inode_v3, bi_flags, 31, 36); LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); +struct bch_inode_alloc_cursor { + struct bch_val v; + __u8 bits; + __u8 pad; + __le32 gen; + __le64 idx; +}; + #endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7f2efe85a805..e1773ac27824 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -1114,8 +1114,10 @@ static int journal_read_bucket(struct bch_dev *ca, (printbuf_reset(&err), prt_str(&err, "journal "), bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + err.buf))) { saw_bad = true; + bch2_fatal_error(c); + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 1ac51af16299..75f27ec26f85 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -65,7 +65,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_max(trans, iter, BTREE_ID_logged_ops, - POS(LOGGED_OPS_INUM, 0), POS(LOGGED_OPS_INUM, U64_MAX), + POS(LOGGED_OPS_INUM_logged_ops, 0), + POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); @@ -76,7 +77,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { struct btree_iter iter; int ret = bch2_bkey_get_empty_slot(trans, &iter, - BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM, U64_MAX)); + BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); if (ret) return ret; diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h index 0b370a963ac6..cfb67c95d4c8 100644 --- a/fs/bcachefs/logged_ops_format.h +++ b/fs/bcachefs/logged_ops_format.h @@ -2,7 +2,10 @@ #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H #define _BCACHEFS_LOGGED_OPS_FORMAT_H -#define LOGGED_OPS_INUM 0 +enum logged_ops_inums { + LOGGED_OPS_INUM_logged_ops, + LOGGED_OPS_INUM_inode_cursors, +}; struct bch_logged_op_truncate { struct bch_val v; diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index ea69099e681d..e763d52e0f38 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -222,14 +222,14 @@ enum fsck_err_opts { BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(inodes_32bit, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ - x(shard_inode_numbers, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_SHARD_INUMS, true, \ + x(shard_inode_numbers_bits, u8, \ + OPT_FS|OPT_FORMAT, \ + OPT_UINT(0, 8), \ + BCH_SB_SHARD_INUMS_NBITS, 0, \ NULL, "Shard new inode numbers by CPU id") \ x(inodes_use_key_cache, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 806486635075..e26317c367f7 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -211,6 +211,7 @@ enum bch_fsck_flags { x(bkey_in_missing_snapshot, 190, 0) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ + x(inode_alloc_cursor_inode_bad, 301, 0) \ x(inode_unpack_error, 193, 0) \ x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ @@ -311,7 +312,7 @@ enum bch_fsck_flags { x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ - x(MAX, 301, 0) + x(MAX, 302, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index b0d52b6ccad4..dbc09e305c27 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -460,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); } +#ifdef __KERNEL__ + if (!BCH_SB_SHARD_INUMS_NBITS(sb)) + SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); +#endif + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 2b2e0835c8fe..7e97c198efe2 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -586,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c) #endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); - kfree(c->unused_inode_hints); if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); @@ -872,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", @@ -900,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || - !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, - sizeof(u64), GFP_KERNEL))) { + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; } From 10e485bba03a3f473cb91ba9fe23979631f34f8a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 10 Dec 2024 14:19:30 -0500 Subject: [PATCH 225/233] bcachefs: bcachefs_metadata_version_autofix_errors It's time to make self healing the default: change the error action for old filesystems to fix_safe, matching the default for current filesystems. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 3 ++- fs/bcachefs/recovery.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 09e53bef1a30..b0fac8b7915b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -684,7 +684,8 @@ struct bch_sb_field_ext { x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ x(inode_depth, BCH_VERSION(1, 17)) \ - x(persistent_inode_cursors, BCH_VERSION(1, 18)) + x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ + x(autofix_errors, BCH_VERSION(1, 19)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 383e03606d6e..98825437381c 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -781,6 +781,11 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { + SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); + write_sb = true; + } + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); From 92b9e40732257b619052ef57d6cba9a7d071eb38 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Tue, 12 Nov 2024 16:15:47 +0800 Subject: [PATCH 226/233] bcachefs: add counter_flags for counters In bcachefs, io_read and io_write counter record the amount of data which has been read and written. They increase in unit of sector, so to display correctly, they need to be shifted to the left by the size of a sector. Other counters like io_move, move_extent_{read, write, finish} also have this problem. In order to support different unit, we add extra column to mark the counter type by using TYPE_COUNTER and TYPE_SECTORS in BCH_PERSISTENT_COUNTERS(). Fixes: 1c6fdbd8f246 ("bcachefs: Initial commit") Signed-off-by: Hongbo Li Signed-off-by: Kent Overstreet --- fs/bcachefs/sb-counters_format.h | 165 ++++++++++++++++--------------- fs/bcachefs/sysfs.c | 9 +- 2 files changed, 93 insertions(+), 81 deletions(-) diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 62ea478215d0..fdcf598f08b1 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -2,86 +2,91 @@ #ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H #define _BCACHEFS_SB_COUNTERS_FORMAT_H -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) \ - x(write_buffer_flush_slowpath, 77) \ - x(write_buffer_flush_sync, 78) +enum counters_flags { + TYPE_COUNTER = BIT(0), /* event counters */ + TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ +}; + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0, TYPE_SECTORS) \ + x(io_write, 1, TYPE_SECTORS) \ + x(io_move, 2, TYPE_SECTORS) \ + x(bucket_invalidate, 3, TYPE_COUNTER) \ + x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_alloc, 5, TYPE_COUNTER) \ + x(bucket_alloc_fail, 6, TYPE_COUNTER) \ + x(btree_cache_scan, 7, TYPE_COUNTER) \ + x(btree_cache_reap, 8, TYPE_COUNTER) \ + x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ + x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ + x(btree_node_write, 13, TYPE_COUNTER) \ + x(btree_node_read, 14, TYPE_COUNTER) \ + x(btree_node_compact, 15, TYPE_COUNTER) \ + x(btree_node_merge, 16, TYPE_COUNTER) \ + x(btree_node_split, 17, TYPE_COUNTER) \ + x(btree_node_rewrite, 18, TYPE_COUNTER) \ + x(btree_node_alloc, 19, TYPE_COUNTER) \ + x(btree_node_free, 20, TYPE_COUNTER) \ + x(btree_node_set_root, 21, TYPE_COUNTER) \ + x(btree_path_relock_fail, 22, TYPE_COUNTER) \ + x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ + x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ + x(journal_entry_full, 25, TYPE_COUNTER) \ + x(journal_full, 26, TYPE_COUNTER) \ + x(journal_reclaim_finish, 27, TYPE_COUNTER) \ + x(journal_reclaim_start, 28, TYPE_COUNTER) \ + x(journal_write, 29, TYPE_COUNTER) \ + x(read_promote, 30, TYPE_COUNTER) \ + x(read_bounce, 31, TYPE_COUNTER) \ + x(read_split, 33, TYPE_COUNTER) \ + x(read_retry, 32, TYPE_COUNTER) \ + x(read_reuse_race, 34, TYPE_COUNTER) \ + x(move_extent_read, 35, TYPE_SECTORS) \ + x(move_extent_write, 36, TYPE_SECTORS) \ + x(move_extent_finish, 37, TYPE_SECTORS) \ + x(move_extent_fail, 38, TYPE_COUNTER) \ + x(move_extent_start_fail, 39, TYPE_COUNTER) \ + x(copygc, 40, TYPE_COUNTER) \ + x(copygc_wait, 41, TYPE_COUNTER) \ + x(gc_gens_end, 42, TYPE_COUNTER) \ + x(gc_gens_start, 43, TYPE_COUNTER) \ + x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ + x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ + x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ + x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ + x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ + x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ + x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ + x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ + x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ + x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ + x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ + x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ + x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ + x(trans_restart_relock, 57, TYPE_COUNTER) \ + x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ + x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ + x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ + x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ + x(trans_restart_relock_path, 62, TYPE_COUNTER) \ + x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ + x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ + x(trans_restart_traverse, 65, TYPE_COUNTER) \ + x(trans_restart_upgrade, 66, TYPE_COUNTER) \ + x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ + x(trans_restart_injected, 69, TYPE_COUNTER) \ + x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ + x(trans_traverse_all, 71, TYPE_COUNTER) \ + x(transaction_commit, 72, TYPE_COUNTER) \ + x(write_super, 73, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ + x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ + x(trans_restart_split_race, 76, TYPE_COUNTER) \ + x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ + x(write_buffer_flush_sync, 78, TYPE_COUNTER) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 48bc6ad03f09..a7eb1f511484 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -505,15 +505,22 @@ SHOW(bch2_fs_counters) printbuf_tabstop_push(out, 32); - #define x(t, ...) \ + #define x(t, n, f, ...) \ if (attr == &sysfs_##t) { \ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ + if (f & TYPE_SECTORS) { \ + counter <<= 9; \ + counter_since_mount <<= 9; \ + } \ + \ prt_printf(out, "since mount:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ prt_printf(out, "since filesystem creation:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ prt_human_readable_u64(out, counter); \ prt_newline(out); \ } From 959dbb09ed74b73d67fdb3f375fe5de40b839ba4 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 12 Dec 2024 00:55:48 -0500 Subject: [PATCH 227/233] bcachefs: better check_bp_exists() error message Signed-off-by: Kent Overstreet --- fs/bcachefs/backpointers.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index b93ddfa00fdd..ebeb6a5ff9d2 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -566,11 +566,11 @@ static int check_bp_exists(struct btree_trans *trans, goto err; missing: printbuf_reset(&buf); - prt_str(&buf, "missing backpointer "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); - prt_newline(&buf); + prt_str(&buf, "missing backpointer\n for: "); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\n got: "); + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); + prt_printf(&buf, "\n got: "); bch2_bkey_val_to_text(&buf, c, bp_k); if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) From 52084849f3d623be3897d085f58d62c773b2e33e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 15 Dec 2024 01:52:54 -0500 Subject: [PATCH 228/233] bcachefs: Drop racy warning Checking for writing past i_size after unlocking the folio and clearing the dirty bit is racy, and we already check it at the start. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io-buffered.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index ff8b8df50bf3..ab1d5db2fa56 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -625,15 +625,6 @@ static int __bch2_writepage(struct folio *folio, BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); - /* Check for writing past i_size: */ - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_emergency_ro, &c->flags), - "writing past i_size: %llu > %llu (unrounded %llu)\n", - bio_end_sector(&w->io->op.wbio.bio) << 9, - round_up(i_size, block_bytes(c)), - i_size); - w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; From 83fa58a3704b538c240b1ae6e8e91ff08eb88269 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 15 Dec 2024 02:03:11 -0500 Subject: [PATCH 229/233] bcachefs: Drop redundant "read error" call from btree_gc The btree node read error path already calls topology error, so this is entirely redundant, and we're not specific enough about our error codes - this was triggering for bucket_ref_update() errors. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_gc.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 5aa11ca08c94..721dca551720 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -733,16 +733,8 @@ static int bch2_gc_btrees(struct bch_fs *c) continue; ret = bch2_gc_btree(trans, btree, true); - - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_read_error, - "btree node read error for %s", - (printbuf_reset(&buf), - bch2_btree_id_to_text(&buf, btree), - buf.buf))) - ret = bch2_btree_lost_data(c, btree); } -fsck_err: + printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); From 989229db3f0720a5f16f5753356a4ce51124c072 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 15 Dec 2024 02:24:30 -0500 Subject: [PATCH 230/233] bcachefs: kill __bch2_btree_iter_flags() bch2_btree_iter_flags() now takes a level parameter; this fixes a bug where using a node iterator on a leaf wouldn't set BTREE_ITER_with_key_cache, leading to fun cache coherency bugs. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 7 +++++-- fs/bcachefs/btree_iter.h | 28 +++++++++++----------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index a1c5fcced24e..291eb5eb0203 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -3032,7 +3032,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _RET_IP_); } @@ -3048,8 +3048,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, flags |= BTREE_ITER_snapshot_field; flags |= BTREE_ITER_all_snapshots; + if (!depth && btree_id_cached(trans->c, btree_id)) + flags |= BTREE_ITER_with_key_cache; + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - __bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, depth, flags), _RET_IP_); iter->min_depth = depth; diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 3477fc8c0396..e23608d2a26d 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -446,10 +446,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); -static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) +static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned level, + unsigned flags) { + if (level || !btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_cached; + flags &= ~BTREE_ITER_with_key_cache; + } else if (!(flags & BTREE_ITER_cached)) + flags |= BTREE_ITER_with_key_cache; + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && btree_id_is_extents(btree_id)) flags |= BTREE_ITER_is_extents; @@ -468,19 +475,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, return flags; } -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) -{ - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_cached; - flags &= ~BTREE_ITER_with_key_cache; - } else if (!(flags & BTREE_ITER_cached)) - flags |= BTREE_ITER_with_key_cache; - - return __bch2_btree_iter_flags(trans, btree_id, flags); -} - static inline void bch2_trans_iter_init_common(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, @@ -517,7 +511,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, if (__builtin_constant_p(btree_id) && __builtin_constant_p(flags)) bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _THIS_IP_); else bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); From 7b5ddd26bcf1f72e16e121c02533ee6192a3eea0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Jun 2024 17:01:31 -0400 Subject: [PATCH 231/233] bcachefs: Write lock btree node in key cache fills this addresses a key cache coherency bug Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 4eba2871f289..382f99b774b8 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -197,7 +197,9 @@ bkey_cached_reuse(struct btree_key_cache *c) return ck; } -static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, +static int btree_key_cache_create(struct btree_trans *trans, + struct btree_path *path, + struct btree_path *ck_path, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * key_u64s = min(256U, (key_u64s * 3) / 2); key_u64s = roundup_pow_of_two(key_u64s); - struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); + struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); int ret = PTR_ERR_OR_ZERO(ck); if (ret) return ret; @@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(path->btree_id)); + bch2_btree_id_str(ck_path->btree_id)); return -BCH_ERR_ENOMEM_btree_key_cache_create; } } ck->c.level = 0; - ck->c.btree_id = path->btree_id; - ck->key.btree_id = path->btree_id; - ck->key.pos = path->pos; + ck->c.btree_id = ck_path->btree_id; + ck->key.btree_id = ck_path->btree_id; + ck->key.pos = ck_path->pos; ck->flags = 1U << BKEY_CACHED_ACCESSED; if (unlikely(key_u64s > ck->u64s)) { - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); struct bkey_i *new_k = allocate_dropping_locks(trans, ret, kmalloc(key_u64s * sizeof(u64), _gfp)); @@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * bkey_reassemble(ck->k, k); + ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); + if (unlikely(ret)) + goto err; + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); + + bch2_btree_node_unlock_write(trans, path, path_l(path)->b); + if (unlikely(ret)) /* raced with another fill? */ goto err; atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); - enum six_lock_type lock_want = __btree_lock_want(path, 0); + enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); if (lock_want == SIX_LOCK_read) six_lock_downgrade(&ck->c.lock); - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); - path->uptodate = BTREE_ITER_UPTODATE; + btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); + ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: bkey_cached_free(bc, ck); - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; } @@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, + BTREE_ITER_intent| BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; @@ -306,7 +316,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, if (unlikely(ret)) goto out; - ret = btree_key_cache_create(trans, ck_path, k); + ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); if (ret) goto err; From b5677d4d8d295e42a91b437a1de8caa2791a4277 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Dec 2024 13:58:02 -0500 Subject: [PATCH 232/233] bcachefs: Handle -BCH_ERR_need_mark_replicas in gc Locking considerations (possibly no longer relevant?) mean that when an accounting update needs a new superblock replicas entry to be created, it's deferred to the transaction commit error path. But accounting updates for gc/fcsk aren't done from the transaction commit path - so we need to handle -BCH_ERR_btree_insert_need_mark_replicas locally. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 72c8dcb9226f..b32e91ba8be8 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_ memcpy_u64s_small(acc->v.d, d, nr); } +static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); + int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) @@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, accounting_key_init(&k_i.k, k, d, nr); - return likely(!gc) - ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) - : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (unlikely(gc)) { + int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (ret == -BCH_ERR_btree_insert_need_mark_replicas) + ret = drop_locks_do(trans, + bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: + bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + return ret; + } else { + return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); + } } int bch2_mod_dev_cached_sectors(struct btree_trans *trans, From ca2e7a3de895c703d2cbbd9b63c10d8adfba8228 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Dec 2024 16:41:25 -0500 Subject: [PATCH 233/233] bcachefs: Fix assert for online fsck We can't check if we're racing with fsck ending until mark_lock is held. Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index fc1b673689c8..5360cbb3ec29 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -138,7 +138,8 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, bpos_to_disk_accounting_pos(&acc_k, a.k->p); bool gc = mode == BCH_ACCOUNTING_gc; - EBUG_ON(gc && !acc->gc_running); + if (gc && !acc->gc_running) + return 0; if (!bch2_accounting_is_mem(acc_k)) return 0;