bcachefs fixes for 6.11-rc1

Assorted minor syzbot fixes, and for bigger stuff:
 
 - Fix two disk accounting rewrite bugs
  - Disk accounting keys use the version field of bkey so that journal
    replay can tell which updates have been applied to the btree. This is
    set in the transaction commit path, after we've gotten our journal
    reservation (and our time ordering), but the
    BCH_TRANS_COMMIT_skip_accounting_apply flag that journal replay uses
    was incorrectly skipping this for new updates generated prior to
    journal replay.
 
    This fixes the underlying cause of an assertion pop in
    disk_accounting_read.
 
  - A couple fixes for disk accounting + device removal. Checking if
    acocunting replicas entries were marked in the superblock was being
    done at the wrong point, when deltas in the journal could still zero
    them out, and then additionally we'd try to add a missing replicas
    entry to the superblock without checking if it referred to an invalid
    (removed) device.
 
 - A whole slew of repair fixes
  - fix infinite loop in propagate_key_to_snapshot_leaves(), this fixes
    an infinite loop when repairing a filesystem with many snapshots
  - fix incorrect transaction restart handling leading to occasional
    "fsck counted ..." warnings"
  - fix warning in __bch2_fsck_err() for bkey fsck errors
  - check_inode() in fsck now correctly checks if the filesystem was
    clean
  - there shouldn't be pending logged ops if the fs was clean, we now
    check for this
  - remove_backpointer() doesn't remove a dirent that doesn't actually
    point to the inode
  - many more fsck errors are AUTOFIX
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmb4QtsACgkQE6szbY3K
 bnYx4A//bhGgZYgP55FxduuxUH8XjX2eOnXwuPv/MmYO/4oCok5VBa9bRDTVXhIK
 PtY4pP2IJZ3+u963mwbwJAawsPA01AEEty9tE+AdXbltDRQ03I33OEuIy0HFIso2
 s8VBkVPbru6yU4RCCvYNIVvRG/9GOL+J0GgrR1t05zHVyKXe1FuS00Yq5+z3niNP
 HtuGTsD273Nnhikz47bqyD+M6VizU+uzSUFLgnB3zrzpb+gPSGETSwgc4ggajlM4
 2P10Vc4L/Nb3KYV9RW+C3WpRfUR/o8BZA3wjJfNo0JeA4iDaUbltSjpCA07EcAnA
 3D6Omzqkm4aobL2WlvioT0UhZx4t8X/8x5t5F9HyX52i1k+g87oMT9/KIKec1Dzd
 8vQCwCdXFfWaLSZoOJsHyIljip7BuRLKhWwKosdzzLIAnRQy5StxAhsG99fNStu6
 JOWICPNCn1b6SkktnoKou1unL+K5RczeNfAxMAjcJjTD7IIAmytLe4mdRbP9q+Oa
 x8no7pttbb4JnoRvfo42GVz8KWQR07oN/Zy7mH3K4Y0Ix+xDOrLqlfLIDLGpxMNv
 HZz+UPchdlfpYJO+nTLoAOGXZWnKDqg70SAEcWKDc82Ri4vNOhraYDZvXrzl9qE+
 63RPzqDbg3uXGxLYMvujjPe610QkPxS9zKKyDvUZZx0ZiUX4CjI=
 =cdrz
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2024-09-28' of git://evilpiepirate.org/bcachefs

Pull more bcachefs updates from Kent Overstreet:
 "Assorted minor syzbot fixes, and for bigger stuff:

  Fix two disk accounting rewrite bugs:

   - Disk accounting keys use the version field of bkey so that journal
     replay can tell which updates have been applied to the btree.

     This is set in the transaction commit path, after we've gotten our
     journal reservation (and our time ordering), but the
     BCH_TRANS_COMMIT_skip_accounting_apply flag that journal replay
     uses was incorrectly skipping this for new updates generated prior
     to journal replay.

     This fixes the underlying cause of an assertion pop in
     disk_accounting_read.

   - A couple of fixes for disk accounting + device removal.

     Checking if acocunting replicas entries were marked in the
     superblock was being done at the wrong point, when deltas in the
     journal could still zero them out, and then additionally we'd try
     to add a missing replicas entry to the superblock without checking
     if it referred to an invalid (removed) device.

  A whole slew of repair fixes:

   - fix infinite loop in propagate_key_to_snapshot_leaves(), this fixes
     an infinite loop when repairing a filesystem with many snapshots

   - fix incorrect transaction restart handling leading to occasional
     "fsck counted ..." warnings

   - fix warning in __bch2_fsck_err() for bkey fsck errors

   - check_inode() in fsck now correctly checks if the filesystem was
     clean

   - there shouldn't be pending logged ops if the fs was clean, we now
     check for this

   - remove_backpointer() doesn't remove a dirent that doesn't actually
     point to the inode

   - many more fsck errors are AUTOFIX"

* tag 'bcachefs-2024-09-28' of git://evilpiepirate.org/bcachefs: (35 commits)
  bcachefs: check_subvol_path() now prints subvol root inode
  bcachefs: remove_backpointer() now checks if dirent points to inode
  bcachefs: dirent_points_to_inode() now warns on mismatch
  bcachefs: Fix lost wake up
  bcachefs: Check for logged ops when clean
  bcachefs: BCH_FS_clean_recovery
  bcachefs: Convert disk accounting BUG_ON() to WARN_ON()
  bcachefs: Fix BCH_TRANS_COMMIT_skip_accounting_apply
  bcachefs: Check for accounting keys with bversion=0
  bcachefs: rename version -> bversion
  bcachefs: Don't delete unlinked inodes before logged op resume
  bcachefs: Fix BCH_SB_ERRS() so we can reorder
  bcachefs: Fix fsck warnings from bkey validation
  bcachefs: Move transaction commit path validation to as late as possible
  bcachefs: Fix disk accounting attempting to mark invalid replicas entry
  bcachefs: Fix unlocked access to c->disk_sb.sb in bch2_replicas_entry_validate()
  bcachefs: Fix accounting read + device removal
  bcachefs: bch_accounting_mode
  bcachefs: fix transaction restart handling in check_extents(), check_dirents()
  bcachefs: kill inode_walker_entry.seen_this_pos
  ...
This commit is contained in:
Linus Torvalds 2024-09-29 09:17:44 -07:00
commit 9f9a534724
39 changed files with 470 additions and 310 deletions

View File

@ -501,7 +501,7 @@ static int check_extent_checksum(struct btree_trans *trans,
prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree));
bch2_bkey_val_to_text(&buf, c, extent2); bch2_bkey_val_to_text(&buf, c, extent2);
struct nonce nonce = extent_nonce(extent.k->version, p.crc); struct nonce nonce = extent_nonce(extent.k->bversion, p.crc);
struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
trans, dup_backpointer_to_bad_csum_extent, trans, dup_backpointer_to_bad_csum_extent,

View File

@ -594,6 +594,7 @@ struct bch_dev {
#define BCH_FS_FLAGS() \ #define BCH_FS_FLAGS() \
x(new_fs) \ x(new_fs) \
x(started) \ x(started) \
x(clean_recovery) \
x(btree_running) \ x(btree_running) \
x(accounting_replay_done) \ x(accounting_replay_done) \
x(may_go_rw) \ x(may_go_rw) \
@ -776,7 +777,7 @@ struct bch_fs {
unsigned nsec_per_time_unit; unsigned nsec_per_time_unit;
u64 features; u64 features;
u64 compat; u64 compat;
unsigned long errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)]; unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data; u64 btrees_lost_data;
} sb; } sb;

View File

@ -217,7 +217,7 @@ struct bkey {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
__u8 pad[1]; __u8 pad[1];
struct bversion version; struct bversion bversion;
__u32 size; /* extent size, in sectors */ __u32 size; /* extent size, in sectors */
struct bpos p; struct bpos p;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
@ -328,8 +328,8 @@ enum bch_bkey_fields {
bkey_format_field(OFFSET, p.offset), \ bkey_format_field(OFFSET, p.offset), \
bkey_format_field(SNAPSHOT, p.snapshot), \ bkey_format_field(SNAPSHOT, p.snapshot), \
bkey_format_field(SIZE, size), \ bkey_format_field(SIZE, size), \
bkey_format_field(VERSION_HI, version.hi), \ bkey_format_field(VERSION_HI, bversion.hi), \
bkey_format_field(VERSION_LO, version.lo), \ bkey_format_field(VERSION_LO, bversion.lo), \
}, \ }, \
}) })

View File

@ -214,9 +214,9 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) #define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
static __always_inline int bversion_zero(struct bversion v) static __always_inline bool bversion_zero(struct bversion v)
{ {
return !bversion_cmp(v, ZERO_VERSION); return bversion_cmp(v, ZERO_VERSION) == 0;
} }
#ifdef CONFIG_BCACHEFS_DEBUG #ifdef CONFIG_BCACHEFS_DEBUG
@ -554,8 +554,8 @@ static inline void bch2_bkey_pack_test(void) {}
x(BKEY_FIELD_OFFSET, p.offset) \ x(BKEY_FIELD_OFFSET, p.offset) \
x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
x(BKEY_FIELD_SIZE, size) \ x(BKEY_FIELD_SIZE, size) \
x(BKEY_FIELD_VERSION_HI, version.hi) \ x(BKEY_FIELD_VERSION_HI, bversion.hi) \
x(BKEY_FIELD_VERSION_LO, version.lo) x(BKEY_FIELD_VERSION_LO, bversion.lo)
struct bkey_format_state { struct bkey_format_state {
u64 field_min[BKEY_NR_FIELDS]; u64 field_min[BKEY_NR_FIELDS];

View File

@ -289,7 +289,7 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
bch2_bpos_to_text(out, k->p); bch2_bpos_to_text(out, k->p);
prt_printf(out, " len %u ver %llu", k->size, k->version.lo); prt_printf(out, " len %u ver %llu", k->size, k->bversion.lo);
} else { } else {
prt_printf(out, "(null)"); prt_printf(out, "(null)");
} }

View File

@ -70,7 +70,7 @@ bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r)
{ {
return l->type == r->type && return l->type == r->type &&
!bversion_cmp(l->version, r->version) && !bversion_cmp(l->bversion, r->bversion) &&
bpos_eq(l->p, bkey_start_pos(r)); bpos_eq(l->p, bkey_start_pos(r));
} }

View File

@ -513,6 +513,8 @@ int bch2_check_topology(struct bch_fs *c)
struct bpos pulled_from_scan = POS_MIN; struct bpos pulled_from_scan = POS_MIN;
int ret = 0; int ret = 0;
bch2_trans_srcu_unlock(trans);
for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
struct btree_root *r = bch2_btree_id_root(c, i); struct btree_root *r = bch2_btree_id_root(c, i);
bool reconstructed_root = false; bool reconstructed_root = false;
@ -599,15 +601,15 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (initial) { if (initial) {
BUG_ON(bch2_journal_seq_verify && BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > atomic64_read(&c->journal.seq)); k.k->bversion.lo > atomic64_read(&c->journal.seq));
if (fsck_err_on(btree_id != BTREE_ID_accounting && if (fsck_err_on(btree_id != BTREE_ID_accounting &&
k.k->version.lo > atomic64_read(&c->key_version), k.k->bversion.lo > atomic64_read(&c->key_version),
trans, bkey_version_in_future, trans, bkey_version_in_future,
"key version number higher than recorded %llu\n %s", "key version number higher than recorded %llu\n %s",
atomic64_read(&c->key_version), atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
atomic64_set(&c->key_version, k.k->version.lo); atomic64_set(&c->key_version, k.k->bversion.lo);
} }
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),

View File

@ -1195,6 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
set_btree_bset(b, b->set, &b->data->keys); set_btree_bset(b, b->set, &b->data->keys);
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
btree_buf_bytes(b) -
sizeof(struct btree_node) -
b->nr.live_u64s * sizeof(u64));
u64s = le16_to_cpu(sorted->keys.u64s); u64s = le16_to_cpu(sorted->keys.u64s);
*sorted = *b->data; *sorted = *b->data;
@ -1219,7 +1223,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bkey_val_validate(c, u.s_c, READ); ret = bch2_bkey_val_validate(c, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey || if (ret == -BCH_ERR_fsck_delete_bkey ||
(bch2_inject_invalid_keys && (bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) { !bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k); btree_keys_account_key_drop(&b->nr, 0, k);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);

View File

@ -275,7 +275,7 @@ static int read_btree_nodes(struct find_btree_nodes *f)
w->ca = ca; w->ca = ca;
t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = IS_ERR_OR_NULL(t); ret = PTR_ERR_OR_ZERO(t);
if (ret) { if (ret) {
percpu_ref_put(&ca->io_ref); percpu_ref_put(&ca->io_ref);
closure_put(&cl); closure_put(&cl);

View File

@ -684,10 +684,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
!(flags & BCH_TRANS_COMMIT_no_journal_res)) { !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify) if (bch2_journal_seq_verify)
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq; i->k->k.bversion.lo = trans->journal_res.seq;
else if (bch2_inject_invalid_keys) else if (bch2_inject_invalid_keys)
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
i->k->k.version = MAX_VERSION; i->k->k.bversion = MAX_VERSION;
} }
h = trans->hooks; h = trans->hooks;
@ -700,27 +700,31 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct jset_entry *entry = trans->journal_entries; struct jset_entry *entry = trans->journal_entries;
if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { percpu_down_read(&c->mark_lock);
percpu_down_read(&c->mark_lock);
for (entry = trans->journal_entries; for (entry = trans->journal_entries;
entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
entry = vstruct_next(entry)) entry = vstruct_next(entry))
if (jset_entry_is_key(entry) && entry->start->k.type == KEY_TYPE_accounting) { if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); entry->start->k.type == KEY_TYPE_accounting) {
BUG_ON(!trans->journal_res.ref);
a->k.version = journal_pos_to_bversion(&trans->journal_res, struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start);
(u64 *) entry - (u64 *) trans->journal_entries);
BUG_ON(bversion_zero(a->k.version)); a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), false, false); (u64 *) entry - (u64 *) trans->journal_entries);
BUG_ON(bversion_zero(a->k.bversion));
if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) {
ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal);
if (ret) if (ret)
goto revert_fs_usage; goto revert_fs_usage;
} }
percpu_up_read(&c->mark_lock); }
percpu_up_read(&c->mark_lock);
/* XXX: we only want to run this if deltas are nonzero */ /* XXX: we only want to run this if deltas are nonzero */
bch2_trans_account_disk_usage_change(trans); bch2_trans_account_disk_usage_change(trans);
}
trans_for_each_update(trans, i) trans_for_each_update(trans, i)
if (btree_node_type_has_atomic_triggers(i->bkey_type)) { if (btree_node_type_has_atomic_triggers(i->bkey_type)) {
@ -735,6 +739,40 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err; goto fatal_err;
} }
trans_for_each_update(trans, i) {
enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
i->bkey_type, invalid_flags);
if (unlikely(ret)){
bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
trans->fn, (void *) i->ip_allocated);
goto fatal_err;
}
btree_insert_entry_checks(trans, i);
}
for (struct jset_entry *i = trans->journal_entries;
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
i = vstruct_next(i)) {
enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
ret = bch2_journal_entry_validate(c, NULL, i,
bcachefs_metadata_version_current,
CPU_BIG_ENDIAN, invalid_flags);
if (unlikely(ret)) {
bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
trans->fn);
goto fatal_err;
}
}
if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal; struct journal *j = &c->journal;
struct jset_entry *entry; struct jset_entry *entry;
@ -798,7 +836,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
bch2_accounting_neg(a); bch2_accounting_neg(a);
bch2_accounting_mem_mod_locked(trans, a.c, false, false); bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
bch2_accounting_neg(a); bch2_accounting_neg(a);
} }
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
@ -1019,40 +1057,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (ret) if (ret)
goto out_reset; goto out_reset;
trans_for_each_update(trans, i) {
enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k),
i->bkey_type, invalid_flags);
if (unlikely(ret)){
bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n",
trans->fn, (void *) i->ip_allocated);
return ret;
}
btree_insert_entry_checks(trans, i);
}
for (struct jset_entry *i = trans->journal_entries;
i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
i = vstruct_next(i)) {
enum bch_validate_flags invalid_flags = 0;
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit;
ret = bch2_journal_entry_validate(c, NULL, i,
bcachefs_metadata_version_current,
CPU_BIG_ENDIAN, invalid_flags);
if (unlikely(ret)) {
bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n",
trans->fn);
return ret;
}
}
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
ret = do_bch2_trans_commit_to_journal_replay(trans); ret = do_bch2_trans_commit_to_journal_replay(trans);
goto out_reset; goto out_reset;

View File

@ -220,7 +220,8 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
if (type && k.k->type != type) if (type && k.k->type != type)
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
mut = bch2_trans_kmalloc_nomemzero(trans, bytes); /* extra padding for varint_decode_fast... */
mut = bch2_trans_kmalloc_nomemzero(trans, bytes + 8);
if (!IS_ERR(mut)) { if (!IS_ERR(mut)) {
bkey_reassemble(mut, k); bkey_reassemble(mut, k);

View File

@ -639,7 +639,7 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_write_op_init(&m->op, c, io_opts); bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k); m->op.pos = bkey_start_pos(k.k);
m->op.version = k.k->version; m->op.version = k.k->bversion;
m->op.target = data_opts.target; m->op.target = data_opts.target;
m->op.write_point = wp; m->op.write_point = wp;
m->op.nr_replicas = 0; m->op.nr_replicas = 0;

View File

@ -134,6 +134,10 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
void *end = &acc_k + 1; void *end = &acc_k + 1;
int ret = 0; int ret = 0;
bkey_fsck_err_on(bversion_zero(k.k->bversion),
c, accounting_key_version_0,
"accounting key with version=0");
switch (acc_k.type) { switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_nr_inodes: case BCH_DISK_ACCOUNTING_nr_inodes:
end = field_end(acc_k, nr_inodes); end = field_end(acc_k, nr_inodes);
@ -291,7 +295,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
struct accounting_mem_entry n = { struct accounting_mem_entry n = {
.pos = a.k->p, .pos = a.k->p,
.version = a.k->version, .bversion = a.k->bversion,
.nr_counters = bch2_accounting_counters(a.k), .nr_counters = bch2_accounting_counters(a.k),
.v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64),
sizeof(u64), GFP_KERNEL), sizeof(u64), GFP_KERNEL),
@ -319,11 +323,13 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
return -BCH_ERR_ENOMEM_disk_accounting; return -BCH_ERR_ENOMEM_disk_accounting;
} }
int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, bool gc) int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
enum bch_accounting_mode mode)
{ {
struct bch_replicas_padded r; struct bch_replicas_padded r;
if (accounting_to_replicas(&r.e, a.k->p) && if (mode != BCH_ACCOUNTING_read &&
accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e)) !bch2_replicas_marked_locked(c, &r.e))
return -BCH_ERR_btree_insert_need_mark_replicas; return -BCH_ERR_btree_insert_need_mark_replicas;
@ -566,7 +572,9 @@ int bch2_gc_accounting_done(struct bch_fs *c)
struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
accounting_key_init(&k_i.k, &acc_k, src_v, nr); accounting_key_init(&k_i.k, &acc_k, src_v, nr);
bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), false, false); bch2_accounting_mem_mod_locked(trans,
bkey_i_to_s_c_accounting(&k_i.k),
BCH_ACCOUNTING_normal);
preempt_disable(); preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@ -589,30 +597,14 @@ int bch2_gc_accounting_done(struct bch_fs *c)
static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
if (k.k->type != KEY_TYPE_accounting) if (k.k->type != KEY_TYPE_accounting)
return 0; return 0;
percpu_down_read(&c->mark_lock); percpu_down_read(&c->mark_lock);
int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), false, true); int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
BCH_ACCOUNTING_read);
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
if (bch2_accounting_key_is_zero(bkey_s_c_to_accounting(k)) &&
ret == -BCH_ERR_btree_insert_need_mark_replicas)
ret = 0;
struct disk_accounting_pos acc;
bpos_to_disk_accounting_pos(&acc, k.k->p);
if (fsck_err_on(ret == -BCH_ERR_btree_insert_need_mark_replicas,
trans, accounting_replicas_not_marked,
"accounting not marked in superblock replicas\n %s",
(bch2_accounting_key_to_text(&buf, &acc),
buf.buf)))
ret = bch2_accounting_update_sb_one(c, k.k->p);
fsck_err:
printbuf_exit(&buf);
return ret; return ret;
} }
@ -624,6 +616,7 @@ int bch2_accounting_read(struct bch_fs *c)
{ {
struct bch_accounting_mem *acc = &c->accounting; struct bch_accounting_mem *acc = &c->accounting;
struct btree_trans *trans = bch2_trans_get(c); struct btree_trans *trans = bch2_trans_get(c);
struct printbuf buf = PRINTBUF;
int ret = for_each_btree_key(trans, iter, int ret = for_each_btree_key(trans, iter,
BTREE_ID_accounting, POS_MIN, BTREE_ID_accounting, POS_MIN,
@ -647,7 +640,7 @@ int bch2_accounting_read(struct bch_fs *c)
accounting_pos_cmp, &k.k->p); accounting_pos_cmp, &k.k->p);
bool applied = idx < acc->k.nr && bool applied = idx < acc->k.nr &&
bversion_cmp(acc->k.data[idx].version, k.k->version) >= 0; bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0;
if (applied) if (applied)
continue; continue;
@ -655,7 +648,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (i + 1 < &darray_top(*keys) && if (i + 1 < &darray_top(*keys) &&
i[1].k->k.type == KEY_TYPE_accounting && i[1].k->k.type == KEY_TYPE_accounting &&
!journal_key_cmp(i, i + 1)) { !journal_key_cmp(i, i + 1)) {
BUG_ON(bversion_cmp(i[0].k->k.version, i[1].k->k.version) >= 0); WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0);
i[1].journal_seq = i[0].journal_seq; i[1].journal_seq = i[0].journal_seq;
@ -674,6 +667,45 @@ int bch2_accounting_read(struct bch_fs *c)
keys->gap = keys->nr = dst - keys->data; keys->gap = keys->nr = dst - keys->data;
percpu_down_read(&c->mark_lock); percpu_down_read(&c->mark_lock);
for (unsigned i = 0; i < acc->k.nr; i++) {
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
continue;
struct bch_replicas_padded r;
if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
continue;
/*
* If the replicas entry is invalid it'll get cleaned up by
* check_allocations:
*/
if (bch2_replicas_entry_validate(&r.e, c, &buf))
continue;
struct disk_accounting_pos k;
bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
trans, accounting_replicas_not_marked,
"accounting not marked in superblock replicas\n %s",
(printbuf_reset(&buf),
bch2_accounting_key_to_text(&buf, &k),
buf.buf))) {
/*
* We're not RW yet and still single threaded, dropping
* and retaking lock is ok:
*/
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, &r.e);
if (ret)
goto fsck_err;
percpu_down_read(&c->mark_lock);
}
}
preempt_disable(); preempt_disable();
struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage);
@ -709,8 +741,10 @@ int bch2_accounting_read(struct bch_fs *c)
} }
} }
preempt_enable(); preempt_enable();
fsck_err:
percpu_up_read(&c->mark_lock); percpu_up_read(&c->mark_lock);
err: err:
printbuf_exit(&buf);
bch2_trans_put(trans); bch2_trans_put(trans);
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;

View File

@ -36,8 +36,8 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
dst->v.d[i] += src.v->d[i]; dst->v.d[i] += src.v->d[i];
if (bversion_cmp(dst->k.version, src.k->version) < 0) if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
dst->k.version = src.k->version; dst->k.bversion = src.k->bversion;
} }
static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage, static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
@ -103,23 +103,35 @@ static inline int accounting_pos_cmp(const void *_l, const void *_r)
return bpos_cmp(*l, *r); return bpos_cmp(*l, *r);
} }
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, bool); enum bch_accounting_mode {
BCH_ACCOUNTING_normal,
BCH_ACCOUNTING_gc,
BCH_ACCOUNTING_read,
};
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *); void bch2_accounting_mem_gc(struct bch_fs *);
/* /*
* Update in memory counters so they match the btree update we're doing; called * Update in memory counters so they match the btree update we're doing; called
* from transaction commit path * from transaction commit path
*/ */
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc, bool read) static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
struct bkey_s_c_accounting a,
enum bch_accounting_mode mode)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct bch_accounting_mem *acc = &c->accounting;
struct disk_accounting_pos acc_k; struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, a.k->p); bpos_to_disk_accounting_pos(&acc_k, a.k->p);
bool gc = mode == BCH_ACCOUNTING_gc;
EBUG_ON(gc && !acc->gc_running);
if (acc_k.type == BCH_DISK_ACCOUNTING_inum) if (acc_k.type == BCH_DISK_ACCOUNTING_inum)
return 0; return 0;
if (!gc && !read) { if (mode == BCH_ACCOUNTING_normal) {
switch (acc_k.type) { switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_persistent_reserved: case BCH_DISK_ACCOUNTING_persistent_reserved:
trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; trans->fs_usage_delta.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0];
@ -140,14 +152,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
} }
} }
struct bch_accounting_mem *acc = &c->accounting;
unsigned idx; unsigned idx;
EBUG_ON(gc && !acc->gc_running);
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
int ret = bch2_accounting_mem_insert(c, a, gc); int ret = bch2_accounting_mem_insert(c, a, mode);
if (ret) if (ret)
return ret; return ret;
} }
@ -164,7 +173,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, stru
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{ {
percpu_down_read(&trans->c->mark_lock); percpu_down_read(&trans->c->mark_lock);
int ret = bch2_accounting_mem_mod_locked(trans, a, gc, false); int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
percpu_up_read(&trans->c->mark_lock); percpu_up_read(&trans->c->mark_lock);
return ret; return ret;
} }

View File

@ -6,7 +6,7 @@
struct accounting_mem_entry { struct accounting_mem_entry {
struct bpos pos; struct bpos pos;
struct bversion version; struct bversion bversion;
unsigned nr_counters; unsigned nr_counters;
u64 __percpu *v[2]; u64 __percpu *v[2];
}; };

View File

@ -239,7 +239,19 @@ int __bch2_fsck_err(struct bch_fs *c,
if (!c) if (!c)
c = trans->c; c = trans->c;
WARN_ON(!trans && bch2_current_has_btree_trans(c)); /*
* Ugly: if there's a transaction in the current task it has to be
* passed in to unlock if we prompt for user input.
*
* But, plumbing a transaction and transaction restarts into
* bkey_validate() is problematic.
*
* So:
* - make all bkey errors AUTOFIX, they're simple anyways (we just
* delete the key)
* - and we don't need to warn if we're not prompting
*/
WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c));
if ((flags & FSCK_CAN_FIX) && if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent)) test_bit(err, c->sb.errors_silent))

View File

@ -184,7 +184,7 @@ do { \
ret = -BCH_ERR_fsck_delete_bkey; \ ret = -BCH_ERR_fsck_delete_bkey; \
goto fsck_err; \ goto fsck_err; \
} \ } \
int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX, \ int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\
BCH_FSCK_ERR_##_err_type, \ BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \ _err_msg, ##__VA_ARGS__); \
if (_ret != -BCH_ERR_fsck_fix && \ if (_ret != -BCH_ERR_fsck_fix && \

View File

@ -21,6 +21,49 @@
#include <linux/bsearch.h> #include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */ #include <linux/dcache.h> /* struct qstr */
static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
struct bkey_s_c_dirent d)
{
return inode->bi_dir == d.k->p.inode &&
inode->bi_dir_offset == d.k->p.offset;
}
static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
if (d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
: le64_to_cpu(d.v->d_inum) == inode->bi_inum)
return 0;
return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
}
static void dirent_inode_mismatch_msg(struct printbuf *out,
struct bch_fs *c,
struct bkey_s_c_dirent dirent,
struct bch_inode_unpacked *inode)
{
prt_str(out, "inode points to dirent that does not point back:");
prt_newline(out);
bch2_bkey_val_to_text(out, c, dirent.s_c);
prt_newline(out);
bch2_inode_unpacked_to_text(out, inode);
}
static int dirent_points_to_inode(struct bch_fs *c,
struct bkey_s_c_dirent dirent,
struct bch_inode_unpacked *inode)
{
int ret = dirent_points_to_inode_nowarn(dirent, inode);
if (ret) {
struct printbuf buf = PRINTBUF;
dirent_inode_mismatch_msg(&buf, c, dirent, inode);
bch_warn(c, "%s", buf.buf);
printbuf_exit(&buf);
}
return ret;
}
/* /*
* XXX: this is handling transaction restarts without returning * XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@ -346,14 +389,17 @@ static int reattach_inode(struct btree_trans *trans,
static int remove_backpointer(struct btree_trans *trans, static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode) struct bch_inode_unpacked *inode)
{ {
struct btree_iter iter; if (!inode->bi_dir)
struct bkey_s_c_dirent d; return 0;
int ret;
d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, struct bch_fs *c = trans->c;
POS(inode->bi_dir, inode->bi_dir_offset), 0, struct btree_iter iter;
struct bkey_s_c_dirent d =
bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents,
SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0,
dirent); dirent);
ret = bkey_err(d) ?: int ret = bkey_err(d) ?:
dirent_points_to_inode(c, d, inode) ?:
__remove_dirent(trans, d.k->p); __remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
return ret; return ret;
@ -371,7 +417,8 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
return ret; return ret;
ret = remove_backpointer(trans, &inode); ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent"); if (!bch2_err_matches(ret, ENOENT))
bch_err_msg(c, ret, "removing dirent");
if (ret) if (ret)
return ret; return ret;
@ -626,12 +673,12 @@ static int ref_visible2(struct bch_fs *c,
struct inode_walker_entry { struct inode_walker_entry {
struct bch_inode_unpacked inode; struct bch_inode_unpacked inode;
u32 snapshot; u32 snapshot;
bool seen_this_pos;
u64 count; u64 count;
}; };
struct inode_walker { struct inode_walker {
bool first_this_inode; bool first_this_inode;
bool have_inodes;
bool recalculate_sums; bool recalculate_sums;
struct bpos last_pos; struct bpos last_pos;
@ -669,6 +716,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bkey_s_c k; struct bkey_s_c k;
int ret; int ret;
/*
* We no longer have inodes for w->last_pos; clear this to avoid
* screwing up check_i_sectors/check_subdir_count if we take a
* transaction restart here:
*/
w->have_inodes = false;
w->recalculate_sums = false; w->recalculate_sums = false;
w->inodes.nr = 0; w->inodes.nr = 0;
@ -686,6 +739,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret; return ret;
w->first_this_inode = true; w->first_this_inode = true;
w->have_inodes = true;
return 0; return 0;
} }
@ -740,9 +794,6 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
if (ret) if (ret)
return ERR_PTR(ret); return ERR_PTR(ret);
} else if (bkey_cmp(w->last_pos, k.k->p)) {
darray_for_each(w->inodes, i)
i->seen_this_pos = false;
} }
w->last_pos = k.k->p; w->last_pos = k.k->p;
@ -896,21 +947,6 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
} }
static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
struct bkey_s_c_dirent d)
{
return inode->bi_dir == d.k->p.inode &&
inode->bi_dir_offset == d.k->p.offset;
}
static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
return d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
: le64_to_cpu(d.v->d_inum) == inode->bi_inum;
}
static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
{ {
struct btree_iter iter; struct btree_iter iter;
@ -920,13 +956,14 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
return ret; return ret;
} }
static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, static int check_inode_dirent_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode, struct bch_inode_unpacked *inode,
u32 inode_snapshot, bool *write_inode) bool *write_inode)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
u32 inode_snapshot = inode->bi_snapshot;
struct btree_iter dirent_iter = {}; struct btree_iter dirent_iter = {};
struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
int ret = bkey_err(d); int ret = bkey_err(d);
@ -936,13 +973,13 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
if (fsck_err_on(ret, if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent, trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s", "inode points to missing dirent\n%s",
(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
fsck_err_on(!ret && !dirent_points_to_inode(d, inode), fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
trans, inode_points_to_wrong_dirent, trans, inode_points_to_wrong_dirent,
"inode points to dirent that does not point back:\n%s", "%s",
(bch2_bkey_val_to_text(&buf, c, inode_k), (printbuf_reset(&buf),
prt_newline(&buf), dirent_inode_mismatch_msg(&buf, c, d, inode),
bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { buf.buf))) {
/* /*
* We just clear the backpointer fields for now. If we find a * We just clear the backpointer fields for now. If we find a
* dirent that points to this inode in check_dirents(), we'll * dirent that points to this inode in check_dirents(), we'll
@ -963,7 +1000,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c i
return ret; return ret;
} }
static bool bch2_inode_open(struct bch_fs *c, struct bpos p) static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{ {
subvol_inum inum = { subvol_inum inum = {
.subvol = snapshot_t(c, p.snapshot)->subvol, .subvol = snapshot_t(c, p.snapshot)->subvol,
@ -972,7 +1009,7 @@ static bool bch2_inode_open(struct bch_fs *c, struct bpos p)
/* snapshot tree corruption, can't safely delete */ /* snapshot tree corruption, can't safely delete */
if (!inum.subvol) { if (!inum.subvol) {
bch_err_ratelimited(c, "%s(): snapshot %u has no subvol", __func__, p.snapshot); bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
return true; return true;
} }
@ -1045,30 +1082,44 @@ static int check_inode(struct btree_trans *trans,
} }
if (u.bi_flags & BCH_INODE_unlinked) { if (u.bi_flags & BCH_INODE_unlinked) {
ret = check_inode_deleted_list(trans, k.k->p); if (!test_bit(BCH_FS_started, &c->flags)) {
if (ret < 0) /*
return ret; * If we're not in online fsck, don't delete unlinked
* inodes, just make sure they're on the deleted list.
*
* They might be referred to by a logged operation -
* i.e. we might have crashed in the middle of a
* truncate on an unlinked but open file - so we want to
* let the delete_dead_inodes kill it after resuming
* logged ops.
*/
ret = check_inode_deleted_list(trans, k.k->p);
if (ret < 0)
return ret;
fsck_err_on(!ret, fsck_err_on(!ret,
trans, unlinked_inode_not_on_deleted_list, trans, unlinked_inode_not_on_deleted_list,
"inode %llu:%u unlinked, but not on deleted list", "inode %llu:%u unlinked, but not on deleted list",
u.bi_inum, k.k->p.snapshot); u.bi_inum, k.k->p.snapshot);
ret = 0;
} ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, k.k->p, 1);
if (ret)
if (u.bi_flags & BCH_INODE_unlinked && goto err;
!bch2_inode_open(c, k.k->p) && } else {
(!c->sb.clean || if (fsck_err_on(bch2_inode_is_open(c, k.k->p),
fsck_err(trans, inode_unlinked_but_clean, trans, inode_unlinked_and_not_open,
"filesystem marked clean, but inode %llu unlinked", "inode %llu%u unlinked and not open",
u.bi_inum))) { u.bi_inum, u.bi_snapshot)) {
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck deleting inode"); bch_err_msg(c, ret, "in fsck deleting inode");
return ret; return ret;
}
}
} }
/* i_size_dirty is vestigal, since we now have logged ops for truncate * */
if (u.bi_flags & BCH_INODE_i_size_dirty && if (u.bi_flags & BCH_INODE_i_size_dirty &&
(!c->sb.clean || (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_size_dirty_but_clean, fsck_err(trans, inode_i_size_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_size dirty", "filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) { u.bi_inum))) {
@ -1097,8 +1148,9 @@ static int check_inode(struct btree_trans *trans,
do_update = true; do_update = true;
} }
/* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
if (u.bi_flags & BCH_INODE_i_sectors_dirty && if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
(!c->sb.clean || (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
fsck_err(trans, inode_i_sectors_dirty_but_clean, fsck_err(trans, inode_i_sectors_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_sectors dirty", "filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) { u.bi_inum))) {
@ -1126,7 +1178,7 @@ static int check_inode(struct btree_trans *trans,
} }
if (u.bi_dir || u.bi_dir_offset) { if (u.bi_dir || u.bi_dir_offset) {
ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret) if (ret)
goto err; goto err;
} }
@ -1555,10 +1607,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k, struct bkey_s_c k,
struct inode_walker *inode, struct inode_walker *inode,
struct snapshots_seen *s, struct snapshots_seen *s,
struct extent_ends *extent_ends) struct extent_ends *extent_ends,
struct disk_reservation *res)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
int ret = 0; int ret = 0;
@ -1568,7 +1620,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto out; goto out;
} }
if (inode->last_pos.inode != k.k->p.inode) { if (inode->last_pos.inode != k.k->p.inode && inode->have_inodes) {
ret = check_i_sectors(trans, inode); ret = check_i_sectors(trans, inode);
if (ret) if (ret)
goto err; goto err;
@ -1578,12 +1630,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
if (ret) if (ret)
goto err; goto err;
i = walk_inode(trans, inode, k); struct inode_walker_entry *extent_i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i); ret = PTR_ERR_OR_ZERO(extent_i);
if (ret) if (ret)
goto err; goto err;
ret = check_key_has_inode(trans, iter, inode, i, k); ret = check_key_has_inode(trans, iter, inode, extent_i, k);
if (ret) if (ret)
goto err; goto err;
@ -1592,24 +1644,19 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
&inode->recalculate_sums); &inode->recalculate_sums);
if (ret) if (ret)
goto err; goto err;
}
/* /*
* Check inodes in reverse order, from oldest snapshots to newest, * Check inodes in reverse order, from oldest snapshots to
* starting from the inode that matches this extent's snapshot. If we * newest, starting from the inode that matches this extent's
* didn't have one, iterate over all inodes: * snapshot. If we didn't have one, iterate over all inodes:
*/ */
if (!i) for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
i = &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data;
--i) {
if (i->snapshot > k.k->p.snapshot ||
!key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
continue;
for (;
inode->inodes.data && i >= inode->inodes.data;
--i) {
if (i->snapshot > k.k->p.snapshot ||
!key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
continue;
if (k.k->type != KEY_TYPE_whiteout) {
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) && if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k), !bkey_extent_is_reservation(k),
@ -1629,13 +1676,25 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err; goto err;
iter->k.type = KEY_TYPE_whiteout; iter->k.type = KEY_TYPE_whiteout;
break;
} }
if (bkey_extent_is_allocation(k.k))
i->count += k.k->size;
} }
}
i->seen_this_pos = true; ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto err;
if (bkey_extent_is_allocation(k.k)) {
for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
inode->inodes.data && i >= inode->inodes.data;
--i) {
if (i->snapshot > k.k->p.snapshot ||
!key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
continue;
i->count += k.k->size;
}
} }
if (k.k->type != KEY_TYPE_whiteout) { if (k.k->type != KEY_TYPE_whiteout) {
@ -1666,13 +1725,11 @@ int bch2_check_extents(struct bch_fs *c)
extent_ends_init(&extent_ends); extent_ends_init(&extent_ends);
int ret = bch2_trans_run(c, int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_extents, for_each_btree_key(trans, iter, BTREE_ID_extents,
POS(BCACHEFS_ROOT_INO, 0), POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
&res, NULL,
BCH_TRANS_COMMIT_no_enospc, ({
bch2_disk_reservation_put(c, &res); bch2_disk_reservation_put(c, &res);
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?: check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?:
check_extent_overbig(trans, &iter, k); check_extent_overbig(trans, &iter, k);
})) ?: })) ?:
check_i_sectors_notnested(trans, &w)); check_i_sectors_notnested(trans, &w));
@ -1758,6 +1815,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF; struct printbuf buf = PRINTBUF;
struct btree_iter bp_iter = { NULL };
int ret = 0; int ret = 0;
if (inode_points_to_dirent(target, d)) if (inode_points_to_dirent(target, d))
@ -1770,7 +1828,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
prt_printf(&buf, "\n "), prt_printf(&buf, "\n "),
bch2_inode_unpacked_to_text(&buf, target), bch2_inode_unpacked_to_text(&buf, target),
buf.buf))) buf.buf)))
goto out_noiter; goto err;
if (!target->bi_dir && if (!target->bi_dir &&
!target->bi_dir_offset) { !target->bi_dir_offset) {
@ -1779,7 +1837,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
return __bch2_fsck_write_inode(trans, target, target_snapshot); return __bch2_fsck_write_inode(trans, target, target_snapshot);
} }
struct btree_iter bp_iter = { NULL };
struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
ret = bkey_err(bp_dirent); ret = bkey_err(bp_dirent);
@ -1840,7 +1897,6 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
err: err:
fsck_err: fsck_err:
bch2_trans_iter_exit(trans, &bp_iter); bch2_trans_iter_exit(trans, &bp_iter);
out_noiter:
printbuf_exit(&buf); printbuf_exit(&buf);
bch_err_fn(c, ret); bch_err_fn(c, ret);
return ret; return ret;
@ -2075,7 +2131,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type == KEY_TYPE_whiteout) if (k.k->type == KEY_TYPE_whiteout)
goto out; goto out;
if (dir->last_pos.inode != k.k->p.inode) { if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
ret = check_subdir_count(trans, dir); ret = check_subdir_count(trans, dir);
if (ret) if (ret)
goto err; goto err;
@ -2137,11 +2193,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret) if (ret)
goto err; goto err;
} }
if (d.v->d_type == DT_DIR)
for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
i->count++;
} }
ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto err;
if (d.v->d_type == DT_DIR)
for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
i->count++;
out: out:
err: err:
fsck_err: fsck_err:
@ -2164,12 +2224,9 @@ int bch2_check_dirents(struct bch_fs *c)
snapshots_seen_init(&s); snapshots_seen_init(&s);
int ret = bch2_trans_run(c, int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, for_each_btree_key(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0), POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
k,
NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
check_subdir_count_notnested(trans, &dir)); check_subdir_count_notnested(trans, &dir));
@ -2314,22 +2371,6 @@ static bool darray_u32_has(darray_u32 *d, u32 v)
return false; return false;
} }
/*
* We've checked that inode backpointers point to valid dirents; here, it's
* sufficient to check that the subvolume root has a dirent:
*/
static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
{
struct bch_inode_unpacked inode;
int ret = bch2_inode_find_by_inum_trans(trans,
(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
&inode);
if (ret)
return ret;
return inode.bi_dir != 0;
}
static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
{ {
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
@ -2348,14 +2389,24 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
ret = subvol_has_dirent(trans, s); struct bch_inode_unpacked subvol_root;
if (ret < 0) ret = bch2_inode_find_by_inum_trans(trans,
(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
&subvol_root);
if (ret)
break; break;
if (fsck_err_on(!ret, /*
* We've checked that inode backpointers point to valid dirents;
* here, it's sufficient to check that the subvolume root has a
* dirent:
*/
if (fsck_err_on(!subvol_root.bi_dir,
trans, subvol_unreachable, trans, subvol_unreachable,
"unreachable subvolume %s", "unreachable subvolume %s",
(bch2_bkey_val_to_text(&buf, c, s.s_c), (bch2_bkey_val_to_text(&buf, c, s.s_c),
prt_newline(&buf),
bch2_inode_unpacked_to_text(&buf, &subvol_root),
buf.buf))) { buf.buf))) {
ret = reattach_subvol(trans, s); ret = reattach_subvol(trans, s);
break; break;
@ -2450,10 +2501,8 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
if (ret && !bch2_err_matches(ret, ENOENT)) if (ret && !bch2_err_matches(ret, ENOENT))
break; break;
if (!ret && !dirent_points_to_inode(d, &inode)) { if (!ret && (ret = dirent_points_to_inode(c, d, &inode)))
bch2_trans_iter_exit(trans, &dirent_iter); bch2_trans_iter_exit(trans, &dirent_iter);
ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
}
if (bch2_err_matches(ret, ENOENT)) { if (bch2_err_matches(ret, ENOENT)) {
ret = 0; ret = 0;

View File

@ -320,9 +320,11 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked) struct bch_inode_unpacked *unpacked)
{ {
if (likely(k.k->type == KEY_TYPE_inode_v3)) unpacked->bi_snapshot = k.k->p.snapshot;
return bch2_inode_unpack_v3(k, unpacked);
return bch2_inode_unpack_slowpath(k, unpacked); return likely(k.k->type == KEY_TYPE_inode_v3)
? bch2_inode_unpack_v3(k, unpacked)
: bch2_inode_unpack_slowpath(k, unpacked);
} }
int bch2_inode_peek_nowarn(struct btree_trans *trans, int bch2_inode_peek_nowarn(struct btree_trans *trans,
@ -557,7 +559,7 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
{ {
prt_printf(out, "inum: %llu ", inode->bi_inum); prt_printf(out, "inum: %llu:%u ", inode->bi_inum, inode->bi_snapshot);
__bch2_inode_unpacked_to_text(out, inode); __bch2_inode_unpacked_to_text(out, inode);
} }
@ -1111,7 +1113,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
pos.offset, pos.snapshot)) pos.offset, pos.snapshot))
goto delete; goto delete;
if (c->sb.clean && if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
!fsck_err(trans, deleted_inode_but_clean, !fsck_err(trans, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u", "filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot)) { pos.offset, pos.snapshot)) {

View File

@ -69,6 +69,7 @@ typedef u64 u96;
struct bch_inode_unpacked { struct bch_inode_unpacked {
u64 bi_inum; u64 bi_inum;
u32 bi_snapshot;
u64 bi_journal_seq; u64 bi_journal_seq;
__le64 bi_hash_seed; __le64 bi_hash_seed;
u64 bi_size; u64 bi_size;

View File

@ -517,7 +517,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
if ((ret = bkey_err(k))) if ((ret = bkey_err(k)))
goto out; goto out;
if (bversion_cmp(k.k->version, rbio->version) || if (bversion_cmp(k.k->bversion, rbio->version) ||
!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
goto out; goto out;
@ -1031,7 +1031,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
rbio->read_pos = read_pos; rbio->read_pos = read_pos;
rbio->data_btree = data_btree; rbio->data_btree = data_btree;
rbio->data_pos = data_pos; rbio->data_pos = data_pos;
rbio->version = k.k->version; rbio->version = k.k->bversion;
rbio->promote = promote; rbio->promote = promote;
INIT_WORK(&rbio->work, NULL); INIT_WORK(&rbio->work, NULL);

View File

@ -697,7 +697,7 @@ static void init_append_extent(struct bch_write_op *op,
e = bkey_extent_init(op->insert_keys.top); e = bkey_extent_init(op->insert_keys.top);
e->k.p = op->pos; e->k.p = op->pos;
e->k.size = crc.uncompressed_size; e->k.size = crc.uncompressed_size;
e->k.version = version; e->k.bversion = version;
if (crc.csum_type || if (crc.csum_type ||
crc.compression_type || crc.compression_type ||
@ -1544,7 +1544,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
id = bkey_inline_data_init(op->insert_keys.top); id = bkey_inline_data_init(op->insert_keys.top);
id->k.p = op->pos; id->k.p = op->pos;
id->k.version = op->version; id->k.bversion = op->version;
id->k.size = sectors; id->k.size = sectors;
iter = bio->bi_iter; iter = bio->bi_iter;

View File

@ -605,7 +605,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
goto out; goto out;
} }
if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err),
c, version, jset, entry, c, version, jset, entry,
journal_entry_data_usage_bad_size, journal_entry_data_usage_bad_size,
"invalid journal entry usage: %s", err.buf)) { "invalid journal entry usage: %s", err.buf)) {

View File

@ -37,6 +37,14 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
struct bkey_buf sk; struct bkey_buf sk;
u32 restart_count = trans->restart_count; u32 restart_count = trans->restart_count;
struct printbuf buf = PRINTBUF;
int ret = 0;
fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags),
trans, logged_op_but_clean,
"filesystem marked as clean but have logged op\n%s",
(bch2_bkey_val_to_text(&buf, c, k),
buf.buf));
if (!fn) if (!fn)
return 0; return 0;
@ -47,8 +55,9 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
fn->resume(trans, sk.k); fn->resume(trans, sk.k);
bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&sk, c);
fsck_err:
return trans_was_restarted(trans, restart_count); printbuf_exit(&buf);
return ret ?: trans_was_restarted(trans, restart_count);
} }
int bch2_resume_logged_ops(struct bch_fs *c) int bch2_resume_logged_ops(struct bch_fs *c)

View File

@ -151,7 +151,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u); struct bkey_s_c old = bch2_btree_path_peek_slot(btree_iter_path(trans, &iter), &u);
/* Has this delta already been applied to the btree? */ /* Has this delta already been applied to the btree? */
if (bversion_cmp(old.k->version, k->k->k.version) >= 0) { if (bversion_cmp(old.k->bversion, k->k->k.bversion) >= 0) {
ret = 0; ret = 0;
goto out; goto out;
} }
@ -717,6 +717,8 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.fsck) if (c->opts.fsck)
set_bit(BCH_FS_fsck_running, &c->flags); set_bit(BCH_FS_fsck_running, &c->flags);
if (c->sb.clean)
set_bit(BCH_FS_clean_recovery, &c->flags);
ret = bch2_blacklist_table_initialize(c); ret = bch2_blacklist_table_initialize(c);
if (ret) { if (ret) {
@ -862,6 +864,9 @@ int bch2_fs_recovery(struct bch_fs *c)
clear_bit(BCH_FS_fsck_running, &c->flags); clear_bit(BCH_FS_fsck_running, &c->flags);
/* in case we don't run journal replay, i.e. norecovery mode */
set_bit(BCH_FS_accounting_replay_done, &c->flags);
/* fsync if we fixed errors */ /* fsync if we fixed errors */
if (test_bit(BCH_FS_errors_fixed, &c->flags) && if (test_bit(BCH_FS_errors_fixed, &c->flags) &&
bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) {

View File

@ -50,7 +50,7 @@
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \
x(resume_logged_ops, 23, PASS_ALWAYS) \ x(resume_logged_ops, 23, PASS_ALWAYS) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ x(delete_dead_inodes, 32, PASS_ALWAYS) \
x(fix_reflink_p, 33, 0) \ x(fix_reflink_p, 33, 0) \
x(set_fs_needs_rebalance, 34, 0) \ x(set_fs_needs_rebalance, 34, 0) \

View File

@ -367,7 +367,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
r_v->k.type = bkey_type_to_indirect(&orig->k); r_v->k.type = bkey_type_to_indirect(&orig->k);
r_v->k.p = reflink_iter.pos; r_v->k.p = reflink_iter.pos;
bch2_key_resize(&r_v->k, orig->k.size); bch2_key_resize(&r_v->k, orig->k.size);
r_v->k.version = orig->k.version; r_v->k.bversion = orig->k.bversion;
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));

View File

@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]"); prt_printf(out, "]");
} }
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
struct bch_sb *sb, struct bch_sb *sb,
struct printbuf *err) struct printbuf *err)
{ {
if (!r->nr_devs) { if (!r->nr_devs) {
prt_printf(err, "no devices in entry "); prt_printf(err, "no devices in entry ");
@ -94,6 +94,16 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
return -BCH_ERR_invalid_replicas_entry; return -BCH_ERR_invalid_replicas_entry;
} }
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
struct bch_fs *c,
struct printbuf *err)
{
mutex_lock(&c->sb_lock);
int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
mutex_unlock(&c->sb_lock);
return ret;
}
void bch2_cpu_replicas_to_text(struct printbuf *out, void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r) struct bch_replicas_cpu *r)
{ {
@ -676,7 +686,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_replicas_entry_v1 *e = struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i); cpu_replicas_entry(cpu_r, i);
int ret = bch2_replicas_entry_validate(e, sb, err); int ret = bch2_replicas_entry_validate_locked(e, sb, err);
if (ret) if (ret)
return ret; return ret;

View File

@ -10,7 +10,7 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *, void bch2_replicas_entry_to_text(struct printbuf *,
struct bch_replicas_entry_v1 *); struct bch_replicas_entry_v1 *);
int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *, int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
struct bch_sb *, struct printbuf *); struct bch_fs *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
static inline struct bch_replicas_entry_v1 * static inline struct bch_replicas_entry_v1 *

View File

@ -167,6 +167,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
ret = bch2_sb_clean_validate_late(c, clean, READ); ret = bch2_sb_clean_validate_late(c, clean, READ);
if (ret) { if (ret) {
kfree(clean);
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
return ERR_PTR(ret); return ERR_PTR(ret);
} }

View File

@ -312,8 +312,7 @@ static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
if (!first) if (!first)
prt_char(out, ','); prt_char(out, ',');
first = false; first = false;
unsigned e = le16_to_cpu(i->errors[j]); bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
prt_str(out, e < BCH_SB_ERR_MAX ? bch2_sb_error_strs[e] : "(unknown)");
} }
prt_newline(out); prt_newline(out);
} }
@ -353,7 +352,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
for (unsigned i = 0; i < src->nr_errors; i++) for (unsigned i = 0; i < src->nr_errors; i++)
dst->errors[i] = cpu_to_le16(src->errors[i]); dst->errors[i] = cpu_to_le16(src->errors[i]);
downgrade_table_extra(c, &table); ret = downgrade_table_extra(c, &table);
if (ret)
goto out;
if (!dst->recovery_passes[0] && if (!dst->recovery_passes[0] &&
!dst->recovery_passes[1] && !dst->recovery_passes[1] &&
@ -399,7 +400,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) { for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
unsigned e = le16_to_cpu(i->errors[j]); unsigned e = le16_to_cpu(i->errors[j]);
if (e < BCH_SB_ERR_MAX) if (e < BCH_FSCK_ERR_MAX)
__set_bit(e, c->sb.errors_silent); __set_bit(e, c->sb.errors_silent);
if (e < sizeof(ext->errors_silent) * 8) if (e < sizeof(ext->errors_silent) * 8)
__set_bit_le64(e, ext->errors_silent); __set_bit_le64(e, ext->errors_silent);

View File

@ -7,12 +7,12 @@
const char * const bch2_sb_error_strs[] = { const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t, #define x(t, n, ...) [n] = #t,
BCH_SB_ERRS() BCH_SB_ERRS()
NULL #undef x
}; };
static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
{ {
if (id < BCH_SB_ERR_MAX) if (id < BCH_FSCK_ERR_MAX)
prt_str(out, bch2_sb_error_strs[id]); prt_str(out, bch2_sb_error_strs[id]);
else else
prt_printf(out, "(unknown error %u)", id); prt_printf(out, "(unknown error %u)", id);

View File

@ -6,6 +6,8 @@
extern const char * const bch2_sb_error_strs[]; extern const char * const bch2_sb_error_strs[];
void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
extern const struct bch_sb_field_ops bch_sb_field_ops_errors; extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);

View File

@ -210,22 +210,23 @@ enum bch_fsck_flags {
x(inode_snapshot_mismatch, 196, 0) \ x(inode_snapshot_mismatch, 196, 0) \
x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \
x(inode_unlinked_and_not_open, 281, 0) \
x(inode_checksum_type_invalid, 199, 0) \ x(inode_checksum_type_invalid, 199, 0) \
x(inode_compression_type_invalid, 200, 0) \ x(inode_compression_type_invalid, 200, 0) \
x(inode_subvol_root_but_not_dir, 201, 0) \ x(inode_subvol_root_but_not_dir, 201, 0) \
x(inode_i_size_dirty_but_clean, 202, 0) \ x(inode_i_size_dirty_but_clean, 202, FSCK_AUTOFIX) \
x(inode_i_sectors_dirty_but_clean, 203, 0) \ x(inode_i_sectors_dirty_but_clean, 203, FSCK_AUTOFIX) \
x(inode_i_sectors_wrong, 204, 0) \ x(inode_i_sectors_wrong, 204, FSCK_AUTOFIX) \
x(inode_dir_wrong_nlink, 205, 0) \ x(inode_dir_wrong_nlink, 205, FSCK_AUTOFIX) \
x(inode_dir_multiple_links, 206, 0) \ x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
x(inode_multiple_links_but_nlink_0, 207, 0) \ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
x(inode_wrong_backpointer, 208, 0) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
x(inode_wrong_nlink, 209, 0) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
x(inode_unreachable, 210, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \
x(deleted_inode_but_clean, 211, 0) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, 0) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, 0) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
x(deleted_inode_not_unlinked, 214, 0) \ x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \ x(extent_overlapping, 215, 0) \
x(key_in_missing_inode, 216, 0) \ x(key_in_missing_inode, 216, 0) \
x(key_in_wrong_inode_type, 217, 0) \ x(key_in_wrong_inode_type, 217, 0) \
@ -255,7 +256,7 @@ enum bch_fsck_flags {
x(dir_loop, 241, 0) \ x(dir_loop, 241, 0) \
x(hash_table_key_duplicate, 242, 0) \ x(hash_table_key_duplicate, 242, 0) \
x(hash_table_key_wrong_offset, 243, 0) \ x(hash_table_key_wrong_offset, 243, 0) \
x(unlinked_inode_not_on_deleted_list, 244, 0) \ x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
x(reflink_p_front_pad_bad, 245, 0) \ x(reflink_p_front_pad_bad, 245, 0) \
x(journal_entry_dup_same_device, 246, 0) \ x(journal_entry_dup_same_device, 246, 0) \
x(inode_bi_subvol_missing, 247, 0) \ x(inode_bi_subvol_missing, 247, 0) \
@ -270,7 +271,7 @@ enum bch_fsck_flags {
x(subvol_children_not_set, 256, 0) \ x(subvol_children_not_set, 256, 0) \
x(subvol_children_bad, 257, 0) \ x(subvol_children_bad, 257, 0) \
x(subvol_loop, 258, 0) \ x(subvol_loop, 258, 0) \
x(subvol_unreachable, 259, 0) \ x(subvol_unreachable, 259, FSCK_AUTOFIX) \
x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_bkey_bad_u64s, 260, 0) \
x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \
@ -282,8 +283,8 @@ enum bch_fsck_flags {
x(btree_ptr_v2_written_0, 268, 0) \ x(btree_ptr_v2_written_0, 268, 0) \
x(subvol_snapshot_bad, 269, 0) \ x(subvol_snapshot_bad, 269, 0) \
x(subvol_inode_bad, 270, 0) \ x(subvol_inode_bad, 270, 0) \
x(alloc_key_stripe_sectors_wrong, 271, 0) \ x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
x(accounting_mismatch, 272, 0) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \ x(accounting_replicas_not_marked, 273, 0) \
x(invalid_btree_id, 274, 0) \ x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_io_time_bad, 275, 0) \
@ -292,12 +293,14 @@ enum bch_fsck_flags {
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(MAX, 284, 0)
enum bch_sb_error_id { enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n, #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS() BCH_SB_ERRS()
#undef x #undef x
BCH_SB_ERR_MAX
}; };
struct bch_sb_field_errors { struct bch_sb_field_errors {

View File

@ -169,11 +169,17 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
ret = -1 - SIX_LOCK_write; ret = -1 - SIX_LOCK_write;
} }
} else if (type == SIX_LOCK_write && lock->readers) { } else if (type == SIX_LOCK_write && lock->readers) {
if (try) { if (try)
atomic_add(SIX_LOCK_HELD_write, &lock->state); atomic_add(SIX_LOCK_HELD_write, &lock->state);
smp_mb__after_atomic();
}
/*
* Make sure atomic_add happens before pcpu_read_count and
* six_set_bitmask in slow path happens before pcpu_read_count.
*
* Paired with the smp_mb() in read lock fast path (per-cpu mode)
* and the one before atomic_read in read unlock path.
*/
smp_mb();
ret = !pcpu_read_count(lock); ret = !pcpu_read_count(lock);
if (try && !ret) { if (try && !ret) {

View File

@ -469,6 +469,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 id = snapshot_root; u32 id = snapshot_root;
u32 subvol = 0, s; u32 subvol = 0, s;
rcu_read_lock();
while (id) { while (id) {
s = snapshot_t(c, id)->subvol; s = snapshot_t(c, id)->subvol;
@ -477,6 +478,7 @@ static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
id = bch2_snapshot_tree_next(c, id); id = bch2_snapshot_tree_next(c, id);
} }
rcu_read_unlock();
return subvol; return subvol;
} }
@ -1782,6 +1784,7 @@ static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
new->k.p.snapshot = leaf_id; new->k.p.snapshot = leaf_id;
ret = bch2_trans_update(trans, &iter, new, 0); ret = bch2_trans_update(trans, &iter, new, 0);
out: out:
bch2_set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
return ret; return ret;
} }

View File

@ -92,34 +92,32 @@ static int check_subvol(struct btree_trans *trans,
} }
struct bch_inode_unpacked inode; struct bch_inode_unpacked inode;
struct btree_iter inode_iter = {}; ret = bch2_inode_find_by_inum_nowarn_trans(trans,
ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
(subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
0); &inode);
bch2_trans_iter_exit(trans, &inode_iter); if (!ret) {
if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
if (ret && !bch2_err_matches(ret, ENOENT)) trans, subvol_root_wrong_bi_subvol,
return ret; "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
inode.bi_inum, inode.bi_snapshot,
if (fsck_err_on(ret, inode.bi_subvol, subvol.k->p.offset)) {
trans, subvol_to_missing_root, inode.bi_subvol = subvol.k->p.offset;
"subvolume %llu points to missing subvolume root %llu:%u", ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
k.k->p.offset, le64_to_cpu(subvol.v->inode), if (ret)
le32_to_cpu(subvol.v->snapshot))) { goto err;
ret = bch2_subvolume_delete(trans, iter->pos.offset); }
bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); } else if (bch2_err_matches(ret, ENOENT)) {
return ret ?: -BCH_ERR_transaction_restart_nested; if (fsck_err(trans, subvol_to_missing_root,
} "subvolume %llu points to missing subvolume root %llu:%u",
k.k->p.offset, le64_to_cpu(subvol.v->inode),
if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, le32_to_cpu(subvol.v->snapshot))) {
trans, subvol_root_wrong_bi_subvol, ret = bch2_subvolume_delete(trans, iter->pos.offset);
"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
inode.bi_inum, inode_iter.k.p.snapshot, ret = ret ?: -BCH_ERR_transaction_restart_nested;
inode.bi_subvol, subvol.k->p.offset)) {
inode.bi_subvol = subvol.k->p.offset;
ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
if (ret)
goto err; goto err;
}
} else {
goto err;
} }
if (!BCH_SUBVOLUME_SNAP(subvol.v)) { if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
@ -137,7 +135,7 @@ static int check_subvol(struct btree_trans *trans,
"%s: snapshot tree %u not found", __func__, snapshot_tree); "%s: snapshot tree %u not found", __func__, snapshot_tree);
if (ret) if (ret)
return ret; goto err;
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
trans, subvol_not_master_and_not_snapshot, trans, subvol_not_master_and_not_snapshot,
@ -147,7 +145,7 @@ static int check_subvol(struct btree_trans *trans,
bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
ret = PTR_ERR_OR_ZERO(s); ret = PTR_ERR_OR_ZERO(s);
if (ret) if (ret)
return ret; goto err;
SET_BCH_SUBVOLUME_SNAP(&s->v, true); SET_BCH_SUBVOLUME_SNAP(&s->v, true);
} }

View File

@ -799,8 +799,10 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts,
i < layout.sb_offset + layout.nr_superblocks; i++) { i < layout.sb_offset + layout.nr_superblocks; i++) {
offset = le64_to_cpu(*i); offset = le64_to_cpu(*i);
if (offset == opt_get(*opts, sb)) if (offset == opt_get(*opts, sb)) {
ret = -BCH_ERR_invalid;
continue; continue;
}
ret = read_one_super(sb, offset, &err); ret = read_one_super(sb, offset, &err);
if (!ret) if (!ret)
@ -1188,7 +1190,8 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8); le_bitvector_to_cpu(errors_silent, (void *) e->errors_silent, sizeof(e->errors_silent) * 8);
prt_printf(out, "Errors to silently fix:\t"); prt_printf(out, "Errors to silently fix:\t");
prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent, sizeof(e->errors_silent) * 8); prt_bitflags_vector(out, bch2_sb_error_strs, errors_silent,
min(BCH_FSCK_ERR_MAX, sizeof(e->errors_silent) * 8));
prt_newline(out); prt_newline(out);
kfree(errors_silent); kfree(errors_silent);

View File

@ -394,7 +394,7 @@ static int insert_test_extent(struct bch_fs *c,
k.k_i.k.p.offset = end; k.k_i.k.p.offset = end;
k.k_i.k.p.snapshot = U32_MAX; k.k_i.k.p.snapshot = U32_MAX;
k.k_i.k.size = end - start; k.k_i.k.size = end - start;
k.k_i.k.version.lo = test_version++; k.k_i.k.bversion.lo = test_version++;
ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0);
bch_err_fn(c, ret); bch_err_fn(c, ret);