mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-10 07:10:27 +00:00
9b23fdbd5d
There's an inherent race in taking a snapshot while an unlinked file is open, and then reattaching it in the child snapshot. In the interior snapshot node the file will appear unlinked, as though it should be deleted - it's not referenced by anything in that snapshot - but we can't delete it, because the file data is referenced by the child snapshot. This was being handled incorrectly with propagate_key_to_snapshot_leaves() - but that doesn't resolve the fundamental inconsistency of "this file looks like it should be deleted according to normal rules, but - ". To fix this, we need to fix the rule for when an inode is deleted. The previous rule, ignoring snapshots (there was no well-defined rule for with snapshots) was: Unlinked, non open files are deleted, either at recovery time or during online fsck The new rule is: Unlinked, non open files, that do not exist in child snapshots, are deleted. To make this work transactionally, we add a new inode flag, BCH_INODE_has_child_snapshot; it overrides BCH_INODE_unlinked when considering whether to delete an inode, or put it on the deleted list. For transactional consistency, clearing it handled by the inode trigger: when deleting an inode we check if there are parent inodes which can now have the BCH_INODE_has_child_snapshot flag cleared. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
414 lines
13 KiB
C
414 lines
13 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
* Superblock section that contains a list of recovery passes to run when
|
|
* downgrading past a given version
|
|
*/
|
|
|
|
#include "bcachefs.h"
|
|
#include "darray.h"
|
|
#include "recovery_passes.h"
|
|
#include "sb-downgrade.h"
|
|
#include "sb-errors.h"
|
|
#include "super-io.h"
|
|
|
|
#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
|
|
|
|
/*
|
|
* Upgrade, downgrade tables - run certain recovery passes, fix certain errors
|
|
*
|
|
* x(version, recovery_passes, errors...)
|
|
*/
|
|
#define UPGRADE_TABLE() \
|
|
x(backpointers, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(inode_v3, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(unwritten_extents, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(bucket_gens, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(lru_v2, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(fragmentation_lru, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(no_bps_in_alloc_keys, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(snapshot_trees, \
|
|
RECOVERY_PASS_ALL_FSCK) \
|
|
x(snapshot_skiplists, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \
|
|
BCH_FSCK_ERR_snapshot_bad_depth, \
|
|
BCH_FSCK_ERR_snapshot_bad_skiplist) \
|
|
x(deleted_inodes, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
|
|
BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
|
|
x(rebalance_work, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
|
|
x(subvolume_fs_parent, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \
|
|
BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \
|
|
x(btree_subvolume_children, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \
|
|
BCH_FSCK_ERR_subvol_children_not_set) \
|
|
x(mi_btree_bitmap, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_btree_bitmap_not_marked) \
|
|
x(disk_accounting_v2, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_bkey_version_in_future, \
|
|
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
|
|
BCH_FSCK_ERR_accounting_mismatch) \
|
|
x(disk_accounting_v3, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_bkey_version_in_future, \
|
|
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
|
|
BCH_FSCK_ERR_accounting_mismatch, \
|
|
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
|
|
BCH_FSCK_ERR_accounting_key_replicas_nr_required_bad, \
|
|
BCH_FSCK_ERR_accounting_key_replicas_devs_unsorted, \
|
|
BCH_FSCK_ERR_accounting_key_junk_at_end) \
|
|
x(disk_accounting_inum, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_accounting_mismatch) \
|
|
x(rebalance_work_acct_fix, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_accounting_mismatch) \
|
|
x(inode_has_child_snapshots, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
|
|
BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
|
|
|
|
#define DOWNGRADE_TABLE() \
|
|
x(bucket_stripe_sectors, \
|
|
0) \
|
|
x(disk_accounting_v2, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_hidden_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_btree_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_data_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_cached_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_reserved_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
|
|
BCH_FSCK_ERR_bkey_version_in_future) \
|
|
x(disk_accounting_v3, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_dev_usage_buckets_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_sectors_wrong, \
|
|
BCH_FSCK_ERR_dev_usage_fragmented_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_hidden_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_btree_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_data_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_cached_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_reserved_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_nr_inodes_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, \
|
|
BCH_FSCK_ERR_fs_usage_replicas_wrong, \
|
|
BCH_FSCK_ERR_accounting_replicas_not_marked, \
|
|
BCH_FSCK_ERR_bkey_version_in_future) \
|
|
x(rebalance_work_acct_fix, \
|
|
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
|
|
BCH_FSCK_ERR_accounting_mismatch)
|
|
|
|
struct upgrade_downgrade_entry {
|
|
u64 recovery_passes;
|
|
u16 version;
|
|
u16 nr_errors;
|
|
const u16 *errors;
|
|
};
|
|
|
|
#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
|
|
UPGRADE_TABLE()
|
|
#undef x
|
|
|
|
static const struct upgrade_downgrade_entry upgrade_table[] = {
|
|
#define x(ver, passes, ...) { \
|
|
.recovery_passes = passes, \
|
|
.version = bcachefs_metadata_version_##ver,\
|
|
.nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \
|
|
.errors = upgrade_##ver##_errors, \
|
|
},
|
|
UPGRADE_TABLE()
|
|
#undef x
|
|
};
|
|
|
|
static int have_stripes(struct bch_fs *c)
|
|
{
|
|
return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
|
|
}
|
|
|
|
int bch2_sb_set_upgrade_extra(struct bch_fs *c)
|
|
{
|
|
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
|
|
unsigned new_version = c->sb.version;
|
|
bool write_sb = false;
|
|
int ret = 0;
|
|
|
|
mutex_lock(&c->sb_lock);
|
|
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
|
|
if (old_version < bcachefs_metadata_version_bucket_stripe_sectors &&
|
|
new_version >= bcachefs_metadata_version_bucket_stripe_sectors &&
|
|
(ret = have_stripes(c) > 0)) {
|
|
__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
|
|
__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
|
|
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_sectors_wrong, ext->errors_silent);
|
|
write_sb = true;
|
|
}
|
|
|
|
if (write_sb)
|
|
bch2_write_super(c);
|
|
mutex_unlock(&c->sb_lock);
|
|
|
|
return ret < 0 ? ret : 0;
|
|
}
|
|
|
|
void bch2_sb_set_upgrade(struct bch_fs *c,
|
|
unsigned old_version,
|
|
unsigned new_version)
|
|
{
|
|
lockdep_assert_held(&c->sb_lock);
|
|
|
|
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
|
|
for (const struct upgrade_downgrade_entry *i = upgrade_table;
|
|
i < upgrade_table + ARRAY_SIZE(upgrade_table);
|
|
i++)
|
|
if (i->version > old_version && i->version <= new_version) {
|
|
u64 passes = i->recovery_passes;
|
|
|
|
if (passes & RECOVERY_PASS_ALL_FSCK)
|
|
passes |= bch2_fsck_recovery_passes();
|
|
passes &= ~RECOVERY_PASS_ALL_FSCK;
|
|
|
|
ext->recovery_passes_required[0] |=
|
|
cpu_to_le64(bch2_recovery_passes_to_stable(passes));
|
|
|
|
for (const u16 *e = i->errors; e < i->errors + i->nr_errors; e++)
|
|
__set_bit_le64(*e, ext->errors_silent);
|
|
}
|
|
}
|
|
|
|
#define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
|
|
DOWNGRADE_TABLE()
|
|
#undef x
|
|
|
|
static const struct upgrade_downgrade_entry downgrade_table[] = {
|
|
#define x(ver, passes, ...) { \
|
|
.recovery_passes = passes, \
|
|
.version = bcachefs_metadata_version_##ver,\
|
|
.nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \
|
|
.errors = downgrade_##ver##_errors, \
|
|
},
|
|
DOWNGRADE_TABLE()
|
|
#undef x
|
|
};
|
|
|
|
static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
|
|
{
|
|
struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
|
|
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
|
|
int ret = 0;
|
|
|
|
unsigned nr_errors = le16_to_cpu(dst->nr_errors);
|
|
|
|
switch (le16_to_cpu(dst->version)) {
|
|
case bcachefs_metadata_version_bucket_stripe_sectors:
|
|
if (have_stripes(c)) {
|
|
bytes += sizeof(dst->errors[0]) * 2;
|
|
|
|
ret = darray_make_room(table, bytes);
|
|
if (ret)
|
|
return ret;
|
|
|
|
/* open coded __set_bit_le64, as dst is packed and
|
|
* dst->recovery_passes is misaligned */
|
|
unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
|
|
dst->recovery_passes[b / 64] |= cpu_to_le64(BIT_ULL(b % 64));
|
|
|
|
dst->errors[nr_errors++] = cpu_to_le16(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong);
|
|
}
|
|
break;
|
|
}
|
|
|
|
dst->nr_errors = cpu_to_le16(nr_errors);
|
|
return ret;
|
|
}
|
|
|
|
static inline const struct bch_sb_field_downgrade_entry *
|
|
downgrade_entry_next_c(const struct bch_sb_field_downgrade_entry *e)
|
|
{
|
|
return (void *) &e->errors[le16_to_cpu(e->nr_errors)];
|
|
}
|
|
|
|
#define for_each_downgrade_entry(_d, _i) \
|
|
for (const struct bch_sb_field_downgrade_entry *_i = (_d)->entries; \
|
|
(void *) _i < vstruct_end(&(_d)->field) && \
|
|
(void *) &_i->errors[0] <= vstruct_end(&(_d)->field) && \
|
|
(void *) downgrade_entry_next_c(_i) <= vstruct_end(&(_d)->field); \
|
|
_i = downgrade_entry_next_c(_i))
|
|
|
|
static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
|
|
enum bch_validate_flags flags, struct printbuf *err)
|
|
{
|
|
struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
|
|
|
|
for (const struct bch_sb_field_downgrade_entry *i = e->entries;
|
|
(void *) i < vstruct_end(&e->field);
|
|
i = downgrade_entry_next_c(i)) {
|
|
/*
|
|
* Careful: sb_field_downgrade_entry is only 2 byte aligned, but
|
|
* section sizes are 8 byte aligned - an empty entry spanning
|
|
* the end of the section is allowed (and ignored):
|
|
*/
|
|
if ((void *) &i->errors[0] > vstruct_end(&e->field))
|
|
break;
|
|
|
|
if (flags & BCH_VALIDATE_write &&
|
|
(void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
|
|
prt_printf(err, "downgrade entry overruns end of superblock section");
|
|
return -BCH_ERR_invalid_sb_downgrade;
|
|
}
|
|
|
|
if (BCH_VERSION_MAJOR(le16_to_cpu(i->version)) !=
|
|
BCH_VERSION_MAJOR(le16_to_cpu(sb->version))) {
|
|
prt_printf(err, "downgrade entry with mismatched major version (%u != %u)",
|
|
BCH_VERSION_MAJOR(le16_to_cpu(i->version)),
|
|
BCH_VERSION_MAJOR(le16_to_cpu(sb->version)));
|
|
return -BCH_ERR_invalid_sb_downgrade;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void bch2_sb_downgrade_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
struct bch_sb_field *f)
|
|
{
|
|
struct bch_sb_field_downgrade *e = field_to_type(f, downgrade);
|
|
|
|
if (out->nr_tabstops <= 1)
|
|
printbuf_tabstop_push(out, 16);
|
|
|
|
for_each_downgrade_entry(e, i) {
|
|
prt_str(out, "version:\t");
|
|
bch2_version_to_text(out, le16_to_cpu(i->version));
|
|
prt_newline(out);
|
|
|
|
prt_str(out, "recovery passes:\t");
|
|
prt_bitflags(out, bch2_recovery_passes,
|
|
bch2_recovery_passes_from_stable(le64_to_cpu(i->recovery_passes[0])));
|
|
prt_newline(out);
|
|
|
|
prt_str(out, "errors:\t");
|
|
bool first = true;
|
|
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
|
|
if (!first)
|
|
prt_char(out, ',');
|
|
first = false;
|
|
bch2_sb_error_id_to_text(out, le16_to_cpu(i->errors[j]));
|
|
}
|
|
prt_newline(out);
|
|
}
|
|
}
|
|
|
|
const struct bch_sb_field_ops bch_sb_field_ops_downgrade = {
|
|
.validate = bch2_sb_downgrade_validate,
|
|
.to_text = bch2_sb_downgrade_to_text,
|
|
};
|
|
|
|
int bch2_sb_downgrade_update(struct bch_fs *c)
|
|
{
|
|
if (!test_bit(BCH_FS_btree_running, &c->flags))
|
|
return 0;
|
|
|
|
darray_char table = {};
|
|
int ret = 0;
|
|
|
|
for (const struct upgrade_downgrade_entry *src = downgrade_table;
|
|
src < downgrade_table + ARRAY_SIZE(downgrade_table);
|
|
src++) {
|
|
if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
|
|
continue;
|
|
|
|
struct bch_sb_field_downgrade_entry *dst;
|
|
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
|
|
|
|
ret = darray_make_room(&table, bytes);
|
|
if (ret)
|
|
goto out;
|
|
|
|
dst = (void *) &darray_top(table);
|
|
dst->version = cpu_to_le16(src->version);
|
|
dst->recovery_passes[0] = cpu_to_le64(bch2_recovery_passes_to_stable(src->recovery_passes));
|
|
dst->recovery_passes[1] = 0;
|
|
dst->nr_errors = cpu_to_le16(src->nr_errors);
|
|
for (unsigned i = 0; i < src->nr_errors; i++)
|
|
dst->errors[i] = cpu_to_le16(src->errors[i]);
|
|
|
|
ret = downgrade_table_extra(c, &table);
|
|
if (ret)
|
|
goto out;
|
|
|
|
if (!dst->recovery_passes[0] &&
|
|
!dst->recovery_passes[1] &&
|
|
!dst->nr_errors)
|
|
continue;
|
|
|
|
table.nr += sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
|
|
}
|
|
|
|
struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
|
|
|
|
unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64));
|
|
|
|
if (d && le32_to_cpu(d->field.u64s) > sb_u64s)
|
|
goto out;
|
|
|
|
d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
|
|
if (!d) {
|
|
ret = -BCH_ERR_ENOSPC_sb_downgrade;
|
|
goto out;
|
|
}
|
|
|
|
memcpy(d->entries, table.data, table.nr);
|
|
memset_u64s_tail(d->entries, 0, table.nr);
|
|
out:
|
|
darray_exit(&table);
|
|
return ret;
|
|
}
|
|
|
|
void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_minor)
|
|
{
|
|
struct bch_sb_field_downgrade *d = bch2_sb_field_get(c->disk_sb.sb, downgrade);
|
|
if (!d)
|
|
return;
|
|
|
|
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
|
|
|
|
for_each_downgrade_entry(d, i) {
|
|
unsigned minor = BCH_VERSION_MINOR(le16_to_cpu(i->version));
|
|
if (new_minor < minor && minor <= old_minor) {
|
|
ext->recovery_passes_required[0] |= i->recovery_passes[0];
|
|
ext->recovery_passes_required[1] |= i->recovery_passes[1];
|
|
|
|
for (unsigned j = 0; j < le16_to_cpu(i->nr_errors); j++) {
|
|
unsigned e = le16_to_cpu(i->errors[j]);
|
|
if (e < BCH_FSCK_ERR_MAX)
|
|
__set_bit(e, c->sb.errors_silent);
|
|
if (e < sizeof(ext->errors_silent) * 8)
|
|
__set_bit_le64(e, ext->errors_silent);
|
|
}
|
|
}
|
|
}
|
|
}
|