Second bcachefs pull request for 6.7-rc1

Here's the second big bcachefs pull request. This brings your tree up to
 date with my master branch, which is what existing bcachefs users are
 currently running.
 
 All but the last few patches have been in linux-next, those being small
 fixes. Test results from my dashboard:
   https://evilpiepirate.org/~testdashboard/ci?commit=c7046ed0cf9bb33599aa7e72e7b67bba4be42d64
 
 New features:
  - rebalance_work btree (and metadata version 1.3): the rebalance thread
    no longer has to scan to find extents that need processing - big
    scalability improvement.
  - sb_errors superblock section: this adds counters for each fsck error
    type, since filesystem creation, along with the date of the most
    recent error. It'll get us better bug reports (since users do not
    typically report errors that fsck was able to fix), and I might add
    telemetry for this in the future.
 
 Fixes include:
  - multiple snapshot deletion fixes
  - members_v2 fixups
  - deleted_inodes btree fixes
  - copygc thread no longer spins when a device is full but has no
    fragmented buckets (i.e. rebalance needs to move data around instead)
  - a fix for a memory reclaim issue with the btree key cache: we're now
    careful not to hold the srcu read lock that blocks key cache reclaim
    for too long
  - an early allocator locking fix, from Brian
  - endianness fixes, from Brian
  - CONFIG_BCACHEFS_DEBUG_TRANSACTIONS no longer defaults to y, a big
    performance improvement on multithreaded workloads
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmVH9xYACgkQE6szbY3K
 bnahLRAAiNRZL73SQ+MW79o4yPqGwt0Eyy/mvoiGpZf1B8uXp0oZ55j2w3l887Uf
 LeM03mInAYCPdyp/d4vxqIr96j9BODmRRl8sEkkGdJDzokLG+22F0ovOe45KWTxL
 kBoNdng/O/oeOe/1K7taP3KzBvMx2nOF6oA+xfgyCjECMArAIXek0iocyEUR4Ywd
 vGKhLNn1k2c+94wacnDYwjjdcLBxoqxsFXlpu6V0BcaY+DX4J3aBaGmj75KEoCI0
 VbBOzxrOO4QzJrzW2+hxZZWgGyvReCkBJvqfORfuPxiSbFobTim10MdfZOAMQA1U
 Xr1FTEpK1wMX0/pPVgZRqaOsttC+yc/SsfPNgSxybgHPbDlMLaakDHjvYssbKOYG
 urDWSMG5yCsktSLj95SXsvUFKZaZFD72SKBNdgdt/nZjwTHuNQ7IkdrMwIrCQ/PT
 Ifn50UrR/Ahd8RAd5tyNCPw6U9VfwnxACSNl2KA7ONKpvHb+gSt1JsJTDyz1+gN9
 nFVrw1SHKQ6EIV6XhVon/5DEuRTzqoYGWoN08FHEUq9fBlvnVpmbJErCQMplOjz9
 OQnAfpJH4YqkpXyjFAjP1V0An+RUn8QvDgXNqC9TyvCYuOliVFuil4y7/c+7oIQU
 NEoz+jVLenqsGOGAbduI4/Q567COojRgwEvbebSIxSImXuhCNj4=
 =Lo4N
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2023-11-5' of https://evilpiepirate.org/git/bcachefs

Pull more bcachefs updates from Kent Overstreet:
 "Here's the second big bcachefs pull request. This brings your tree up
  to date with my master branch, which is what existing bcachefs users
  are currently running.

  New features:
   - rebalance_work btree (and metadata version 1.3): the rebalance
     thread no longer has to scan to find extents that need processing -
     big scalability improvement.
   - sb_errors superblock section: this adds counters for each fsck
     error type, since filesystem creation, along with the date of the
     most recent error. It'll get us better bug reports (since users do
     not typically report errors that fsck was able to fix), and I might
     add telemetry for this in the future.

  Fixes include:
   - multiple snapshot deletion fixes
   - members_v2 fixups
   - deleted_inodes btree fixes
   - copygc thread no longer spins when a device is full but has no
     fragmented buckets (i.e. rebalance needs to move data around
     instead)
   - a fix for a memory reclaim issue with the btree key cache: we're
     now careful not to hold the srcu read lock that blocks key cache
     reclaim for too long
   - an early allocator locking fix, from Brian
   - endianness fixes, from Brian
   - CONFIG_BCACHEFS_DEBUG_TRANSACTIONS no longer defaults to y, a big
     performance improvement on multithreaded workloads"

* tag 'bcachefs-2023-11-5' of https://evilpiepirate.org/git/bcachefs: (70 commits)
  bcachefs: Improve stripe checksum error message
  bcachefs: Simplify, fix bch2_backpointer_get_key()
  bcachefs: kill thing_it_points_to arg to backpointer_not_found()
  bcachefs: bch2_ec_read_extent() now takes btree_trans
  bcachefs: bch2_stripe_to_text() now prints ptr gens
  bcachefs: Don't iterate over journal entries just for btree roots
  bcachefs: Break up bch2_journal_write()
  bcachefs: Replace ERANGE with private error codes
  bcachefs: bkey_copy() is no longer a macro
  bcachefs: x-macro-ify inode flags enum
  bcachefs: Convert bch2_fs_open() to darray
  bcachefs: Move __bch2_members_v2_get_mut to sb-members.h
  bcachefs: bch2_prt_datetime()
  bcachefs: CONFIG_BCACHEFS_DEBUG_TRANSACTIONS no longer defaults to y
  bcachefs: Add a comment for BTREE_INSERT_NOJOURNAL usage
  bcachefs: rebalance_work btree is not a snapshots btree
  bcachefs: Add missing printk newlines
  bcachefs: Fix recovery when forced to use JSET_NO_FLUSH journal entry
  bcachefs: .get_parent() should return an error pointer
  bcachefs: Fix bch2_delete_dead_inodes()
  ...
This commit is contained in:
Linus Torvalds 2023-11-07 11:38:38 -08:00
commit c9d01179e1
109 changed files with 3988 additions and 2302 deletions

View File

@ -24,7 +24,6 @@ config BCACHEFS_FS
select XXHASH
select SRCU
select SYMBOLIC_ERRNAME
select MEAN_AND_VARIANCE
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@ -42,7 +41,6 @@ config BCACHEFS_POSIX_ACL
config BCACHEFS_DEBUG_TRANSACTIONS
bool "bcachefs runtime info"
depends on BCACHEFS_FS
default y
help
This makes the list of running btree transactions available in debugfs.
@ -78,7 +76,7 @@ config BCACHEFS_NO_LATENCY_ACCT
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
select MEAN_AND_VARIANCE
depends on BCACHEFS_FS
default KUNIT_ALL_TESTS
help
This option enables the kunit tests for mean_and_variance module.

View File

@ -70,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
sb-errors.o \
sb-members.o \
siphash.o \
six.o \

View File

@ -192,123 +192,109 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
return DIV_ROUND_UP(bytes, sizeof(u64));
}
int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
int ret = 0;
/* allow for unknown fields */
if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
prt_printf(err, "incorrect value size (%zu < %u)",
bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
alloc_v1_val_size_bad,
"incorrect value size (%zu < %u)",
bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
return -BCH_ERR_invalid_bkey;
fsck_err:
return ret;
}
return 0;
}
int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
int ret = 0;
if (bch2_alloc_unpack_v2(&u, k)) {
prt_printf(err, "unpack error");
return -BCH_ERR_invalid_bkey;
bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
alloc_v2_unpack_error,
"unpack error");
fsck_err:
return ret;
}
return 0;
}
int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_alloc_unpacked u;
int ret = 0;
if (bch2_alloc_unpack_v3(&u, k)) {
prt_printf(err, "unpack error");
return -BCH_ERR_invalid_bkey;
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
alloc_v2_unpack_error,
"unpack error");
fsck_err:
return ret;
}
return 0;
}
int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
int ret = 0;
if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
prt_printf(err, "bad val size (%u > %zu)",
bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
alloc_v4_val_size_bad,
"bad val size (%u > %zu)",
alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
return -BCH_ERR_invalid_bkey;
}
if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
prt_printf(err, "invalid backpointers_start");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
alloc_v4_backpointers_start_bad,
"invalid backpointers_start");
if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
prt_printf(err, "invalid data type (got %u should be %u)",
bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
alloc_key_data_type_bad,
"invalid data type (got %u should be %u)",
a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
return -BCH_ERR_invalid_bkey;
}
switch (a.v->data_type) {
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
if (a.v->dirty_sectors ||
bkey_fsck_err_on(a.v->dirty_sectors ||
a.v->cached_sectors ||
a.v->stripe) {
prt_printf(err, "empty data type free but have data");
return -BCH_ERR_invalid_bkey;
}
a.v->stripe, c, err,
alloc_key_empty_but_have_data,
"empty data type free but have data");
break;
case BCH_DATA_sb:
case BCH_DATA_journal:
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
if (!a.v->dirty_sectors) {
prt_printf(err, "data_type %s but dirty_sectors==0",
bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
bch2_data_types[a.v->data_type]);
return -BCH_ERR_invalid_bkey;
}
break;
case BCH_DATA_cached:
if (!a.v->cached_sectors ||
bkey_fsck_err_on(!a.v->cached_sectors ||
a.v->dirty_sectors ||
a.v->stripe) {
prt_printf(err, "data type inconsistency");
return -BCH_ERR_invalid_bkey;
}
a.v->stripe, c, err,
alloc_key_cached_inconsistency,
"data type inconsistency");
if (!a.v->io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
prt_printf(err, "cached bucket with read_time == 0");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(!a.v->io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
c, err,
alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
case BCH_DATA_stripe:
break;
}
return 0;
}
static inline u64 swab40(u64 x)
{
return (((x & 0x00000000ffULL) << 32)|
((x & 0x000000ff00ULL) << 16)|
((x & 0x0000ff0000ULL) >> 0)|
((x & 0x00ff000000ULL) >> 16)|
((x & 0xff00000000ULL) >> 32));
fsck_err:
return ret;
}
void bch2_alloc_v4_swab(struct bkey_s k)
@ -324,6 +310,7 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->io_time[1] = swab64(a->io_time[1]);
a->stripe = swab32(a->stripe);
a->nr_external_backpointers = swab32(a->nr_external_backpointers);
a->fragmentation_lru = swab64(a->fragmentation_lru);
bps = alloc_v4_backpointers(a);
for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
@ -521,17 +508,18 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
: 0;
}
int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
prt_printf(err, "bad val size (%zu != %zu)",
bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return 0;
bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
bucket_gens_val_size_bad,
"bad val size (%zu != %zu)",
bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
fsck_err:
return ret;
}
void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
@ -727,7 +715,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
"incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
" for %s",
set ? "setting" : "clearing",
bch2_btree_ids[btree],
bch2_btree_id_str(btree),
iter.pos.inode,
iter.pos.offset,
bch2_bkey_types[old.k->type],
@ -986,6 +974,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
int ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
alloc_key_to_missing_dev_bucket,
"alloc key for invalid device:bucket %llu:%llu",
alloc_k.k->p.inode, alloc_k.k->p.offset))
return bch2_btree_delete_at(trans, alloc_iter, 0);
@ -1005,7 +994,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (k.k->type != discard_key_type &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
fsck_err(c, need_discard_key_wrong,
"incorrect key in need_discard btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[discard_key_type],
@ -1035,7 +1025,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (k.k->type != freespace_key_type &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
fsck_err(c, freespace_key_wrong,
"incorrect key in freespace btree (got %s should be %s)\n"
" %s",
bch2_bkey_types[k.k->type],
bch2_bkey_types[freespace_key_type],
@ -1066,7 +1057,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
if (a->gen != alloc_gen(k, gens_offset) &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
fsck_err(c, bucket_gens_key_wrong,
"incorrect gen in bucket_gens btree (got %u should be %u)\n"
" %s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
@ -1124,7 +1116,8 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_set &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "hole in alloc btree missing in freespace btree\n"
fsck_err(c, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
" device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
@ -1187,6 +1180,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
for (i = gens_offset; i < gens_end_offset; i++) {
if (fsck_err_on(g.v.gens[i], c,
bucket_gens_hole_wrong,
"hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
bucket_gens_pos_to_alloc(k.k->p, i).inode,
bucket_gens_pos_to_alloc(k.k->p, i).offset,
@ -1244,8 +1238,9 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr
return ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
need_discard_freespace_key_to_invalid_dev_bucket,
"entry in %s btree for nonexistant dev:bucket %llu:%llu",
bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
goto delete;
a = bch2_alloc_to_v4(alloc_k, &a_convert);
@ -1253,9 +1248,10 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr
if (fsck_err_on(a->data_type != state ||
(state == BCH_DATA_free &&
genbits != alloc_freespace_genbits(*a)), c,
need_discard_freespace_key_bad,
"%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
bch2_btree_ids[iter->btree_id],
bch2_btree_id_str(iter->btree_id),
iter->pos.inode,
iter->pos.offset,
a->data_type == state,
@ -1320,6 +1316,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
dev_exists = bch2_dev_exists2(c, k.k->p.inode);
if (!dev_exists) {
if (fsck_err_on(!dev_exists, c,
bucket_gens_to_invalid_dev,
"bucket_gens key for invalid device:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@ -1330,6 +1327,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
ca = bch_dev_bkey_exists(c, k.k->p.inode);
if (fsck_err_on(end <= ca->mi.first_bucket ||
start >= ca->mi.nbuckets, c,
bucket_gens_to_invalid_buckets,
"bucket_gens key for invalid buckets:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@ -1338,6 +1336,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
for (b = start; b < ca->mi.first_bucket; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
@ -1345,6 +1344,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
for (b = ca->mi.nbuckets; b < end; b++)
if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
bucket_gens_nonzero_for_invalid_buckets,
"bucket_gens key has nonzero gen for invalid bucket")) {
g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
need_update = true;
@ -1495,11 +1495,13 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
return ret;
if (fsck_err_on(!a->io_time[READ], c,
alloc_key_cached_but_read_time_zero,
"cached bucket with read_time 0\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
alloc_key_to_missing_lru_entry,
"missing lru entry\n"
" %s",
(printbuf_reset(&buf),
@ -2075,6 +2077,17 @@ void bch2_recalc_capacity(struct bch_fs *c)
closure_wake_up(&c->freelist_wait);
}
u64 bch2_min_rw_member_capacity(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
u64 ret = U64_MAX;
for_each_rw_member(ca, c, i)
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
return ret;
}
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
struct open_bucket *ob;

View File

@ -149,13 +149,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@ -193,7 +193,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.min_val_size = 48, \
})
int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@ -249,6 +249,7 @@ int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
int bch2_fs_freespace_init(struct bch_fs *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);

View File

@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
struct bucket_alloc_state *s,
struct closure *cl)
{
struct btree_iter iter;
struct bkey_s_c k;
struct btree_iter iter, citer;
struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
/*
* Scan with an uncached iterator to avoid polluting the key cache. An
* uncached iter will return a cached key if one exists, but if not
* there is no other underlying protection for the associated key cache
* slot. To avoid racing bucket allocations, look up the cached key slot
* of any likely allocation candidate before attempting to proceed with
* the allocation. This provides proper exclusion on the associated
* bucket.
*/
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
BTREE_ITER_SLOTS, k, ret) {
@ -419,25 +430,38 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
continue;
a = bch2_alloc_to_v4(k, &a_convert);
if (a->data_type != BCH_DATA_free)
continue;
/* now check the cached key to serialize concurrent allocs of the bucket */
ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
ret = bkey_err(ck);
if (ret)
break;
a = bch2_alloc_to_v4(ck, &a_convert);
if (a->data_type != BCH_DATA_free)
goto next;
s->buckets_seen++;
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
next:
citer.path->preserve = false;
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
alloc_cursor = iter.pos.offset;
ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
if (!ob && alloc_cursor > alloc_start) {
alloc_cursor = alloc_start;
if (!ob && alloc_start > first_bucket) {
alloc_cursor = alloc_start = first_bucket;
goto again;
}

View File

@ -5,6 +5,7 @@
#include "backpointers.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
@ -37,25 +38,26 @@ static bool extent_matches_bp(struct bch_fs *c,
return false;
}
int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
int ret = 0;
if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
prt_str(err, "backpointer at wrong pos");
return -BCH_ERR_invalid_bkey;
}
return 0;
bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
c, err,
backpointer_pos_wrong,
"backpointer at wrong pos");
fsck_err:
return ret;
}
void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
{
prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
bch2_btree_ids[bp->btree_id],
bch2_btree_id_str(bp->btree_id),
bp->level,
(u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
(u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
@ -76,7 +78,7 @@ void bch2_backpointer_swab(struct bkey_s k)
{
struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
bp.v->bucket_offset = swab32(bp.v->bucket_offset);
bp.v->bucket_offset = swab40(bp.v->bucket_offset);
bp.v->bucket_len = swab32(bp.v->bucket_len);
bch2_bpos_swab(&bp.v->pos);
}
@ -219,18 +221,22 @@ int bch2_get_next_backpointer(struct btree_trans *trans,
static void backpointer_not_found(struct btree_trans *trans,
struct bpos bp_pos,
struct bch_backpointer bp,
struct bkey_s_c k,
const char *thing_it_points_to)
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
/*
* If we're using the btree write buffer, the backpointer we were
* looking at may have already been deleted - failure to find what it
* pointed to is not an error:
*/
if (likely(!bch2_backpointers_no_use_write_buffer))
return;
prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
thing_it_points_to);
bp.level ? "btree node" : "extent");
prt_printf(&buf, "bucket: ");
bch2_bpos_to_text(&buf, bucket);
prt_printf(&buf, "\n ");
@ -256,16 +262,15 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
struct bch_backpointer bp,
unsigned iter_flags)
{
if (likely(!bp.level)) {
struct bch_fs *c = trans->c;
struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
struct bkey_s_c k;
bch2_trans_node_iter_init(trans, iter,
bp.btree_id,
bp.pos,
0,
min(bp.level, r->level),
0, 0,
iter_flags);
k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k)) {
@ -273,39 +278,21 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
return k;
}
if (bp.level == r->level + 1)
k = bkey_i_to_s_c(&r->key);
if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
return k;
bch2_trans_iter_exit(trans, iter);
backpointer_not_found(trans, bp_pos, bp, k);
return bkey_s_c_null;
} else {
struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
if (unlikely(bch2_backpointers_no_use_write_buffer)) {
if (bp.level) {
struct btree *b;
/*
* If a backpointer for a btree node wasn't found, it may be
* because it was overwritten by a new btree node that hasn't
* been written out yet - backpointer_get_node() checks for
* this:
*/
b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
if (!IS_ERR_OR_NULL(b))
return bkey_i_to_s_c(&b->key);
if (IS_ERR_OR_NULL(b)) {
bch2_trans_iter_exit(trans, iter);
if (IS_ERR(b))
return bkey_s_c_err(PTR_ERR(b));
return bkey_s_c_null;
return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
}
backpointer_not_found(trans, bp_pos, bp, k, "extent");
return bkey_i_to_s_c(&b->key);
}
return bkey_s_c_null;
}
struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
@ -329,6 +316,8 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
if (IS_ERR(b))
goto err;
BUG_ON(b->c.level != bp.level - 1);
if (b && extent_matches_bp(c, bp.btree_id, bp.level,
bkey_i_to_s_c(&b->key),
bucket, bp))
@ -337,8 +326,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
if (b && btree_node_will_make_reachable(b)) {
b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
} else {
backpointer_not_found(trans, bp_pos, bp,
bkey_i_to_s_c(&b->key), "btree node");
backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
b = NULL;
}
err:
@ -356,6 +344,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
int ret = 0;
if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
backpointer_to_missing_device,
"backpointer for missing device:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, bp_iter, 0);
@ -369,6 +358,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
goto out;
if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
backpointer_to_missing_alloc,
"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
alloc_iter.pos.inode, alloc_iter.pos.offset,
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
@ -453,14 +443,14 @@ static int check_bp_exists(struct btree_trans *trans,
return ret;
missing:
prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
bch2_btree_ids[bp.btree_id], bp.level);
bch2_btree_id_str(bp.btree_id), bp.level);
bch2_bkey_val_to_text(&buf, c, orig_k);
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
c->opts.reconstruct_alloc ||
fsck_err(c, "%s", buf.buf))
fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
goto out;
@ -793,7 +783,9 @@ static int check_one_backpointer(struct btree_trans *trans,
}
if (fsck_err_on(!k.k, c,
"backpointer for missing extent\n %s",
backpointer_to_missing_ptr,
"backpointer for missing %s\n %s",
bp.v->level ? "btree node" : "extent",
(bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
goto out;

View File

@ -7,7 +7,16 @@
#include "buckets.h"
#include "super.h"
int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
static inline u64 swab40(u64 x)
{
return (((x & 0x00000000ffULL) << 32)|
((x & 0x000000ff00ULL) << 16)|
((x & 0x0000ff0000ULL) >> 0)|
((x & 0x00ff000000ULL) >> 16)|
((x & 0xff00000000ULL) >> 32));
}
int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
enum bkey_invalid_flags, struct printbuf *);
void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

View File

@ -2,20 +2,9 @@
#ifndef _BCACHEFS_BBPOS_H
#define _BCACHEFS_BBPOS_H
#include "bbpos_types.h"
#include "bkey_methods.h"
struct bbpos {
enum btree_id btree;
struct bpos pos;
};
static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
{
return (struct bbpos) { btree, pos };
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
#include "btree_cache.h"
static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
{
@ -40,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos)
static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
{
prt_str(out, bch2_btree_ids[pos.btree]);
prt_str(out, bch2_btree_id_str(pos.btree));
prt_char(out, ':');
bch2_bpos_to_text(out, pos.pos);
}

18
fs/bcachefs/bbpos_types.h Normal file
View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BBPOS_TYPES_H
#define _BCACHEFS_BBPOS_TYPES_H
struct bbpos {
enum btree_id btree;
struct bpos pos;
};
static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
{
return (struct bbpos) { btree, pos };
}
#define BBPOS_MIN BBPOS(0, POS_MIN)
#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX)
#endif /* _BCACHEFS_BBPOS_TYPES_H */

View File

@ -209,6 +209,7 @@
#include "nocow_locking_types.h"
#include "opts.h"
#include "recovery_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
#include "util.h"
@ -418,6 +419,7 @@ enum bch_time_stats {
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
#include "disk_groups_types.h"
#include "ec_types.h"
#include "journal_types.h"
#include "keylist_types.h"
@ -463,6 +465,7 @@ enum gc_phase {
GC_PHASE_BTREE_snapshot_trees,
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
GC_PHASE_BTREE_rebalance_work,
GC_PHASE_PENDING_DELETE,
};
@ -500,6 +503,8 @@ struct bch_dev {
* Committed by bch2_write_super() -> bch_fs_mi_update()
*/
struct bch_member_cpu mi;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
__uuid_t uuid;
char name[BDEVNAME_SIZE];
@ -578,7 +583,7 @@ enum {
BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
BCH_FS_NEED_ANOTHER_GC,
BCH_FS_HAVE_DELETED_SNAPSHOTS,
BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
/* errors: */
BCH_FS_ERROR,
@ -938,9 +943,6 @@ struct bch_fs {
struct list_head moving_context_list;
struct mutex moving_context_lock;
struct list_head data_progress_list;
struct mutex data_progress_lock;
/* REBALANCE */
struct bch_fs_rebalance rebalance;
@ -991,11 +993,6 @@ struct bch_fs {
struct bio_set dio_read_bioset;
struct bio_set nocow_flush_bioset;
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;
bool fsck_alloc_err;
/* QUOTAS */
struct bch_memquota_type quotas[QTYP_NR];
@ -1044,6 +1041,14 @@ struct bch_fs {
struct bch2_time_stats times[BCH_TIME_STAT_NR];
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
/* ERRORS */
struct list_head fsck_error_msgs;
struct mutex fsck_error_msgs_lock;
bool fsck_alloc_msgs_err;
bch_sb_errors_cpu fsck_error_counts;
struct mutex fsck_error_counts_lock;
};
extern struct wait_queue_head bch2_read_only_wait;

View File

@ -613,31 +613,17 @@ struct bch_extent_stripe_ptr {
#endif
};
struct bch_extent_reservation {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:6,
unused:22,
replicas:4,
generation:32;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 generation:32,
replicas:4,
unused:22,
type:6;
#endif
};
struct bch_extent_rebalance {
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u64 type:7,
unused:33,
compression:8,
__u64 type:6,
unused:34,
compression:8, /* enum bch_compression_opt */
target:16;
#elif defined (__BIG_ENDIAN_BITFIELD)
__u64 target:16,
compression:8,
unused:33,
type:7;
unused:34,
type:6;
#endif
};
@ -838,34 +824,30 @@ enum inode_opt_id {
Inode_opt_nr,
};
enum {
/*
* User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
* flags)
*/
__BCH_INODE_SYNC = 0,
__BCH_INODE_IMMUTABLE = 1,
__BCH_INODE_APPEND = 2,
__BCH_INODE_NODUMP = 3,
__BCH_INODE_NOATIME = 4,
__BCH_INODE_I_SIZE_DIRTY = 5, /* obsolete */
__BCH_INODE_I_SECTORS_DIRTY = 6, /* obsolete */
__BCH_INODE_UNLINKED = 7,
__BCH_INODE_BACKPTR_UNTRUSTED = 8,
#define BCH_INODE_FLAGS() \
x(sync, 0) \
x(immutable, 1) \
x(append, 2) \
x(nodump, 3) \
x(noatime, 4) \
x(i_size_dirty, 5) \
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8)
/* bits 20+ reserved for packed fields below: */
enum bch_inode_flags {
#define x(t, n) BCH_INODE_##t = 1U << n,
BCH_INODE_FLAGS()
#undef x
};
#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC)
#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE)
#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND)
#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP)
#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME)
#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
enum __bch_inode_flags {
#define x(t, n) __BCH_INODE_##t = n,
BCH_INODE_FLAGS()
#undef x
};
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
@ -1232,7 +1214,8 @@ struct bch_sb_field {
x(journal_seq_blacklist, 8) \
x(journal_v2, 9) \
x(counters, 10) \
x(members_v2, 11)
x(members_v2, 11) \
x(errors, 12)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@ -1282,6 +1265,18 @@ enum bch_iops_measurement {
BCH_IOPS_NR
};
#define BCH_MEMBER_ERROR_TYPES() \
x(read, 0) \
x(write, 1) \
x(checksum, 2)
enum bch_member_error_type {
#define x(t, n) BCH_MEMBER_ERROR_##t = n,
BCH_MEMBER_ERROR_TYPES()
#undef x
BCH_MEMBER_ERROR_NR
};
struct bch_member {
__uuid_t uuid;
__le64 nbuckets; /* device size */
@ -1292,6 +1287,9 @@ struct bch_member {
__le64 flags;
__le32 iops[4];
__le64 errors[BCH_MEMBER_ERROR_NR];
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
};
#define BCH_MEMBER_V1_BYTES 56
@ -1615,11 +1613,20 @@ struct journal_seq_blacklist_entry {
struct bch_sb_field_journal_seq_blacklist {
struct bch_sb_field field;
struct journal_seq_blacklist_entry start[0];
__u64 _data[];
struct journal_seq_blacklist_entry start[];
};
struct bch_sb_field_errors {
struct bch_sb_field field;
struct bch_sb_field_error_entry {
__le64 v;
__le64 last_error_time;
} entries[];
};
LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
/* Superblock: */
/*
@ -1682,7 +1689,9 @@ struct bch_sb_field_journal_seq_blacklist {
x(snapshot_skiplists, BCH_VERSION(1, 1), \
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
x(deleted_inodes, BCH_VERSION(1, 2), \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
x(rebalance_work, BCH_VERSION(1, 3), \
BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@ -1693,7 +1702,7 @@ enum bcachefs_metadata_version {
};
static const __maybe_unused
unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
@ -2247,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
enum btree_id_flags {
BTREE_ID_EXTENTS = BIT(0),
BTREE_ID_SNAPSHOTS = BIT(1),
BTREE_ID_DATA = BIT(2),
BTREE_ID_SNAPSHOT_FIELD = BIT(2),
BTREE_ID_DATA = BIT(3),
};
#define BCH_BTREE_IDS() \
@ -2302,11 +2312,13 @@ enum btree_id_flags {
BIT_ULL(KEY_TYPE_bucket_gens)) \
x(snapshot_trees, 15, 0, \
BIT_ULL(KEY_TYPE_snapshot_tree)) \
x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \
x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)) \
x(logged_ops, 17, 0, \
BIT_ULL(KEY_TYPE_logged_op_truncate)| \
BIT_ULL(KEY_TYPE_logged_op_finsert))
BIT_ULL(KEY_TYPE_logged_op_finsert)) \
x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
enum btree_id {
#define x(name, nr, ...) BTREE_ID_##name = nr,

View File

@ -92,19 +92,15 @@ enum bkey_lr_packed {
#define bkey_lr_packed(_l, _r) \
((_l)->format + ((_r)->format << 1))
#define bkey_copy(_dst, _src) \
do { \
BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \
!type_is(_dst, struct bkey_packed *)); \
BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \
!type_is(_src, struct bkey_packed *)); \
EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \
(u64 *) (_dst) < (u64 *) (_src) + \
((struct bkey *) (_src))->u64s); \
\
memcpy_u64s_small((_dst), (_src), \
((struct bkey *) (_src))->u64s); \
} while (0)
static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
{
memcpy_u64s_small(dst, src, src->u64s);
}
static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
{
memcpy_u64s_small(dst, src, src->k.u64s);
}
struct btree;

View File

@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "backpointers.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_types.h"
#include "alloc_background.h"
#include "dirent.h"
@ -25,7 +26,7 @@ const char * const bch2_bkey_types[] = {
NULL
};
static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
@ -39,23 +40,24 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
.key_invalid = deleted_key_invalid, \
})
static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
if (bkey_val_bytes(k.k)) {
prt_printf(err, "incorrect value size (%zu != 0)",
bkey_val_bytes(k.k));
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return 0;
bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
bkey_val_size_nonzero,
"incorrect value size (%zu != 0)",
bkey_val_bytes(k.k));
fsck_err:
return ret;
}
#define bch2_bkey_ops_error ((struct bkey_ops) { \
.key_invalid = empty_val_key_invalid, \
})
static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
@ -70,7 +72,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
.key_invalid = empty_val_key_invalid, \
})
static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
return 0;
@ -91,18 +93,6 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
.val_to_text = key_type_inline_data_to_text, \
})
static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
if (bkey_val_bytes(k.k)) {
prt_printf(err, "incorrect value size (%zu != %zu)",
bkey_val_bytes(k.k), sizeof(struct bch_cookie));
return -BCH_ERR_invalid_bkey;
}
return 0;
}
static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
{
bch2_key_resize(l.k, l.k->size + r.k->size);
@ -110,7 +100,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
}
#define bch2_bkey_ops_set ((struct bkey_ops) { \
.key_invalid = key_type_set_invalid, \
.key_invalid = empty_val_key_invalid, \
.key_merge = key_type_set_merge, \
})
@ -128,84 +118,95 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
int ret = 0;
if (bkey_val_bytes(k.k) < ops->min_val_size) {
prt_printf(err, "bad val size (%zu < %u)",
bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
bkey_val_size_too_small,
"bad val size (%zu < %u)",
bkey_val_bytes(k.k), ops->min_val_size);
return -BCH_ERR_invalid_bkey;
}
if (!ops->key_invalid)
return 0;
return ops->key_invalid(c, k, flags, err);
ret = ops->key_invalid(c, k, flags, err);
fsck_err:
return ret;
}
static u64 bch2_key_types_allowed[] = {
#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
BCH_BTREE_IDS()
#undef x
[BKEY_TYPE_btree] =
BIT_ULL(KEY_TYPE_deleted)|
BIT_ULL(KEY_TYPE_btree_ptr)|
BIT_ULL(KEY_TYPE_btree_ptr_v2),
#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
BCH_BTREE_IDS()
#undef x
};
const char *bch2_btree_node_type_str(enum btree_node_type type)
{
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
}
int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
enum btree_node_type type,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (k.k->u64s < BKEY_U64s) {
prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
if (flags & BKEY_INVALID_COMMIT &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) {
prt_printf(err, "invalid key type for btree %s (%s)",
bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
bkey_u64s_too_small,
"u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
if (type >= BKEY_TYPE_NR)
return 0;
bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
if (k.k->size == 0) {
prt_printf(err, "size == 0");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(k.k->size == 0, c, err,
bkey_extent_size_zero,
"size == 0");
if (k.k->size > k.k->p.offset) {
prt_printf(err, "size greater than offset (%u > %llu)",
bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
bkey_extent_size_greater_than_offset,
"size greater than offset (%u > %llu)",
k.k->size, k.k->p.offset);
return -BCH_ERR_invalid_bkey;
}
} else {
if (k.k->size) {
prt_printf(err, "size != 0");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(k.k->size, c, err,
bkey_size_nonzero,
"size != 0");
}
if (type != BKEY_TYPE_btree) {
if (!btree_type_has_snapshots((enum btree_id) type) &&
k.k->p.snapshot) {
prt_printf(err, "nonzero snapshot");
return -BCH_ERR_invalid_bkey;
enum btree_id btree = type - 1;
if (btree_type_has_snapshots(btree)) {
bkey_fsck_err_on(!k.k->p.snapshot, c, err,
bkey_snapshot_zero,
"snapshot == 0");
} else if (!btree_type_has_snapshot_field(btree)) {
bkey_fsck_err_on(k.k->p.snapshot, c, err,
bkey_snapshot_nonzero,
"nonzero snapshot");
} else {
/*
* btree uses snapshot field but it's not required to be
* nonzero
*/
}
if (btree_type_has_snapshots((enum btree_id) type) &&
!k.k->p.snapshot) {
prt_printf(err, "snapshot == 0");
return -BCH_ERR_invalid_bkey;
bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
bkey_at_pos_max,
"key at POS_MAX");
}
if (bkey_eq(k.k->p, POS_MAX)) {
prt_printf(err, "key at POS_MAX");
return -BCH_ERR_invalid_bkey;
}
}
return 0;
fsck_err:
return ret;
}
int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
@ -217,20 +218,20 @@ int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
bch2_bkey_val_invalid(c, k, flags, err);
}
int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
struct printbuf *err)
int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
struct bkey_s_c k, struct printbuf *err)
{
if (bpos_lt(k.k->p, b->data->min_key)) {
prt_printf(err, "key before start of btree node");
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
if (bpos_gt(k.k->p, b->data->max_key)) {
prt_printf(err, "key past end of btree node");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
bkey_before_start_of_btree_node,
"key before start of btree node");
return 0;
bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
bkey_after_end_of_btree_node,
"key past end of btree node");
fsck_err:
return ret;
}
void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)

View File

@ -21,7 +21,7 @@ extern const struct bkey_ops bch2_bkey_null_ops;
* being read or written; more aggressive checks can be enabled when rw == WRITE.
*/
struct bkey_ops {
int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
int (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@ -55,7 +55,8 @@ int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
enum bkey_invalid_flags, struct printbuf *);
int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
enum bkey_invalid_flags, struct printbuf *);
int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
struct bkey_s_c, struct printbuf *);
void bch2_bpos_to_text(struct printbuf *, struct bpos);
void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
@ -119,16 +120,6 @@ enum btree_update_flags {
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \
((1U << KEY_TYPE_alloc)| \
(1U << KEY_TYPE_alloc_v2)| \
(1U << KEY_TYPE_alloc_v3)| \
(1U << KEY_TYPE_alloc_v4)| \
(1U << KEY_TYPE_stripe)| \
(1U << KEY_TYPE_inode)| \
(1U << KEY_TYPE_inode_v2)| \
(1U << KEY_TYPE_snapshot))
static inline int bch2_trans_mark_key(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,

View File

@ -106,7 +106,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
while ((k = sort_iter_peek(iter))) {
if (!bkey_deleted(k) &&
!should_drop_next_key(iter)) {
bkey_copy(out, k);
bkey_p_copy(out, k);
btree_keys_account_key_add(&nr, 0, out);
out = bkey_p_next(out);
}
@ -137,7 +137,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
continue;
if (!transform)
bkey_copy(out, in);
bkey_p_copy(out, in);
else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
? in_f : &bch2_bkey_format_current, in))
out->format = KEY_FORMAT_LOCAL_BTREE;
@ -191,7 +191,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
set_bkeyp_val_u64s(f, out, 0);
} else {
bkey_copy(out, in);
bkey_p_copy(out, in);
}
out->needs_whiteout |= needs_whiteout;
out = bkey_p_next(out);

View File

@ -472,7 +472,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
mutex_init(&c->verify_lock);
shrink = shrinker_alloc(0, "%s/btree_cache", c->name);
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
if (!shrink)
goto err;
bc->shrink = shrink;
@ -785,12 +785,12 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
"btree node header doesn't match ptr\n"
"btree %s level %u\n"
"ptr: ",
bch2_btree_ids[b->c.btree_id], b->c.level);
bch2_btree_id_str(b->c.btree_id), b->c.level);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
prt_printf(&buf, "\nheader: btree %s level %llu\n"
"min ",
bch2_btree_ids[BTREE_NODE_ID(b->data)],
bch2_btree_id_str(BTREE_NODE_ID(b->data)),
BTREE_NODE_LEVEL(b->data));
bch2_bpos_to_text(&buf, b->data->min_key);
@ -1153,8 +1153,21 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
six_unlock_intent(&b->c.lock);
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
const struct btree *b)
const char *bch2_btree_id_str(enum btree_id btree)
{
return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
}
void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
prt_printf(out, "%s level %u/%u\n ",
bch2_btree_id_str(b->c.btree_id),
b->c.level,
bch2_btree_id_root(c, b->c.btree_id)->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
}
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
{
struct bset_stats stats;

View File

@ -123,8 +123,9 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
return bch2_btree_id_root(c, b->c.btree_id)->b;
}
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
const struct btree *);
const char *bch2_btree_id_str(enum btree_id);
void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *);
#endif /* _BCACHEFS_BTREE_CACHE_H */

View File

@ -98,12 +98,12 @@ static int bch2_gc_check_topology(struct bch_fs *c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
FSCK_NO_RATELIMIT,
btree_node_topology_bad_min_key,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" cur %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
buf1.buf, buf2.buf) &&
should_restart_for_topology_repair(c)) {
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
@ -122,14 +122,12 @@ static int bch2_gc_check_topology(struct bch_fs *c,
bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
bch2_bpos_to_text(&buf2, node_end);
if (__fsck_err(c,
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
FSCK_NO_RATELIMIT,
if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf) &&
should_restart_for_topology_repair(c)) {
bch_info(c, "Halting mark and sweep to start topology repair pass");
@ -287,10 +285,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
cur->data->min_key), c,
btree_node_topology_overwritten_by_next_node,
"btree node overwritten by next node at btree %s level %u:\n"
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_PREV_NODE;
goto out;
@ -298,10 +297,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
bpos_predecessor(cur->data->min_key)), c,
btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" node %s\n"
" next %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_max(c, prev,
bpos_predecessor(cur->data->min_key));
@ -310,20 +310,22 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
if (mustfix_fsck_err_on(bpos_ge(expected_start,
cur->data->max_key), c,
btree_node_topology_overwritten_by_prev_node,
"btree node overwritten by prev node at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = DROP_THIS_NODE;
goto out;
}
if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
btree_node_topology_bad_min_key,
"btree node with incorrect min_key at btree %s level %u:\n"
" prev %s\n"
" node %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf))
ret = set_node_min(c, cur, expected_start);
}
@ -344,10 +346,11 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
bch2_bpos_to_text(&buf2, b->key.k.p);
if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
btree_node_topology_bad_max_key,
"btree node with incorrect max_key at btree %s level %u:\n"
" %s\n"
" expected %s",
bch2_btree_ids[b->c.btree_id], b->c.level,
bch2_btree_id_str(b->c.btree_id), b->c.level,
buf1.buf, buf2.buf)) {
ret = set_node_max(c, child, b->key.k.p);
if (ret)
@ -396,9 +399,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
if (mustfix_fsck_err_on(ret == -EIO, c,
btree_node_unreadable,
"Topology repair: unreadable btree node at btree %s level %u:\n"
" %s",
bch2_btree_ids[b->c.btree_id],
bch2_btree_id_str(b->c.btree_id),
b->c.level - 1,
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
@ -504,9 +508,10 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
if (mustfix_fsck_err_on(!have_child, c,
btree_node_topology_interior_node_empty,
"empty interior btree node at btree %s level %u\n"
" %s",
bch2_btree_ids[b->c.btree_id],
bch2_btree_id_str(b->c.btree_id),
b->c.level, buf.buf))
ret = DROP_THIS_NODE;
err:
@ -582,7 +587,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (!g->gen_valid &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
fsck_err(c, ptr_to_missing_alloc_key,
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@ -599,7 +605,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
fsck_err(c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@ -620,7 +627,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
fsck_err(c, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@ -631,7 +639,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
(c->opts.reconstruct_alloc ||
fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
fsck_err(c, stale_dirty_ptr,
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@ -645,6 +654,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
if (fsck_err_on(bucket_data_type(g->data_type) &&
bucket_data_type(g->data_type) != data_type, c,
ptr_bucket_data_type_mismatch,
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@ -664,6 +674,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
if (fsck_err_on(!m || !m->alive, c,
ptr_to_missing_stripe,
"pointer to nonexistent stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
@ -672,6 +683,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
do_update = true;
if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
ptr_to_incorrect_stripe,
"pointer does not match stripe %llu\n"
"while marking %s",
(u64) p.ec.idx,
@ -811,6 +823,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
goto err;
if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
k->k->version.lo,
atomic64_read(&c->key_version)))
@ -968,9 +981,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
FSCK_CAN_FIX|
FSCK_CAN_IGNORE|
FSCK_NO_RATELIMIT,
btree_node_read_error,
"Unreadable btree node at btree %s level %u:\n"
" %s",
bch2_btree_ids[b->c.btree_id],
bch2_btree_id_str(b->c.btree_id),
b->c.level - 1,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
@ -1025,6 +1039,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->min_key);
if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
btree_root_bad_min_key,
"btree root with incorrect min_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = -BCH_ERR_fsck_repair_unimplemented;
@ -1034,6 +1049,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->data->max_key);
if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
btree_root_bad_max_key,
"btree root with incorrect max_key: %s", buf.buf)) {
bch_err(c, "repair unimplemented");
ret = -BCH_ERR_fsck_repair_unimplemented;
@ -1207,16 +1223,16 @@ static int bch2_gc_done(struct bch_fs *c,
percpu_down_write(&c->mark_lock);
#define copy_field(_f, _msg, ...) \
#define copy_field(_err, _f, _msg, ...) \
if (dst->_f != src->_f && \
(!verify || \
fsck_err(c, _msg ": got %llu, should be %llu" \
fsck_err(c, _err, _msg ": got %llu, should be %llu" \
, ##__VA_ARGS__, dst->_f, src->_f))) \
dst->_f = src->_f
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
#define copy_dev_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
#define copy_fs_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
@ -1227,13 +1243,17 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
dev_usage_u64s());
copy_dev_field(buckets_ec, "buckets_ec");
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
copy_dev_field(dev_usage_buckets_wrong,
d[i].buckets, "%s buckets", bch2_data_types[i]);
copy_dev_field(dev_usage_sectors_wrong,
d[i].sectors, "%s sectors", bch2_data_types[i]);
copy_dev_field(dev_usage_fragmented_wrong,
d[i].fragmented, "%s fragmented", bch2_data_types[i]);
}
copy_dev_field(dev_usage_buckets_ec_wrong,
buckets_ec, "buckets_ec");
}
{
@ -1242,17 +1262,24 @@ static int bch2_gc_done(struct bch_fs *c,
struct bch_fs_usage *src = (void *)
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
copy_fs_field(hidden, "hidden");
copy_fs_field(btree, "btree");
copy_fs_field(fs_usage_hidden_wrong,
hidden, "hidden");
copy_fs_field(fs_usage_btree_wrong,
btree, "btree");
if (!metadata_only) {
copy_fs_field(data, "data");
copy_fs_field(cached, "cached");
copy_fs_field(reserved, "reserved");
copy_fs_field(nr_inodes,"nr_inodes");
copy_fs_field(fs_usage_data_wrong,
data, "data");
copy_fs_field(fs_usage_cached_wrong,
cached, "cached");
copy_fs_field(fs_usage_reserved_wrong,
reserved, "reserved");
copy_fs_field(fs_usage_nr_inodes_wrong,
nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(persistent_reserved[i],
copy_fs_field(fs_usage_persistent_reserved_wrong,
persistent_reserved[i],
"persistent_reserved[%i]", i);
}
@ -1268,7 +1295,8 @@ static int bch2_gc_done(struct bch_fs *c,
printbuf_reset(&buf);
bch2_replicas_entry_to_text(&buf, e);
copy_fs_field(replicas[i], "%s", buf.buf);
copy_fs_field(fs_usage_replicas_wrong,
replicas[i], "%s", buf.buf);
}
}
@ -1404,6 +1432,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
if (c->opts.reconstruct_alloc ||
fsck_err_on(new.data_type != gc.data_type, c,
alloc_key_data_type_wrong,
"bucket %llu:%llu gen %u has wrong data_type"
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
@ -1412,9 +1441,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
bch2_data_types[gc.data_type]))
new.data_type = gc.data_type;
#define copy_bucket_field(_f) \
#define copy_bucket_field(_errtype, _f) \
if (c->opts.reconstruct_alloc || \
fsck_err_on(new._f != gc._f, c, \
fsck_err_on(new._f != gc._f, c, _errtype, \
"bucket %llu:%llu gen %u data type %s has wrong " #_f \
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
@ -1423,11 +1452,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
new._f, gc._f)) \
new._f = gc._f; \
copy_bucket_field(gen);
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
copy_bucket_field(stripe_redundancy);
copy_bucket_field(stripe);
copy_bucket_field(alloc_key_gen_wrong,
gen);
copy_bucket_field(alloc_key_dirty_sectors_wrong,
dirty_sectors);
copy_bucket_field(alloc_key_cached_sectors_wrong,
cached_sectors);
copy_bucket_field(alloc_key_stripe_wrong,
stripe);
copy_bucket_field(alloc_key_stripe_redundancy_wrong,
stripe_redundancy);
#undef copy_bucket_field
if (!bch2_alloc_v4_cmp(*old, new))
@ -1584,6 +1618,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
}
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
reflink_v_refcount_wrong,
"reflink key has wrong refcount:\n"
" %s\n"
" should be %u",
@ -1709,7 +1744,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
if (bad)
bch2_bkey_val_to_text(&buf, c, k);
if (fsck_err_on(bad, c, "%s", buf.buf)) {
if (fsck_err_on(bad, c, stripe_sector_count_wrong,
"%s", buf.buf)) {
struct bkey_i_stripe *new;
new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
@ -1954,19 +1990,17 @@ int bch2_gc_gens(struct bch_fs *c)
trans = bch2_trans_get(c);
for_each_member_device(ca, c, i) {
struct bucket_gens *gens;
struct bucket_gens *gens = bucket_gens(ca);
BUG_ON(ca->oldest_gen);
ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
percpu_ref_put(&ca->ref);
ret = -BCH_ERR_ENOMEM_gc_gens;
goto err;
}
gens = bucket_gens(ca);
for (b = gens->first_bucket;
b < gens->nbuckets; b++)
ca->oldest_gen[b] = gens->b[b];

View File

@ -184,7 +184,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
k = new_whiteouts;
while (ptrs != ptrs_end) {
bkey_copy(k, *ptrs);
bkey_p_copy(k, *ptrs);
k = bkey_p_next(k);
ptrs++;
}
@ -260,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
n = bkey_p_next(k);
if (!bkey_deleted(k)) {
bkey_copy(out, k);
bkey_p_copy(out, k);
out = bkey_p_next(out);
} else {
BUG_ON(k->needs_whiteout);
@ -510,16 +510,6 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bch2_trans_node_reinit_iter(trans, b);
}
static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
struct btree *b)
{
prt_printf(out, "%s level %u/%u\n ",
bch2_btree_ids[b->c.btree_id],
b->c.level,
bch2_btree_id_root(c, b->c.btree_id)->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
}
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
struct btree *b, struct bset *i,
@ -532,7 +522,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
if (ca)
prt_printf(out, "on %s ", ca->name);
prt_printf(out, "at btree ");
btree_pos_to_text(out, c, b);
bch2_btree_pos_to_text(out, c, b);
prt_printf(out, "\n node offset %u", b->written);
if (i)
@ -540,7 +530,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_str(out, ": ");
}
__printf(8, 9)
__printf(9, 10)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
@ -548,6 +538,7 @@ static int __btree_err(int ret,
struct bset *i,
int write,
bool have_retry,
enum bch_sb_error_id err_type,
const char *fmt, ...)
{
struct printbuf out = PRINTBUF;
@ -572,9 +563,15 @@ static int __btree_err(int ret,
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
ret = -BCH_ERR_btree_node_read_err_bad_node;
if (ret != -BCH_ERR_btree_node_read_err_fixable)
bch2_sb_error_count(c, err_type);
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
mustfix_fsck_err(c, "%s", out.buf);
ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
if (ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore)
goto fsck_err;
ret = -BCH_ERR_fsck_fix;
break;
case -BCH_ERR_btree_node_read_err_want_retry:
@ -599,9 +596,11 @@ static int __btree_err(int ret,
return ret;
}
#define btree_err(type, c, ca, b, i, msg, ...) \
#define btree_err(type, c, ca, b, i, _err_type, msg, ...) \
({ \
int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
int _ret = __btree_err(type, c, ca, b, i, write, have_retry, \
BCH_FSCK_ERR_##_err_type, \
msg, ##__VA_ARGS__); \
\
if (_ret != -BCH_ERR_fsck_fix) { \
ret = _ret; \
@ -676,13 +675,17 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
int ret = 0;
btree_err_on(!bch2_version_compatible(version),
-BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
-BCH_ERR_btree_node_read_err_incompatible,
c, ca, b, i,
btree_node_unsupported_version,
"unsupported bset version %u.%u",
BCH_VERSION_MAJOR(version),
BCH_VERSION_MINOR(version));
if (btree_err_on(version < c->sb.version_min,
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
mutex_lock(&c->sb_lock);
@ -693,7 +696,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
if (btree_err_on(BCH_VERSION_MAJOR(version) >
BCH_VERSION_MAJOR(c->sb.version),
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bset_newer_than_sb,
"bset version %u newer than superblock version %u",
version, c->sb.version)) {
mutex_lock(&c->sb_lock);
@ -703,11 +708,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
-BCH_ERR_btree_node_read_err_incompatible,
c, ca, b, i,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
if (btree_err_on(offset + sectors > btree_sectors(c),
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i,
bset_past_end_of_btree_node,
"bset past end of btree node")) {
i->u64s = 0;
ret = 0;
@ -715,12 +724,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i,
bset_empty,
"empty bset");
btree_err_on(BSET_OFFSET(i) &&
BSET_OFFSET(i) != offset,
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_wrong_sector_offset,
"bset at wrong sector offset");
if (!offset) {
@ -734,16 +746,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
/* XXX endianness */
btree_err_on(bp->seq != bn->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
bset_bad_seq,
"incorrect sequence number (wrong btree node)");
}
btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, i,
btree_node_bad_btree,
"incorrect btree id");
btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, i,
btree_node_bad_level,
"incorrect level");
if (!write)
@ -760,7 +778,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_min_key,
"incorrect min_key: got %s should be %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
@ -769,7 +789,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
}
btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, i,
btree_node_bad_max_key,
"incorrect max key %s",
(printbuf_reset(&buf1),
bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
@ -779,7 +801,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BSET_BIG_ENDIAN(i), write, bn);
btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
-BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
-BCH_ERR_btree_node_read_err_bad_node,
c, ca, b, i,
btree_node_bad_format,
"invalid bkey format: %s\n %s", buf1.buf,
(printbuf_reset(&buf2),
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
@ -802,7 +826,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
struct printbuf *err)
{
return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
(!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
(!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
@ -823,14 +847,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bkey tmp;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_past_bset_end,
"key extends past end of bset")) {
i->u64s = cpu_to_le16((u64 *) k - i->_data);
break;
}
if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_format,
"invalid bkey format %u", k->format)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
@ -849,12 +877,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
printbuf_reset(&buf);
if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
printbuf_reset(&buf);
prt_printf(&buf, "invalid bkey: ");
bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
prt_printf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, u.s_c);
btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
btree_err(-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bad_bkey,
"invalid bkey: %s", buf.buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
@ -878,7 +908,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
bch2_dump_bset(c, b, i, 0);
if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) {
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_out_of_order,
"%s", buf.buf)) {
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_p_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
@ -919,47 +952,62 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
if (bch2_meta_read_fault("btree"))
btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
btree_err(-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_fault_injected,
"dynamic fault");
btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_magic,
"bad magic: want %llx, got %llx",
bset_magic(c), le64_to_cpu(b->data->magic));
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
"bad btree header: seq 0");
if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_seq,
"got wrong btree node (seq %llx want %llx)",
b->data->keys.seq, bp->seq);
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_seq,
"bad btree header: seq 0");
}
while (b->written < (ptr_written ?: btree_sectors(c))) {
unsigned sectors;
struct nonce nonce;
struct bch_csum csum;
bool first = !b->written;
bool csum_bad;
if (!b->written) {
i = &b->data->keys;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
"unknown checksum type %llu",
BSET_CSUM_TYPE(i));
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_unknown_csum,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
csum_bad = bch2_crc_cmp(b->data->csum,
csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
btree_err_on(csum_bad,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
"invalid checksum");
ret = bset_encrypt(c, i, b->written << 9);
@ -969,7 +1017,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
!BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL,
-BCH_ERR_btree_node_read_err_incompatible,
c, NULL, b, NULL,
btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
sectors = vstruct_sectors(b->data, c->block_bits);
@ -981,15 +1031,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
break;
btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
"unknown checksum type %llu",
BSET_CSUM_TYPE(i));
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_unknown_csum,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
csum_bad = bch2_crc_cmp(bne->csum,
csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
btree_err_on(bch2_crc_cmp(csum, bne->csum),
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
btree_err_on(csum_bad,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
"invalid checksum");
ret = bset_encrypt(c, i, b->written << 9);
@ -1022,12 +1078,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
true);
btree_err_on(blacklisted && first,
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i,
bset_blacklisted_journal_seq,
"first btree node bset has blacklisted journal seq (%llu)",
le64_to_cpu(i->journal_seq));
btree_err_on(blacklisted && ptr_written,
-BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i,
first_bset_blacklisted_journal_seq,
"found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
@ -1044,7 +1104,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (ptr_written) {
btree_err_on(b->written < ptr_written,
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, NULL,
btree_node_data_missing,
"btree node data missing: expected %u sectors, found %u",
ptr_written, b->written);
} else {
@ -1055,7 +1117,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
!bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(bne->keys.journal_seq),
true),
-BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, NULL,
btree_node_bset_after_end,
"found bset signature after last bset");
}
@ -1097,7 +1161,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
prt_printf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, u.s_c);
btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
btree_err(-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bad_bkey,
"%s", buf.buf);
btree_keys_account_key_drop(&b->nr, 0, k);
@ -1177,8 +1244,9 @@ static void btree_node_read_work(struct work_struct *work)
}
start:
printbuf_reset(&buf);
btree_pos_to_text(&buf, c, b);
bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
bch2_btree_pos_to_text(&buf, c, b);
bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
percpu_ref_put(&ca->io_ref);
@ -1213,7 +1281,7 @@ static void btree_node_read_work(struct work_struct *work)
printbuf_reset(&buf);
bch2_bpos_to_text(&buf, b->key.k.p);
bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
__func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
__func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
bch2_btree_node_rewrite_async(c, b);
}
@ -1322,14 +1390,20 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
}
written2 = btree_node_sectors_written(c, ra->buf[i]);
if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, NULL,
btree_node_replicas_sectors_written_mismatch,
"btree node sectors written mismatch: %u != %u",
written, written2) ||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, NULL,
btree_node_bset_after_end,
"found bset signature after last bset") ||
btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, NULL,
btree_node_replicas_data_mismatch,
"btree node replicas content mismatch"))
dump_bset_maps = true;
@ -1524,7 +1598,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
struct printbuf buf = PRINTBUF;
prt_str(&buf, "btree node read error: no device to read from\n at ");
btree_pos_to_text(&buf, c, b);
bch2_btree_pos_to_text(&buf, c, b);
bch_err(c, "%s", buf.buf);
if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
@ -1759,7 +1833,8 @@ static void btree_node_write_endio(struct bio *bio)
if (wbio->have_ioref)
bch2_latency_acct(ca, wbio->submit_time, WRITE);
if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"btree write error: %s",
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);

View File

@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(iter->btree_id));
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
bch2_btree_path_verify(trans, iter->update_path);
@ -362,7 +362,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
bch2_bpos_to_text(&buf, pos);
panic("not locked: %s %s%s\n",
bch2_btree_ids[id], buf.buf,
bch2_btree_id_str(id), buf.buf,
key_cache ? " cached" : "");
}
@ -1109,6 +1109,9 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
if (unlikely(ret))
goto out;
if (unlikely(!trans->srcu_held))
bch2_trans_srcu_lock(trans);
/*
* Ensure we obey path->should_be_locked: if it's set, we can't unlock
* and re-traverse the path without a transaction restart:
@ -1371,7 +1374,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
struct bkey_s_c old = { &i->old_k, i->old_v };
prt_printf(buf, "update: btree=%s cached=%u %pS",
bch2_btree_ids[i->btree_id],
bch2_btree_id_str(i->btree_id),
i->cached,
(void *) i->ip_allocated);
prt_newline(buf);
@ -1387,7 +1390,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
trans_for_each_wb_update(trans, wb) {
prt_printf(buf, "update: btree=%s wb=1 %pS",
bch2_btree_ids[wb->btree],
bch2_btree_id_str(wb->btree),
(void *) i->ip_allocated);
prt_newline(buf);
@ -1416,7 +1419,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
path->idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
bch2_btree_ids[path->btree_id],
bch2_btree_id_str(path->btree_id),
path->level);
bch2_bpos_to_text(out, path->pos);
@ -1523,6 +1526,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
path->alloc_seq++;
btree_path_list_add(trans, pos, path);
trans->paths_sorted = false;
@ -1598,7 +1602,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want)
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
return path;
}
@ -2829,8 +2833,16 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
return p;
}
static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
static inline void check_srcu_held_too_long(struct btree_trans *trans)
{
WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
"btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
(jiffies - trans->srcu_lock_time) / HZ);
}
void bch2_trans_srcu_unlock(struct btree_trans *trans)
{
if (trans->srcu_held) {
struct bch_fs *c = trans->c;
struct btree_path *path;
@ -2838,9 +2850,19 @@ static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
if (path->cached && !btree_node_locked(path, 0))
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
check_srcu_held_too_long(trans);
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_held = false;
}
}
void bch2_trans_srcu_lock(struct btree_trans *trans)
{
if (!trans->srcu_held) {
trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
}
}
/**
@ -2894,8 +2916,9 @@ u32 bch2_trans_begin(struct btree_trans *trans)
}
trans->last_begin_time = now;
if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
bch2_trans_reset_srcu_lock(trans);
if (unlikely(trans->srcu_held &&
time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
bch2_trans_srcu_unlock(trans);
trans->last_begin_ip = _RET_IP_;
if (trans->restarted) {
@ -2982,6 +3005,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;
trans->srcu_held = true;
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
struct btree_trans *pos;
@ -3025,7 +3049,7 @@ static void check_btree_paths_leaked(struct btree_trans *trans)
trans_for_each_path(trans, path)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
bch2_btree_ids[path->btree_id],
bch2_btree_id_str(path->btree_id),
(void *) path->ip_allocated);
/* Be noisy about this: */
bch2_fatal_error(c);
@ -3058,7 +3082,10 @@ void bch2_trans_put(struct btree_trans *trans)
check_btree_paths_leaked(trans);
if (trans->srcu_held) {
check_srcu_held_too_long(trans);
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
@ -3100,7 +3127,7 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
prt_tab(out);
prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
b->level, bch2_btree_ids[b->btree_id]);
b->level, bch2_btree_id_str(b->btree_id));
bch2_bpos_to_text(out, btree_node_pos(b));
prt_tab(out);
@ -3130,7 +3157,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
path->idx,
path->cached ? 'c' : 'b',
path->level,
bch2_btree_ids[path->btree_id]);
bch2_btree_id_str(path->btree_id));
bch2_bpos_to_text(out, path->pos);
prt_newline(out);

View File

@ -274,6 +274,7 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
void bch2_trans_unlock(struct btree_trans *);
void bch2_trans_unlock_long(struct btree_trans *);
bool bch2_trans_locked(struct btree_trans *);
static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
@ -411,11 +412,11 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
btree_node_type_is_extents(btree_id))
btree_id_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
!btree_type_has_snapshots(btree_id))
!btree_type_has_snapshot_field(btree_id))
flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
@ -579,6 +580,9 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
__bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \
KEY_TYPE_##_type, sizeof(*_val), _val)
void bch2_trans_srcu_unlock(struct btree_trans *);
void bch2_trans_srcu_lock(struct btree_trans *);
u32 bch2_trans_begin(struct btree_trans *);
/*

View File

@ -324,7 +324,7 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
ck = bkey_cached_reuse(bc);
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
bch2_btree_ids[path->btree_id]);
bch2_btree_id_str(path->btree_id));
return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
}
@ -407,7 +407,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_ids[ck->key.btree_id], new_u64s);
bch2_btree_id_str(ck->key.btree_id), new_u64s);
ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
goto err;
}
@ -509,7 +509,7 @@ bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree
* path->uptodate yet:
*/
if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) {
!__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err;
@ -1038,7 +1038,7 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
bc->table_init_done = true;
shrink = shrinker_alloc(0, "%s/btree_key_cache", c->name);
shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
bc->shrink = shrink;

View File

@ -431,7 +431,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path,
bool upgrade)
bool upgrade,
struct get_locks_fail *f)
{
unsigned l = path->level;
int fail_idx = -1;
@ -442,9 +443,15 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
: bch2_btree_node_relock(trans, path, l)))
: bch2_btree_node_relock(trans, path, l))) {
fail_idx = l;
if (f) {
f->l = l;
f->b = path->l[l].b;
}
}
l++;
} while (l < path->locks_want);
@ -584,7 +591,9 @@ __flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
return btree_path_get_locks(trans, path, false);
struct get_locks_fail f;
return btree_path_get_locks(trans, path, false, &f);
}
int __bch2_btree_path_relock(struct btree_trans *trans,
@ -600,22 +609,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
unsigned new_locks_want,
struct get_locks_fail *f)
{
EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want;
return btree_path_get_locks(trans, path, true);
return btree_path_get_locks(trans, path, true, f);
}
bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
unsigned new_locks_want,
struct get_locks_fail *f)
{
struct btree_path *linked;
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
/*
@ -644,7 +655,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true);
btree_path_get_locks(trans, linked, true, NULL);
}
return false;
@ -656,6 +667,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
{
unsigned l;
if (trans->restarted)
return;
EBUG_ON(path->locks_want < new_locks_want);
path->locks_want = new_locks_want;
@ -674,6 +688,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
}
bch2_btree_path_verify_locks(path);
path->downgrade_seq++;
trace_path_downgrade(trans, _RET_IP_, path);
}
/* Btree transaction locking: */
@ -682,6 +699,9 @@ void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
if (trans->restarted)
return;
trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path);
}
@ -733,6 +753,12 @@ void bch2_trans_unlock(struct btree_trans *trans)
__bch2_btree_path_unlock(trans, path);
}
void bch2_trans_unlock_long(struct btree_trans *trans)
{
bch2_trans_unlock(trans);
bch2_trans_srcu_unlock(trans);
}
bool bch2_trans_locked(struct btree_trans *trans)
{
struct btree_path *path;

View File

@ -355,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
struct get_locks_fail {
unsigned l;
struct btree *b;
};
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *, unsigned,
struct get_locks_fail *);
bool __bch2_btree_path_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
struct btree_path *, unsigned,
struct get_locks_fail *);
static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
struct get_locks_fail f;
unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want)
? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
: path->uptodate == BTREE_ITER_UPTODATE)
return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
old_locks_want, new_locks_want);
old_locks_want, new_locks_want, &f);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
}

View File

@ -269,6 +269,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->level != i->path->level);
BUG_ON(i->btree_id != i->path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
i->k->k.p.snapshot &&
@ -349,7 +350,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
if (!new_k) {
bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_ids[path->btree_id], new_u64s);
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
@ -379,11 +380,10 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
return 0;
if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
ret = bch2_mark_key(trans, i->btree_id, i->level,
old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
@ -425,8 +425,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
old_ops->trans_trigger == new_ops->trans_trigger &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
old_ops->trans_trigger == new_ops->trans_trigger) {
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
@ -683,7 +682,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
BCH_JSET_ENTRY_overwrite,
i->btree_id, i->level,
i->old_k.u64s);
bkey_reassemble(&entry->start[0],
bkey_reassemble((struct bkey_i *) entry->start,
(struct bkey_s_c) { &i->old_k, i->old_v });
}
@ -691,7 +690,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
BCH_JSET_ENTRY_btree_keys,
i->btree_id, i->level,
i->k->k.u64s);
bkey_copy(&entry->start[0], i->k);
bkey_copy((struct bkey_i *) entry->start, i->k);
}
trans_for_each_wb_update(trans, wb) {
@ -699,7 +698,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
BCH_JSET_ENTRY_btree_keys,
wb->btree, 0,
wb->k.k.u64s);
bkey_copy(&entry->start[0], &wb->k);
bkey_copy((struct bkey_i *) entry->start, &wb->k);
}
if (trans->journal_seq)
@ -776,12 +775,12 @@ static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans
bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
enum bkey_invalid_flags flags,
struct btree_insert_entry *i,
struct printbuf *err)
{
struct bch_fs *c = trans->c;
int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
printbuf_reset(err);
prt_printf(err, "invalid bkey on insert from %s -> %ps",
@ -792,8 +791,7 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un
bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
prt_newline(err);
bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, rw, err);
bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
bch2_print_string_as_lines(KERN_ERR, err->buf);
bch2_inconsistent_error(c);
@ -864,12 +862,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
*/
bch2_journal_res_put(&c->journal, &trans->journal_res);
if (unlikely(ret))
return ret;
bch2_trans_downgrade(trans);
return 0;
}
static int journal_reclaim_wait_done(struct bch_fs *c)
@ -1034,7 +1027,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
i->bkey_type, invalid_flags, &buf)))
ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
btree_insert_entry_checks(trans, i);
printbuf_exit(&buf);
@ -1138,6 +1131,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
bch2_trans_downgrade(trans);
bch2_trans_reset_updates(trans);
return ret;

View File

@ -228,6 +228,8 @@ struct btree_path {
u8 sorted_idx;
u8 ref;
u8 intent_ref;
u32 alloc_seq;
u32 downgrade_seq;
/* btree_iter_copy starts here: */
struct bpos pos;
@ -424,6 +426,7 @@ struct btree_trans {
u8 nr_updates;
u8 nr_wb_updates;
u8 wb_updates_size;
bool srcu_held:1;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
@ -636,16 +639,17 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
}
enum btree_node_type {
#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
BKEY_TYPE_btree,
#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
BCH_BTREE_IDS()
#undef x
BKEY_TYPE_btree,
BKEY_TYPE_NR
};
/* Type of a key in btree @id at level @level: */
static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
{
return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
return level ? BKEY_TYPE_btree : (unsigned) id + 1;
}
/* Type of keys @b contains: */
@ -654,19 +658,21 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
return __btree_node_type(b->c.level, b->c.btree_id);
}
const char *bch2_btree_node_type_str(enum btree_node_type);
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
(BIT(BKEY_TYPE_extents)| \
BIT(BKEY_TYPE_alloc)| \
BIT(BKEY_TYPE_inodes)| \
BIT(BKEY_TYPE_stripes)| \
BIT(BKEY_TYPE_reflink)| \
BIT(BKEY_TYPE_btree))
(BIT_ULL(BKEY_TYPE_extents)| \
BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_btree))
#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
(BIT(BKEY_TYPE_alloc)| \
BIT(BKEY_TYPE_inodes)| \
BIT(BKEY_TYPE_stripes)| \
BIT(BKEY_TYPE_snapshots))
(BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
BIT_ULL(BKEY_TYPE_snapshots))
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
@ -674,13 +680,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
}
static inline bool btree_node_type_is_extents(enum btree_node_type type)
{
const unsigned mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr)
#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
BCH_BTREE_IDS()
#undef x
;
@ -690,7 +696,7 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type)
static inline bool btree_id_is_extents(enum btree_id btree)
{
return btree_node_type_is_extents((enum btree_node_type) btree);
return btree_node_type_is_extents(__btree_node_type(0, btree));
}
static inline bool btree_type_has_snapshots(enum btree_id id)
@ -704,6 +710,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
return (1U << id) & mask;
}
static inline bool btree_type_has_snapshot_field(enum btree_id id)
{
const unsigned mask = 0
#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
BCH_BTREE_IDS()
#undef x
;
return (1U << id) & mask;
}
static inline bool btree_type_has_ptrs(enum btree_id id)
{
const unsigned mask = 0

View File

@ -1274,14 +1274,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
btree_node_type(b), WRITE, &buf) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
printbuf_reset(&buf);
prt_printf(&buf, "inserting invalid bkey\n ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_printf(&buf, "\n ");
bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
btree_node_type(b), WRITE, &buf);
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
bch2_fs_inconsistent(c, "%s", buf.buf);
dump_stack();
@ -1987,7 +1987,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
out:
if (new_path)
bch2_path_put(trans, new_path, true);
bch2_btree_path_downgrade(trans, iter->path);
bch2_trans_downgrade(trans);
return ret;
err:
bch2_btree_node_free_never_used(as, trans, n);
@ -2411,30 +2411,24 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry
r->level = entry->level;
r->alive = true;
bkey_copy(&r->key, &entry->start[0]);
bkey_copy(&r->key, (struct bkey_i *) entry->start);
mutex_unlock(&c->btree_root_lock);
}
struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c,
struct jset_entry *start,
struct jset_entry *end)
struct jset_entry *end,
unsigned long skip)
{
struct jset_entry *entry;
unsigned long have = 0;
unsigned i;
for (entry = start; entry < end; entry = vstruct_next(entry))
if (entry->type == BCH_JSET_ENTRY_btree_root)
__set_bit(entry->btree_id, &have);
mutex_lock(&c->btree_root_lock);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
if (r->alive && !test_bit(i, &have)) {
if (r->alive && !test_bit(i, &skip)) {
journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
i, r->level, &r->key, r->key.k.u64s);
end = vstruct_next(end);

View File

@ -271,7 +271,7 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
__bch_btree_u64s_remaining(c, b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@ -303,7 +303,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
bkey_copy(unwritten_whiteouts_start(c, b), &k);
bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
}
/*
@ -325,7 +325,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *);
void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
struct jset_entry *, struct jset_entry *);
struct jset_entry *, unsigned long);
void bch2_do_pending_node_rewrites(struct bch_fs *);
void bch2_free_pending_node_rewrites(struct bch_fs *);

View File

@ -370,8 +370,8 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
idx = bch2_replicas_entry_idx(c, r);
if (idx < 0 &&
fsck_err(c, "no replicas entry\n"
" while marking %s",
fsck_err(c, ptr_to_missing_replicas_entry,
"no replicas entry\n while marking %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, r);
@ -695,6 +695,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if (gen_after(ptr->gen, b_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@ -707,6 +708,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_ptr_too_stale,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@ -720,6 +722,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if (b_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_stale_dirty_ptr,
"bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@ -741,6 +744,7 @@ static int check_bucket_ref(struct btree_trans *trans,
ptr_data_type &&
bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@ -754,6 +758,7 @@ static int check_bucket_ref(struct btree_trans *trans,
if ((u64) bucket_sectors + sectors > U32_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_bucket_sector_count_overflow,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
@ -935,14 +940,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
return 0;
}
int bch2_mark_extent(struct btree_trans *trans,
static int __mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
struct bkey_s_c k, unsigned flags)
{
u64 journal_seq = trans->journal_res.seq;
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@ -1018,6 +1021,14 @@ int bch2_mark_extent(struct btree_trans *trans,
return 0;
}
int bch2_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
}
int bch2_mark_stripe(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@ -1124,13 +1135,11 @@ int bch2_mark_stripe(struct btree_trans *trans,
return 0;
}
int bch2_mark_reservation(struct btree_trans *trans,
static int __mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bch_fs_usage *fs_usage;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
@ -1157,6 +1166,14 @@ int bch2_mark_reservation(struct btree_trans *trans,
return 0;
}
int bch2_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
}
static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 start, u64 end,
@ -1183,7 +1200,8 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
*idx = r->offset;
return 0;
not_found:
if (fsck_err(c, "pointer to missing indirect extent\n"
if (fsck_err(c, reflink_p_to_missing_reflink_v,
"pointer to missing indirect extent\n"
" %s\n"
" missing range %llu-%llu",
(bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
@ -1211,13 +1229,11 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
int bch2_mark_reflink_p(struct btree_trans *trans,
static int __mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
struct reflink_gc *ref;
size_t l, r, m;
@ -1251,6 +1267,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
int bch2_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
{
return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
}
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@ -1298,7 +1322,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
struct bch_fs *c = trans->c;
static int warned_disk_usage = 0;
bool warn = false;
unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
struct replicas_delta *d, *d2;
struct replicas_delta *top = (void *) deltas->d + deltas->used;
struct bch_fs_usage *dst;
@ -1357,7 +1381,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
bch2_trans_inconsistent(trans,
"disk usage increased %lli more than %u sectors reserved)",
"disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
return 0;
need_mark:
@ -1452,15 +1476,11 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
return ret;
}
int bch2_trans_mark_extent(struct btree_trans *trans,
static int __trans_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
struct bkey_s_c k, unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
? old
: bkey_i_to_s_c(new);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
@ -1517,6 +1537,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
return ret;
}
int bch2_trans_mark_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
struct bch_fs *c = trans->c;
int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
(int) bch2_bkey_needs_rebalance(c, old);
if (mod) {
int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
if (ret)
return ret;
}
return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
}
static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
struct bkey_s_c_stripe s,
unsigned idx, bool deleting)
@ -1670,15 +1708,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
return ret;
}
int bch2_trans_mark_reservation(struct btree_trans *trans,
static int __trans_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
struct bkey_s_c k, unsigned flags)
{
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
? old
: bkey_i_to_s_c(new);
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
s64 sectors = (s64) k.k->size;
struct replicas_delta_list *d;
@ -1700,7 +1733,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
return 0;
}
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
int bch2_trans_mark_reservation(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
{
return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
}
static int trans_mark_reflink_p_segment(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 *idx, unsigned flags)
{
@ -1767,35 +1809,38 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
return ret;
}
int bch2_trans_mark_reflink_p(struct btree_trans *trans,
static int __trans_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
struct bkey_s_c k, unsigned flags)
{
struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
? old
: bkey_i_to_s_c(new);
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx, end_idx;
int ret = 0;
if (flags & BTREE_TRIGGER_INSERT) {
struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
v->front_pad = v->back_pad = 0;
}
idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
end_idx = le64_to_cpu(p.v->idx) + p.k->size +
le32_to_cpu(p.v->back_pad);
while (idx < end_idx && !ret)
ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
return ret;
}
int bch2_trans_mark_reflink_p(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old,
struct bkey_i *new,
unsigned flags)
{
if (flags & BTREE_TRIGGER_INSERT) {
struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
v->front_pad = v->back_pad = 0;
}
return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
}
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,
@ -1818,6 +1863,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
if (a->v.data_type && type && a->v.data_type != type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
BCH_FSCK_ERR_bucket_metadata_type_mismatch,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
@ -1825,16 +1871,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bch2_data_types[type],
bch2_data_types[type]);
ret = -EIO;
goto out;
goto err;
}
if (a->v.data_type != type ||
a->v.dirty_sectors != sectors) {
a->v.data_type = type;
a->v.dirty_sectors = sectors;
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
if (ret)
goto out;
out:
}
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@ -1929,6 +1975,22 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
return ret;
}
int bch2_trans_mark_dev_sbs(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i) {
int ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
return ret;
}
}
return 0;
}
/* Disk reservations: */
#define SECTORS_CACHE 1024

View File

@ -339,12 +339,27 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
if (!ret && _new.k->type) \
ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
ret; \
})
#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
size_t, enum bch_data_type, unsigned);
int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
int bch2_trans_mark_dev_sbs(struct bch_fs *);
static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
{

View File

@ -332,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
.p.btree_id = ctx->stats.btree_id,
.p.pos = ctx->stats.pos,
.p.btree_id = ctx->stats.pos.btree,
.p.pos = ctx->stats.pos.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_usage_read_short(c).used,
};

View File

@ -697,14 +697,32 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
return ret;
}
void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
{
struct bch_compression_opt opt = bch2_compression_decode(v);
if (opt.type < BCH_COMPRESSION_OPT_NR)
prt_str(out, bch2_compression_opts[opt.type]);
else
prt_printf(out, "(unknown compression opt %u)", opt.type);
if (opt.level)
prt_printf(out, ":%u", opt.level);
}
void bch2_opt_compression_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
{
struct bch_compression_opt opt = bch2_compression_decode(v);
prt_str(out, bch2_compression_opts[opt.type]);
if (opt.level)
prt_printf(out, ":%u", opt.level);
return bch2_compression_opt_to_text(out, v);
}
int bch2_opt_compression_validate(u64 v, struct printbuf *err)
{
if (!bch2_compression_opt_valid(v)) {
prt_printf(err, "invalid compression opt %llu", v);
return -BCH_ERR_invalid_sb_opt_compression;
}
return 0;
}

View File

@ -4,12 +4,18 @@
#include "extents_types.h"
static const unsigned __bch2_compression_opt_to_type[] = {
#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
BCH_COMPRESSION_OPTS()
#undef x
};
struct bch_compression_opt {
u8 type:4,
level:4;
};
static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
{
return (struct bch_compression_opt) {
.type = v & 15,
@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
};
}
static inline bool bch2_compression_opt_valid(unsigned v)
{
struct bch_compression_opt opt = __bch2_compression_decode(v);
return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
}
static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
{
return bch2_compression_opt_valid(v)
? __bch2_compression_decode(v)
: (struct bch_compression_opt) { 0 };
}
static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
{
return opt.type|(opt.level << 4);
}
static const unsigned __bch2_compression_opt_to_type[] = {
#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
BCH_COMPRESSION_OPTS()
#undef x
};
static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
{
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
void bch2_compression_opt_to_text(struct printbuf *, u64);
int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int bch2_opt_compression_validate(u64, struct printbuf *);
#define bch2_opt_compression (struct bch_opt_fn) { \
.parse = bch2_opt_compression_parse, \
.to_text = bch2_opt_compression_to_text, \
.validate = bch2_opt_compression_validate, \
}
#endif /* _BCACHEFS_COMPRESS_H */

View File

@ -69,9 +69,15 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
_ret; \
})
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
#define darray_for_each(_d, _i) \
for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each_reverse(_d, _i) \
for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
#define darray_init(_d) \
do { \
(_d)->data = NULL; \

View File

@ -13,6 +13,7 @@
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
#include "subvolume.h"
#include "trace.h"
@ -161,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
/*
* See comment below:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
*/
rewrites_found |= 1U << i;
}
i++;
@ -211,14 +208,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (!p.ptr.cached &&
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
/*
* Currently, we're dropping unneeded replicas
* instead of marking them as cached, since
* cached data in stripe buckets prevents them
* from being reused:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
*/
goto restart_drop_extra_replicas;
}
}
@ -251,11 +242,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p);
if (ret)
goto err;
ret = bch2_trans_update(trans, &iter, insert,
k.k->p, insert->k.p) ?:
bch2_bkey_set_needs_rebalance(c, insert,
op->opts.background_target,
op->opts.background_compression) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
@ -281,11 +272,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
}
continue;
nowork:
if (m->ctxt && m->ctxt->stats) {
if (m->stats && m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->ctxt->stats->keys_raced);
atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->stats->sectors_raced);
&m->stats->sectors_raced);
}
this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
@ -439,6 +430,8 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&m->k, c, k);
m->btree_id = btree_id;
m->data_opts = data_opts;
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
bch2_write_op_init(&m->op, c, io_opts);
m->op.pos = bkey_start_pos(k.k);
@ -487,7 +480,7 @@ int bch2_data_update_init(struct btree_trans *trans,
if (c->opts.nocow_enabled) {
if (ctxt) {
move_ctxt_wait_event(ctxt, trans,
move_ctxt_wait_event(ctxt,
(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
PTR_BUCKET_POS(c, &p.ptr), 0)) ||
!atomic_read(&ctxt->read_sectors));

View File

@ -23,6 +23,7 @@ struct data_update {
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
struct bch_move_stats *stats;
struct bch_write_op op;
};

View File

@ -517,7 +517,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
prt_printf(out, "%px btree=%s l=%u ",
b,
bch2_btree_ids[b->c.btree_id],
bch2_btree_id_str(b->c.btree_id),
b->c.level);
prt_newline(out);
@ -919,18 +919,18 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
debugfs_create_file(bch2_btree_ids[bd->id],
debugfs_create_file(bch2_btree_id_str(bd->id),
0400, c->btree_debug_dir, bd,
&btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
bch2_btree_ids[bd->id]);
bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
bch2_btree_ids[bd->id]);
bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&bfloat_failed_debug_ops);

View File

@ -97,61 +97,51 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.is_visible = dirent_is_visible,
};
int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
int ret = 0;
if (!d_name.len) {
prt_printf(err, "empty name");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(!d_name.len, c, err,
dirent_empty_name,
"empty name");
if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) {
prt_printf(err, "value too big (%zu > %u)",
bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
dirent_val_too_big,
"value too big (%zu > %u)",
bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
return -BCH_ERR_invalid_bkey;
}
/*
* Check new keys don't exceed the max length
* (older keys may be larger.)
*/
if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) {
prt_printf(err, "dirent name too big (%u > %u)",
bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
dirent_name_too_long,
"dirent name too big (%u > %u)",
d_name.len, BCH_NAME_MAX);
return -BCH_ERR_invalid_bkey;
}
if (d_name.len != strnlen(d_name.name, d_name.len)) {
prt_printf(err, "dirent has stray data after name's NUL");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
dirent_name_embedded_nul,
"dirent has stray data after name's NUL");
if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) {
prt_printf(err, "invalid name");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
(d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
dirent_name_dot_or_dotdot,
"invalid name");
if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) {
prt_printf(err, "invalid name");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
dirent_name_has_slash,
"name with /");
if (memchr(d_name.name, '/', d_name.len)) {
prt_printf(err, "invalid name");
return -BCH_ERR_invalid_bkey;
}
if (d.v->d_type != DT_SUBVOL &&
le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
prt_printf(err, "dirent points to own directory");
return -BCH_ERR_invalid_bkey;
}
return 0;
bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
dirent_to_itself,
"dirent points to own directory");
fsck_err:
return ret;
}
void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,

View File

@ -7,7 +7,7 @@
enum bkey_invalid_flags;
extern const struct bch_hash_desc bch2_dirent_hash_desc;
int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

View File

@ -175,6 +175,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
dst->deleted = BCH_GROUP_DELETED(src);
dst->parent = BCH_GROUP_PARENT(src);
memcpy(dst->label, src->label, sizeof(dst->label));
}
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
@ -382,7 +383,57 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
struct bch_disk_groups_cpu *groups;
struct bch_disk_group_cpu *g;
unsigned nr = 0;
u16 path[32];
out->atomic++;
rcu_read_lock();
groups = rcu_dereference(c->disk_groups);
if (!groups)
goto invalid;
while (1) {
if (nr == ARRAY_SIZE(path))
goto invalid;
if (v >= groups->nr)
goto invalid;
g = groups->entries + v;
if (g->deleted)
goto invalid;
path[nr++] = v;
if (!g->parent)
break;
v = g->parent - 1;
}
while (nr) {
v = path[--nr];
g = groups->entries + v;
prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
if (nr)
prt_printf(out, ".");
}
out:
rcu_read_unlock();
out->atomic--;
return;
invalid:
prt_printf(out, "invalid label %u", v);
goto out;
}
void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_field_get(sb, disk_groups);
@ -493,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
return -EINVAL;
}
void bch2_opt_target_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
{
struct target t = target_decode(v);
@ -504,8 +552,7 @@ void bch2_opt_target_to_text(struct printbuf *out,
case TARGET_NULL:
prt_printf(out, "none");
break;
case TARGET_DEV:
if (c) {
case TARGET_DEV: {
struct bch_dev *ca;
rcu_read_lock();
@ -523,7 +570,25 @@ void bch2_opt_target_to_text(struct printbuf *out,
}
rcu_read_unlock();
} else {
break;
}
case TARGET_GROUP:
bch2_disk_path_to_text(out, c, t.group);
break;
default:
BUG();
}
}
void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
{
struct target t = target_decode(v);
switch (t.type) {
case TARGET_NULL:
prt_printf(out, "none");
break;
case TARGET_DEV: {
struct bch_member m = bch2_sb_member_get(sb, t.dev);
if (bch2_dev_exists(sb, t.dev)) {
@ -533,18 +598,23 @@ void bch2_opt_target_to_text(struct printbuf *out,
} else {
prt_printf(out, "Bad device %u", t.dev);
}
}
break;
case TARGET_GROUP:
if (c) {
mutex_lock(&c->sb_lock);
bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
mutex_unlock(&c->sb_lock);
} else {
bch2_disk_path_to_text(out, sb, t.group);
}
case TARGET_GROUP:
bch2_disk_path_to_text_sb(out, sb, t.group);
break;
default:
BUG();
}
}
void bch2_opt_target_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_sb *sb,
u64 v)
{
if (c)
bch2_target_to_text(out, c, v);
else
bch2_target_to_text_sb(out, sb, v);
}

View File

@ -2,6 +2,8 @@
#ifndef _BCACHEFS_DISK_GROUPS_H
#define _BCACHEFS_DISK_GROUPS_H
#include "disk_groups_types.h"
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
/* Exported for userspace bcachefs-tools: */
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);

View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
#define _BCACHEFS_DISK_GROUPS_TYPES_H
struct bch_disk_group_cpu {
bool deleted;
u16 parent;
u8 label[BCH_SB_LABEL_SIZE];
struct bch_devs_mask devs;
};
struct bch_disk_groups_cpu {
struct rcu_head rcu;
unsigned nr;
struct bch_disk_group_cpu entries[] __counted_by(nr);
};
#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */

View File

@ -105,29 +105,26 @@ struct ec_bio {
/* Stripes btree keys: */
int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
int ret = 0;
if (bkey_eq(k.k->p, POS_MIN)) {
prt_printf(err, "stripe at POS_MIN");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
stripe_pos_bad,
"stripe at bad pos");
if (k.k->p.inode) {
prt_printf(err, "nonzero inode field");
return -BCH_ERR_invalid_bkey;
}
if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
prt_printf(err, "incorrect value size (%zu < %u)",
bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
stripe_val_size_bad,
"incorrect value size (%zu < %u)",
bkey_val_u64s(k.k), stripe_val_u64s(s));
return -BCH_ERR_invalid_bkey;
}
return bch2_bkey_ptrs_invalid(c, k, flags, err);
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
}
void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@ -153,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
if (i < nr_data)
prt_printf(out, "#%u", stripe_blockcount_get(s, i));
prt_printf(out, " gen %u", ptr->gen);
if (ptr_stale(ca, ptr))
prt_printf(out, " stale");
}
@ -306,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
struct bch_csum got = ec_block_checksum(buf, i, offset);
if (bch2_crc_cmp(want, got)) {
struct printbuf buf2 = PRINTBUF;
struct printbuf err = PRINTBUF;
struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
want.hi, want.lo,
got.hi, got.lo,
bch2_csum_types[v->csum_type]);
prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i);
bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
bch_err_ratelimited(ca, "%s", err.buf);
printbuf_exit(&err);
bch_err_ratelimited(c,
"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
(void *) _RET_IP_, i, j, v->csum_type,
want.lo, got.lo, buf2.buf);
printbuf_exit(&buf2);
clear_bit(i, buf->valid);
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
break;
}
@ -373,7 +376,11 @@ static void ec_block_endio(struct bio *bio)
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca,
bio_data_dir(bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"erasure coding %s error: %s",
bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
clear_bit(ec_bio->idx, ec_bio->buf->valid);
@ -474,14 +481,10 @@ static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
return ret;
}
static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
return bch2_trans_run(c, get_stripe_key_trans(trans, idx, stripe));
}
/* recovery read path: */
int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
{
struct bch_fs *c = trans->c;
struct ec_stripe_buf *buf;
struct closure cl;
struct bch_stripe *v;
@ -496,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
if (!buf)
return -BCH_ERR_ENOMEM_ec_read_extent;
ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
bch_err_ratelimited(c,
"error doing reconstruct read: error %i looking up stripe", ret);

View File

@ -8,7 +8,7 @@
enum bkey_invalid_flags;
int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@ -199,7 +199,7 @@ struct ec_stripe_head {
struct ec_stripe_new *s;
};
int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);

View File

@ -3,6 +3,8 @@
#define _BCACHEFS_ERRCODE_H
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
x(ENOMEM, ENOMEM_stripe_buf) \
x(ENOMEM, ENOMEM_replicas_table) \
x(ENOMEM, ENOMEM_cpu_replicas) \
@ -213,6 +215,8 @@
x(BCH_ERR_invalid_sb, invalid_sb_crypt) \
x(BCH_ERR_invalid_sb, invalid_sb_clean) \
x(BCH_ERR_invalid_sb, invalid_sb_quota) \
x(BCH_ERR_invalid_sb, invalid_sb_errors) \
x(BCH_ERR_invalid_sb, invalid_sb_opt_compression) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \

View File

@ -56,8 +56,9 @@ void bch2_io_error_work(struct work_struct *work)
up_write(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca)
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
atomic64_inc(&ca->errors[type]);
//queue_work(system_long_wq, &ca->io_error_work);
}
@ -116,31 +117,34 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
return NULL;
list_for_each_entry(s, &c->fsck_errors, list)
list_for_each_entry(s, &c->fsck_error_msgs, list)
if (s->fmt == fmt) {
/*
* move it to the head of the list: repeated fsck errors
* are common
*/
list_move(&s->list, &c->fsck_errors);
list_move(&s->list, &c->fsck_error_msgs);
return s;
}
s = kzalloc(sizeof(*s), GFP_NOFS);
if (!s) {
if (!c->fsck_alloc_err)
if (!c->fsck_alloc_msgs_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
c->fsck_alloc_err = true;
c->fsck_alloc_msgs_err = true;
return NULL;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
list_add(&s->list, &c->fsck_errors);
list_add(&s->list, &c->fsck_error_msgs);
return s;
}
int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
int bch2_fsck_err(struct bch_fs *c,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
const char *fmt, ...)
{
struct fsck_err_state *s = NULL;
va_list args;
@ -148,11 +152,13 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
bch2_sb_error_count(c, err);
va_start(args, fmt);
prt_vprintf(out, fmt, args);
va_end(args);
mutex_lock(&c->fsck_error_lock);
mutex_lock(&c->fsck_error_msgs_lock);
s = fsck_err_get(c, fmt);
if (s) {
/*
@ -162,7 +168,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
*/
if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
ret = s->ret;
mutex_unlock(&c->fsck_error_lock);
mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
return ret;
}
@ -257,7 +263,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
if (s)
s->ret = ret;
mutex_unlock(&c->fsck_error_lock);
mutex_unlock(&c->fsck_error_msgs_lock);
printbuf_exit(&buf);
@ -278,9 +284,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_lock);
mutex_lock(&c->fsck_error_msgs_lock);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
if (s->ratelimited && s->last_msg)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
@ -289,5 +295,5 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
kfree(s);
}
mutex_unlock(&c->fsck_error_lock);
mutex_unlock(&c->fsck_error_msgs_lock);
}

View File

@ -4,6 +4,7 @@
#include <linux/list.h>
#include <linux/printk.h>
#include "sb-errors.h"
struct bch_dev;
struct bch_fs;
@ -101,18 +102,26 @@ struct fsck_err_state {
char *last_msg;
};
#define FSCK_CAN_FIX (1 << 0)
#define FSCK_CAN_IGNORE (1 << 1)
#define FSCK_NEED_FSCK (1 << 2)
#define FSCK_NO_RATELIMIT (1 << 3)
enum bch_fsck_flags {
FSCK_CAN_FIX = 1 << 0,
FSCK_CAN_IGNORE = 1 << 1,
FSCK_NEED_FSCK = 1 << 2,
FSCK_NO_RATELIMIT = 1 << 3,
};
__printf(3, 4) __cold
int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
__printf(4, 5) __cold
int bch2_fsck_err(struct bch_fs *,
enum bch_fsck_flags,
enum bch_sb_error_id,
const char *, ...);
void bch2_flush_fsck_errs(struct bch_fs *);
#define __fsck_err(c, _flags, msg, ...) \
#define __fsck_err(c, _flags, _err_type, ...) \
({ \
int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \
int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type, \
__VA_ARGS__); \
\
if (_ret != -BCH_ERR_fsck_fix && \
_ret != -BCH_ERR_fsck_ignore) { \
@ -127,26 +136,53 @@ void bch2_flush_fsck_errs(struct bch_fs *);
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, ...) \
(unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
#define __fsck_err_on(cond, c, _flags, _err_type, ...) \
(unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
#define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
#define need_fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
#define need_fsck_err(c, ...) \
__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
#define need_fsck_err(c, _err_type, ...) \
__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
#define mustfix_fsck_err(c, ...) \
__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define mustfix_fsck_err(c, _err_type, ...) \
__fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
#define mustfix_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define mustfix_fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
#define fsck_err(c, ...) \
__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
#define fsck_err(c, _err_type, ...) \
__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
#define fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
#define fsck_err_on(cond, c, _err_type, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
static inline void bch2_bkey_fsck_err(struct bch_fs *c,
struct printbuf *err_msg,
enum bch_sb_error_id err_type,
const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
prt_vprintf(err_msg, fmt, args);
va_end(args);
}
#define bkey_fsck_err(c, _err_msg, _err_type, ...) \
do { \
prt_printf(_err_msg, __VA_ARGS__); \
bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type); \
ret = -BCH_ERR_invalid_bkey; \
goto fsck_err; \
} while (0)
#define bkey_fsck_err_on(cond, ...) \
do { \
if (unlikely(cond)) \
bkey_fsck_err(__VA_ARGS__); \
} while (0)
/*
* Fatal errors: these don't indicate a bug, but we can't continue running in RW
@ -179,26 +215,26 @@ do { \
void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
void bch2_io_error(struct bch_dev *);
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
#define bch2_dev_io_err_on(cond, ca, ...) \
#define bch2_dev_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_dev_ratelimited(ca, __VA_ARGS__); \
bch2_io_error(ca); \
bch2_io_error(ca, _type); \
} \
_ret; \
})
#define bch2_dev_inum_io_err_on(cond, ca, ...) \
#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) { \
bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
bch2_io_error(ca); \
bch2_io_error(ca, _type); \
} \
_ret; \
})

View File

@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "buckets.h"
#include "checksum.h"
#include "compress.h"
#include "debug.h"
#include "disk_groups.h"
#include "error.h"
@ -162,17 +163,19 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
prt_printf(err, "value too big (%zu > %u)",
bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return bch2_bkey_ptrs_invalid(c, k, flags, err);
bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
btree_ptr_val_too_big,
"value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
}
void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@ -181,17 +184,20 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
bch2_bkey_ptrs_to_text(out, c, k);
}
int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
prt_printf(err, "value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return bch2_bkey_ptrs_invalid(c, k, flags, err);
bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
btree_ptr_v2_val_too_big,
"value too big (%zu > %zu)",
bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
fsck_err:
return ret;
}
void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@ -372,19 +378,18 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
/* KEY_TYPE_reservation: */
int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
int ret = 0;
if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
prt_printf(err, "invalid nr_replicas (%u)",
r.v->nr_replicas);
return -BCH_ERR_invalid_bkey;
}
return 0;
bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
reservation_key_nr_replicas_invalid,
"invalid nr_replicas (%u)", r.v->nr_replicas);
fsck_err:
return ret;
}
void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@ -757,18 +762,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
return i;
}
static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
{
union bch_extent_entry *next = extent_entry_next(entry);
/* stripes have ptrs, but their layout doesn't work with this code */
BUG_ON(k.k->type == KEY_TYPE_stripe);
memmove_u64s_down(entry, next,
(u64 *) bkey_val_end(k) - (u64 *) next);
k.k->u64s -= (u64 *) next - (u64 *) entry;
}
/*
* Returns pointer to the next entry after the one being dropped:
*/
@ -992,10 +985,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct bch_extent_crc_unpacked crc;
const struct bch_extent_ptr *ptr;
const struct bch_extent_stripe_ptr *ec;
struct bch_dev *ca;
bool first = true;
if (c)
@ -1006,9 +995,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " ");
switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
ptr = entry_to_ptr(entry);
ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
case BCH_EXTENT_ENTRY_ptr: {
const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
? bch_dev_bkey_exists(c, ptr->dev)
: NULL;
@ -1030,10 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " stale");
}
break;
}
case BCH_EXTENT_ENTRY_crc32:
case BCH_EXTENT_ENTRY_crc64:
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
case BCH_EXTENT_ENTRY_crc128: {
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
crc.compressed_size,
@ -1042,12 +1033,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_csum_types[crc.csum_type],
bch2_compression_types[crc.compression_type]);
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
ec = &entry->stripe_ptr;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
prt_printf(out, "ec: idx %llu block %u",
(u64) ec->idx, ec->block);
break;
}
case BCH_EXTENT_ENTRY_rebalance: {
const struct bch_extent_rebalance *r = &entry->rebalance;
prt_str(out, "rebalance: target ");
if (c)
bch2_target_to_text(out, c, r->target);
else
prt_printf(out, "%u", r->target);
prt_str(out, " compression ");
bch2_compression_opt_to_text(out, r->compression);
break;
}
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@ -1057,7 +1062,7 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
}
}
static int extent_ptr_invalid(const struct bch_fs *c,
static int extent_ptr_invalid(struct bch_fs *c,
struct bkey_s_c k,
enum bkey_invalid_flags flags,
const struct bch_extent_ptr *ptr,
@ -1070,6 +1075,7 @@ static int extent_ptr_invalid(const struct bch_fs *c,
u64 bucket;
u32 bucket_offset;
struct bch_dev *ca;
int ret = 0;
if (!bch2_dev_exists2(c, ptr->dev)) {
/*
@ -1080,41 +1086,33 @@ static int extent_ptr_invalid(const struct bch_fs *c,
if (flags & BKEY_INVALID_WRITE)
return 0;
prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
return -BCH_ERR_invalid_bkey;
bkey_fsck_err(c, err, ptr_to_invalid_device,
"pointer to invalid device (%u)", ptr->dev);
}
ca = bch_dev_bkey_exists(c, ptr->dev);
bkey_for_each_ptr(ptrs, ptr2)
if (ptr != ptr2 && ptr->dev == ptr2->dev) {
prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
ptr_to_duplicate_device,
"multiple pointers to same device (%u)", ptr->dev);
bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
if (bucket >= ca->mi.nbuckets) {
prt_printf(err, "pointer past last bucket (%llu > %llu)",
bucket, ca->mi.nbuckets);
return -BCH_ERR_invalid_bkey;
}
if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
prt_printf(err, "pointer before first bucket (%llu < %u)",
bucket, ca->mi.first_bucket);
return -BCH_ERR_invalid_bkey;
}
if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
ptr_after_last_bucket,
"pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
ptr_before_first_bucket,
"pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
ptr_spans_multiple_buckets,
"pointer spans multiple buckets (%u + %u > %u)",
bucket_offset, size_ondisk, ca->mi.bucket_size);
return -BCH_ERR_invalid_bkey;
fsck_err:
return ret;
}
return 0;
}
int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@ -1124,24 +1122,22 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
unsigned size_ondisk = k.k->size;
unsigned nonce = UINT_MAX;
unsigned nr_ptrs = 0;
bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
int ret;
bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
int ret = 0;
if (bkey_is_btree_ptr(k.k))
size_ondisk = btree_sectors(c);
bkey_extent_entry_for_each(ptrs, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
prt_printf(err, "invalid extent entry type (got %u, max %u)",
bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
extent_ptrs_invalid_entry,
"invalid extent entry type (got %u, max %u)",
__extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
return -BCH_ERR_invalid_bkey;
}
if (bkey_is_btree_ptr(k.k) &&
!extent_entry_is_ptr(entry)) {
prt_printf(err, "has non ptr field");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
!extent_entry_is_ptr(entry), c, err,
btree_ptr_has_non_ptr,
"has non ptr field");
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
@ -1150,22 +1146,15 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
if (ret)
return ret;
if (nr_ptrs && unwritten != entry->ptr.unwritten) {
prt_printf(err, "extent with unwritten and written ptrs");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
ptr_cached_and_erasure_coded,
"cached, erasure coded ptr");
if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
prt_printf(err, "has unwritten ptrs");
return -BCH_ERR_invalid_bkey;
}
if (!entry->ptr.unwritten)
have_written = true;
else
have_unwritten = true;
if (entry->ptr.cached && have_ec) {
prt_printf(err, "cached, erasure coded ptr");
return -BCH_ERR_invalid_bkey;
}
unwritten = entry->ptr.unwritten;
have_ec = false;
crc_since_last_ptr = false;
nr_ptrs++;
@ -1175,72 +1164,77 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
case BCH_EXTENT_ENTRY_crc128:
crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
if (crc.offset + crc.live_size >
crc.uncompressed_size) {
prt_printf(err, "checksum offset + key size > uncompressed size");
return -BCH_ERR_invalid_bkey;
}
size_ondisk = crc.compressed_size;
if (!bch2_checksum_type_valid(c, crc.csum_type)) {
prt_printf(err, "invalid checksum type");
return -BCH_ERR_invalid_bkey;
}
if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
prt_printf(err, "invalid compression type");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
ptr_crc_uncompressed_size_too_small,
"checksum offset + key size > uncompressed size");
bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
ptr_crc_csum_type_unknown,
"invalid checksum type");
bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
ptr_crc_compression_type_unknown,
"invalid compression type");
if (bch2_csum_type_is_encryption(crc.csum_type)) {
if (nonce == UINT_MAX)
nonce = crc.offset + crc.nonce;
else if (nonce != crc.offset + crc.nonce) {
prt_printf(err, "incorrect nonce");
return -BCH_ERR_invalid_bkey;
}
else if (nonce != crc.offset + crc.nonce)
bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
"incorrect nonce");
}
if (crc_since_last_ptr) {
prt_printf(err, "redundant crc entry");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(crc_since_last_ptr, c, err,
ptr_crc_redundant,
"redundant crc entry");
crc_since_last_ptr = true;
bkey_fsck_err_on(crc_is_encoded(crc) &&
(crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
(flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
ptr_crc_uncompressed_size_too_big,
"too large encoded extent");
size_ondisk = crc.compressed_size;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
if (have_ec) {
prt_printf(err, "redundant stripe entry");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(have_ec, c, err,
ptr_stripe_redundant,
"redundant stripe entry");
have_ec = true;
break;
case BCH_EXTENT_ENTRY_rebalance:
case BCH_EXTENT_ENTRY_rebalance: {
const struct bch_extent_rebalance *r = &entry->rebalance;
if (!bch2_compression_opt_valid(r->compression)) {
struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
prt_printf(err, "invalid compression opt %u:%u",
opt.type, opt.level);
return -BCH_ERR_invalid_bkey;
}
break;
}
}
if (!nr_ptrs) {
prt_str(err, "no ptrs");
return -BCH_ERR_invalid_bkey;
}
if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
prt_str(err, "too many ptrs");
return -BCH_ERR_invalid_bkey;
}
if (crc_since_last_ptr) {
prt_printf(err, "redundant crc entry");
return -BCH_ERR_invalid_bkey;
}
if (have_ec) {
prt_printf(err, "redundant stripe entry");
return -BCH_ERR_invalid_bkey;
}
return 0;
bkey_fsck_err_on(!nr_ptrs, c, err,
extent_ptrs_no_ptrs,
"no ptrs");
bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
extent_ptrs_too_many_ptrs,
"too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
bkey_fsck_err_on(have_written && have_unwritten, c, err,
extent_ptrs_written_and_unwritten,
"extent with unwritten and written ptrs");
bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
extent_ptrs_unwritten,
"has unwritten ptrs");
bkey_fsck_err_on(crc_since_last_ptr, c, err,
extent_ptrs_redundant_crc,
"redundant crc entry");
bkey_fsck_err_on(have_ec, c, err,
extent_ptrs_redundant_stripe,
"redundant stripe entry");
fsck_err:
return ret;
}
void bch2_ptr_swab(struct bkey_s k)
@ -1281,6 +1275,125 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
return &entry->rebalance;
return NULL;
}
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
unsigned target, unsigned compression)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned rewrite_ptrs = 0;
if (compression) {
unsigned compression_type = bch2_compression_opt_to_type(compression);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
unsigned i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
rewrite_ptrs = 0;
goto incompressible;
}
if (!p.ptr.cached && p.crc.compression_type != compression_type)
rewrite_ptrs |= 1U << i;
i++;
}
}
incompressible:
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
rewrite_ptrs |= 1U << i;
i++;
}
}
return rewrite_ptrs;
}
bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
/*
* If it's an indirect extent, we don't delete the rebalance entry when
* done so that we know what options were applied - check if it still
* needs work done:
*/
if (r &&
k.k->type == KEY_TYPE_reflink_v &&
!bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
r = NULL;
return r != NULL;
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
unsigned target, unsigned compression)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))
return 0;
/* get existing rebalance entry: */
r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
if (r) {
if (k.k->type == KEY_TYPE_reflink_v) {
/*
* indirect extents: existing options take precedence,
* so that we don't move extents back and forth if
* they're referenced by different inodes with different
* options:
*/
if (r->target)
target = r->target;
if (r->compression)
compression = r->compression;
}
r->target = target;
r->compression = compression;
}
needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
if (needs_rebalance && !r) {
union bch_extent_entry *new = bkey_val_end(k);
new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance;
new->rebalance.compression = compression;
new->rebalance.target = target;
new->rebalance.unused = 0;
k.k->u64s += extent_entry_u64s(new);
} else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
/*
* For indirect extents, don't delete the rebalance entry when
* we're finished so that we know we specifically moved it or
* compressed it to its current location/compression type
*/
extent_entry_drop(k, (union bch_extent_entry *) r);
}
return 0;
}
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)

View File

@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k,
memcpy_u64s_small(dst, new, extent_entry_u64s(new));
}
static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
{
union bch_extent_entry *next = extent_entry_next(entry);
/* stripes have ptrs, but their layout doesn't work with this code */
BUG_ON(k.k->type == KEY_TYPE_stripe);
memmove_u64s_down(entry, next,
(u64 *) bkey_val_end(k) - (u64 *) next);
k.k->u64s -= (u64 *) next - (u64 *) entry;
}
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
}
static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
{
return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
}
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
@ -383,12 +400,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
/* KEY_TYPE_btree_ptr: */
int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
@ -428,7 +445,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
/* KEY_TYPE_reservation: */
int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@ -688,11 +705,19 @@ void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_ptr_swab(struct bkey_s);
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
unsigned, unsigned);
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
unsigned, unsigned);
/* Generic extent code: */
enum bch_extent_overlap {
@ -737,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
/*
* In extent_sort_fix_overlapping(), insert_fixup_extent(),
* extent_merge_inline() - we're modifying keys in place that are packed. To do
* that we have to unpack the key, modify the unpacked key - then this
* copies/repacks the unpacked to the original as necessary.
*/
static inline void extent_save(struct btree *b, struct bkey_packed *dst,
struct bkey *src)
{
struct bkey_format *f = &b->format;
struct bkey_i *dst_unpacked;
if ((dst_unpacked = packed_to_bkey(dst)))
dst_unpacked->k = *src;
else
BUG_ON(!bch2_bkey_pack_key(dst, src, f));
}
#endif /* _BCACHEFS_EXTENTS_H */

View File

@ -51,7 +51,7 @@ int bch2_create_trans(struct btree_trans *trans,
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
if (flags & BCH_CREATE_TMPFILE)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
new_inode->bi_flags |= BCH_INODE_unlinked;
ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
if (ret)

View File

@ -389,6 +389,21 @@ static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs
return ret;
}
/*
* Determine when a writepage io is full. We have to limit writepage bios to a
* single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
* what the bounce path in bch2_write_extent() can handle. In theory we could
* loosen this restriction for non-bounce I/O, but we don't have that context
* here. Ideally, we can up this limit and make it configurable in the future
* when the bounce path can be enhanced to accommodate larger source bios.
*/
static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
{
struct bio *bio = &io->op.wbio.bio;
return bio_full(bio, len) ||
(bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
}
static void bch2_writepage_io_done(struct bch_write_op *op)
{
struct bch_writepage_io *io =
@ -606,9 +621,7 @@ static int __bch2_writepage(struct folio *folio,
if (w->io &&
(w->io->op.res.nr_replicas != nr_replicas_this_write ||
bio_full(&w->io->op.wbio.bio, sectors << 9) ||
w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
(BIO_MAX_VECS * PAGE_SIZE) ||
bch_io_full(w->io, sectors << 9) ||
bio_end_sector(&w->io->op.wbio.bio) != sector))
bch2_writepage_do_io(w);

View File

@ -113,6 +113,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
} else {
atomic_set(&dio->cl.remaining,
CLOSURE_REMAINING_INITIALIZER + 1);
dio->cl.closure_get_happened = true;
}
dio->req = req;

View File

@ -45,13 +45,13 @@ static int bch2_inode_flags_set(struct btree_trans *trans,
unsigned newflags = s->flags;
unsigned oldflags = bi->bi_flags & s->mask;
if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
if (!S_ISREG(bi->bi_mode) &&
!S_ISDIR(bi->bi_mode) &&
(newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
(newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
return -EINVAL;
if (s->set_projinherit) {

View File

@ -6,28 +6,28 @@
/* bcachefs inode flags -> vfs inode flags: */
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
[__BCH_INODE_SYNC] = S_SYNC,
[__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
[__BCH_INODE_APPEND] = S_APPEND,
[__BCH_INODE_NOATIME] = S_NOATIME,
[__BCH_INODE_sync] = S_SYNC,
[__BCH_INODE_immutable] = S_IMMUTABLE,
[__BCH_INODE_append] = S_APPEND,
[__BCH_INODE_noatime] = S_NOATIME,
};
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
static const __maybe_unused unsigned bch_flags_to_uflags[] = {
[__BCH_INODE_SYNC] = FS_SYNC_FL,
[__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
[__BCH_INODE_APPEND] = FS_APPEND_FL,
[__BCH_INODE_NODUMP] = FS_NODUMP_FL,
[__BCH_INODE_NOATIME] = FS_NOATIME_FL,
[__BCH_INODE_sync] = FS_SYNC_FL,
[__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
[__BCH_INODE_append] = FS_APPEND_FL,
[__BCH_INODE_nodump] = FS_NODUMP_FL,
[__BCH_INODE_noatime] = FS_NOATIME_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
static const __maybe_unused unsigned bch_flags_to_xflags[] = {
[__BCH_INODE_SYNC] = FS_XFLAG_SYNC,
[__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
[__BCH_INODE_APPEND] = FS_XFLAG_APPEND,
[__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP,
[__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME,
[__BCH_INODE_sync] = FS_XFLAG_SYNC,
[__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
[__BCH_INODE_append] = FS_XFLAG_APPEND,
[__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
[__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
};

View File

@ -764,15 +764,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
}
if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
stat->attributes |= STATX_ATTR_IMMUTABLE;
stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
if (inode->ei_inode.bi_flags & BCH_INODE_append)
stat->attributes |= STATX_ATTR_APPEND;
stat->attributes_mask |= STATX_ATTR_APPEND;
if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
stat->attributes |= STATX_ATTR_NODUMP;
stat->attributes_mask |= STATX_ATTR_NODUMP;
@ -1213,9 +1213,6 @@ static struct dentry *bch2_get_parent(struct dentry *child)
.inum = inode->ei_inode.bi_dir,
};
if (!parent_inum.inum)
return NULL;
return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
}

View File

@ -2,6 +2,7 @@
#include "bcachefs.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "buckets.h"
#include "darray.h"
@ -444,9 +445,10 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
if (i->equiv == n.equiv) {
bch_err(c, "snapshot deletion did not finish:\n"
" duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
bch2_btree_ids[btree_id],
bch2_btree_id_str(btree_id),
pos.inode, pos.offset,
i->id, n.id, n.equiv);
set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
}
}
@ -719,6 +721,7 @@ static int check_key_has_snapshot(struct btree_trans *trans,
int ret = 0;
if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
bkey_in_missing_snapshot,
"key in missing snapshot: %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
@ -789,6 +792,7 @@ static int hash_check_key(struct btree_trans *trans,
if (fsck_err_on(k.k->type == desc.key_type &&
!desc.cmp_bkey(k, hash_k), c,
hash_table_key_duplicate,
"duplicate hash table keys:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k),
@ -807,8 +811,9 @@ static int hash_check_key(struct btree_trans *trans,
printbuf_exit(&buf);
return ret;
bad_hash:
if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
if (fsck_err(c, hash_table_key_wrong_offset,
"hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
@ -849,22 +854,23 @@ static int check_inode(struct btree_trans *trans,
BUG_ON(bch2_inode_unpack(k, &u));
if (!full &&
!(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
BCH_INODE_I_SECTORS_DIRTY|
BCH_INODE_UNLINKED)))
!(u.bi_flags & (BCH_INODE_i_size_dirty|
BCH_INODE_i_sectors_dirty|
BCH_INODE_unlinked)))
return 0;
if (prev->bi_inum != u.bi_inum)
*prev = u;
if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
inode_d_type(prev) != inode_d_type(&u), c,
inode_d_type(prev) != inode_d_type(&u),
c, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
return -EINVAL;
}
if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) &&
if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
struct bpos new_min_pos;
@ -872,7 +878,7 @@ static int check_inode(struct btree_trans *trans,
if (ret)
goto err;
u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED;
u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
ret = __write_inode(trans, &u, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck updating inode");
@ -884,9 +890,10 @@ static int check_inode(struct btree_trans *trans,
return 0;
}
if (u.bi_flags & BCH_INODE_UNLINKED &&
if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
fsck_err(c, inode_unlinked_but_clean,
"filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
bch2_trans_unlock(trans);
bch2_fs_lazy_rw(c);
@ -896,9 +903,10 @@ static int check_inode(struct btree_trans *trans,
return ret;
}
if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
if (u.bi_flags & BCH_INODE_i_size_dirty &&
(!c->sb.clean ||
fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
fsck_err(c, inode_i_size_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_size dirty",
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
@ -922,15 +930,16 @@ static int check_inode(struct btree_trans *trans,
* We truncated without our normal sector accounting hook, just
* make sure we recalculate it:
*/
u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
u.bi_flags |= BCH_INODE_i_sectors_dirty;
u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
u.bi_flags &= ~BCH_INODE_i_size_dirty;
do_update = true;
}
if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
(!c->sb.clean ||
fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
fsck_err(c, inode_i_sectors_dirty_but_clean,
"filesystem marked clean, but inode %llu has i_sectors dirty",
u.bi_inum))) {
s64 sectors;
@ -944,14 +953,14 @@ static int check_inode(struct btree_trans *trans,
}
u.bi_sectors = sectors;
u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
do_update = true;
}
if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
if (u.bi_flags & BCH_INODE_backptr_untrusted) {
u.bi_dir = 0;
u.bi_dir_offset = 0;
u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
u.bi_flags &= ~BCH_INODE_backptr_untrusted;
do_update = true;
}
@ -1056,7 +1065,8 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
return -BCH_ERR_internal_fsck_err;
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
c, inode_i_sectors_wrong,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
w->last_pos.inode, i->snapshot,
i->inode.bi_sectors, i->count)) {
@ -1200,7 +1210,8 @@ static int overlapping_extents_found(struct btree_trans *trans,
prt_printf(&buf, "\n overwriting %s extent",
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
if (fsck_err(c, "overlapping extents%s", buf.buf)) {
if (fsck_err(c, extent_overlapping,
"overlapping extents%s", buf.buf)) {
struct btree_iter *old_iter = &iter1;
struct disk_reservation res = { 0 };
@ -1297,6 +1308,28 @@ static int check_overlapping_extents(struct btree_trans *trans,
return ret;
}
static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *i;
unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
bkey_for_each_crc(k.k, ptrs, crc, i)
if (crc_is_encoded(crc) &&
crc.uncompressed_size > encoded_extent_max_sectors) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf);
printbuf_exit(&buf);
}
return 0;
}
static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k,
struct inode_walker *inode,
@ -1333,7 +1366,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
if (k.k->type != KEY_TYPE_whiteout) {
if (fsck_err_on(!i, c,
if (fsck_err_on(!i, c, extent_in_missing_inode,
"extent in missing inode:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
@ -1341,7 +1374,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
if (fsck_err_on(i &&
!S_ISREG(i->inode.bi_mode) &&
!S_ISLNK(i->inode.bi_mode), c,
!S_ISLNK(i->inode.bi_mode),
c, extent_in_non_reg_inode,
"extent in non regular inode mode %o:\n %s",
i->inode.bi_mode,
(printbuf_reset(&buf),
@ -1371,9 +1405,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
continue;
if (k.k->type != KEY_TYPE_whiteout) {
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k), c,
!bkey_extent_is_reservation(k),
c, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n %s",
i->inode.bi_inum, i->snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@ -1432,7 +1467,8 @@ int bch2_check_extents(struct bch_fs *c)
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
bch2_disk_reservation_put(c, &res);
check_extent(trans, &iter, k, &w, &s, &extent_ends);
check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
check_extent_overbig(trans, &iter, k);
})) ?:
check_i_sectors(trans, &w);
@ -1446,6 +1482,30 @@ int bch2_check_extents(struct bch_fs *c)
return ret;
}
int bch2_check_indirect_extents(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct disk_reservation res = { 0 };
int ret = 0;
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
POS_MIN,
BTREE_ITER_PREFETCH, k,
&res, NULL,
BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
bch2_disk_reservation_put(c, &res);
check_extent_overbig(trans, &iter, k);
}));
bch2_disk_reservation_put(c, &res);
bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
@ -1470,7 +1530,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
continue;
}
if (fsck_err_on(i->inode.bi_nlink != i->count, c,
if (fsck_err_on(i->inode.bi_nlink != i->count,
c, inode_dir_wrong_nlink,
"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
i->inode.bi_nlink = i->count;
@ -1514,27 +1575,28 @@ static int check_dirent_target(struct btree_trans *trans,
backpointer_exists = ret;
ret = 0;
if (fsck_err_on(S_ISDIR(target->bi_mode) &&
backpointer_exists, c,
if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
c, inode_dir_multiple_links,
"directory %llu with multiple links",
target->bi_inum)) {
ret = __remove_dirent(trans, d.k->p);
goto out;
}
if (fsck_err_on(backpointer_exists &&
!target->bi_nlink, c,
if (fsck_err_on(backpointer_exists && !target->bi_nlink,
c, inode_multiple_links_but_nlink_0,
"inode %llu type %s has multiple links but i_nlink 0",
target->bi_inum, bch2_d_types[d.v->d_type])) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_UNLINKED;
target->bi_flags &= ~BCH_INODE_unlinked;
ret = __write_inode(trans, target, target_snapshot);
if (ret)
goto err;
}
if (fsck_err_on(!backpointer_exists, c,
if (fsck_err_on(!backpointer_exists,
c, inode_wrong_backpointer,
"inode %llu:%u has wrong backpointer:\n"
"got %llu:%llu\n"
"should be %llu:%llu",
@ -1552,7 +1614,8 @@ static int check_dirent_target(struct btree_trans *trans,
}
}
if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
if (fsck_err_on(d.v->d_type != inode_d_type(target),
c, dirent_d_type_wrong,
"incorrect d_type: got %s, should be %s:\n%s",
bch2_d_type_str(d.v->d_type),
bch2_d_type_str(inode_d_type(target)),
@ -1576,7 +1639,8 @@ static int check_dirent_target(struct btree_trans *trans,
if (d.v->d_type == DT_SUBVOL &&
target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
(c->sb.version < bcachefs_metadata_version_subvol_dirent ||
fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
fsck_err(c, dirent_d_parent_subvol_wrong,
"dirent has wrong d_parent_subvol field: got %u, should be %u",
le32_to_cpu(d.v->d_parent_subvol),
target->bi_parent_subvol))) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
@ -1648,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
dir->first_this_inode = false;
if (fsck_err_on(!i, c,
if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
"dirent in nonexisting directory:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@ -1660,7 +1724,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (!i)
goto out;
if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
c, dirent_in_non_dir_inode,
"dirent in non directory inode type %s:\n%s",
bch2_d_type_str(inode_d_type(&i->inode)),
(printbuf_reset(&buf),
@ -1694,7 +1759,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (fsck_err_on(ret, c,
if (fsck_err_on(ret, c, dirent_to_missing_subvol,
"dirent points to missing subvolume %u",
le32_to_cpu(d.v->d_child_subvol))) {
ret = __remove_dirent(trans, d.k->p);
@ -1706,7 +1771,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (fsck_err_on(ret, c,
if (fsck_err_on(ret, c, subvol_to_missing_root,
"subvolume %u points to missing subvolume root %llu",
target_subvol,
target_inum)) {
@ -1715,7 +1780,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
c, subvol_root_wrong_bi_subvol,
"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
target_inum,
subvol_root.bi_subvol, target_subvol)) {
@ -1734,7 +1800,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
if (fsck_err_on(!target->inodes.nr, c,
if (fsck_err_on(!target->inodes.nr,
c, dirent_to_missing_inode,
"dirent points to missing inode: (equiv %u)\n%s",
equiv.snapshot,
(printbuf_reset(&buf),
@ -1820,7 +1887,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
inode->first_this_inode = false;
if (fsck_err_on(!i, c,
if (fsck_err_on(!i, c, xattr_in_missing_inode,
"xattr for missing inode %llu",
k.k->p.inode))
return bch2_btree_delete_at(trans, iter, 0);
@ -1869,7 +1936,8 @@ static int check_root_trans(struct btree_trans *trans)
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
"root subvol missing")) {
struct bkey_i_subvolume root_subvol;
snapshot = U32_MAX;
@ -1895,8 +1963,10 @@ static int check_root_trans(struct btree_trans *trans)
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
if (mustfix_fsck_err_on(ret, c, root_dir_missing,
"root directory missing") ||
mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
c, root_inode_not_dir,
"root inode not a directory")) {
bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
0, NULL);
@ -2000,7 +2070,8 @@ static int check_path(struct btree_trans *trans,
}
if (bch2_err_matches(ret, ENOENT)) {
if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
if (fsck_err(c, inode_unreachable,
"unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
inode->bi_inum, snapshot,
bch2_d_type_str(inode_d_type(inode)),
inode->bi_nlink,
@ -2040,7 +2111,8 @@ static int check_path(struct btree_trans *trans,
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
if (!fsck_err(c, "directory structure loop"))
if (!fsck_err(c, dir_loop,
"directory structure loop"))
return 0;
ret = commit_do(trans, NULL, NULL,
@ -2088,7 +2160,7 @@ int bch2_check_directory_structure(struct bch_fs *c)
break;
}
if (u.bi_flags & BCH_INODE_UNLINKED)
if (u.bi_flags & BCH_INODE_unlinked)
continue;
ret = check_path(trans, &path, &u, iter.pos.snapshot);
@ -2300,7 +2372,8 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
link = &links->d[++*idx];
}
if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
c, inode_wrong_nlink,
"inode %llu type %s has wrong i_nlink (%u, should be %u)",
u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
bch2_inode_nlink_get(&u), link->count)) {

View File

@ -4,6 +4,7 @@
int bch2_check_inodes(struct bch_fs *);
int bch2_check_extents(struct bch_fs *);
int bch2_check_indirect_extents(struct bch_fs *);
int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);

View File

@ -6,6 +6,7 @@
#include "bkey_methods.h"
#include "btree_update.h"
#include "buckets.h"
#include "compress.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@ -19,13 +20,18 @@
#include <asm/unaligned.h>
const char * const bch2_inode_opts[] = {
#define x(name, ...) #name,
const char * const bch2_inode_opts[] = {
BCH_INODE_OPTS()
#undef x
NULL,
};
static const char * const bch2_inode_flag_strs[] = {
BCH_INODE_FLAGS()
NULL
};
#undef x
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
static int inode_decode_field(const u8 *in, const u8 *end,
@ -361,9 +367,10 @@ int bch2_inode_peek(struct btree_trans *trans,
return ret;
}
int bch2_inode_write(struct btree_trans *trans,
int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode)
struct bch_inode_unpacked *inode,
enum btree_update_flags flags)
{
struct bkey_inode_buf *inode_p;
@ -373,7 +380,7 @@ int bch2_inode_write(struct btree_trans *trans,
bch2_inode_pack_inlined(inode_p, inode);
inode_p->inode.k.p.snapshot = iter->snapshot;
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
}
struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
@ -397,117 +404,121 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
return &inode_p->inode.k_i;
}
static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
{
struct bch_inode_unpacked unpacked;
int ret = 0;
if (k.k->p.inode) {
prt_printf(err, "nonzero k.p.inode");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(k.k->p.inode, c, err,
inode_pos_inode_nonzero,
"nonzero k.p.inode");
if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
prt_printf(err, "fs inode in blockdev range");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
inode_pos_blockdev_range,
"fs inode in blockdev range");
if (bch2_inode_unpack(k, &unpacked)) {
prt_printf(err, "invalid variable length fields");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
inode_unpack_error,
"invalid variable length fields");
if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
prt_printf(err, "invalid data checksum type (%u >= %u",
bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
inode_checksum_type_invalid,
"invalid data checksum type (%u >= %u",
unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
return -BCH_ERR_invalid_bkey;
bkey_fsck_err_on(unpacked.bi_compression &&
!bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
inode_compression_type_invalid,
"invalid compression opt %u", unpacked.bi_compression - 1);
bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
unpacked.bi_nlink != 0, c, err,
inode_unlinked_but_nlink_nonzero,
"flagged as unlinked but bi_nlink != 0");
bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
inode_subvol_root_but_not_dir,
"subvolume root but not a directory");
fsck_err:
return ret;
}
if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
prt_printf(err, "invalid data checksum type (%u >= %u)",
unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
return -BCH_ERR_invalid_bkey;
}
if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
unpacked.bi_nlink != 0) {
prt_printf(err, "flagged as unlinked but bi_nlink != 0");
return -BCH_ERR_invalid_bkey;
}
if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
prt_printf(err, "subvolume root but not a directory");
return -BCH_ERR_invalid_bkey;
}
return 0;
}
int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
int ret = 0;
if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
prt_printf(err, "invalid str hash type (%llu >= %u)",
bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
return -BCH_ERR_invalid_bkey;
ret = __bch2_inode_invalid(c, k, err);
fsck_err:
return ret;
}
return __bch2_inode_invalid(k, err);
}
int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
int ret = 0;
if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
prt_printf(err, "invalid str hash type (%llu >= %u)",
bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
return -BCH_ERR_invalid_bkey;
ret = __bch2_inode_invalid(c, k, err);
fsck_err:
return ret;
}
return __bch2_inode_invalid(k, err);
}
int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
int ret = 0;
if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
inode_v3_fields_start_bad,
"invalid fields_start (got %llu, min %u max %zu)",
INODEv3_FIELDS_START(inode.v),
INODEv3_FIELDS_START_INITIAL,
bkey_val_u64s(inode.k));
return -BCH_ERR_invalid_bkey;
}
if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
prt_printf(err, "invalid str hash type (%llu >= %u)",
bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
return -BCH_ERR_invalid_bkey;
}
return __bch2_inode_invalid(k, err);
ret = __bch2_inode_invalid(c, k, err);
fsck_err:
return ret;
}
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
inode->bi_mode, inode->bi_flags,
prt_printf(out, "mode=%o ", inode->bi_mode);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
prt_printf(out, " (%x)", inode->bi_flags);
prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
inode->bi_journal_seq,
inode->bi_size,
inode->bi_sectors,
inode->bi_version);
#define x(_name, _bits) \
prt_printf(out, " "#_name " %llu", (u64) inode->_name);
prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
BCH_INODE_FIELDS_v3()
#undef x
}
@ -546,7 +557,7 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
{
return bkey_inode_flags(k) & BCH_INODE_UNLINKED;
return bkey_inode_flags(k) & BCH_INODE_unlinked;
}
int bch2_trans_mark_inode(struct btree_trans *trans,
@ -610,16 +621,17 @@ int bch2_mark_inode(struct btree_trans *trans,
return 0;
}
int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (k.k->p.inode) {
prt_printf(err, "nonzero k.p.inode");
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return 0;
bkey_fsck_err_on(k.k->p.inode, c, err,
inode_pos_inode_nonzero,
"nonzero k.p.inode");
fsck_err:
return ret;
}
void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
@ -926,8 +938,8 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
{
if (bi->bi_flags & BCH_INODE_UNLINKED)
bi->bi_flags &= ~BCH_INODE_UNLINKED;
if (bi->bi_flags & BCH_INODE_unlinked)
bi->bi_flags &= ~BCH_INODE_unlinked;
else {
if (bi->bi_nlink == U32_MAX)
return -EINVAL;
@ -940,13 +952,13 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
{
if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
bi->bi_inum);
return;
}
if (bi->bi_flags & BCH_INODE_UNLINKED) {
if (bi->bi_flags & BCH_INODE_unlinked) {
bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
return;
}
@ -954,7 +966,7 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *
if (bi->bi_nlink)
bi->bi_nlink--;
else
bi->bi_flags |= BCH_INODE_UNLINKED;
bi->bi_flags |= BCH_INODE_unlinked;
}
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
@ -979,6 +991,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
}
int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
{
struct bch_inode_unpacked inode;
int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
if (ret)
return ret;
bch2_inode_opts_get(opts, trans->c, &inode);
return 0;
}
int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
@ -1042,53 +1066,85 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
return ret ?: -BCH_ERR_transaction_restart_nested;
}
static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
static int may_delete_deleted_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos pos,
bool *need_another_pass)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct btree_iter inode_iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
int ret;
if (bch2_snapshot_is_internal_node(c, pos.snapshot))
return 0;
if (!fsck_err_on(c->sb.clean, c,
"filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot))
return 0;
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
ret = bkey_err(k);
if (ret)
return ret;
ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
if (fsck_err_on(!bkey_is_inode(k.k), c,
deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
ret = bch2_inode_unpack(k, &inode);
if (ret)
goto err;
goto out;
if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
deleted_inode_is_dir,
"directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
return 1;
err:
if (c->sb.clean &&
!fsck_err(c,
deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
pos.offset, pos.snapshot)) {
ret = 0;
goto out;
}
if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
struct bpos new_min_pos;
ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
if (ret)
goto out;
inode.bi_flags &= ~BCH_INODE_unlinked;
ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch_err_msg(c, ret, "clearing inode unlinked flag");
if (ret)
goto out;
/*
* We'll need another write buffer flush to pick up the new
* unlinked inodes in the snapshot leaves:
*/
*need_another_pass = true;
return 0;
}
ret = 1;
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
return ret;
delete:
return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
goto out;
}
int bch2_delete_dead_inodes(struct bch_fs *c)
@ -1096,7 +1152,10 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
bool need_another_pass;
int ret;
again:
need_another_pass = false;
ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
@ -1110,7 +1169,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
*/
for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
ret = lockrestart_do(trans, may_delete_deleted_inode(trans, k.k->p));
ret = lockrestart_do(trans, may_delete_deleted_inode(trans, &iter, k.k->p,
&need_another_pass));
if (ret < 0)
break;
@ -1120,12 +1180,17 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
bch2_fs_lazy_rw(c);
}
bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
}
}
bch2_trans_iter_exit(trans, &iter);
if (!ret && need_another_pass)
goto again;
err:
bch2_trans_put(trans);

View File

@ -3,16 +3,17 @@
#define _BCACHEFS_INODE_H
#include "bkey.h"
#include "bkey_methods.h"
#include "opts.h"
enum bkey_invalid_flags;
extern const char * const bch2_inode_opts[];
int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@ -52,7 +53,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
k->type == KEY_TYPE_inode_v3;
}
int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@ -101,8 +102,16 @@ void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *)
int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, subvol_inum, unsigned);
int bch2_inode_write(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *);
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, enum btree_update_flags);
static inline int bch2_inode_write(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode)
{
return bch2_inode_write_flags(trans, iter, inode, 0);
}
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
@ -177,7 +186,7 @@ static inline unsigned nlink_bias(umode_t mode)
static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
{
return bi->bi_flags & BCH_INODE_UNLINKED
return bi->bi_flags & BCH_INODE_unlinked
? 0
: bi->bi_nlink + nlink_bias(bi->bi_mode);
}
@ -187,10 +196,10 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
{
if (nlink) {
bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
bi->bi_flags &= ~BCH_INODE_UNLINKED;
bi->bi_flags &= ~BCH_INODE_unlinked;
} else {
bi->bi_nlink = 0;
bi->bi_flags |= BCH_INODE_UNLINKED;
bi->bi_flags |= BCH_INODE_unlinked;
}
}
@ -200,6 +209,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
int bch2_delete_dead_inodes(struct bch_fs *);

View File

@ -16,13 +16,14 @@
#include "io_misc.h"
#include "io_write.h"
#include "logged_ops.h"
#include "rebalance.h"
#include "subvolume.h"
/* Overwrites whatever was present with zeroes: */
int bch2_extent_fallocate(struct btree_trans *trans,
subvol_inum inum,
struct btree_iter *iter,
unsigned sectors,
u64 sectors,
struct bch_io_opts opts,
s64 *i_sectors_delta,
struct write_point_specifier write_point)
@ -104,7 +105,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
if (ret)
goto err;
sectors = min(sectors, wp->sectors_free);
sectors = min_t(u64, sectors, wp->sectors_free);
sectors_allocated = sectors;
bch2_key_resize(&e->k, sectors);
@ -355,6 +356,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
struct bch_io_opts opts;
u64 dst_offset = le64_to_cpu(op->v.dst_offset);
u64 src_offset = le64_to_cpu(op->v.src_offset);
s64 shift = dst_offset - src_offset;
@ -363,6 +365,10 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
bool insert = shift > 0;
int ret = 0;
ret = bch2_inum_opts_get(trans, inum, &opts);
if (ret)
return ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(inum.inum, 0),
BTREE_ITER_INTENT);
@ -443,7 +449,10 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
ret = bch2_bkey_set_needs_rebalance(c, copy,
opts.background_target,
opts.background_compression) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);

View File

@ -3,7 +3,7 @@
#define _BCACHEFS_IO_MISC_H
int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
unsigned, struct bch_io_opts, s64 *,
u64, struct bch_io_opts, s64 *,
struct write_point_specifier);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);

View File

@ -643,7 +643,7 @@ static void __bch2_read_endio(struct work_struct *work)
"data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
bch2_io_error(ca);
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
decompression_err:
@ -677,7 +677,7 @@ static void bch2_read_endio(struct bio *bio)
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
rbio->read_pos.inode,
rbio->read_pos.offset,
"data read error: %s",
@ -1025,7 +1025,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
trans->notrace_relock_fail = true;
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(c, rbio)) {
if (bch2_ec_read_extent(trans, rbio)) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
}

View File

@ -202,6 +202,17 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_i *k;
struct bkey_i_inode_v3 *inode;
/*
* Crazy performance optimization:
* Every extent update needs to also update the inode: the inode trigger
* will set bi->journal_seq to the journal sequence number of this
* transaction - for fsync.
*
* But if that's the only reason we're updating the inode (we're not
* updating bi_size or bi_sectors), then we don't need the inode update
* to be journalled - if we crash, the bi_journal_seq update will be
* lost, but that's fine.
*/
unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
int ret;
@ -223,7 +234,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
inode = bkey_i_to_inode_v3(k);
if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
new_i_size > le64_to_cpu(inode->v.bi_size)) {
inode->v.bi_size = cpu_to_le64(new_i_size);
inode_update_flags = 0;
@ -351,7 +362,10 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
ret = bch2_extent_update(trans, inum, &iter, sk.k,
ret = bch2_bkey_set_needs_rebalance(c, sk.k,
op->opts.background_target,
op->opts.background_compression) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
op->flags & BCH_WRITE_CHECK_ENOSPC);
@ -495,7 +509,6 @@ static void __bch2_write_index(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_i *k;
unsigned dev;
int ret = 0;
@ -505,14 +518,6 @@ static void __bch2_write_index(struct bch_write_op *op)
goto err;
}
/*
* probably not the ideal place to hook this in, but I don't
* particularly want to plumb io_opts all the way through the btree
* update stack right now
*/
for_each_keylist_key(keys, k)
bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
@ -643,7 +648,7 @@ static void bch2_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
op->pos.inode,
wbio->inode_offset << 9,
"data write error: %s",
@ -816,6 +821,7 @@ static enum prep_encoded_ret {
/* Can we just write the entire extent as is? */
if (op->crc.uncompressed_size == op->crc.live_size &&
op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
op->crc.compressed_size <= wp->sectors_free &&
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
@ -1091,9 +1097,7 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_decode(e, p, entry) {
if (p.crc.csum_type ||
crc_is_compressed(p.crc) ||
p.has_ec)
if (crc_is_encoded(p.crc) || p.has_ec)
return false;
replicas += bch2_extent_ptr_durability(c, &p);

View File

@ -1019,6 +1019,25 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
return ret;
}
int bch2_fs_journal_alloc(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i) {
if (ca->journal.nr)
continue;
int ret = bch2_dev_journal_alloc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
return ret;
}
}
return 0;
}
/* startup/shutdown: */
static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)

View File

@ -534,6 +534,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);

View File

@ -140,7 +140,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
if (!dup->csum_good)
goto replace;
fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
fsck_err(c, journal_entry_replicas_data_mismatch,
"found duplicate but non identical journal entries (seq %llu)",
le64_to_cpu(j->seq));
i = dup;
goto found;
@ -235,7 +236,7 @@ static void journal_entry_err_msg(struct printbuf *out,
prt_str(out, ": ");
}
#define journal_entry_err(c, version, jset, entry, msg, ...) \
#define journal_entry_err(c, version, jset, entry, _err, msg, ...) \
({ \
struct printbuf _buf = PRINTBUF; \
\
@ -244,9 +245,10 @@ static void journal_entry_err_msg(struct printbuf *out,
\
switch (flags & BKEY_INVALID_WRITE) { \
case READ: \
mustfix_fsck_err(c, "%s", _buf.buf); \
mustfix_fsck_err(c, _err, "%s", _buf.buf); \
break; \
case WRITE: \
bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
if (bch2_fs_inconsistent(c)) { \
ret = -BCH_ERR_fsck_errors_not_fixed; \
@ -259,8 +261,8 @@ static void journal_entry_err_msg(struct printbuf *out,
true; \
})
#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \
((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false)
#define journal_entry_err_on(cond, ...) \
((cond) ? journal_entry_err(__VA_ARGS__) : false)
#define FSCK_DELETED_KEY 5
@ -277,7 +279,10 @@ static int journal_validate_key(struct bch_fs *c,
struct printbuf buf = PRINTBUF;
int ret = 0;
if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) {
if (journal_entry_err_on(!k->k.u64s,
c, version, jset, entry,
journal_entry_bkey_u64s_0,
"k->u64s 0")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
return FSCK_DELETED_KEY;
@ -286,6 +291,7 @@ static int journal_validate_key(struct bch_fs *c,
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry),
c, version, jset, entry,
journal_entry_bkey_past_end,
"extends past end of journal entry")) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
journal_entry_null_range(vstruct_next(entry), next);
@ -294,6 +300,7 @@ static int journal_validate_key(struct bch_fs *c,
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
c, version, jset, entry,
journal_entry_bkey_bad_format,
"bad format %u", k->k.format)) {
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@ -317,7 +324,8 @@ static int journal_validate_key(struct bch_fs *c,
bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id), write, &buf);
mustfix_fsck_err(c, "%s", buf.buf);
mustfix_fsck_err(c, journal_entry_bkey_invalid,
"%s", buf.buf);
le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@ -369,7 +377,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
prt_newline(out);
prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
}
prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
first = false;
}
@ -387,6 +395,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
if (journal_entry_err_on(!entry->u64s ||
le16_to_cpu(entry->u64s) != k->k.u64s,
c, version, jset, entry,
journal_entry_btree_root_bad_size,
"invalid btree root journal entry: wrong number of keys")) {
void *next = vstruct_next(entry);
/*
@ -436,6 +445,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c,
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
c, version, jset, entry,
journal_entry_blacklist_bad_size,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
@ -463,6 +473,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
c, version, jset, entry,
journal_entry_blacklist_v2_bad_size,
"invalid journal seq blacklist entry: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
goto out;
@ -473,6 +484,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
le64_to_cpu(bl_entry->end),
c, version, jset, entry,
journal_entry_blacklist_v2_start_past_end,
"invalid journal seq blacklist entry: start > end")) {
journal_entry_null_range(entry, vstruct_next(entry));
}
@ -505,6 +517,7 @@ static int journal_entry_usage_validate(struct bch_fs *c,
if (journal_entry_err_on(bytes < sizeof(*u),
c, version, jset, entry,
journal_entry_usage_bad_size,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@ -539,6 +552,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
if (journal_entry_err_on(bytes < sizeof(*u) ||
bytes < sizeof(*u) + u->r.nr_devs,
c, version, jset, entry,
journal_entry_data_usage_bad_size,
"invalid journal entry usage: bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@ -570,13 +584,17 @@ static int journal_entry_clock_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(bytes != sizeof(*clock),
c, version, jset, entry, "bad size")) {
c, version, jset, entry,
journal_entry_clock_bad_size,
"bad size")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(clock->rw > 1,
c, version, jset, entry, "bad rw")) {
c, version, jset, entry,
journal_entry_clock_bad_rw,
"bad rw")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
@ -608,7 +626,9 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
int ret = 0;
if (journal_entry_err_on(bytes < expected,
c, version, jset, entry, "bad size (%u < %u)",
c, version, jset, entry,
journal_entry_dev_usage_bad_size,
"bad size (%u < %u)",
bytes, expected)) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
@ -617,13 +637,17 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
dev = le32_to_cpu(u->dev);
if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
c, version, jset, entry, "bad dev")) {
c, version, jset, entry,
journal_entry_dev_usage_bad_dev,
"bad dev")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
if (journal_entry_err_on(u->pad,
c, version, jset, entry, "bad pad")) {
c, version, jset, entry,
journal_entry_dev_usage_bad_pad,
"bad pad")) {
journal_entry_null_range(entry, vstruct_next(entry));
return ret;
}
@ -739,6 +763,7 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
vstruct_for_each(jset, entry) {
if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
c, version, jset, entry,
journal_entry_past_jset_end,
"journal entry extends past end of jset")) {
jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
break;
@ -767,6 +792,7 @@ static int jset_validate(struct bch_fs *c,
version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
c, version, jset, NULL,
jset_unsupported_version,
"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
@ -778,6 +804,7 @@ static int jset_validate(struct bch_fs *c,
if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
c, version, jset, NULL,
jset_unknown_csum,
"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
@ -788,6 +815,7 @@ static int jset_validate(struct bch_fs *c,
if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
c, version, jset, NULL,
jset_last_seq_newer_than_seq,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(jset->last_seq),
le64_to_cpu(jset->seq))) {
@ -817,6 +845,7 @@ static int jset_validate_early(struct bch_fs *c,
version = le32_to_cpu(jset->version);
if (journal_entry_err_on(!bch2_version_compatible(version),
c, version, jset, NULL,
jset_unsupported_version,
"%s sector %llu seq %llu: unknown journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
@ -832,6 +861,7 @@ static int jset_validate_early(struct bch_fs *c,
if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
c, version, jset, NULL,
jset_past_bucket_end,
"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq), bytes))
@ -900,7 +930,7 @@ static int journal_read_bucket(struct bch_dev *ca,
ret = submit_bio_wait(bio);
kfree(bio);
if (bch2_dev_io_err_on(ret, ca,
if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
"journal read error: sector %llu",
offset) ||
bch2_meta_read_fault("journal")) {
@ -956,7 +986,8 @@ static int journal_read_bucket(struct bch_dev *ca,
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
csum_good = jset_csum_good(c, j);
if (!csum_good)
if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
"journal checksum error"))
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
@ -1172,6 +1203,7 @@ int bch2_journal_read(struct bch_fs *c,
if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
c, le32_to_cpu(i->j.version), &i->j, NULL,
jset_last_seq_newer_than_seq,
"invalid journal entry: last_seq > seq (%llu > %llu)",
le64_to_cpu(i->j.last_seq),
le64_to_cpu(i->j.seq)))
@ -1188,7 +1220,8 @@ int bch2_journal_read(struct bch_fs *c,
}
if (!*last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
"journal read done, but no entries found after dropping non-flushes");
return 0;
}
@ -1214,6 +1247,7 @@ int bch2_journal_read(struct bch_fs *c,
if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
jset_seq_blacklisted,
"found blacklisted journal entry %llu", seq);
i->ignore = true;
}
@ -1254,7 +1288,8 @@ int bch2_journal_read(struct bch_fs *c,
bch2_journal_ptrs_to_text(&buf2, c, i);
missing_end = seq - 1;
fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
fsck_err(c, journal_entries_missing,
"journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
" prev at %s\n"
" next at %s",
missing_start, missing_end,
@ -1309,7 +1344,8 @@ int bch2_journal_read(struct bch_fs *c,
if (!degraded &&
!bch2_replicas_marked(c, &replicas.e) &&
(le64_to_cpu(i->j.seq) == *last_seq ||
fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n %s",
fsck_err(c, journal_entry_replicas_not_marked,
"superblock not marked as containing replicas for journal entry %llu\n %s",
le64_to_cpu(i->j.seq), buf.buf))) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
@ -1581,7 +1617,8 @@ static void journal_write_endio(struct bio *bio)
struct journal_buf *w = journal_last_unwritten_buf(j);
unsigned long flags;
if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
bch2_blk_status_to_str(bio->bi_status)) ||
bch2_meta_write_fault("journal")) {
@ -1641,9 +1678,15 @@ static void do_journal_write(struct closure *cl)
continue_at(cl, journal_write_done, c->io_complete_wq);
}
static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct jset_entry *i, *next, *prev = NULL;
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct jset_entry *start, *end, *i, *next, *prev = NULL;
struct jset *jset = w->data;
unsigned sectors, bytes, u64s;
bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
int ret;
/*
* Simple compaction, dropping empty jset_entries (from journal
@ -1660,8 +1703,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
if (!u64s)
continue;
if (i->type == BCH_JSET_ENTRY_btree_root)
/*
* New btree roots are set by journalling them; when the journal
* entry gets written we have to propagate them to
* c->btree_roots
*
* But, every journal entry we write has to contain all the
* btree roots (at least for now); so after we copy btree roots
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
if (i->type == BCH_JSET_ENTRY_btree_root) {
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
}
/* Can we merge with previous entry? */
if (prev &&
@ -1685,85 +1740,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
}
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
spin_lock(&j->lock);
/*
* If the journal is in an error state - we did an emergency shutdown -
* we prefer to continue doing journal writes. We just mark them as
* noflush so they'll never be used, but they'll still be visible by the
* list_journal tool - this helps in debugging.
*
* There's a caveat: the first journal write after marking the
* superblock dirty must always be a flush write, because on startup
* from a clean shutdown we didn't necessarily read the journal and the
* new journal write might overwrite whatever was in the journal
* previously - we can't leave the journal without any flush writes in
* it.
*
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
(bch2_journal_error(j) ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
} else if (!bch2_journal_error(j)) {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
} else {
spin_unlock(&j->lock);
goto err;
}
spin_unlock(&j->lock);
/*
* New btree roots are set by journalling them; when the journal entry
* gets written we have to propagate them to c->btree_roots
*
* But, every journal entry we write has to contain all the btree roots
* (at least for now); so after we copy btree roots to c->btree_roots we
* have to get any missing btree roots and add them to this journal
* entry:
*/
bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
bch2_journal_super_entries_add_common(c, &end,
le64_to_cpu(jset->seq));
@ -1779,7 +1759,7 @@ void bch2_journal_write(struct closure *cl)
bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
vstruct_bytes(jset), w->sectors << 9,
u64s, w->u64s_reserved, j->entry_u64s_reserved);
goto err;
return -EINVAL;
}
jset->magic = cpu_to_le64(jset_magic(c));
@ -1798,37 +1778,117 @@ void bch2_journal_write(struct closure *cl)
validate_before_checksum = true;
if (validate_before_checksum &&
jset_validate(c, NULL, jset, 0, WRITE))
goto err;
(ret = jset_validate(c, NULL, jset, 0, WRITE)))
return ret;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
if (bch2_fs_fatal_err_on(ret, c,
"error decrypting journal entry: %i", ret))
goto err;
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
journal_nonce(jset), jset);
if (!validate_before_checksum &&
jset_validate(c, NULL, jset, 0, WRITE))
goto err;
(ret = jset_validate(c, NULL, jset, 0, WRITE)))
return ret;
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
retry_alloc:
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
if (ret && j->can_discard) {
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
goto retry_alloc;
return 0;
}
static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
int error = bch2_journal_error(j);
/*
* If the journal is in an error state - we did an emergency shutdown -
* we prefer to continue doing journal writes. We just mark them as
* noflush so they'll never be used, but they'll still be visible by the
* list_journal tool - this helps in debugging.
*
* There's a caveat: the first journal write after marking the
* superblock dirty must always be a flush write, because on startup
* from a clean shutdown we didn't necessarily read the journal and the
* new journal write might overwrite whatever was in the journal
* previously - we can't leave the journal without any flush writes in
* it.
*
* So if we're in an error state, and we're still starting up, we don't
* write anything at all.
*/
if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
return -EIO;
if (error ||
w->noflush ||
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
} else {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
}
return 0;
}
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
unsigned i, nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
j->write_start_time = local_clock();
spin_lock(&j->lock);
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
if (ret)
goto err;
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
if (ret)
goto err;
while (1) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
if (!ret || !j->can_discard)
break;
spin_unlock(&j->lock);
bch2_journal_do_discards(j);
}
if (ret) {
__bch2_journal_debug_to_text(&journal_debug_buf, j);
spin_unlock(&j->lock);
bch_err(c, "Unable to allocate journal write:\n%s",
journal_debug_buf.buf);
printbuf_exit(&journal_debug_buf);
goto err;
}
/*
* write is allocated, no longer need to account for it in
@ -1843,13 +1903,6 @@ void bch2_journal_write(struct closure *cl)
bch2_journal_space_available(j);
spin_unlock(&j->lock);
if (ret) {
bch_err(c, "Unable to allocate journal write:\n%s",
journal_debug_buf.buf);
printbuf_exit(&journal_debug_buf);
goto err;
}
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (c->opts.nochanges)
@ -1871,7 +1924,7 @@ void bch2_journal_write(struct closure *cl)
if (ret)
goto err;
if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref);

View File

@ -10,17 +10,17 @@
#include "recovery.h"
/* KEY_TYPE_lru is obsolete: */
int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (!lru_pos_time(k.k->p)) {
prt_printf(err, "lru entry at time=0");
return -BCH_ERR_invalid_bkey;
int ret = 0;
}
return 0;
bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
lru_entry_at_time_0,
"lru entry at time=0");
fsck_err:
return ret;
}
void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
@ -95,6 +95,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
int ret;
if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
lru_entry_to_invalid_bucket,
"lru key points to nonexistent device:bucket %llu:%llu",
alloc_pos.inode, alloc_pos.offset))
return bch2_btree_delete_at(trans, lru_iter, 0);
@ -125,7 +126,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
}
if (c->opts.reconstruct_alloc ||
fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
fsck_err(c, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
" %s\n"
" for %s",
bch2_lru_types[type],

View File

@ -48,7 +48,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
return BCH_LRU_read;
}
int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

View File

@ -20,6 +20,7 @@
#include "keylist.h"
#include "move.h"
#include "replicas.h"
#include "snapshot.h"
#include "super-io.h"
#include "trace.h"
@ -59,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c
}
}
static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
{
mutex_lock(&c->data_progress_lock);
list_add(&stats->list, &c->data_progress_list);
mutex_unlock(&c->data_progress_lock);
}
static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
{
mutex_lock(&c->data_progress_lock);
list_del(&stats->list);
mutex_unlock(&c->data_progress_lock);
}
struct moving_io {
struct list_head read_list;
struct list_head io_list;
@ -156,35 +143,31 @@ static void move_read_endio(struct bio *bio)
closure_put(&ctxt->cl);
}
void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
struct btree_trans *trans)
void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
{
struct moving_io *io;
if (trans)
bch2_trans_unlock(trans);
while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
bch2_trans_unlock_long(ctxt->trans);
list_del(&io->read_list);
move_write(io);
}
}
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
struct btree_trans *trans)
void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
move_ctxt_wait_event(ctxt, trans,
move_ctxt_wait_event(ctxt,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
struct bch_fs *c = ctxt->c;
struct bch_fs *c = ctxt->trans->c;
move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
closure_sync(&ctxt->cl);
EBUG_ON(atomic_read(&ctxt->write_sectors));
@ -192,16 +175,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
EBUG_ON(atomic_read(&ctxt->read_sectors));
EBUG_ON(atomic_read(&ctxt->read_ios));
if (ctxt->stats) {
progress_list_del(c, ctxt->stats);
trace_move_data(c,
atomic64_read(&ctxt->stats->sectors_moved),
atomic64_read(&ctxt->stats->keys_moved));
}
mutex_lock(&c->moving_context_lock);
list_del(&ctxt->list);
mutex_unlock(&c->moving_context_lock);
bch2_trans_put(ctxt->trans);
memset(ctxt, 0, sizeof(*ctxt));
}
void bch2_moving_ctxt_init(struct moving_context *ctxt,
@ -213,7 +192,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
{
memset(ctxt, 0, sizeof(*ctxt));
ctxt->c = c;
ctxt->trans = bch2_trans_get(c);
ctxt->fn = (void *) _RET_IP_;
ctxt->rate = rate;
ctxt->stats = stats;
@ -230,16 +209,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
mutex_lock(&c->moving_context_lock);
list_add(&ctxt->list, &c->moving_context_list);
mutex_unlock(&c->moving_context_lock);
if (stats) {
progress_list_add(c, stats);
stats->data_type = BCH_DATA_user;
}
void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
{
trace_move_data(c, stats);
}
void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
{
memset(stats, 0, sizeof(*stats));
stats->data_type = BCH_DATA_user;
scnprintf(stats->name, sizeof(stats->name), "%s", name);
}
@ -286,15 +266,14 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
static int bch2_move_extent(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
int bch2_move_extent(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bch_io_opts io_opts,
enum btree_id btree_id,
struct btree_iter *iter,
struct bkey_s_c k,
struct bch_io_opts io_opts,
struct data_update_opts data_opts)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct moving_io *io;
@ -303,6 +282,8 @@ static int bch2_move_extent(struct btree_trans *trans,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@ -355,7 +336,7 @@ static int bch2_move_extent(struct btree_trans *trans,
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
io_opts, data_opts, btree_id, k);
io_opts, data_opts, iter->btree_id, k);
if (ret && ret != -BCH_ERR_unwritten_extent_update)
goto err_free_pages;
@ -367,9 +348,11 @@ static int bch2_move_extent(struct btree_trans *trans,
BUG_ON(ret);
io->write.ctxt = ctxt;
io->write.op.end_io = move_write_done;
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats) {
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@ -399,7 +382,7 @@ static int bch2_move_extent(struct btree_trans *trans,
closure_get(&ctxt->cl);
bch2_read_extent(trans, &io->rbio,
bkey_start_pos(k.k),
btree_id, k, 0,
iter->btree_id, k, 0,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
@ -413,45 +396,96 @@ static int bch2_move_extent(struct btree_trans *trans,
return ret;
}
static int lookup_inode(struct btree_trans *trans, struct bpos pos,
struct bch_inode_unpacked *inode)
struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct per_snapshot_io_opts *io_opts,
struct bkey_s_c extent_k)
{
struct bch_fs *c = trans->c;
u32 restart_count = trans->restart_count;
int ret = 0;
if (io_opts->cur_inum != extent_k.k->p.inode) {
struct btree_iter iter;
struct bkey_s_c k;
io_opts->d.nr = 0;
for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset != extent_k.k->p.inode)
break;
if (!bkey_is_inode(k.k))
continue;
struct bch_inode_unpacked inode;
BUG_ON(bch2_inode_unpack(k, &inode));
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
ret = darray_push(&io_opts->d, e);
if (ret)
break;
}
bch2_trans_iter_exit(trans, &iter);
io_opts->cur_inum = extent_k.k->p.inode;
}
ret = ret ?: trans_was_restarted(trans, restart_count);
if (ret)
return ERR_PTR(ret);
if (extent_k.k->p.snapshot) {
struct snapshot_io_opts_entry *i;
darray_for_each(io_opts->d, i)
if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
return &i->io_opts;
}
return &io_opts->fs_io_opts;
}
int bch2_move_get_io_opts_one(struct btree_trans *trans,
struct bch_io_opts *io_opts,
struct bkey_s_c extent_k)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
BTREE_ITER_ALL_SNAPSHOTS);
k = bch2_btree_iter_peek(&iter);
/* reflink btree? */
if (!extent_k.k->p.inode) {
*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
return 0;
}
k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
BTREE_ITER_CACHED);
ret = bkey_err(k);
if (ret)
goto err;
if (!k.k || !bkey_eq(k.k->p, pos)) {
ret = -BCH_ERR_ENOENT_inode;
goto err;
}
ret = bkey_is_inode(k.k) ? 0 : -EIO;
if (ret)
goto err;
ret = bch2_inode_unpack(k, inode);
if (ret)
goto err;
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
if (!ret && bkey_is_inode(k.k)) {
struct bch_inode_unpacked inode;
bch2_inode_unpack(k, &inode);
bch2_inode_opts_get(io_opts, trans->c, &inode);
} else {
*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
}
static int move_ratelimit(struct btree_trans *trans,
struct moving_context *ctxt)
bch2_trans_iter_exit(trans, &iter);
return 0;
}
int bch2_move_ratelimit(struct moving_context *ctxt)
{
struct bch_fs *c = trans->c;
struct bch_fs *c = ctxt->trans->c;
u64 delay;
if (ctxt->wait_on_copygc) {
bch2_trans_unlock(trans);
if (ctxt->wait_on_copygc && !c->copygc_running) {
bch2_trans_unlock_long(ctxt->trans);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
@ -460,8 +494,12 @@ static int move_ratelimit(struct btree_trans *trans,
do {
delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
if (delay) {
bch2_trans_unlock(trans);
if (delay > HZ / 10)
bch2_trans_unlock_long(ctxt->trans);
else
bch2_trans_unlock(ctxt->trans);
set_current_state(TASK_INTERRUPTIBLE);
}
@ -474,7 +512,7 @@ static int move_ratelimit(struct btree_trans *trans,
schedule_timeout(delay);
if (unlikely(freezing(current))) {
move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze();
}
} while (delay);
@ -483,7 +521,7 @@ static int move_ratelimit(struct btree_trans *trans,
* XXX: these limits really ought to be per device, SSDs and hard drives
* will want different limits
*/
move_ctxt_wait_event(ctxt, trans,
move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
@ -492,52 +530,28 @@ static int move_ratelimit(struct btree_trans *trans,
return 0;
}
static int move_get_io_opts(struct btree_trans *trans,
struct bch_io_opts *io_opts,
struct bkey_s_c k, u64 *cur_inum)
{
struct bch_inode_unpacked inode;
int ret;
if (*cur_inum == k.k->p.inode)
return 0;
ret = lookup_inode(trans,
SPOS(0, k.k->p.inode, k.k->p.snapshot),
&inode);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
if (!ret)
bch2_inode_opts_get(io_opts, trans->c, &inode);
else
*io_opts = bch2_opts_to_inode_opts(trans->c->opts);
*cur_inum = k.k->p.inode;
return 0;
}
static int __bch2_move_data(struct moving_context *ctxt,
static int bch2_move_data_btree(struct moving_context *ctxt,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
enum btree_id btree_id)
{
struct bch_fs *c = ctxt->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct per_snapshot_io_opts snapshot_io_opts;
struct bch_io_opts *io_opts;
struct bkey_buf sk;
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
struct data_update_opts data_opts;
u64 cur_inum = U64_MAX;
int ret = 0, ret2;
per_snapshot_io_opts_init(&snapshot_io_opts, c);
bch2_bkey_buf_init(&sk);
if (ctxt->stats) {
ctxt->stats->data_type = BCH_DATA_user;
ctxt->stats->btree_id = btree_id;
ctxt->stats->pos = start;
ctxt->stats->pos = BBPOS(btree_id, start);
}
bch2_trans_iter_init(trans, &iter, btree_id, start,
@ -547,7 +561,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
while (!move_ratelimit(trans, ctxt)) {
while (!bch2_move_ratelimit(ctxt)) {
bch2_trans_begin(trans);
k = bch2_btree_iter_peek(&iter);
@ -564,17 +578,18 @@ static int __bch2_move_data(struct moving_context *ctxt,
break;
if (ctxt->stats)
ctxt->stats->pos = iter.pos;
ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!bkey_extent_is_direct_data(k.k))
goto next_nondata;
ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
ret = PTR_ERR_OR_ZERO(io_opts);
if (ret)
continue;
memset(&data_opts, 0, sizeof(data_opts));
if (!pred(c, arg, k, &io_opts, &data_opts))
if (!pred(c, arg, k, io_opts, &data_opts))
goto next;
/*
@ -584,24 +599,20 @@ static int __bch2_move_data(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
io_opts, btree_id, k, data_opts);
ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt, trans);
bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
/* XXX signal failure */
goto next;
}
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
@ -610,59 +621,68 @@ static int __bch2_move_data(struct moving_context *ctxt,
}
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
bch2_bkey_buf_exit(&sk, c);
per_snapshot_io_opts_exit(&snapshot_io_opts);
return ret;
}
int __bch2_move_data(struct moving_context *ctxt,
struct bbpos start,
struct bbpos end,
move_pred_fn pred, void *arg)
{
struct bch_fs *c = ctxt->trans->c;
enum btree_id id;
int ret = 0;
for (id = start.btree;
id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
id++) {
ctxt->stats->pos = BBPOS(id, POS_MIN);
if (!btree_type_has_ptrs(id) ||
!bch2_btree_id_root(c, id)->b)
continue;
ret = bch2_move_data_btree(ctxt,
id == start.btree ? start.pos : POS_MIN,
id == end.btree ? end.pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
return ret;
}
int bch2_move_data(struct bch_fs *c,
enum btree_id start_btree_id, struct bpos start_pos,
enum btree_id end_btree_id, struct bpos end_pos,
struct bbpos start,
struct bbpos end,
struct bch_ratelimit *rate,
struct bch_move_stats *stats,
struct write_point_specifier wp,
bool wait_on_copygc,
move_pred_fn pred, void *arg)
{
struct moving_context ctxt;
enum btree_id id;
int ret = 0;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
stats->btree_id = id;
if (id != BTREE_ID_extents &&
id != BTREE_ID_reflink)
continue;
if (!bch2_btree_id_root(c, id)->b)
continue;
ret = __bch2_move_data(&ctxt,
id == start_btree_id ? start_pos : POS_MIN,
id == end_btree_id ? end_pos : POS_MAX,
pred, arg, id);
if (ret)
break;
}
ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
int __bch2_evacuate_bucket(struct btree_trans *trans,
struct moving_context *ctxt,
int __bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
{
struct bch_fs *c = ctxt->c;
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_iter iter;
struct bkey_buf sk;
@ -673,7 +693,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
struct data_update_opts data_opts;
unsigned dirty_sectors, bucket_size;
u64 fragmentation;
u64 cur_inum = U64_MAX;
struct bpos bp_pos = POS_MIN;
int ret = 0;
@ -708,7 +727,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
goto err;
}
while (!(ret = move_ratelimit(trans, ctxt))) {
while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
ret = bch2_get_next_backpointer(trans, bucket, gen,
@ -737,7 +756,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
@ -758,23 +777,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
i++;
}
ret = bch2_move_extent(trans, &iter, ctxt,
bucket_in_flight,
io_opts, bp.btree_id, k, data_opts);
ret = bch2_move_extent(ctxt, bucket_in_flight,
&iter, k, io_opts, data_opts);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt, trans);
bch2_move_ctxt_wait_for_io(ctxt);
continue;
}
if (ret)
goto err;
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
} else {
@ -825,14 +841,12 @@ int bch2_evacuate_bucket(struct bch_fs *c,
struct write_point_specifier wp,
bool wait_on_copygc)
{
struct btree_trans *trans = bch2_trans_get(c);
struct moving_context ctxt;
int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
bch2_moving_ctxt_exit(&ctxt);
bch2_trans_put(trans);
return ret;
}
@ -849,21 +863,25 @@ static int bch2_move_btree(struct bch_fs *c,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree_trans *trans = bch2_trans_get(c);
struct moving_context ctxt;
struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
enum btree_id id;
struct data_update_opts data_opts;
int ret = 0;
progress_list_add(c, stats);
bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
writepoint_ptr(&c->btree_write_point),
true);
trans = ctxt.trans;
stats->data_type = BCH_DATA_btree;
for (id = start_btree_id;
id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
id++) {
stats->btree_id = id;
stats->pos = BBPOS(id, POS_MIN);
if (!bch2_btree_id_root(c, id)->b)
continue;
@ -882,7 +900,7 @@ static int bch2_move_btree(struct bch_fs *c,
bpos_cmp(b->key.k.p, end_pos)) > 0)
break;
stats->pos = iter.pos;
stats->pos = BBPOS(iter.btree_id, iter.pos);
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
@ -904,14 +922,10 @@ static int bch2_move_btree(struct bch_fs *c,
break;
}
bch2_trans_put(trans);
if (ret)
bch_err_fn(c, ret);
bch2_moving_ctxt_exit(&ctxt);
bch2_btree_interior_updates_flush(c);
progress_list_del(c, stats);
return ret;
}
@ -1032,7 +1046,6 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
mutex_unlock(&c->sb_lock);
}
if (ret)
bch_err_fn(c, ret);
return ret;
}
@ -1056,14 +1069,16 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
(struct bbpos) { op.start_btree, op.start_pos },
(struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
@ -1080,18 +1095,21 @@ int bch2_data_job(struct bch_fs *c,
ret = bch2_replicas_gc2(c) ?: ret;
ret = bch2_move_data(c,
op.start_btree, op.start_pos,
op.end_btree, op.end_pos,
(struct bbpos) { op.start_btree, op.start_pos },
(struct bbpos) { op.end_btree, op.end_pos },
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
bch2_move_stats_exit(stats, c);
break;
case BCH_DATA_OP_REWRITE_OLD_NODES:
bch2_move_stats_init(stats, "rewrite_old_nodes");
ret = bch2_scan_old_btree_nodes(c, stats);
bch2_move_stats_exit(stats, c);
break;
default:
ret = -EINVAL;
@ -1100,19 +1118,43 @@ int bch2_data_job(struct bch_fs *c,
return ret;
}
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
prt_printf(out, "%s: data type=%s pos=",
stats->name,
bch2_data_types[stats->data_type]);
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
prt_str(out, "keys moved: ");
prt_u64(out, atomic64_read(&stats->keys_moved));
prt_newline(out);
prt_str(out, "keys raced: ");
prt_u64(out, atomic64_read(&stats->keys_raced));
prt_newline(out);
prt_str(out, "bytes seen: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
prt_str(out, "bytes moved: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
prt_str(out, "bytes raced: ");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
printbuf_indent_sub(out, 2);
}
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{
struct bch_move_stats *stats = ctxt->stats;
struct moving_io *io;
prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
prt_newline(out);
prt_printf(out, " data type %s btree_id %s position: ",
bch2_data_types[stats->data_type],
bch2_btree_ids[stats->btree_id]);
bch2_bpos_to_text(out, stats->pos);
prt_newline(out);
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
prt_printf(out, "reads: ios %u/%u sectors %u/%u",
@ -1153,7 +1195,4 @@ void bch2_fs_move_init(struct bch_fs *c)
{
INIT_LIST_HEAD(&c->moving_context_list);
mutex_init(&c->moving_context_lock);
INIT_LIST_HEAD(&c->data_progress_list);
mutex_init(&c->data_progress_lock);
}

View File

@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
#include "bbpos.h"
#include "bcachefs_ioctl.h"
#include "btree_iter.h"
#include "buckets.h"
@ -11,7 +12,7 @@
struct bch_read_bio;
struct moving_context {
struct bch_fs *c;
struct btree_trans *trans;
struct list_head list;
void *fn;
@ -37,13 +38,14 @@ struct moving_context {
wait_queue_head_t wait;
};
#define move_ctxt_wait_event(_ctxt, _trans, _cond) \
#define move_ctxt_wait_event(_ctxt, _cond) \
do { \
bool cond_finished = false; \
bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \
bch2_moving_ctxt_do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
bch2_trans_unlock_long((_ctxt)->trans); \
__wait_event((_ctxt)->wait, \
bch2_moving_ctxt_next_pending_write(_ctxt) || \
(cond_finished = (_cond))); \
@ -59,22 +61,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
struct btree_trans *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
void bch2_move_ctxt_wait_for_io(struct moving_context *);
int bch2_move_ratelimit(struct moving_context *);
/* Inodes in different snapshots may have different IO options: */
struct snapshot_io_opts_entry {
u32 snapshot;
struct bch_io_opts io_opts;
};
struct per_snapshot_io_opts {
u64 cur_inum;
struct bch_io_opts fs_io_opts;
DARRAY(struct snapshot_io_opts_entry) d;
};
static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
{
memset(io_opts, 0, sizeof(*io_opts));
io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
}
static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
{
darray_exit(&io_opts->d);
}
struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
struct per_snapshot_io_opts *, struct bkey_s_c);
int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
int bch2_move_extent(struct moving_context *,
struct move_bucket_in_flight *,
struct btree_iter *,
struct bkey_s_c,
struct bch_io_opts,
struct data_update_opts);
int __bch2_move_data(struct moving_context *,
struct bbpos,
struct bbpos,
move_pred_fn, void *);
int bch2_move_data(struct bch_fs *,
enum btree_id, struct bpos,
enum btree_id, struct bpos,
struct bbpos start,
struct bbpos end,
struct bch_ratelimit *,
struct bch_move_stats *,
struct write_point_specifier,
bool,
move_pred_fn, void *);
int __bch2_evacuate_bucket(struct btree_trans *,
struct moving_context *,
int __bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
@ -88,7 +128,10 @@ int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
void bch2_move_stats_init(struct bch_move_stats *, char *);
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_move_init(struct bch_fs *);

View File

@ -2,17 +2,17 @@
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
#include "bbpos_types.h"
struct bch_move_stats {
enum bch_data_type data_type;
enum btree_id btree_id;
struct bpos pos;
struct list_head list;
struct bbpos pos;
char name[32];
atomic64_t keys_moved;
atomic64_t keys_raced;
atomic64_t sectors_moved;
atomic64_t sectors_seen;
atomic64_t sectors_moved;
atomic64_t sectors_raced;
};

View File

@ -101,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
return ret;
}
static void move_buckets_wait(struct btree_trans *trans,
struct moving_context *ctxt,
static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
@ -111,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans,
while ((i = list->first)) {
if (flush)
move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
if (atomic_read(&i->count))
break;
@ -129,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans,
kfree(i);
}
bch2_trans_unlock(trans);
bch2_trans_unlock_long(ctxt->trans);
}
static bool bucket_in_flight(struct buckets_in_flight *list,
@ -140,11 +139,11 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
typedef DARRAY(struct move_bucket) move_buckets;
static int bch2_copygc_get_buckets(struct btree_trans *trans,
struct moving_context *ctxt,
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight,
move_buckets *buckets)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
@ -152,7 +151,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
move_buckets_wait(trans, ctxt, buckets_in_flight, false);
move_buckets_wait(ctxt, buckets_in_flight, false);
ret = bch2_btree_write_buffer_flush(trans);
if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
@ -188,10 +187,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
}
noinline
static int bch2_copygc(struct btree_trans *trans,
struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight)
static int bch2_copygc(struct moving_context *ctxt,
struct buckets_in_flight *buckets_in_flight,
bool *did_work)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
@ -202,7 +202,7 @@ static int bch2_copygc(struct btree_trans *trans,
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
if (ret)
goto err;
@ -221,10 +221,12 @@ static int bch2_copygc(struct btree_trans *trans,
break;
}
ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
*did_work = true;
}
err:
darray_exit(&buckets);
@ -300,24 +302,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
static int bch2_copygc_thread(void *arg)
{
struct bch_fs *c = arg;
struct btree_trans *trans;
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
struct buckets_in_flight buckets;
struct buckets_in_flight *buckets;
u64 last, wait;
int ret = 0;
memset(&buckets, 0, sizeof(buckets));
ret = rhashtable_init(&buckets.table, &bch_move_bucket_params);
buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
if (!buckets)
return -ENOMEM;
ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
if (ret) {
kfree(buckets);
bch_err_msg(c, ret, "allocating copygc buckets in flight");
return ret;
}
set_freezable();
trans = bch2_trans_get(c);
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@ -325,16 +327,18 @@ static int bch2_copygc_thread(void *arg)
false);
while (!ret && !kthread_should_stop()) {
bch2_trans_unlock(trans);
bool did_work = false;
bch2_trans_unlock_long(ctxt.trans);
cond_resched();
if (!c->copy_gc_enabled) {
move_buckets_wait(trans, &ctxt, &buckets, true);
move_buckets_wait(&ctxt, buckets, true);
kthread_wait_freezable(c->copy_gc_enabled);
}
if (unlikely(freezing(current))) {
move_buckets_wait(trans, &ctxt, &buckets, true);
move_buckets_wait(&ctxt, buckets, true);
__refrigerator(false);
continue;
}
@ -345,7 +349,7 @@ static int bch2_copygc_thread(void *arg)
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
move_buckets_wait(trans, &ctxt, &buckets, true);
move_buckets_wait(&ctxt, buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
@ -355,16 +359,29 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
c->copygc_running = true;
ret = bch2_copygc(trans, &ctxt, &buckets);
ret = bch2_copygc(&ctxt, buckets, &did_work);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
if (!wait && !did_work) {
u64 min_member_capacity = bch2_min_rw_member_capacity(c);
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
bch2_trans_unlock_long(ctxt.trans);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}
}
move_buckets_wait(trans, &ctxt, &buckets, true);
rhashtable_destroy(&buckets.table);
bch2_trans_put(trans);
move_buckets_wait(&ctxt, buckets, true);
rhashtable_destroy(&buckets->table);
kfree(buckets);
bch2_moving_ctxt_exit(&ctxt);
bch2_move_stats_exit(&move_stats, c);
return 0;
}

View File

@ -12,11 +12,6 @@
#define x(t, n, ...) [n] = #t,
const char * const bch2_iops_measurements[] = {
BCH_IOPS_MEASUREMENTS()
NULL
};
const char * const bch2_error_actions[] = {
BCH_ERROR_ACTIONS()
NULL
@ -42,9 +37,8 @@ const char * const bch2_sb_compat[] = {
NULL
};
const char * const bch2_btree_ids[] = {
const char * const __bch2_btree_ids[] = {
BCH_BTREE_IDS()
"interior btree node",
NULL
};
@ -271,14 +265,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
if (err)
prt_printf(err, "%s: too small (min %llu)",
opt->attr.name, opt->min);
return -ERANGE;
return -BCH_ERR_ERANGE_option_too_small;
}
if (opt->max && v >= opt->max) {
if (err)
prt_printf(err, "%s: too big (max %llu)",
opt->attr.name, opt->max);
return -ERANGE;
return -BCH_ERR_ERANGE_option_too_big;
}
if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
@ -295,6 +289,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
return -EINVAL;
}
if (opt->fn.validate)
return opt->fn.validate(v, err);
return 0;
}

View File

@ -10,13 +10,12 @@
struct bch_fs;
extern const char * const bch2_iops_measurements[];
extern const char * const bch2_error_actions[];
extern const char * const bch2_fsck_fix_opts[];
extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const bch2_btree_ids[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_types[];
@ -74,6 +73,7 @@ enum opt_type {
struct bch_opt_fn {
int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
int (*validate)(u64, struct printbuf *);
};
/**

View File

@ -415,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out,
while (list[nr])
nr++;
while (flags && (bit = __ffs(flags)) < nr) {
while (flags && (bit = __ffs64(flags)) < nr) {
if (!first)
bch2_prt_printf(out, ",");
first = false;
bch2_prt_printf(out, "%s", list[bit]);
flags ^= 1 << bit;
flags ^= BIT_ULL(bit);
}
}

View File

@ -59,17 +59,18 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
.to_text = bch2_sb_quota_to_text,
};
int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (k.k->p.inode >= QTYP_NR) {
prt_printf(err, "invalid quota type (%llu >= %u)",
k.k->p.inode, QTYP_NR);
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return 0;
bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
quota_type_invalid,
"invalid quota type (%llu >= %u)",
k.k->p.inode, QTYP_NR);
fsck_err:
return ret;
}
void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,

View File

@ -8,7 +8,7 @@
enum bkey_invalid_flags;
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

View File

@ -1,15 +1,21 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "buckets.h"
#include "clock.h"
#include "compress.h"
#include "disk_groups.h"
#include "errcode.h"
#include "error.h"
#include "inode.h"
#include "move.h"
#include "rebalance.h"
#include "subvolume.h"
#include "super-io.h"
#include "trace.h"
@ -17,302 +23,396 @@
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
#define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1)
static const char * const bch2_rebalance_state_strs[] = {
#define x(t) #t,
BCH_REBALANCE_STATES()
NULL
#undef x
};
static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
{
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_cookie *cookie;
u64 v;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
v = k.k->type == KEY_TYPE_cookie
? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
: 0;
cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
ret = PTR_ERR_OR_ZERO(cookie);
if (ret)
goto err;
bkey_cookie_init(&cookie->k_i);
cookie->k.p = iter.pos;
cookie->v.cookie = cpu_to_le64(v + 1);
ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
}
int bch2_set_fs_needs_rebalance(struct bch_fs *c)
{
return bch2_set_rebalance_needs_scan(c, 0);
}
static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
{
struct btree_iter iter;
struct bkey_s_c k;
u64 v;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
v = k.k->type == KEY_TYPE_cookie
? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
: 0;
if (v == cookie)
ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
struct btree_iter *work_iter)
{
return !kthread_should_stop()
? bch2_btree_iter_peek(work_iter)
: bkey_s_c_null;
}
static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
int ret = PTR_ERR_OR_ZERO(n);
if (ret)
return ret;
extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
struct bpos work_pos,
struct btree_iter *extent_iter,
struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_s_c k;
bch2_trans_iter_exit(trans, extent_iter);
bch2_trans_iter_init(trans, extent_iter,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
BTREE_ITER_ALL_SNAPSHOTS);
k = bch2_btree_iter_peek_slot(extent_iter);
if (bkey_err(k))
return k;
const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
if (!r) {
/* raced due to btree write buffer, nothing to do */
return bkey_s_c_null;
}
memset(data_opts, 0, sizeof(*data_opts));
data_opts->rewrite_ptrs =
bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
data_opts->target = r->target;
if (!data_opts->rewrite_ptrs) {
/*
* Check if an extent should be moved:
* returns -1 if it should not be moved, or
* device of pointer that should be moved, if known, or INT_MAX if unknown
* device we would want to write to offline? devices in target
* changed?
*
* We'll now need a full scan before this extent is picked up
* again:
*/
int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
if (ret)
return bkey_s_c_err(ret);
return bkey_s_c_null;
}
return k;
}
noinline_for_stack
static int do_rebalance_extent(struct moving_context *ctxt,
struct bpos work_pos,
struct btree_iter *extent_iter)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &trans->c->rebalance;
struct data_update_opts data_opts;
struct bch_io_opts io_opts;
struct bkey_s_c k;
struct bkey_buf sk;
int ret;
ctxt->stats = &r->work_stats;
r->state = BCH_REBALANCE_working;
bch2_bkey_buf_init(&sk);
ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
extent_iter, &data_opts));
if (ret || !k.k)
goto out;
ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
if (ret)
goto out;
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
/*
* The iterator gets unlocked by __bch2_read_extent - need to
* save a copy of @k elsewhere:
*/
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
if (ret) {
if (bch2_err_matches(ret, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
ret = -BCH_ERR_transaction_restart_nested;
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto out;
/* skip it and continue, XXX signal failure */
ret = 0;
}
out:
bch2_bkey_buf_exit(&sk, c);
return ret;
}
static bool rebalance_pred(struct bch_fs *c, void *arg,
struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned i;
unsigned target, compression;
data_opts->rewrite_ptrs = 0;
data_opts->target = io_opts->background_target;
data_opts->extra_replicas = 0;
data_opts->btree_insert_flags = 0;
if (k.k->p.inode) {
target = io_opts->background_target;
compression = io_opts->background_compression ?: io_opts->compression;
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
if (io_opts->background_compression &&
!bch2_bkey_is_incompressible(k)) {
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached &&
p.crc.compression_type !=
bch2_compression_opt_to_type(io_opts->background_compression))
data_opts->rewrite_ptrs |= 1U << i;
i++;
}
}
if (io_opts->background_target) {
const struct bch_extent_ptr *ptr;
i = 0;
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached &&
!bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
data_opts->rewrite_ptrs |= 1U << i;
i++;
}
target = r ? r->target : io_opts->background_target;
compression = r ? r->compression :
(io_opts->background_compression ?: io_opts->compression);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
data_opts->target = target;
return data_opts->rewrite_ptrs != 0;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
struct data_update_opts update_opts = { 0 };
struct bkey_ptrs_c ptrs;
const struct bch_extent_ptr *ptr;
unsigned i;
struct btree_trans *trans = ctxt->trans;
struct bch_fs_rebalance *r = &trans->c->rebalance;
int ret;
if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
return;
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
ctxt->stats = &r->scan_stats;
i = 0;
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
if ((1U << i) && update_opts.rewrite_ptrs)
if (atomic64_add_return(k.k->size,
&bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
k.k->size)
rebalance_wakeup(c);
i++;
}
if (!inum) {
r->scan_start = BBPOS_MIN;
r->scan_end = BBPOS_MAX;
} else {
r->scan_start = BBPOS(BTREE_ID_extents, POS(inum, 0));
r->scan_end = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
}
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
{
if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
sectors)
rebalance_wakeup(c);
}
r->state = BCH_REBALANCE_scanning;
struct rebalance_work {
int dev_most_full_idx;
unsigned dev_most_full_percent;
u64 dev_most_full_work;
u64 dev_most_full_capacity;
u64 total_work;
};
static void rebalance_work_accumulate(struct rebalance_work *w,
u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
{
unsigned percent_full;
u64 work = dev_work + unknown_dev;
/* avoid divide by 0 */
if (!capacity)
return;
if (work < dev_work || work < unknown_dev)
work = U64_MAX;
work = min(work, capacity);
percent_full = div64_u64(work * 100, capacity);
if (percent_full >= w->dev_most_full_percent) {
w->dev_most_full_idx = idx;
w->dev_most_full_percent = percent_full;
w->dev_most_full_work = work;
w->dev_most_full_capacity = capacity;
}
if (w->total_work + dev_work >= w->total_work &&
w->total_work + dev_work >= dev_work)
w->total_work += dev_work;
}
static struct rebalance_work rebalance_work(struct bch_fs *c)
{
struct bch_dev *ca;
struct rebalance_work ret = { .dev_most_full_idx = -1 };
u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
unsigned i;
for_each_online_member(ca, c, i)
rebalance_work_accumulate(&ret,
atomic64_read(&ca->rebalance_work),
unknown_dev,
bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket),
i);
rebalance_work_accumulate(&ret,
unknown_dev, 0, c->capacity, -1);
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c);
return ret;
}
static void rebalance_work_reset(struct bch_fs *c)
static void rebalance_wait(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
u64 now = atomic64_read(&clock->now);
u64 min_member_capacity = bch2_min_rw_member_capacity(c);
for_each_online_member(ca, c, i)
atomic64_set(&ca->rebalance_work, 0);
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
atomic64_set(&c->rebalance.work_unknown_dev, 0);
r->wait_iotime_end = now + (min_member_capacity >> 6);
if (r->state != BCH_REBALANCE_waiting) {
r->wait_iotime_start = now;
r->wait_wallclock_start = ktime_get_real_ns();
r->state = BCH_REBALANCE_waiting;
}
static unsigned long curr_cputime(void)
{
u64 utime, stime;
bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
}
task_cputime_adjusted(current, &utime, &stime);
return nsecs_to_jiffies(utime + stime);
static int do_rebalance(struct moving_context *ctxt)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &c->rebalance;
struct btree_iter rebalance_work_iter, extent_iter = { NULL };
struct bkey_s_c k;
int ret = 0;
bch2_move_stats_init(&r->work_stats, "rebalance_work");
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
bch2_trans_iter_init(trans, &rebalance_work_iter,
BTREE_ID_rebalance_work, POS_MIN,
BTREE_ITER_ALL_SNAPSHOTS);
while (!bch2_move_ratelimit(ctxt) &&
!kthread_wait_freezable(r->enabled)) {
bch2_trans_begin(trans);
ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret || !k.k)
break;
ret = k.k->type == KEY_TYPE_cookie
? do_rebalance_scan(ctxt, k.k->p.inode,
le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
: do_rebalance_extent(ctxt, k.k->p, &extent_iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
bch2_btree_iter_advance(&rebalance_work_iter);
}
bch2_trans_iter_exit(trans, &extent_iter);
bch2_trans_iter_exit(trans, &rebalance_work_iter);
bch2_move_stats_exit(&r->scan_stats, c);
if (!ret &&
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
!atomic64_read(&r->scan_stats.sectors_seen)) {
bch2_trans_unlock_long(trans);
rebalance_wait(c);
}
if (!bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
return ret;
}
static int bch2_rebalance_thread(void *arg)
{
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
struct rebalance_work w, p;
struct bch_move_stats move_stats;
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
u64 io_start;
long throttle;
struct moving_context ctxt;
int ret;
set_freezable();
io_start = atomic64_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
bch2_move_stats_init(&move_stats, "rebalance");
while (!kthread_wait_freezable(r->enabled)) {
cond_resched();
start = jiffies;
cputime = curr_cputime();
prev_run_time = start - prev_start;
prev_run_cputime = cputime - prev_cputime;
w = rebalance_work(c);
BUG_ON(!w.dev_most_full_capacity);
if (!w.total_work) {
r->state = REBALANCE_WAITING;
kthread_wait_freezable(rebalance_work(c).total_work);
continue;
}
/*
* If there isn't much work to do, throttle cpu usage:
*/
throttle = prev_run_cputime * 100 /
max(1U, w.dev_most_full_percent) -
prev_run_time;
if (w.dev_most_full_percent < 20 && throttle > 0) {
r->throttled_until_iotime = io_start +
div_u64(w.dev_most_full_capacity *
(20 - w.dev_most_full_percent),
50);
if (atomic64_read(&clock->now) + clock->max_slop <
r->throttled_until_iotime) {
r->throttled_until_cputime = start + throttle;
r->state = REBALANCE_THROTTLED;
bch2_kthread_io_clock_wait(clock,
r->throttled_until_iotime,
throttle);
continue;
}
}
/* minimum 1 mb/sec: */
r->pd.rate.rate =
max_t(u64, 1 << 11,
r->pd.rate.rate *
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
io_start = atomic64_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
r->state = REBALANCE_RUNNING;
memset(&move_stats, 0, sizeof(move_stats));
rebalance_work_reset(c);
bch2_move_data(c,
0, POS_MIN,
BTREE_ID_NR, POS_MAX,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
&move_stats,
bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
writepoint_ptr(&c->rebalance_write_point),
true,
rebalance_pred, NULL);
}
true);
while (!kthread_should_stop() &&
!(ret = do_rebalance(&ctxt)))
;
bch2_moving_ctxt_exit(&ctxt);
return 0;
}
void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
{
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 20);
prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
prt_tab(out);
prt_human_readable_u64(out, w.dev_most_full_work << 9);
prt_printf(out, "/");
prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
prt_newline(out);
prt_printf(out, "total work:");
prt_tab(out);
prt_human_readable_u64(out, w.total_work << 9);
prt_printf(out, "/");
prt_human_readable_u64(out, c->capacity << 9);
prt_newline(out);
prt_printf(out, "rate:");
prt_tab(out);
prt_printf(out, "%u", r->pd.rate.rate);
prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
printbuf_indent_add(out, 2);
switch (r->state) {
case REBALANCE_WAITING:
prt_printf(out, "waiting");
case BCH_REBALANCE_waiting: {
u64 now = atomic64_read(&c->io_clock[WRITE].now);
prt_str(out, "io wait duration: ");
bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
prt_newline(out);
prt_str(out, "io wait remaining: ");
bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
prt_newline(out);
prt_str(out, "duration waited: ");
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
prt_newline(out);
break;
case REBALANCE_THROTTLED:
prt_printf(out, "throttled for %lu sec or ",
(r->throttled_until_cputime - jiffies) / HZ);
prt_human_readable_u64(out,
(r->throttled_until_iotime -
atomic64_read(&c->io_clock[WRITE].now)) << 9);
prt_printf(out, " io");
}
case BCH_REBALANCE_working:
bch2_move_stats_to_text(out, &r->work_stats);
break;
case REBALANCE_RUNNING:
prt_printf(out, "running");
case BCH_REBALANCE_scanning:
bch2_move_stats_to_text(out, &r->scan_stats);
break;
}
prt_newline(out);
printbuf_indent_sub(out, 2);
}
void bch2_rebalance_stop(struct bch_fs *c)
@ -361,6 +461,4 @@ int bch2_rebalance_start(struct bch_fs *c)
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}

View File

@ -4,6 +4,9 @@
#include "rebalance_types.h"
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
rcu_read_unlock();
}
void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
struct bch_io_opts *);
void bch2_rebalance_add_work(struct bch_fs *, u64);
void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);

View File

@ -2,23 +2,34 @@
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
#include "bbpos_types.h"
#include "move_types.h"
enum rebalance_state {
REBALANCE_WAITING,
REBALANCE_THROTTLED,
REBALANCE_RUNNING,
#define BCH_REBALANCE_STATES() \
x(waiting) \
x(working) \
x(scanning)
enum bch_rebalance_states {
#define x(t) BCH_REBALANCE_##t,
BCH_REBALANCE_STATES()
#undef x
};
struct bch_fs_rebalance {
struct task_struct __rcu *thread;
struct bch_pd_controller pd;
atomic64_t work_unknown_dev;
enum bch_rebalance_states state;
u64 wait_iotime_start;
u64 wait_iotime_end;
u64 wait_wallclock_start;
enum rebalance_state state;
u64 throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats work_stats;
struct bbpos scan_start;
struct bbpos scan_end;
struct bch_move_stats scan_stats;
unsigned enabled:1;
};

View File

@ -23,6 +23,7 @@
#include "logged_ops.h"
#include "move.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
@ -182,7 +183,7 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_replay_key(trans, k));
if (ret) {
bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
goto err;
}
}
@ -225,7 +226,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
if (entry->u64s) {
r->level = entry->level;
bkey_copy(&r->key, &entry->start[0]);
bkey_copy(&r->key, (struct bkey_i *) entry->start);
r->error = 0;
} else {
r->error = -EIO;
@ -364,10 +365,12 @@ static int read_btree_roots(struct bch_fs *c)
}
if (r->error) {
__fsck_err(c, btree_id_is_alloc(i)
__fsck_err(c,
btree_id_is_alloc(i)
? FSCK_CAN_IGNORE : 0,
btree_root_bkey_invalid,
"invalid btree root %s",
bch2_btree_ids[i]);
bch2_btree_id_str(i));
if (i == BTREE_ID_alloc)
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
}
@ -375,8 +378,9 @@ static int read_btree_roots(struct bch_fs *c)
ret = bch2_btree_root_read(c, i, &r->key, r->level);
if (ret) {
fsck_err(c,
btree_root_read_error,
"error reading btree root %s",
bch2_btree_ids[i]);
bch2_btree_id_str(i));
if (btree_id_is_alloc(i))
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
ret = 0;
@ -713,6 +717,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (mustfix_fsck_err_on(c->sb.clean &&
last_journal_entry &&
!journal_entry_empty(last_journal_entry), c,
clean_but_journal_not_empty,
"filesystem marked clean but journal not empty")) {
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
@ -720,7 +725,9 @@ int bch2_fs_recovery(struct bch_fs *c)
}
if (!last_journal_entry) {
fsck_err_on(!c->sb.clean, c, "no journal entries found");
fsck_err_on(!c->sb.clean, c,
dirty_but_no_journal_entries,
"no journal entries found");
if (clean)
goto use_clean;
@ -728,6 +735,13 @@ int bch2_fs_recovery(struct bch_fs *c)
if (*i) {
last_journal_entry = &(*i)->j;
(*i)->ignore = false;
/*
* This was probably a NO_FLUSH entry,
* so last_seq was garbage - but we know
* we're only using a single journal
* entry, set it here:
*/
(*i)->j.last_seq = (*i)->j.seq;
break;
}
}
@ -901,7 +915,7 @@ int bch2_fs_recovery(struct bch_fs *c)
}
kfree(clean);
if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
bch2_fs_read_write_early(c);
bch2_delete_dead_snapshots_async(c);
}
@ -946,16 +960,12 @@ int bch2_fs_initialize(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
for_each_online_member(ca, c, i)
for_each_member_device(ca, c, i)
bch2_dev_usage_init(ca);
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);
if (ret) {
percpu_ref_put(&ca->io_ref);
ret = bch2_fs_journal_alloc(c);
if (ret)
goto err;
}
}
/*
* journal_res_get() will crash if called before this has
@ -973,15 +983,13 @@ int bch2_fs_initialize(struct bch_fs *c)
* btree updates
*/
bch_verbose(c, "marking superblocks");
for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
ret = bch2_trans_mark_dev_sbs(c);
bch_err_msg(c, ret, "marking superblocks");
if (ret)
goto err;
}
for_each_online_member(ca, c, i)
ca->new_fs_bucket_idx = 0;
}
ret = bch2_fs_freespace_init(c);
if (ret)

View File

@ -14,6 +14,8 @@
x(snapshots_read, PASS_ALWAYS) \
x(check_topology, 0) \
x(check_allocations, PASS_FSCK) \
x(trans_mark_dev_sbs, PASS_ALWAYS|PASS_SILENT) \
x(fs_journal_alloc, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, PASS_ALWAYS) \
x(check_alloc_info, PASS_FSCK) \
@ -27,11 +29,12 @@
x(check_snapshot_trees, PASS_FSCK) \
x(check_snapshots, PASS_FSCK) \
x(check_subvols, PASS_FSCK) \
x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \
x(delete_dead_snapshots, PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 0) \
x(resume_logged_ops, PASS_ALWAYS) \
x(check_inodes, PASS_FSCK) \
x(check_extents, PASS_FSCK) \
x(check_indirect_extents, PASS_FSCK) \
x(check_dirents, PASS_FSCK) \
x(check_xattrs, PASS_FSCK) \
x(check_root, PASS_FSCK) \
@ -39,6 +42,7 @@
x(check_nlinks, PASS_FSCK) \
x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 0) \
x(set_fs_needs_rebalance, 0) \
enum bch_recovery_pass {
#define x(n, when) BCH_RECOVERY_PASS_##n,

View File

@ -7,6 +7,7 @@
#include "inode.h"
#include "io_misc.h"
#include "io_write.h"
#include "rebalance.h"
#include "reflink.h"
#include "subvolume.h"
#include "super-io.h"
@ -27,7 +28,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
/* reflink pointers */
int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@ -74,7 +75,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
/* indirect extents */
int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@ -103,28 +104,29 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
{
if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
new->k.type = KEY_TYPE_deleted;
new->k.size = 0;
set_bkey_val_u64s(&new->k, 0);;
*flags &= ~BTREE_TRIGGER_INSERT;
}
}
int bch2_trans_mark_reflink_v(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
if (!r->v.refcount) {
r->k.type = KEY_TYPE_deleted;
r->k.size = 0;
set_bkey_val_u64s(&r->k, 0);
return 0;
}
}
check_indirect_extent_deleting(new, &flags);
return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
}
/* indirect inline data */
int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
@ -147,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_i *new,
unsigned flags)
{
if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
struct bkey_i_indirect_inline_data *r =
bkey_i_to_indirect_inline_data(new);
if (!r->v.refcount) {
r->k.type = KEY_TYPE_deleted;
r->k.size = 0;
set_bkey_val_u64s(&r->k, 0);
}
}
check_indirect_extent_deleting(new, &flags);
return 0;
}
@ -260,8 +253,9 @@ s64 bch2_remap_range(struct bch_fs *c,
struct bpos dst_start = POS(dst_inum.inum, dst_offset);
struct bpos src_start = POS(src_inum.inum, src_offset);
struct bpos dst_end = dst_start, src_end = src_start;
struct bch_io_opts opts;
struct bpos src_want;
u64 dst_done;
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
@ -277,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_init(&new_src);
trans = bch2_trans_get(c);
ret = bch2_inum_opts_get(trans, src_inum, &opts);
if (ret)
goto err;
bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
BTREE_ITER_INTENT);
bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
@ -360,7 +358,10 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
ret = bch2_extent_update(trans, dst_inum, &dst_iter,
ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
opts.background_target,
opts.background_compression) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
true);
@ -394,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_trans_iter_exit(trans, &inode_iter);
} while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);

View File

@ -4,7 +4,7 @@
enum bkey_invalid_flags;
int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@ -19,7 +19,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.min_val_size = 16, \
})
int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
@ -35,7 +35,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
.min_val_size = 8, \
})
int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);

View File

@ -462,18 +462,13 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{
lockdep_assert_held(&c->replicas_gc_lock);
if (ret)
goto err;
mutex_lock(&c->sb_lock);
percpu_down_write(&c->mark_lock);
ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
if (ret)
goto err;
ret = ret ?:
bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
replicas_table_update(c, &c->replicas_gc);
ret = replicas_table_update(c, &c->replicas_gc);
err:
kfree(c->replicas_gc.entries);
c->replicas_gc.entries = NULL;
@ -579,12 +574,9 @@ int bch2_replicas_gc2(struct bch_fs *c)
bch2_cpu_replicas_sort(&new);
ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
if (ret)
goto err;
ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
replicas_table_update(c, &new);
ret = replicas_table_update(c, &new);
err:
kfree(new.entries);
percpu_up_write(&c->mark_lock);

View File

@ -82,6 +82,7 @@ int bch2_verify_superblock_clean(struct bch_fs *c,
int ret = 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
sb_clean_journal_seq_mismatch,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
le64_to_cpu(j->seq))) {
@ -119,6 +120,7 @@ int bch2_verify_superblock_clean(struct bch_fs *c,
k1->k.u64s != k2->k.u64s ||
memcmp(k1, k2, bkey_bytes(&k1->k)) ||
l1 != l2, c,
sb_clean_btree_root_mismatch,
"superblock btree root %u doesn't match journal after clean shutdown\n"
"sb: l=%u %s\n"
"journal: l=%u %s\n", i,
@ -140,6 +142,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
if (fsck_err_on(!sb_clean, c,
sb_clean_missing,
"superblock marked clean but clean section not present")) {
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
@ -373,7 +376,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
entry = sb_clean->start;
bch2_journal_super_entries_add_common(c, &entry, 0);
entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
memset(entry, 0,

172
fs/bcachefs/sb-errors.c Normal file
View File

@ -0,0 +1,172 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "sb-errors.h"
#include "super-io.h"
static const char * const bch2_sb_error_strs[] = {
#define x(t, n, ...) [n] = #t,
BCH_SB_ERRS()
NULL
};
static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
{
if (id < BCH_SB_ERR_MAX)
prt_str(out, bch2_sb_error_strs[id]);
else
prt_printf(out, "(unknown error %u)", id);
}
static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
{
return e
? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0])
: 0;
}
static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
{
return (sizeof(struct bch_sb_field_errors) +
sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
}
static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
struct printbuf *err)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
for (i = 0; i < nr; i++) {
if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
prt_printf(err, "entry with count 0 (id ");
bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
prt_printf(err, ")");
return -BCH_ERR_invalid_sb_errors;
}
if (i + 1 < nr &&
BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
prt_printf(err, "entries out of order");
return -BCH_ERR_invalid_sb_errors;
}
}
return 0;
}
static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_errors *e = field_to_type(f, errors);
unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
if (out->nr_tabstops <= 1)
printbuf_tabstop_push(out, 16);
for (i = 0; i < nr; i++) {
bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
prt_tab(out);
prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
prt_tab(out);
bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
prt_newline(out);
}
}
const struct bch_sb_field_ops bch_sb_field_ops_errors = {
.validate = bch2_sb_errors_validate,
.to_text = bch2_sb_errors_to_text,
};
void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
{
bch_sb_errors_cpu *e = &c->fsck_error_counts;
struct bch_sb_error_entry_cpu n = {
.id = err,
.nr = 1,
.last_error_time = ktime_get_real_seconds()
};
unsigned i;
mutex_lock(&c->fsck_error_counts_lock);
for (i = 0; i < e->nr; i++) {
if (err == e->data[i].id) {
e->data[i].nr++;
e->data[i].last_error_time = n.last_error_time;
goto out;
}
if (err < e->data[i].id)
break;
}
if (darray_make_room(e, 1))
goto out;
darray_insert_item(e, i, n);
out:
mutex_unlock(&c->fsck_error_counts_lock);
}
void bch2_sb_errors_from_cpu(struct bch_fs *c)
{
bch_sb_errors_cpu *src = &c->fsck_error_counts;
struct bch_sb_field_errors *dst =
bch2_sb_field_resize(&c->disk_sb, errors,
bch2_sb_field_errors_u64s(src->nr));
unsigned i;
if (!dst)
return;
for (i = 0; i < src->nr; i++) {
SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
}
}
static int bch2_sb_errors_to_cpu(struct bch_fs *c)
{
struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
bch_sb_errors_cpu *dst = &c->fsck_error_counts;
unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
int ret;
if (!nr)
return 0;
mutex_lock(&c->fsck_error_counts_lock);
ret = darray_make_room(dst, nr);
if (ret)
goto err;
dst->nr = nr;
for (i = 0; i < nr; i++) {
dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
}
err:
mutex_unlock(&c->fsck_error_counts_lock);
return ret;
}
void bch2_fs_sb_errors_exit(struct bch_fs *c)
{
darray_exit(&c->fsck_error_counts);
}
void bch2_fs_sb_errors_init_early(struct bch_fs *c)
{
mutex_init(&c->fsck_error_counts_lock);
darray_init(&c->fsck_error_counts);
}
int bch2_fs_sb_errors_init(struct bch_fs *c)
{
return bch2_sb_errors_to_cpu(c);
}

270
fs/bcachefs/sb-errors.h Normal file
View File

@ -0,0 +1,270 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SB_ERRORS_H
#define _BCACHEFS_SB_ERRORS_H
#include "sb-errors_types.h"
#define BCH_SB_ERRS() \
x(clean_but_journal_not_empty, 0) \
x(dirty_but_no_journal_entries, 1) \
x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \
x(sb_clean_journal_seq_mismatch, 3) \
x(sb_clean_btree_root_mismatch, 4) \
x(sb_clean_missing, 5) \
x(jset_unsupported_version, 6) \
x(jset_unknown_csum, 7) \
x(jset_last_seq_newer_than_seq, 8) \
x(jset_past_bucket_end, 9) \
x(jset_seq_blacklisted, 10) \
x(journal_entries_missing, 11) \
x(journal_entry_replicas_not_marked, 12) \
x(journal_entry_past_jset_end, 13) \
x(journal_entry_replicas_data_mismatch, 14) \
x(journal_entry_bkey_u64s_0, 15) \
x(journal_entry_bkey_past_end, 16) \
x(journal_entry_bkey_bad_format, 17) \
x(journal_entry_bkey_invalid, 18) \
x(journal_entry_btree_root_bad_size, 19) \
x(journal_entry_blacklist_bad_size, 20) \
x(journal_entry_blacklist_v2_bad_size, 21) \
x(journal_entry_blacklist_v2_start_past_end, 22) \
x(journal_entry_usage_bad_size, 23) \
x(journal_entry_data_usage_bad_size, 24) \
x(journal_entry_clock_bad_size, 25) \
x(journal_entry_clock_bad_rw, 26) \
x(journal_entry_dev_usage_bad_size, 27) \
x(journal_entry_dev_usage_bad_dev, 28) \
x(journal_entry_dev_usage_bad_pad, 29) \
x(btree_node_unreadable, 30) \
x(btree_node_fault_injected, 31) \
x(btree_node_bad_magic, 32) \
x(btree_node_bad_seq, 33) \
x(btree_node_unsupported_version, 34) \
x(btree_node_bset_older_than_sb_min, 35) \
x(btree_node_bset_newer_than_sb, 36) \
x(btree_node_data_missing, 37) \
x(btree_node_bset_after_end, 38) \
x(btree_node_replicas_sectors_written_mismatch, 39) \
x(btree_node_replicas_data_mismatch, 40) \
x(bset_unknown_csum, 41) \
x(bset_bad_csum, 42) \
x(bset_past_end_of_btree_node, 43) \
x(bset_wrong_sector_offset, 44) \
x(bset_empty, 45) \
x(bset_bad_seq, 46) \
x(bset_blacklisted_journal_seq, 47) \
x(first_bset_blacklisted_journal_seq, 48) \
x(btree_node_bad_btree, 49) \
x(btree_node_bad_level, 50) \
x(btree_node_bad_min_key, 51) \
x(btree_node_bad_max_key, 52) \
x(btree_node_bad_format, 53) \
x(btree_node_bkey_past_bset_end, 54) \
x(btree_node_bkey_bad_format, 55) \
x(btree_node_bad_bkey, 56) \
x(btree_node_bkey_out_of_order, 57) \
x(btree_root_bkey_invalid, 58) \
x(btree_root_read_error, 59) \
x(btree_root_bad_min_key, 50) \
x(btree_root_bad_max_key, 61) \
x(btree_node_read_error, 62) \
x(btree_node_topology_bad_min_key, 63) \
x(btree_node_topology_bad_max_key, 64) \
x(btree_node_topology_overwritten_by_prev_node, 65) \
x(btree_node_topology_overwritten_by_next_node, 66) \
x(btree_node_topology_interior_node_empty, 67) \
x(fs_usage_hidden_wrong, 68) \
x(fs_usage_btree_wrong, 69) \
x(fs_usage_data_wrong, 70) \
x(fs_usage_cached_wrong, 71) \
x(fs_usage_reserved_wrong, 72) \
x(fs_usage_persistent_reserved_wrong, 73) \
x(fs_usage_nr_inodes_wrong, 74) \
x(fs_usage_replicas_wrong, 75) \
x(dev_usage_buckets_wrong, 76) \
x(dev_usage_sectors_wrong, 77) \
x(dev_usage_fragmented_wrong, 78) \
x(dev_usage_buckets_ec_wrong, 79) \
x(bkey_version_in_future, 80) \
x(bkey_u64s_too_small, 81) \
x(bkey_invalid_type_for_btree, 82) \
x(bkey_extent_size_zero, 83) \
x(bkey_extent_size_greater_than_offset, 84) \
x(bkey_size_nonzero, 85) \
x(bkey_snapshot_nonzero, 86) \
x(bkey_snapshot_zero, 87) \
x(bkey_at_pos_max, 88) \
x(bkey_before_start_of_btree_node, 89) \
x(bkey_after_end_of_btree_node, 90) \
x(bkey_val_size_nonzero, 91) \
x(bkey_val_size_too_small, 92) \
x(alloc_v1_val_size_bad, 93) \
x(alloc_v2_unpack_error, 94) \
x(alloc_v3_unpack_error, 95) \
x(alloc_v4_val_size_bad, 96) \
x(alloc_v4_backpointers_start_bad, 97) \
x(alloc_key_data_type_bad, 98) \
x(alloc_key_empty_but_have_data, 99) \
x(alloc_key_dirty_sectors_0, 100) \
x(alloc_key_data_type_inconsistency, 101) \
x(alloc_key_to_missing_dev_bucket, 102) \
x(alloc_key_cached_inconsistency, 103) \
x(alloc_key_cached_but_read_time_zero, 104) \
x(alloc_key_to_missing_lru_entry, 105) \
x(alloc_key_data_type_wrong, 106) \
x(alloc_key_gen_wrong, 107) \
x(alloc_key_dirty_sectors_wrong, 108) \
x(alloc_key_cached_sectors_wrong, 109) \
x(alloc_key_stripe_wrong, 110) \
x(alloc_key_stripe_redundancy_wrong, 111) \
x(bucket_sector_count_overflow, 112) \
x(bucket_metadata_type_mismatch, 113) \
x(need_discard_key_wrong, 114) \
x(freespace_key_wrong, 115) \
x(freespace_hole_missing, 116) \
x(bucket_gens_val_size_bad, 117) \
x(bucket_gens_key_wrong, 118) \
x(bucket_gens_hole_wrong, 119) \
x(bucket_gens_to_invalid_dev, 120) \
x(bucket_gens_to_invalid_buckets, 121) \
x(bucket_gens_nonzero_for_invalid_buckets, 122) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \
x(need_discard_freespace_key_bad, 124) \
x(backpointer_pos_wrong, 125) \
x(backpointer_to_missing_device, 126) \
x(backpointer_to_missing_alloc, 127) \
x(backpointer_to_missing_ptr, 128) \
x(lru_entry_at_time_0, 129) \
x(lru_entry_to_invalid_bucket, 130) \
x(lru_entry_bad, 131) \
x(btree_ptr_val_too_big, 132) \
x(btree_ptr_v2_val_too_big, 133) \
x(btree_ptr_has_non_ptr, 134) \
x(extent_ptrs_invalid_entry, 135) \
x(extent_ptrs_no_ptrs, 136) \
x(extent_ptrs_too_many_ptrs, 137) \
x(extent_ptrs_redundant_crc, 138) \
x(extent_ptrs_redundant_stripe, 139) \
x(extent_ptrs_unwritten, 140) \
x(extent_ptrs_written_and_unwritten, 141) \
x(ptr_to_invalid_device, 142) \
x(ptr_to_duplicate_device, 143) \
x(ptr_after_last_bucket, 144) \
x(ptr_before_first_bucket, 145) \
x(ptr_spans_multiple_buckets, 146) \
x(ptr_to_missing_backpointer, 147) \
x(ptr_to_missing_alloc_key, 148) \
x(ptr_to_missing_replicas_entry, 149) \
x(ptr_to_missing_stripe, 150) \
x(ptr_to_incorrect_stripe, 151) \
x(ptr_gen_newer_than_bucket_gen, 152) \
x(ptr_too_stale, 153) \
x(stale_dirty_ptr, 154) \
x(ptr_bucket_data_type_mismatch, 155) \
x(ptr_cached_and_erasure_coded, 156) \
x(ptr_crc_uncompressed_size_too_small, 157) \
x(ptr_crc_csum_type_unknown, 158) \
x(ptr_crc_compression_type_unknown, 159) \
x(ptr_crc_redundant, 160) \
x(ptr_crc_uncompressed_size_too_big, 161) \
x(ptr_crc_nonce_mismatch, 162) \
x(ptr_stripe_redundant, 163) \
x(reservation_key_nr_replicas_invalid, 164) \
x(reflink_v_refcount_wrong, 165) \
x(reflink_p_to_missing_reflink_v, 166) \
x(stripe_pos_bad, 167) \
x(stripe_val_size_bad, 168) \
x(stripe_sector_count_wrong, 169) \
x(snapshot_tree_pos_bad, 170) \
x(snapshot_tree_to_missing_snapshot, 171) \
x(snapshot_tree_to_missing_subvol, 172) \
x(snapshot_tree_to_wrong_subvol, 173) \
x(snapshot_tree_to_snapshot_subvol, 174) \
x(snapshot_pos_bad, 175) \
x(snapshot_parent_bad, 176) \
x(snapshot_children_not_normalized, 177) \
x(snapshot_child_duplicate, 178) \
x(snapshot_child_bad, 179) \
x(snapshot_skiplist_not_normalized, 180) \
x(snapshot_skiplist_bad, 181) \
x(snapshot_should_not_have_subvol, 182) \
x(snapshot_to_bad_snapshot_tree, 183) \
x(snapshot_bad_depth, 184) \
x(snapshot_bad_skiplist, 185) \
x(subvol_pos_bad, 186) \
x(subvol_not_master_and_not_snapshot, 187) \
x(subvol_to_missing_root, 188) \
x(subvol_root_wrong_bi_subvol, 189) \
x(bkey_in_missing_snapshot, 190) \
x(inode_pos_inode_nonzero, 191) \
x(inode_pos_blockdev_range, 192) \
x(inode_unpack_error, 193) \
x(inode_str_hash_invalid, 194) \
x(inode_v3_fields_start_bad, 195) \
x(inode_snapshot_mismatch, 196) \
x(inode_unlinked_but_clean, 197) \
x(inode_unlinked_but_nlink_nonzero, 198) \
x(inode_checksum_type_invalid, 199) \
x(inode_compression_type_invalid, 200) \
x(inode_subvol_root_but_not_dir, 201) \
x(inode_i_size_dirty_but_clean, 202) \
x(inode_i_sectors_dirty_but_clean, 203) \
x(inode_i_sectors_wrong, 204) \
x(inode_dir_wrong_nlink, 205) \
x(inode_dir_multiple_links, 206) \
x(inode_multiple_links_but_nlink_0, 207) \
x(inode_wrong_backpointer, 208) \
x(inode_wrong_nlink, 209) \
x(inode_unreachable, 210) \
x(deleted_inode_but_clean, 211) \
x(deleted_inode_missing, 212) \
x(deleted_inode_is_dir, 213) \
x(deleted_inode_not_unlinked, 214) \
x(extent_overlapping, 215) \
x(extent_in_missing_inode, 216) \
x(extent_in_non_reg_inode, 217) \
x(extent_past_end_of_inode, 218) \
x(dirent_empty_name, 219) \
x(dirent_val_too_big, 220) \
x(dirent_name_too_long, 221) \
x(dirent_name_embedded_nul, 222) \
x(dirent_name_dot_or_dotdot, 223) \
x(dirent_name_has_slash, 224) \
x(dirent_d_type_wrong, 225) \
x(dirent_d_parent_subvol_wrong, 226) \
x(dirent_in_missing_dir_inode, 227) \
x(dirent_in_non_dir_inode, 228) \
x(dirent_to_missing_inode, 229) \
x(dirent_to_missing_subvol, 230) \
x(dirent_to_itself, 231) \
x(quota_type_invalid, 232) \
x(xattr_val_size_too_small, 233) \
x(xattr_val_size_too_big, 234) \
x(xattr_invalid_type, 235) \
x(xattr_name_invalid_chars, 236) \
x(xattr_in_missing_inode, 237) \
x(root_subvol_missing, 238) \
x(root_dir_missing, 239) \
x(root_inode_not_dir, 240) \
x(dir_loop, 241) \
x(hash_table_key_duplicate, 242) \
x(hash_table_key_wrong_offset, 243)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
BCH_SB_ERRS()
#undef x
BCH_SB_ERR_MAX
};
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
void bch2_sb_errors_from_cpu(struct bch_fs *);
void bch2_fs_sb_errors_exit(struct bch_fs *);
void bch2_fs_sb_errors_init_early(struct bch_fs *);
int bch2_fs_sb_errors_init(struct bch_fs *);
#endif /* _BCACHEFS_SB_ERRORS_H */

View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
#define _BCACHEFS_SB_ERRORS_TYPES_H
#include "darray.h"
struct bch_sb_error_entry_cpu {
u64 id:16,
nr:48;
u64 last_error_time;
};
typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */

View File

@ -7,21 +7,28 @@
#include "sb-members.h"
#include "super-io.h"
/* Code for bch_sb_field_members_v1: */
#define x(t, n, ...) [n] = #t,
static const char * const bch2_iops_measurements[] = {
BCH_IOPS_MEASUREMENTS()
NULL
};
static struct bch_member *members_v2_get_mut(struct bch_sb_field_members_v2 *mi, int i)
{
return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
}
char * const bch2_member_error_strs[] = {
BCH_MEMBER_ERROR_TYPES()
NULL
};
#undef x
/* Code for bch_sb_field_members_v1: */
struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
{
return members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
}
static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
{
struct bch_member ret, *p = members_v2_get_mut(mi, i);
struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
memset(&ret, 0, sizeof(ret));
memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
return ret;
@ -36,7 +43,8 @@ static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int
{
struct bch_member ret, *p = members_v1_get_mut(mi, i);
memset(&ret, 0, sizeof(ret));
memcpy(&ret, p, min_t(size_t, sizeof(struct bch_member), sizeof(ret))); return ret;
memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
return ret;
}
struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
@ -62,7 +70,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
memmove(dst, members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
memset(dst + le16_to_cpu(mi->member_bytes),
0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
}
@ -71,7 +79,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
return 0;
}
int bch2_members_v2_init(struct bch_fs *c)
int bch2_sb_members_v2_init(struct bch_fs *c)
{
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;
@ -91,7 +99,7 @@ int bch2_members_v2_init(struct bch_fs *c)
return sb_members_v2_resize_entries(c);
}
int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
{
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;
@ -105,7 +113,7 @@ int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
memcpy(members_v1_get_mut(mi1, i), members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
return 0;
}
@ -155,6 +163,8 @@ static void member_to_text(struct printbuf *out,
u64 bucket_size = le16_to_cpu(m.bucket_size);
u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
if (!bch2_member_exists(&m))
return;
prt_printf(out, "Device:");
prt_tab(out);
@ -163,6 +173,21 @@ static void member_to_text(struct printbuf *out,
printbuf_indent_add(out, 2);
prt_printf(out, "Label:");
prt_tab(out);
if (BCH_MEMBER_GROUP(&m)) {
unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
if (idx < disk_groups_nr(gi))
prt_printf(out, "%s (%u)",
gi->entries[idx].label, idx);
else
prt_printf(out, "(bad disk labels section)");
} else {
prt_printf(out, "(none)");
}
prt_newline(out);
prt_printf(out, "UUID:");
prt_tab(out);
pr_uuid(out, m.uuid.b);
@ -173,6 +198,13 @@ static void member_to_text(struct printbuf *out,
prt_units_u64(out, device_size << 9);
prt_newline(out);
for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
prt_tab(out);
prt_u64(out, le64_to_cpu(m.errors[i]));
prt_newline(out);
}
for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
prt_tab(out);
@ -198,7 +230,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Last mount:");
prt_tab(out);
if (m.last_mount)
pr_time(out, le64_to_cpu(m.last_mount));
bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
else
prt_printf(out, "(never)");
prt_newline(out);
@ -211,21 +243,6 @@ static void member_to_text(struct printbuf *out,
: "unknown");
prt_newline(out);
prt_printf(out, "Label:");
prt_tab(out);
if (BCH_MEMBER_GROUP(&m)) {
unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
if (idx < disk_groups_nr(gi))
prt_printf(out, "%s (%u)",
gi->entries[idx].label, idx);
else
prt_printf(out, "(bad disk labels section)");
} else {
prt_printf(out, "(none)");
}
prt_newline(out);
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
@ -262,8 +279,7 @@ static int bch2_sb_members_v1_validate(struct bch_sb *sb,
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
unsigned i;
if ((void *) members_v1_get_mut(mi, sb->nr_devices) >
vstruct_end(&mi->field)) {
if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
prt_printf(err, "too many devices for section size");
return -BCH_ERR_invalid_sb_members;
}
@ -286,10 +302,8 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
unsigned i;
for (i = 0; i < sb->nr_devices; i++) {
struct bch_member m = members_v1_get(mi, i);
member_to_text(out, m, gi, sb, i);
}
for (i = 0; i < sb->nr_devices; i++)
member_to_text(out, members_v1_get(mi, i), gi, sb, i);
}
const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
@ -304,10 +318,8 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
unsigned i;
for (i = 0; i < sb->nr_devices; i++) {
struct bch_member m = members_v2_get(mi, i);
member_to_text(out, m, gi, sb, i);
}
for (i = 0; i < sb->nr_devices; i++)
member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
static int bch2_sb_members_v2_validate(struct bch_sb *sb,
@ -315,7 +327,7 @@ static int bch2_sb_members_v2_validate(struct bch_sb *sb,
struct printbuf *err)
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
size_t mi_bytes = (void *) members_v2_get_mut(mi, sb->nr_devices) -
size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
(void *) mi;
if (mi_bytes > vstruct_bytes(&mi->field)) {
@ -337,3 +349,72 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
.validate = bch2_sb_members_v2_validate,
.to_text = bch2_sb_members_v2_to_text,
};
void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
struct bch_dev *ca;
unsigned i, e;
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL) {
struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
rcu_read_unlock();
}
void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
struct bch_member m;
mutex_lock(&ca->fs->sb_lock);
m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
mutex_unlock(&ca->fs->sb_lock);
printbuf_tabstop_push(out, 12);
prt_str(out, "IO errors since filesystem creation");
prt_newline(out);
printbuf_indent_add(out, 2);
for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
prt_printf(out, "%s:", bch2_member_error_strs[i]);
prt_tab(out);
prt_u64(out, atomic64_read(&ca->errors[i]));
prt_newline(out);
}
printbuf_indent_sub(out, 2);
prt_str(out, "IO errors since ");
bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
prt_str(out, " ago");
prt_newline(out);
printbuf_indent_add(out, 2);
for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
prt_printf(out, "%s:", bch2_member_error_strs[i]);
prt_tab(out);
prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
prt_newline(out);
}
printbuf_indent_sub(out, 2);
}
void bch2_dev_errors_reset(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
struct bch_member *m;
mutex_lock(&c->sb_lock);
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
m->errors_reset_time = ktime_get_real_seconds();
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
}

View File

@ -2,8 +2,16 @@
#ifndef _BCACHEFS_SB_MEMBERS_H
#define _BCACHEFS_SB_MEMBERS_H
int bch2_members_v2_init(struct bch_fs *c);
int bch_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
extern char * const bch2_member_error_strs[];
static inline struct bch_member *
__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
{
return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
}
int bch2_sb_members_v2_init(struct bch_fs *c);
int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
@ -179,4 +187,41 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
static inline bool bch2_member_exists(struct bch_member *m)
{
return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
}
static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
{
if (dev < sb->nr_devices) {
struct bch_member m = bch2_sb_member_get(sb, dev);
return bch2_member_exists(&m);
}
return false;
}
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
.nbuckets = le64_to_cpu(mi->nbuckets),
.first_bucket = le16_to_cpu(mi->first_bucket),
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
.state = BCH_MEMBER_STATE(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
.durability = BCH_MEMBER_DURABILITY(mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = bch2_member_exists(mi),
};
}
void bch2_sb_members_from_cpu(struct bch_fs *);
void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
void bch2_dev_errors_reset(struct bch_dev *);
#endif /* _BCACHEFS_SB_MEMBERS_H */

View File

@ -11,6 +11,8 @@
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <trace/events/lock.h>
#include "six.h"
#ifdef DEBUG
@ -462,11 +464,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
smp_mb__after_atomic();
}
trace_contention_begin(lock, 0);
lock_contended(&lock->dep_map, ip);
if (six_optimistic_spin(lock, type))
goto out;
lock_contended(&lock->dep_map, ip);
wait->task = current;
wait->lock_want = type;
wait->lock_acquired = false;
@ -546,6 +549,7 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
six_clear_bitmask(lock, SIX_LOCK_HELD_write);
six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
}
trace_contention_end(lock, 0);
return ret;
}

View File

@ -30,17 +30,18 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(t.v->root_snapshot));
}
int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
bkey_lt(k.k->p, POS(0, 1))) {
prt_printf(err, "bad pos");
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return 0;
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
bkey_lt(k.k->p, POS(0, 1)), c, err,
snapshot_tree_pos_bad,
"bad pos");
fsck_err:
return ret;
}
int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
@ -202,68 +203,60 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
le32_to_cpu(s.v->skip[2]));
}
int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags,
struct printbuf *err)
{
struct bkey_s_c_snapshot s;
u32 i, id;
int ret = 0;
if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
bkey_lt(k.k->p, POS(0, 1))) {
prt_printf(err, "bad pos");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
bkey_lt(k.k->p, POS(0, 1)), c, err,
snapshot_pos_bad,
"bad pos");
s = bkey_s_c_to_snapshot(k);
id = le32_to_cpu(s.v->parent);
if (id && id <= k.k->p.offset) {
prt_printf(err, "bad parent node (%u <= %llu)",
bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
snapshot_parent_bad,
"bad parent node (%u <= %llu)",
id, k.k->p.offset);
return -BCH_ERR_invalid_bkey;
}
if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
prt_printf(err, "children not normalized");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
snapshot_children_not_normalized,
"children not normalized");
if (s.v->children[0] &&
s.v->children[0] == s.v->children[1]) {
prt_printf(err, "duplicate child nodes");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
snapshot_child_duplicate,
"duplicate child nodes");
for (i = 0; i < 2; i++) {
id = le32_to_cpu(s.v->children[i]);
if (id >= k.k->p.offset) {
prt_printf(err, "bad child node (%u >= %llu)",
bkey_fsck_err_on(id >= k.k->p.offset, c, err,
snapshot_child_bad,
"bad child node (%u >= %llu)",
id, k.k->p.offset);
return -BCH_ERR_invalid_bkey;
}
}
if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
prt_printf(err, "skiplist not normalized");
return -BCH_ERR_invalid_bkey;
}
bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
snapshot_skiplist_not_normalized,
"skiplist not normalized");
for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
id = le32_to_cpu(s.v->skip[i]);
if ((id && !s.v->parent) ||
(id && id <= k.k->p.offset)) {
prt_printf(err, "bad skiplist node %u", id);
return -BCH_ERR_invalid_bkey;
bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
snapshot_skiplist_bad,
"bad skiplist node %u", id);
}
}
}
return 0;
fsck_err:
return ret;
}
static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
@ -325,8 +318,9 @@ int bch2_mark_snapshot(struct btree_trans *trans,
__set_is_ancestor_bitmap(c, id);
if (BCH_SNAPSHOT_DELETED(s.v)) {
set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
bch2_delete_dead_snapshots_async(c);
}
} else {
memset(t, 0, sizeof(*t));
@ -529,7 +523,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
if (fsck_err_on(ret ||
root_id != bch2_snapshot_root(c, root_id) ||
st.k->p.offset != le32_to_cpu(s.tree),
c,
c, snapshot_tree_to_missing_snapshot,
"snapshot tree points to missing/incorrect snapshot:\n %s",
(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
@ -541,17 +535,20 @@ static int check_snapshot_tree(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (fsck_err_on(ret, c,
if (fsck_err_on(ret,
c, snapshot_tree_to_missing_subvol,
"snapshot tree points to missing subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
le32_to_cpu(subvol.snapshot),
root_id), c,
root_id),
c, snapshot_tree_to_wrong_subvol,
"snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
c, snapshot_tree_to_snapshot_subvol,
"snapshot tree points to snapshot subvolume:\n %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
@ -787,7 +784,9 @@ static int check_snapshot(struct btree_trans *trans,
goto err;
}
} else {
if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s",
if (fsck_err_on(s.subvol,
c, snapshot_should_not_have_subvol,
"snapshot should not point to subvol:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@ -803,7 +802,8 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s",
if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
"snapshot points to missing/incorrect tree:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
if (ret)
@ -815,7 +815,8 @@ static int check_snapshot(struct btree_trans *trans,
if (le32_to_cpu(s.depth) != real_depth &&
(c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s",
fsck_err(c, snapshot_bad_depth,
"snapshot with incorrect depth field, should be %u:\n %s",
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@ -832,7 +833,8 @@ static int check_snapshot(struct btree_trans *trans,
if (!ret &&
(c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
fsck_err(c, "snapshot with bad skiplist field:\n %s",
fsck_err(c, snapshot_bad_skiplist,
"snapshot with bad skiplist field:\n %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@ -1251,13 +1253,7 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans,
return 0;
}
/*
* For a given snapshot, if it doesn't have a subvolume that points to it, and
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
* as deleted.
*/
static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k)
static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
{
struct bkey_s_c_snapshot snap;
u32 children[2];
@ -1278,10 +1274,21 @@ static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btre
bch2_snapshot_live(trans, children[1]);
if (ret < 0)
return ret;
return !ret;
}
if (!ret)
return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
return 0;
/*
* For a given snapshot, if it doesn't have a subvolume that points to it, and
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
* as deleted.
*/
static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
{
int ret = bch2_snapshot_needs_delete(trans, k);
return ret <= 0
? ret
: bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
@ -1342,12 +1349,12 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
u32 id = le32_to_cpu(s->v.skip[j]);
if (snapshot_list_has_id(deleted, id)) {
id = depth > 1
? bch2_snapshot_nth_parent_skip(c,
id = bch2_snapshot_nth_parent_skip(c,
parent,
get_random_u32_below(depth - 1),
deleted)
: parent;
depth > 1
? get_random_u32_below(depth - 1)
: 0,
deleted);
s->v.skip[j] = cpu_to_le32(id);
}
}
@ -1369,6 +1376,9 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
u32 *i, id;
int ret = 0;
if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
return 0;
if (!test_bit(BCH_FS_STARTED, &c->flags)) {
ret = bch2_fs_read_write_early(c);
if (ret) {
@ -1386,7 +1396,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k,
NULL, NULL, 0,
bch2_delete_redundant_snapshot(trans, &iter, k));
bch2_delete_redundant_snapshot(trans, k));
if (ret) {
bch_err_msg(c, ret, "deleting redundant snapshots");
goto err;
@ -1427,6 +1437,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
if (!btree_type_has_snapshots(id))
continue;
/*
* deleted inodes btree is maintained by a trigger on the inodes
* btree - no work for us to do here, and it's not safe to scan
* it because we'll see out of date keys due to the btree write
* buffer:
*/
if (id == BTREE_ID_deleted_inodes)
continue;
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
@ -1447,6 +1466,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
}
}
bch2_trans_unlock(trans);
down_write(&c->snapshot_create_lock);
for_each_btree_key(trans, iter, BTREE_ID_snapshots,
@ -1491,8 +1511,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
goto err_create_lock;
}
}
clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
err_create_lock:
up_write(&c->snapshot_create_lock);
err:
@ -1508,7 +1526,6 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
bch2_delete_dead_snapshots(c);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
@ -1520,20 +1537,6 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c)
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{
struct bch_fs *c = trans->c;
set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
return 0;
bch2_delete_dead_snapshots_async(c);
return 0;
}
int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
enum btree_id id,
struct bpos pos)
@ -1664,6 +1667,26 @@ int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
return ret ?: trans_was_restarted(trans, restart_count);
}
static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_s_c_snapshot snap;
int ret = 0;
if (k.k->type != KEY_TYPE_snapshot)
return 0;
snap = bkey_s_c_to_snapshot(k);
if (BCH_SNAPSHOT_DELETED(snap.v) ||
bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
(ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
return 0;
}
return ret;
}
int bch2_snapshots_read(struct bch_fs *c)
{
struct btree_iter iter;
@ -1674,7 +1697,8 @@ int bch2_snapshots_read(struct bch_fs *c)
for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k,
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
bch2_snapshot_set_equiv(trans, k)) ?:
bch2_snapshot_set_equiv(trans, k) ?:
bch2_check_snapshot_needs_deletion(trans, k)) ?:
for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));

View File

@ -5,7 +5,7 @@
enum bkey_invalid_flags;
void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \
@ -19,7 +19,7 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s_c, unsigned);
@ -244,8 +244,6 @@ int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
int bch2_delete_dead_snapshots_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
void bch2_delete_dead_snapshots_work(struct work_struct *);
int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);

View File

@ -62,7 +62,8 @@ static int check_subvol(struct btree_trans *trans,
if (ret)
return ret;
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
c, subvol_not_master_and_not_snapshot,
"subvolume %llu is not set as snapshot but is not master subvolume",
k.k->p.offset)) {
struct bkey_i_subvolume *s =
@ -97,16 +98,17 @@ int bch2_check_subvols(struct bch_fs *c)
/* Subvolumes: */
int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
enum bkey_invalid_flags flags, struct printbuf *err)
{
if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
prt_printf(err, "invalid pos");
return -BCH_ERR_invalid_bkey;
}
int ret = 0;
return 0;
bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
subvol_pos_bad,
"invalid pos");
fsck_err:
return ret;
}
void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
@ -230,7 +232,6 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
struct btree_iter iter;
struct bkey_s_c_subvolume subvol;
struct btree_trans_commit_hook *h;
u32 snapid;
int ret = 0;
@ -246,22 +247,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
snapid = le32_to_cpu(subvol.v->snapshot);
ret = bch2_btree_delete_at(trans, &iter, 0);
if (ret)
goto err;
ret = bch2_snapshot_node_set_deleted(trans, snapid);
if (ret)
goto err;
h = bch2_trans_kmalloc(trans, sizeof(*h));
ret = PTR_ERR_OR_ZERO(h);
if (ret)
goto err;
h->fn = bch2_delete_dead_snapshots_hook;
bch2_trans_commit_hook(trans, h);
err:
ret = bch2_btree_delete_at(trans, &iter, 0) ?:
bch2_snapshot_node_set_deleted(trans, snapid);
bch2_trans_iter_exit(trans, &iter);
return ret;
}

View File

@ -9,7 +9,7 @@ enum bkey_invalid_flags;
int bch2_check_subvols(struct bch_fs *);
int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

View File

@ -13,6 +13,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "super-io.h"
#include "super.h"
@ -720,7 +721,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
if (opt_defined(*opts, sb))
goto err;
printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
printbuf_reset(&err);
@ -782,7 +783,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
ret = bch2_sb_validate(sb, &err, READ);
if (ret) {
printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
goto err_no_print;
}
@ -790,7 +791,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
printbuf_exit(&err);
return ret;
err:
printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
path, err.buf);
err_no_print:
bch2_free_super(sb);
@ -805,7 +806,12 @@ static void write_super_endio(struct bio *bio)
/* XXX: return errors directly */
if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
if (bch2_dev_io_err_on(bio->bi_status, ca,
bio_data_dir(bio)
? BCH_MEMBER_ERROR_write
: BCH_MEMBER_ERROR_read,
"superblock %s error: %s",
bio_data_dir(bio) ? "write" : "read",
bch2_blk_status_to_str(bio->bi_status)))
ca->sb_write_error = 1;
@ -892,7 +898,9 @@ int bch2_write_super(struct bch_fs *c)
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
bch2_sb_counters_from_cpu(c);
bch_members_cpy_v2_v1(&c->disk_sb);
bch2_sb_members_from_cpu(c);
bch2_sb_members_cpy_v2_v1(&c->disk_sb);
bch2_sb_errors_from_cpu(c);
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
@ -1175,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "Created:");
prt_tab(out);
if (sb->time_base_lo)
pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
else
prt_printf(out, "(not set)");
prt_newline(out);

View File

@ -23,6 +23,11 @@ u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
unsigned,
unsigned);
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
{
return le32_to_cpu(f->u64s) * sizeof(u64);
}
#define field_to_type(_f, _name) \
container_of_or_null(_f, struct bch_sb_field_##_name, field)
@ -78,41 +83,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
__bch2_check_set_feature(c, feat);
}
/* BCH_SB_FIELD_members_v1: */
static inline bool bch2_member_exists(struct bch_member *m)
{
return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
}
static inline bool bch2_dev_exists(struct bch_sb *sb,
unsigned dev)
{
if (dev < sb->nr_devices) {
struct bch_member m = bch2_sb_member_get(sb, dev);
return bch2_member_exists(&m);
}
return false;
}
static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
{
return (struct bch_member_cpu) {
.nbuckets = le64_to_cpu(mi->nbuckets),
.first_bucket = le16_to_cpu(mi->first_bucket),
.bucket_size = le16_to_cpu(mi->bucket_size),
.group = BCH_MEMBER_GROUP(mi),
.state = BCH_MEMBER_STATE(mi),
.discard = BCH_MEMBER_DISCARD(mi),
.data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
.durability = BCH_MEMBER_DURABILITY(mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
.valid = bch2_member_exists(mi),
};
}
void bch2_sb_maybe_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned);

View File

@ -49,6 +49,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
#include "subvolume.h"
@ -400,7 +401,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch_info(c, "going read-write");
ret = bch2_members_v2_init(c);
ret = bch2_sb_members_v2_init(c);
if (ret)
goto err;
@ -481,6 +482,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_time_stats_exit(&c->times[i]);
bch2_free_pending_node_rewrites(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);
bch2_fs_snapshots_exit(c);
bch2_fs_quota_exit(c);
@ -713,6 +715,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_quota_init(c);
bch2_fs_ec_init_early(c);
bch2_fs_move_init(c);
bch2_fs_sb_errors_init_early(c);
INIT_LIST_HEAD(&c->list);
@ -729,8 +732,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->journal_iters);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
INIT_LIST_HEAD(&c->fsck_error_msgs);
mutex_init(&c->fsck_error_msgs_lock);
seqcount_init(&c->gc_pos_lock);
@ -840,6 +843,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
}
ret = bch2_fs_counters_init(c) ?:
bch2_fs_sb_errors_init(c) ?:
bch2_io_clock_init(&c->io_clock[READ]) ?:
bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_journal_init(&c->journal) ?:
@ -942,15 +946,12 @@ int bch2_fs_start(struct bch_fs *c)
mutex_lock(&c->sb_lock);
ret = bch2_members_v2_init(c);
ret = bch2_sb_members_v2_init(c);
if (ret) {
mutex_unlock(&c->sb_lock);
goto err;
}
for_each_online_member(ca, c, i)
bch2_sb_from_fs(c, ca);
for_each_online_member(ca, c, i)
bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
@ -960,12 +961,6 @@ int bch2_fs_start(struct bch_fs *c)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
mutex_lock(&c->btree_transaction_stats[i].lock);
bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
mutex_unlock(&c->btree_transaction_stats[i].lock);
}
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
? bch2_fs_recovery(c)
: bch2_fs_initialize(c);
@ -1140,6 +1135,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
struct bch_member *member)
{
struct bch_dev *ca;
unsigned i;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
@ -1157,6 +1153,10 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
bch2_time_stats_init(&ca->io_latency[WRITE]);
ca->mi = bch2_mi_to_cpu(member);
for (i = 0; i < ARRAY_SIZE(member->errors); i++)
atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
ca->uuid = member->uuid;
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
@ -1591,7 +1591,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
if (BCH_MEMBER_GROUP(&dev_mi)) {
bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
if (label.allocation_failure) {
ret = -ENOMEM;
goto err;
@ -1631,16 +1631,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
}
mi = bch2_sb_field_get(ca->disk_sb.sb, members_v2);
if (!bch2_sb_field_resize(&ca->disk_sb, members_v2,
le32_to_cpu(mi->field.u64s) +
sizeof(dev_mi) / sizeof(u64))) {
ret = -BCH_ERR_ENOSPC_sb_members;
bch_err_msg(c, ret, "setting up new superblock");
goto err_unlock;
}
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
@ -1654,6 +1644,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
@ -1689,13 +1681,13 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
bch_err_msg(c, ret, "marking new superblock");
bch_err_msg(ca, ret, "marking new superblock");
goto err_late;
}
ret = bch2_fs_freespace_init(c);
if (ret) {
bch_err_msg(c, ret, "initializing free space");
bch_err_msg(ca, ret, "initializing free space");
goto err_late;
}
@ -1763,19 +1755,26 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
if (!ca->mi.freespace_initialized) {
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
bch_err_msg(ca, ret, "initializing free space");
if (ret)
goto err;
}
if (!ca->journal.nr) {
ret = bch2_dev_journal_alloc(ca);
bch_err_msg(ca, ret, "allocating journal");
if (ret)
goto err;
}
mutex_lock(&c->sb_lock);
struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
m->last_mount =
bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
ret = bch2_fs_freespace_init(c);
if (ret)
bch_err_msg(c, ret, "initializing free space");
up_write(&c->state_lock);
return 0;
err:
@ -1886,9 +1885,9 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_opts opts)
{
struct bch_sb_handle *sb = NULL;
DARRAY(struct bch_sb_handle) sbs = { 0 };
struct bch_fs *c = NULL;
unsigned i, best_sb = 0;
struct bch_sb_handle *sb, *best = NULL;
struct printbuf errbuf = PRINTBUF;
int ret = 0;
@ -1900,49 +1899,46 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err;
}
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
if (!sb) {
ret = -ENOMEM;
goto err;
}
for (i = 0; i < nr_devices; i++) {
ret = bch2_read_super(devices[i], &opts, &sb[i]);
ret = darray_make_room(&sbs, nr_devices);
if (ret)
goto err;
for (unsigned i = 0; i < nr_devices; i++) {
struct bch_sb_handle sb = { NULL };
ret = bch2_read_super(devices[i], &opts, &sb);
if (ret)
goto err;
BUG_ON(darray_push(&sbs, sb));
}
for (i = 1; i < nr_devices; i++)
if (le64_to_cpu(sb[i].sb->seq) >
le64_to_cpu(sb[best_sb].sb->seq))
best_sb = i;
darray_for_each(sbs, sb)
if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
best = sb;
i = 0;
while (i < nr_devices) {
if (i != best_sb &&
!bch2_dev_exists(sb[best_sb].sb, sb[i].sb->dev_idx)) {
pr_info("%pg has been removed, skipping", sb[i].bdev);
bch2_free_super(&sb[i]);
array_remove_item(sb, nr_devices, i);
darray_for_each_reverse(sbs, sb) {
if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
pr_info("%pg has been removed, skipping", sb->bdev);
bch2_free_super(sb);
darray_remove_item(&sbs, sb);
best -= best > sb;
continue;
}
ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
ret = bch2_dev_in_fs(best->sb, sb->sb);
if (ret)
goto err_print;
i++;
}
c = bch2_fs_alloc(sb[best_sb].sb, opts);
if (IS_ERR(c)) {
ret = PTR_ERR(c);
c = bch2_fs_alloc(best->sb, opts);
ret = PTR_ERR_OR_ZERO(c);
if (ret)
goto err;
}
down_write(&c->state_lock);
for (i = 0; i < nr_devices; i++) {
ret = bch2_dev_attach_bdev(c, &sb[i]);
darray_for_each(sbs, sb) {
ret = bch2_dev_attach_bdev(c, sb);
if (ret) {
up_write(&c->state_lock);
goto err;
@ -1961,7 +1957,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err;
}
out:
kfree(sb);
darray_for_each(sbs, sb)
bch2_free_super(sb);
darray_exit(&sbs);
printbuf_exit(&errbuf);
module_put(THIS_MODULE);
return c;
@ -1971,9 +1969,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
err:
if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
if (sb)
for (i = 0; i < nr_devices; i++)
bch2_free_super(&sb[i]);
c = ERR_PTR(ret);
goto out;
}

View File

@ -37,16 +37,4 @@ struct bch_member_cpu {
u8 valid;
};
struct bch_disk_group_cpu {
bool deleted;
u16 parent;
struct bch_devs_mask devs;
};
struct bch_disk_groups_cpu {
struct rcu_head rcu;
unsigned nr;
struct bch_disk_group_cpu entries[] __counted_by(nr);
};
#endif /* _BCACHEFS_SUPER_TYPES_H */

Some files were not shown because too many files have changed in this diff Show More