mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-08 14:23:19 +00:00
bcachefs: Add a debug mode that always reads from every btree replica
There's a new module parameter, verify_all_btree_replicas, that enables reading from every btree replica when reading in btree nodes and comparing them against each other. We've been seeing some strange btree corruption - this will hopefully aid in tracking it down and catching it more often. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
This commit is contained in:
parent
596d3bdc1e
commit
1ce0cf5fe9
@ -264,7 +264,10 @@ do { \
|
||||
BCH_DEBUG_PARAM(verify_btree_ondisk, \
|
||||
"Reread btree nodes at various points to verify the " \
|
||||
"mergesort in the read path against modifications " \
|
||||
"done in memory")
|
||||
"done in memory") \
|
||||
BCH_DEBUG_PARAM(verify_all_btree_replicas, \
|
||||
"When reading btree nodes, read all replicas and " \
|
||||
"compare them")
|
||||
|
||||
/* Parameters that should only be compiled in in debug mode: */
|
||||
#define BCH_DEBUG_PARAMS_DEBUG() \
|
||||
|
@ -521,7 +521,7 @@ enum btree_validate_ret {
|
||||
\
|
||||
switch (write) { \
|
||||
case READ: \
|
||||
bch_err(c, "%s", _buf2); \
|
||||
bch_err(c, "%s", _buf2); \
|
||||
\
|
||||
switch (type) { \
|
||||
case BTREE_ERR_FIXABLE: \
|
||||
@ -1036,8 +1036,8 @@ static void btree_node_read_work(struct work_struct *work)
|
||||
struct btree_read_bio *rb =
|
||||
container_of(work, struct btree_read_bio, work);
|
||||
struct bch_fs *c = rb->c;
|
||||
struct btree *b = rb->b;
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
||||
struct btree *b = rb->bio.bi_private;
|
||||
struct bio *bio = &rb->bio;
|
||||
struct bch_io_failures failed = { .nr = 0 };
|
||||
char buf[200];
|
||||
@ -1112,6 +1112,262 @@ static void btree_node_read_endio(struct bio *bio)
|
||||
queue_work(system_unbound_wq, &rb->work);
|
||||
}
|
||||
|
||||
struct btree_node_read_all {
|
||||
struct closure cl;
|
||||
struct bch_fs *c;
|
||||
struct btree *b;
|
||||
unsigned nr;
|
||||
void *buf[BCH_REPLICAS_MAX];
|
||||
struct bio *bio[BCH_REPLICAS_MAX];
|
||||
int err[BCH_REPLICAS_MAX];
|
||||
};
|
||||
|
||||
static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
|
||||
{
|
||||
struct btree_node *bn = data;
|
||||
struct btree_node_entry *bne;
|
||||
unsigned offset = 0;
|
||||
|
||||
if (le64_to_cpu(bn->magic) != bset_magic(c))
|
||||
return 0;
|
||||
|
||||
while (offset < c->opts.btree_node_size) {
|
||||
if (!offset) {
|
||||
offset += vstruct_sectors(bn, c->block_bits);
|
||||
} else {
|
||||
bne = data + (offset << 9);
|
||||
if (bne->keys.seq != bn->keys.seq)
|
||||
break;
|
||||
offset += vstruct_sectors(bne, c->block_bits);
|
||||
}
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
|
||||
{
|
||||
struct btree_node *bn = data;
|
||||
struct btree_node_entry *bne;
|
||||
|
||||
if (!offset)
|
||||
return false;
|
||||
|
||||
while (offset < c->opts.btree_node_size) {
|
||||
bne = data + (offset << 9);
|
||||
if (bne->keys.seq == bn->keys.seq)
|
||||
return true;
|
||||
offset++;
|
||||
}
|
||||
|
||||
return false;
|
||||
return offset;
|
||||
}
|
||||
|
||||
static void btree_node_read_all_replicas_done(struct closure *cl)
|
||||
{
|
||||
struct btree_node_read_all *ra =
|
||||
container_of(cl, struct btree_node_read_all, cl);
|
||||
struct bch_fs *c = ra->c;
|
||||
struct btree *b = ra->b;
|
||||
bool have_good_copy = false;
|
||||
bool dump_bset_maps = false;
|
||||
bool have_retry = false;
|
||||
int ret = 0, write = READ;
|
||||
unsigned i, written, written2;
|
||||
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
|
||||
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
|
||||
|
||||
for (i = 0; i < ra->nr; i++) {
|
||||
if (ra->err[i])
|
||||
continue;
|
||||
|
||||
if (!have_good_copy) {
|
||||
memcpy(b->data, ra->buf[i], btree_bytes(c));
|
||||
have_good_copy = true;
|
||||
written = btree_node_sectors_written(c, b->data);
|
||||
}
|
||||
|
||||
/* Try to get the right btree node: */
|
||||
if (have_good_copy &&
|
||||
seq &&
|
||||
b->data->keys.seq != seq &&
|
||||
((struct btree_node *) ra->buf[i])->keys.seq == seq) {
|
||||
memcpy(b->data, ra->buf[i], btree_bytes(c));
|
||||
written = btree_node_sectors_written(c, b->data);
|
||||
}
|
||||
|
||||
written2 = btree_node_sectors_written(c, ra->buf[i]);
|
||||
if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
|
||||
"btree node sectors written mismatch: %u != %u",
|
||||
written, written2) ||
|
||||
btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
|
||||
"found bset signature after last bset") ||
|
||||
btree_err_on(memcmp(b->data, ra->buf[i], written << 9),
|
||||
BTREE_ERR_FIXABLE, c, NULL, b, NULL,
|
||||
"btree node replicas content mismatch"))
|
||||
dump_bset_maps = true;
|
||||
|
||||
if (written2 > written) {
|
||||
written = written2;
|
||||
memcpy(b->data, ra->buf[i], btree_bytes(c));
|
||||
}
|
||||
}
|
||||
fsck_err:
|
||||
if (dump_bset_maps) {
|
||||
for (i = 0; i < ra->nr; i++) {
|
||||
char buf[200];
|
||||
struct printbuf out = PBUF(buf);
|
||||
struct btree_node *bn = ra->buf[i];
|
||||
struct btree_node_entry *bne = NULL;
|
||||
unsigned offset = 0, sectors;
|
||||
bool gap = false;
|
||||
|
||||
if (ra->err[i])
|
||||
continue;
|
||||
|
||||
while (offset < c->opts.btree_node_size) {
|
||||
if (!offset) {
|
||||
sectors = vstruct_sectors(bn, c->block_bits);
|
||||
} else {
|
||||
bne = ra->buf[i] + (offset << 9);
|
||||
if (bne->keys.seq != bn->keys.seq)
|
||||
break;
|
||||
sectors = vstruct_sectors(bne, c->block_bits);
|
||||
}
|
||||
|
||||
pr_buf(&out, " %u-%u", offset, offset + sectors);
|
||||
if (bne && bch2_journal_seq_is_blacklisted(c,
|
||||
le64_to_cpu(bne->keys.journal_seq), false))
|
||||
pr_buf(&out, "*");
|
||||
offset += sectors;
|
||||
}
|
||||
|
||||
while (offset < c->opts.btree_node_size) {
|
||||
bne = ra->buf[i] + (offset << 9);
|
||||
if (bne->keys.seq == bn->keys.seq) {
|
||||
if (!gap)
|
||||
pr_buf(&out, " GAP");
|
||||
gap = true;
|
||||
|
||||
sectors = vstruct_sectors(bne, c->block_bits);
|
||||
pr_buf(&out, " %u-%u", offset, offset + sectors);
|
||||
if (bch2_journal_seq_is_blacklisted(c,
|
||||
le64_to_cpu(bne->keys.journal_seq), false))
|
||||
pr_buf(&out, "*");
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
|
||||
bch_err(c, "replica %u:%s", i, buf);
|
||||
}
|
||||
}
|
||||
|
||||
if (have_good_copy)
|
||||
bch2_btree_node_read_done(c, NULL, b, false);
|
||||
else
|
||||
set_btree_node_read_error(b);
|
||||
|
||||
for (i = 0; i < ra->nr; i++) {
|
||||
mempool_free(ra->buf[i], &c->btree_bounce_pool);
|
||||
bio_put(ra->bio[i]);
|
||||
}
|
||||
|
||||
closure_debug_destroy(&ra->cl);
|
||||
kfree(ra);
|
||||
|
||||
clear_btree_node_read_in_flight(b);
|
||||
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
||||
}
|
||||
|
||||
static void btree_node_read_all_replicas_endio(struct bio *bio)
|
||||
{
|
||||
struct btree_read_bio *rb =
|
||||
container_of(bio, struct btree_read_bio, bio);
|
||||
struct bch_fs *c = rb->c;
|
||||
struct btree_node_read_all *ra = rb->ra;
|
||||
|
||||
if (rb->have_ioref) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
||||
bch2_latency_acct(ca, rb->start_time, READ);
|
||||
}
|
||||
|
||||
ra->err[rb->idx] = bio->bi_status;
|
||||
closure_put(&ra->cl);
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX This allocates multiple times from the same mempools, and can deadlock
|
||||
* under sufficient memory pressure (but is only a debug path)
|
||||
*/
|
||||
static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
|
||||
{
|
||||
struct bkey_s_c k = bkey_i_to_s_c(&b->key);
|
||||
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
||||
const union bch_extent_entry *entry;
|
||||
struct extent_ptr_decoded pick;
|
||||
struct btree_node_read_all *ra;
|
||||
unsigned i;
|
||||
|
||||
ra = kzalloc(sizeof(*ra), GFP_NOFS);
|
||||
if (!ra)
|
||||
return -ENOMEM;
|
||||
|
||||
closure_init(&ra->cl, NULL);
|
||||
ra->c = c;
|
||||
ra->b = b;
|
||||
ra->nr = bch2_bkey_nr_ptrs(k);
|
||||
|
||||
for (i = 0; i < ra->nr; i++) {
|
||||
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
|
||||
ra->bio[i] = bio_alloc_bioset(NULL,
|
||||
buf_pages(ra->buf[i], btree_bytes(c)),
|
||||
REQ_OP_READ|REQ_SYNC|REQ_META,
|
||||
GFP_NOFS,
|
||||
&c->btree_bio);
|
||||
}
|
||||
|
||||
i = 0;
|
||||
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
|
||||
struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
|
||||
struct btree_read_bio *rb =
|
||||
container_of(ra->bio[i], struct btree_read_bio, bio);
|
||||
rb->c = c;
|
||||
rb->b = b;
|
||||
rb->ra = ra;
|
||||
rb->start_time = local_clock();
|
||||
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
|
||||
rb->idx = i;
|
||||
rb->pick = pick;
|
||||
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
|
||||
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
|
||||
bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
|
||||
|
||||
if (rb->have_ioref) {
|
||||
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
|
||||
bio_sectors(&rb->bio));
|
||||
bio_set_dev(&rb->bio, ca->disk_sb.bdev);
|
||||
|
||||
closure_get(&ra->cl);
|
||||
submit_bio(&rb->bio);
|
||||
} else {
|
||||
ra->err[i] = BLK_STS_REMOVED;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
if (sync) {
|
||||
closure_sync(&ra->cl);
|
||||
btree_node_read_all_replicas_done(&ra->cl);
|
||||
} else {
|
||||
continue_at(&ra->cl, btree_node_read_all_replicas_done, system_unbound_wq);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
|
||||
bool sync)
|
||||
{
|
||||
@ -1125,6 +1381,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
|
||||
btree_pos_to_text(&PBUF(buf), c, b);
|
||||
trace_btree_read(c, b);
|
||||
|
||||
set_btree_node_read_in_flight(b);
|
||||
|
||||
if (bch2_verify_all_btree_replicas &&
|
||||
!btree_node_read_all_replicas(c, b, sync))
|
||||
return;
|
||||
|
||||
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
|
||||
NULL, &pick);
|
||||
if (bch2_fs_fatal_err_on(ret <= 0, c,
|
||||
@ -1143,17 +1405,16 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
|
||||
&c->btree_bio);
|
||||
rb = container_of(bio, struct btree_read_bio, bio);
|
||||
rb->c = c;
|
||||
rb->b = b;
|
||||
rb->ra = NULL;
|
||||
rb->start_time = local_clock();
|
||||
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
|
||||
rb->pick = pick;
|
||||
INIT_WORK(&rb->work, btree_node_read_work);
|
||||
bio->bi_iter.bi_sector = pick.ptr.offset;
|
||||
bio->bi_end_io = btree_node_read_endio;
|
||||
bio->bi_private = b;
|
||||
bch2_bio_map(bio, b->data, btree_bytes(c));
|
||||
|
||||
set_btree_node_read_in_flight(b);
|
||||
|
||||
if (rb->have_ioref) {
|
||||
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
|
||||
bio_sectors(bio));
|
||||
@ -1162,7 +1423,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
|
||||
if (sync) {
|
||||
submit_bio_wait(bio);
|
||||
|
||||
bio->bi_private = b;
|
||||
btree_node_read_work(&rb->work);
|
||||
} else {
|
||||
submit_bio(bio);
|
||||
@ -1174,7 +1434,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
|
||||
btree_node_read_work(&rb->work);
|
||||
else
|
||||
queue_work(system_unbound_wq, &rb->work);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,7 @@ struct bch_fs;
|
||||
struct btree_write;
|
||||
struct btree;
|
||||
struct btree_iter;
|
||||
struct btree_node_read_all;
|
||||
|
||||
static inline bool btree_node_dirty(struct btree *b)
|
||||
{
|
||||
@ -33,8 +34,11 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
|
||||
|
||||
struct btree_read_bio {
|
||||
struct bch_fs *c;
|
||||
struct btree *b;
|
||||
struct btree_node_read_all *ra;
|
||||
u64 start_time;
|
||||
unsigned have_ioref:1;
|
||||
unsigned idx:7;
|
||||
struct extent_ptr_decoded pick;
|
||||
struct work_struct work;
|
||||
struct bio bio;
|
||||
|
Loading…
Reference in New Issue
Block a user