From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Nov 2024 11:11:55 -0500 Subject: [PATCH 1/4] btrfs: fix improper generation check in snapshot delete We have been using the following check if (generation <= root->root_key.offset) to make decisions about whether or not to visit a node during snapshot delete. This is because for normal subvolumes this is set to 0, and for snapshots it's set to the creation generation. The idea being that if the generation of the node is less than or equal to our creation generation then we don't need to visit that node, because it doesn't belong to us, we can simply drop our reference and move on. However reloc roots don't have their generation stored in root->root_key.offset, instead that is the objectid of their corresponding fs root. This means we can incorrectly not walk into nodes that need to be dropped when deleting a reloc root. There are a variety of consequences to making the wrong choice in two distinct areas. visit_node_for_delete() 1. False positive. We think we are newer than the block when we really aren't. We don't visit the node and drop our reference to the node and carry on. This would result in leaked space. 2. False negative. We do decide to walk down into a block that we should have just dropped our reference to. However this means that the child node will have refs > 1, so we will switch to UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice that btrfs_header_owner(node) != root->root_key.objectid and it'll break out of the loop, and then walk_up_proc() will drop our reference, so this appears to be ok. do_walk_down() 1. False positive. We are in UPDATE_BACKREF and incorrectly decide that we are done and don't need to update the backref for our lower nodes. This is another case that simply won't happen with relocation, as we only have to do UPDATE_BACKREF if the node below us was shared and didn't have FULL_BACKREF set, and since we don't own that node because we're a reloc root we actually won't end up in this case. 2. False negative. Again this is tricky because as described above, we simply wouldn't be here from relocation, because we don't own any of the nodes because we never set btrfs_header_owner() to the reloc root objectid, and we always use FULL_BACKREF, we never actually need to set FULL_BACKREF on any children. Having spent a lot of time stressing relocation/snapshot delete recently I've not seen this pop in practice. But this is objectively incorrect, so fix this to get the correct starting generation based on the root we're dropping to keep me from thinking there's a problem here. CC: stable@vger.kernel.org Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 +++++++++++++++++++ fs/btrfs/extent-tree.c | 6 +++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 307dedf95c70..2c341956a01c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -370,6 +370,25 @@ static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transi WRITE_ONCE(root->last_trans, transid); } +/* + * Return the generation this root started with. + * + * Every normal root that is created with root->root_key.offset set to it's + * originating generation. If it is a snapshot it is the generation when the + * snapshot was created. + * + * However for TREE_RELOC roots root_key.offset is the objectid of the owning + * tree root. Thankfully we copy the root item of the owning tree root, which + * has it's last_snapshot set to what we would have root_key.offset set to, so + * return that if this is a TREE_RELOC root. + */ +static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) +{ + if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) + return btrfs_root_last_snapshot(&root->root_item); + return root->root_key.offset; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 412e318e4a22..43a771f7bd7a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,7 +5285,7 @@ static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control * * reference to it. */ generation = btrfs_node_ptr_generation(eb, slot); - if (!wc->update_ref || generation <= root->root_key.offset) + if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) return false; /* @@ -5340,7 +5340,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, goto reada; if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) + generation <= btrfs_root_origin_generation(root)) continue; /* We don't lock the tree block, it's OK to be racy here */ @@ -5683,7 +5683,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, * for the subtree */ if (wc->stage == UPDATE_BACKREF && - generation <= root->root_key.offset) { + generation <= btrfs_root_origin_generation(root)) { wc->lookup_info = 1; return 1; } From 6c3864e055486fadb5b97793b57688082e14b43b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:32 +0100 Subject: [PATCH 2/4] btrfs: use bio_is_zone_append() in the completion handler Otherwise it won't catch bios turned into regular writes by the block level zone write plugging. The additional test it adds is for emulated zone append. Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 1f216d07eff6..011cc97be3b5 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -355,7 +355,7 @@ static void btrfs_simple_end_io(struct bio *bio) INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); } else { - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) btrfs_record_physical_zoned(bbio); btrfs_bio_end_io(bbio, bbio->bio.bi_status); } @@ -398,7 +398,7 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; - if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + if (bio_is_zone_append(bio) && !bio->bi_status) stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; btrfs_bio_end_io(bbio, bbio->bio.bi_status); @@ -412,7 +412,7 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); - } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + } else if (bio_is_zone_append(bio)) { stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Nov 2024 07:26:33 +0100 Subject: [PATCH 3/4] btrfs: split bios to the fs sector size boundary Btrfs like other file systems can't really deal with I/O not aligned to it's internal block size (which strangely is called sector size in btrfs, for historical reasons), but the block layer split helper doesn't even know about that. Round down the split boundary so that all I/Os are aligned. Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio") CC: stable@vger.kernel.org # 6.12 Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 011cc97be3b5..78f5606baacb 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) map_length = min(map_length, bbio->fs_info->max_zone_append_size); sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, &nr_segs, map_length); - if (sector_offset) - return sector_offset << SECTOR_SHIFT; + if (sector_offset) { + /* + * bio_split_rw_at() could split at a size smaller than our + * sectorsize and thus cause unaligned I/Os. Fix that by + * always rounding down to the nearest boundary. + */ + return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); + } return map_length; } From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 4 Dec 2024 13:30:46 +1030 Subject: [PATCH 4/4] btrfs: tree-checker: reject inline extent items with 0 ref count [BUG] There is a bug report in the mailing list where btrfs_run_delayed_refs() failed to drop the ref count for logical 25870311358464 num_bytes 2113536. The involved leaf dump looks like this: item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50 extent refs 1 gen 84178 flags 1 ref#0: shared data backref parent 32399126528000 count 0 <<< ref#1: shared data backref parent 31808973717504 count 1 Notice the count number is 0. [CAUSE] There is no concrete evidence yet, but considering 0 -> 1 is also a single bit flipped, it's possible that hardware memory bitflip is involved, causing the on-disk extent tree to be corrupted. [FIX] To prevent us reading such corrupted extent item, or writing such damaged extent item back to disk, enhance the handling of BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both inlined and key items, to detect such 0 ref count and reject them. CC: stable@vger.kernel.org # 5.4+ Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/ Reported-by: Frankie Fisher Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 148d8cefa40e..dfeee033f31f 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1527,6 +1527,11 @@ static int check_extent_item(struct extent_buffer *leaf, dref_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_extent_data_ref_count(leaf, dref); break; /* Contains parent bytenr and ref count */ @@ -1539,6 +1544,11 @@ static int check_extent_item(struct extent_buffer *leaf, inline_offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data ref count, should have non-zero value"); + return -EUCLEAN; + } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; case BTRFS_EXTENT_OWNER_REF_KEY: @@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struct extent_buffer *leaf, { u32 expect_item_size = 0; - if (key->type == BTRFS_SHARED_DATA_REF_KEY) + if (key->type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + + sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); + if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { + extent_err(leaf, slot, + "invalid shared data backref count, should have non-zero value"); + return -EUCLEAN; + } + expect_item_size = sizeof(struct btrfs_shared_data_ref); + } if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { generic_err(leaf, slot, @@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct extent_buffer *leaf, offset, leaf->fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { + extent_err(leaf, slot, + "invalid extent data backref count, should have non-zero value"); + return -EUCLEAN; + } } return 0; }