Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git

This commit is contained in:
Stephen Rothwell 2024-12-20 09:19:12 +11:00
commit 389534fa27
53 changed files with 2781 additions and 1048 deletions

View File

@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
# misc-next marker
config BTRFS_FS
tristate "Btrfs filesystem support"

View File

@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
tests/free-space-tree-tests.o tests/extent-map-tests.o \
tests/raid-stripe-tree-tests.o
tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o

View File

@ -3022,9 +3022,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
cache->rb_root = RB_ROOT;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
INIT_LIST_HEAD(&cache->pending[i]);
INIT_LIST_HEAD(&cache->changed);
INIT_LIST_HEAD(&cache->detached);
INIT_LIST_HEAD(&cache->leaves);
INIT_LIST_HEAD(&cache->pending_edge);
INIT_LIST_HEAD(&cache->useless_node);
cache->fs_info = fs_info;
@ -3132,29 +3129,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node)
{
struct btrfs_backref_node *upper;
struct btrfs_backref_edge *edge;
if (!node)
return;
BUG_ON(!node->lowest && !node->detached);
while (!list_empty(&node->upper)) {
edge = list_entry(node->upper.next, struct btrfs_backref_edge,
list[LOWER]);
upper = edge->node[UPPER];
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
btrfs_backref_free_edge(cache, edge);
/*
* Add the node to leaf node list if no other child block
* cached.
*/
if (list_empty(&upper->lower)) {
list_add_tail(&upper->lower, &cache->leaves);
upper->lowest = 1;
}
}
btrfs_backref_drop_node(cache, node);
@ -3166,33 +3151,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
{
struct btrfs_backref_node *node;
int i;
while (!list_empty(&cache->detached)) {
node = list_entry(cache->detached.next,
struct btrfs_backref_node, list);
while ((node = rb_entry_safe(rb_first(&cache->rb_root),
struct btrfs_backref_node, rb_node)))
btrfs_backref_cleanup_node(cache, node);
}
while (!list_empty(&cache->leaves)) {
node = list_entry(cache->leaves.next,
struct btrfs_backref_node, lower);
btrfs_backref_cleanup_node(cache, node);
}
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
while (!list_empty(&cache->pending[i])) {
node = list_first_entry(&cache->pending[i],
struct btrfs_backref_node,
list);
btrfs_backref_cleanup_node(cache, node);
}
}
ASSERT(list_empty(&cache->pending_edge));
ASSERT(list_empty(&cache->useless_node));
ASSERT(list_empty(&cache->changed));
ASSERT(list_empty(&cache->detached));
ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
ASSERT(!cache->nr_nodes);
ASSERT(!cache->nr_edges);
}
@ -3316,8 +3281,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
if (IS_ERR(root))
return PTR_ERR(root);
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
cur->cowonly = 1;
/* We shouldn't be using backref cache for non-shareable roots. */
if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
btrfs_put_root(root);
return -EUCLEAN;
}
if (btrfs_root_level(&root->root_item) == cur->level) {
/* Tree root */
@ -3403,8 +3372,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
goto out;
}
upper->owner = btrfs_header_owner(eb);
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
upper->cowonly = 1;
/* We shouldn't be using backref cache for non shareable roots. */
if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
btrfs_put_root(root);
btrfs_backref_free_edge(cache, edge);
btrfs_backref_free_node(cache, upper);
ret = -EUCLEAN;
goto out;
}
/*
* If we know the block isn't shared we can avoid
@ -3595,15 +3571,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
ASSERT(start->checked);
/* Insert this node to cache if it's not COW-only */
if (!start->cowonly) {
rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
&start->rb_node);
if (rb_node)
btrfs_backref_panic(cache->fs_info, start->bytenr,
-EEXIST);
list_add_tail(&start->lower, &cache->leaves);
}
rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
if (rb_node)
btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
/*
* Use breadth first search to iterate all related edges.
@ -3642,11 +3612,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
* parents have already been linked.
*/
if (!RB_EMPTY_NODE(&upper->rb_node)) {
if (upper->lowest) {
list_del_init(&upper->lower);
upper->lowest = 0;
}
list_add_tail(&edge->list[UPPER], &upper->lower);
continue;
}
@ -3657,23 +3622,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
return -EUCLEAN;
}
/* Sanity check, COW-only node has non-COW-only parent */
if (start->cowonly != upper->cowonly) {
ASSERT(0);
rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
if (unlikely(rb_node)) {
btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
return -EUCLEAN;
}
/* Only cache non-COW-only (subvolume trees) tree blocks */
if (!upper->cowonly) {
rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
&upper->rb_node);
if (rb_node) {
btrfs_backref_panic(cache->fs_info,
upper->bytenr, -EEXIST);
return -EUCLEAN;
}
}
list_add_tail(&edge->list[UPPER], &upper->lower);
/*

View File

@ -318,6 +318,12 @@ struct btrfs_backref_node {
u64 bytenr;
}; /* Use rb_simple_node for search/insert */
/*
* This is a sanity check, whenever we COW a block we will update
* new_bytenr with it's current location, and we will check this in
* various places to validate that the cache makes sense, it shouldn't
* be used for anything else.
*/
u64 new_bytenr;
/* Objectid of tree block owner, can be not uptodate */
u64 owner;
@ -335,10 +341,6 @@ struct btrfs_backref_node {
struct extent_buffer *eb;
/* Level of the tree block */
unsigned int level:8;
/* Is the block in a non-shareable tree */
unsigned int cowonly:1;
/* 1 if no child node is in the cache */
unsigned int lowest:1;
/* Is the extent buffer locked */
unsigned int locked:1;
/* Has the block been processed */
@ -391,12 +393,6 @@ struct btrfs_backref_cache {
* level blocks may not reflect the new location
*/
struct list_head pending[BTRFS_MAX_LEVEL];
/* List of backref nodes with no child node */
struct list_head leaves;
/* List of blocks that have been COWed in current transaction */
struct list_head changed;
/* List of detached backref node. */
struct list_head detached;
u64 last_trans;

View File

@ -725,8 +725,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
if (is_data_bbio(bbio) && bioc &&
btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
/*
* No locking for the list update, as we only add to
* the list in the I/O submission path, and list

View File

@ -1223,7 +1223,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->space_info->total_bytes -= block_group->length;
block_group->space_info->bytes_readonly -=
(block_group->length - block_group->zone_unusable);
btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
-block_group->zone_unusable);
block_group->space_info->disk_total -= block_group->length * factor;
@ -1396,8 +1396,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
if (btrfs_is_zoned(cache->fs_info)) {
/* Migrate zone_unusable bytes to readonly */
sinfo->bytes_readonly += cache->zone_unusable;
btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
-cache->zone_unusable);
btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
cache->zone_unusable = 0;
}
cache->ro++;
@ -1645,8 +1644,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
-block_group->pinned);
btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
space_info->bytes_readonly += block_group->pinned;
block_group->pinned = 0;
@ -3060,8 +3058,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
(cache->alloc_offset - cache->used - cache->pinned -
cache->reserved) +
(cache->length - cache->zone_capacity);
btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
cache->zone_unusable);
btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
sinfo->bytes_readonly -= cache->zone_unusable;
}
num_bytes = cache->length - cache->reserved -
@ -3699,7 +3696,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
old_val -= num_bytes;
cache->used = old_val;
cache->pinned += num_bytes;
btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
space_info->bytes_used -= num_bytes;
space_info->disk_used -= num_bytes * factor;
if (READ_ONCE(space_info->periodic_reclaim))
@ -3781,8 +3778,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info->bytes_reserved += num_bytes;
trace_btrfs_space_reservation(cache->fs_info, "space_info",
space_info->flags, num_bytes, 1);
btrfs_space_info_update_bytes_may_use(cache->fs_info,
space_info, -ram_bytes);
btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;

View File

@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
spin_unlock(&dest->lock);
}
if (num_bytes)
btrfs_space_info_free_bytes_may_use(fs_info,
space_info,
num_bytes);
btrfs_space_info_free_bytes_may_use(space_info, num_bytes);
}
if (qgroup_to_release_ret)
*qgroup_to_release_ret = qgroup_to_release;
@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
if (block_rsv->reserved < block_rsv->size) {
num_bytes = block_rsv->size - block_rsv->reserved;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
num_bytes);
btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);
block_rsv->reserved = block_rsv->size;
} else if (block_rsv->reserved > block_rsv->size) {
num_bytes = block_rsv->reserved - block_rsv->size;
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
-num_bytes);
btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
block_rsv->reserved = block_rsv->size;
btrfs_try_granting_tickets(fs_info, sinfo);
}

View File

@ -526,7 +526,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait, bool strict);
bool nowait);
void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);

View File

@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *dst_buf,
struct extent_buffer *src_buf);
static const struct btrfs_csums {
u16 size;
const char name[10];
const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
.driver = "blake2b-256" },
};
/*
* The leaf data grows from end-to-front in the node. this returns the address
* of the start of the last item, which is the stop of the leaf data stack.
@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst,
nr_items * sizeof(struct btrfs_item));
}
/* This exists for btrfs-progs usages. */
u16 btrfs_csum_type_size(u16 type)
{
return btrfs_csums[type].size;
}
int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
/*
* csum type is validated at mount time
*/
return btrfs_csum_type_size(t);
}
const char *btrfs_super_csum_name(u16 csum_type)
{
/* csum type is validated at mount time */
return btrfs_csums[csum_type].name;
}
/*
* Return driver name if defined, otherwise the name that's also a valid driver
* name
*/
const char *btrfs_super_csum_driver(u16 csum_type)
{
/* csum type is validated at mount time */
return btrfs_csums[csum_type].driver[0] ?
btrfs_csums[csum_type].driver :
btrfs_csums[csum_type].name;
}
size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
struct btrfs_path *btrfs_alloc_path(void)
{
might_sleep();
@ -225,22 +174,6 @@ noinline void btrfs_release_path(struct btrfs_path *p)
}
}
/*
* We want the transaction abort to print stack trace only for errors where the
* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
* caused by external factors.
*/
bool __cold abort_should_print_stack(int error)
{
switch (error) {
case -EIO:
case -EROFS:
case -ENOMEM:
return false;
}
return true;
}
/*
* safely gets a reference on the root node of a tree. A lock
* is not taken, so a concurrent writer may put a different node
@ -654,6 +587,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
goto error_unlock_cow;
}
}
trace_btrfs_cow_block(root, buf, cow);
if (unlock_orig)
btrfs_tree_unlock(buf);
free_extent_buffer_stale(buf);
@ -710,7 +645,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
int ret;
if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
btrfs_abort_transaction(trans, -EUCLEAN);
@ -751,12 +685,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
* Also We don't care about the error, as it's handled internally.
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
return ret;
return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
cow_ret, search_start, 0, nest);
}
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);

View File

@ -7,7 +7,6 @@
#define BTRFS_CTREE_H
#include "linux/cleanup.h"
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/mutex.h>
@ -506,20 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
}
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
((bytes) >> (fs_info)->sectorsize_bits)
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
/* ctree.c */
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
@ -756,18 +741,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
}
u16 btrfs_csum_type_size(u16 type);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
size_t __attribute_const__ btrfs_get_num_csums(void);
/*
* We use folio flag owner_2 to indicate there is an ordered extent with
* unfinished IO.
*/
#define folio_test_ordered(folio) folio_test_owner_2(folio)
#define folio_set_ordered(folio) folio_set_owner_2(folio)
#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
#endif

View File

@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
btrfs_space_info_free_bytes_may_use(data_sinfo, len);
}
/*

View File

@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
u64 num_bytes;
u64 reserved_bytes;
if (btrfs_is_testing(fs_info))
return;
num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
trans->delayed_ref_csum_deletions);
@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
if (to_free > 0)
btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
btrfs_space_info_free_bytes_may_use(space_info, to_free);
if (refilled_bytes > 0)
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
@ -555,6 +558,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
delayed_refs->num_heads_ready--;
}
struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
lockdep_assert_held(&head->mutex);
lockdep_assert_held(&head->lock);
if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return NULL;
/*
* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
* This is to prevent a ref count from going down to zero, which deletes
* the extent item from the extent tree, when there still are references
* to add, which would fail because they would not find the extent item.
*/
if (!list_empty(&head->ref_add_list))
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
ref = rb_entry(rb_first_cached(&head->ref_tree),
struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
}
/*
* Helper to insert the ref_node to the tail or merge with tail.
*
@ -1234,6 +1263,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
{
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_fs_info *fs_info = trans->fs_info;
bool testing = btrfs_is_testing(fs_info);
spin_lock(&delayed_refs->lock);
while (true) {
@ -1263,7 +1293,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_unlock(&delayed_refs->lock);
mutex_unlock(&head->mutex);
if (pin_bytes) {
if (!testing && pin_bytes) {
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, head->bytenr);
@ -1281,8 +1311,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
spin_lock(&bg->space_info->lock);
spin_lock(&bg->lock);
bg->pinned += head->num_bytes;
btrfs_space_info_update_bytes_pinned(fs_info,
bg->space_info,
btrfs_space_info_update_bytes_pinned(bg->space_info,
head->num_bytes);
bg->reserved -= head->num_bytes;
bg->space_info->bytes_reserved -= head->num_bytes;
@ -1295,12 +1324,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
head->bytenr + head->num_bytes - 1);
}
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
if (!testing)
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
btrfs_put_delayed_ref_head(head);
cond_resched();
spin_lock(&delayed_refs->lock);
}
btrfs_qgroup_destroy_extent_records(trans);
if (!testing)
btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
}

View File

@ -402,6 +402,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
struct btrfs_delayed_ref_root *delayed_refs);
void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);

View File

@ -248,8 +248,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
block_start = extent_map_block_start(em) + (start - em->start);
if (can_nocow_extent(inode, start, &len,
&file_extent, false, false) == 1) {
if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start);
if (bg)
can_nocow = true;

View File

@ -2337,7 +2337,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
* 1, 2 2nd and 3rd backup copy
* -1 skip bytenr check
*/
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num)
{
u64 nodesize = btrfs_super_nodesize(sb);
@ -2495,24 +2495,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
/*
* Obvious sys_chunk_array corruptions, it must hold at least one key
* and one chunk
*/
if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
btrfs_err(fs_info, "system chunk array too big %u > %u",
btrfs_super_sys_array_size(sb),
BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
ret = -EINVAL;
}
if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ sizeof(struct btrfs_chunk)) {
btrfs_err(fs_info, "system chunk array too small %u < %zu",
btrfs_super_sys_array_size(sb),
sizeof(struct btrfs_disk_key)
+ sizeof(struct btrfs_chunk));
ret = -EINVAL;
}
ret = btrfs_check_system_chunk_array(fs_info, sb);
/*
* The generation is a global counter, we'll trust it more than the others
@ -3321,6 +3304,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
/*
* Handle the space caching options appropriately now that we have the

View File

@ -54,7 +54,7 @@ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);

View File

@ -1803,30 +1803,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
return ret;
}
static inline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return NULL;
/*
* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
* This is to prevent a ref count from going down to zero, which deletes
* the extent item from the extent tree, when there still are references
* to add, which would fail because they would not find the extent item.
*/
if (!list_empty(&head->ref_add_list))
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
ref = rb_entry(rb_first_cached(&head->ref_tree),
struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
}
static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head)
{
@ -1959,7 +1935,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
lockdep_assert_held(&locked_ref->mutex);
lockdep_assert_held(&locked_ref->lock);
while ((ref = select_delayed_ref(locked_ref))) {
while ((ref = btrfs_select_delayed_ref(locked_ref))) {
if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
@ -2230,10 +2206,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
return ret;
}
static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_delayed_ref(struct btrfs_inode *inode,
struct btrfs_path *path,
u64 objectid, u64 offset, u64 bytenr)
u64 offset, u64 bytenr)
{
struct btrfs_root *root = inode->root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_root *delayed_refs;
@ -2307,7 +2284,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
* then we have a cross reference.
*/
if (ref->ref_root != btrfs_root_id(root) ||
ref_owner != objectid || ref_offset != offset) {
ref_owner != btrfs_ino(inode) || ref_offset != offset) {
ret = 1;
break;
}
@ -2318,11 +2295,54 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
return ret;
}
static noinline int check_committed_ref(struct btrfs_root *root,
/*
* Check if there are references for a data extent other than the one belonging
* to the given inode and offset.
*
* @inode: The only inode we expect to find associated with the data extent.
* @path: A path to use for searching the extent tree.
* @offset: The only offset we expect to find associated with the data
* extent.
* @bytenr: The logical address of the data extent.
*
* When the extent does not have any other references other than the one we
* expect to find, we always return a value of 0 with the path having a locked
* leaf that contains the extent's extent item - this is necessary to ensure
* we don't race with a task running delayed references, and our caller must
* have such a path when calling check_delayed_ref() - it must lock a delayed
* ref head while holding the leaf locked. In case the extent item is not found
* in the extent tree, we return -ENOENT with the path having the leaf (locked)
* where the extent item should be, in order to prevent races with another task
* running delayed references, so that we don't miss any reference when calling
* check_delayed_ref().
*
* Note: this may return false positives, and this is because we want to be
* quick here as we're called in write paths (when flushing delalloc and
* in the direct IO write path). For example we can have an extent with
* a single reference but that reference is not inlined, or we may have
* many references in the extent tree but we also have delayed references
* that cancel all the reference except the one for our inode and offset,
* but it would be expensive to do such checks and complex due to all
* locking to avoid races between the checks and flushing delayed refs,
* plus non-inline references may be located on leaves other than the one
* that contains the extent item in the extent tree. The important thing
* here is to not return false negatives and that the false positives are
* not very common.
*
* Returns: 0 if there are no cross references and with the path having a locked
* leaf from the extent tree that contains the extent's extent item.
*
* 1 if there are cross references (false positives can happen).
*
* < 0 in case of an error. In case of -ENOENT the leaf in the extent
* tree where the extent item should be located at is read locked and
* accessible in the given path.
*/
static noinline int check_committed_ref(struct btrfs_inode *inode,
struct btrfs_path *path,
u64 objectid, u64 offset, u64 bytenr,
bool strict)
u64 offset, u64 bytenr)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
struct extent_buffer *leaf;
@ -2341,35 +2361,32 @@ static noinline int check_committed_ref(struct btrfs_root *root,
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist an extent
* item with such offset, but this is out of the valid range.
*/
ret = -EUCLEAN;
goto out;
return -EUCLEAN;
}
ret = -ENOENT;
if (path->slots[0] == 0)
goto out;
return -ENOENT;
path->slots[0]--;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
goto out;
return -ENOENT;
ret = 1;
item_size = btrfs_item_size(leaf, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
/* No inline refs; we need to bail before checking for owner ref. */
if (item_size == sizeof(*ei))
goto out;
return 1;
/* Check for an owner ref; skip over it to the real inline refs. */
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@ -2377,56 +2394,69 @@ static noinline int check_committed_ref(struct btrfs_root *root,
if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
iref = (struct btrfs_extent_inline_ref *)(iref + 1);
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
}
/* If extent item has more than 1 inline ref then it's shared */
if (item_size != expected_size)
goto out;
/*
* If extent created before last snapshot => it's shared unless the
* snapshot has been deleted. Use the heuristic if strict is false.
*/
if (!strict &&
(btrfs_extent_generation(leaf, ei) <=
btrfs_root_last_snapshot(&root->root_item)))
goto out;
return 1;
/* If this extent has SHARED_DATA_REF then it's shared */
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
if (type != BTRFS_EXTENT_DATA_REF_KEY)
goto out;
return 1;
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
if (btrfs_extent_refs(leaf, ei) !=
btrfs_extent_data_ref_count(leaf, ref) ||
btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
goto out;
return 1;
ret = 0;
out:
return ret;
return 0;
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
u64 bytenr, bool strict, struct btrfs_path *path)
int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
u64 bytenr, struct btrfs_path *path)
{
int ret;
do {
ret = check_committed_ref(root, path, objectid,
offset, bytenr, strict);
ret = check_committed_ref(inode, path, offset, bytenr);
if (ret && ret != -ENOENT)
goto out;
ret = check_delayed_ref(root, path, objectid, offset, bytenr);
/*
* The path must have a locked leaf from the extent tree where
* the extent item for our extent is located, in case it exists,
* or where it should be located in case it doesn't exist yet
* because it's new and its delayed ref was not yet flushed.
* We need to lock the delayed ref head at check_delayed_ref(),
* if one exists, while holding the leaf locked in order to not
* race with delayed ref flushing, missing references and
* incorrectly reporting that the extent is not shared.
*/
if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
struct extent_buffer *leaf = path->nodes[0];
ASSERT(leaf != NULL);
btrfs_assert_tree_read_locked(leaf);
if (ret != -ENOENT) {
struct btrfs_key key;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ASSERT(key.objectid == bytenr);
ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY);
}
}
ret = check_delayed_ref(inode, path, offset, bytenr);
} while (ret == -EAGAIN && !path->nowait);
out:
btrfs_release_path(path);
if (btrfs_is_data_reloc_root(root))
if (btrfs_is_data_reloc_root(inode->root))
WARN_ON(ret > 0);
return ret;
}
@ -2571,13 +2601,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
struct btrfs_block_group *cache,
u64 bytenr, u64 num_bytes, int reserved)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
num_bytes);
btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes);
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
@ -2724,15 +2751,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
{
struct btrfs_block_group *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
struct btrfs_free_cluster *cluster = NULL;
u64 len;
u64 total_unpinned = 0;
u64 empty_cluster = 0;
bool readonly;
int ret = 0;
while (start <= end) {
u64 len;
readonly = false;
if (!cache ||
start >= cache->start + cache->length) {
@ -2778,37 +2805,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
btrfs_space_info_update_bytes_pinned(space_info, -len);
space_info->max_extent_size = 0;
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
} else if (btrfs_is_zoned(fs_info)) {
/* Need reset before reusing in a zoned block group */
btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
len);
btrfs_space_info_update_bytes_zone_unusable(space_info, len);
readonly = true;
}
spin_unlock(&cache->lock);
if (!readonly && return_free_space &&
global_rsv->space_info == space_info) {
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
u64 to_add = min(len, global_rsv->size -
global_rsv->reserved);
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(fs_info,
space_info, to_add);
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
len -= to_add;
}
spin_unlock(&global_rsv->lock);
}
/* Add to any tickets we may have */
if (!readonly && return_free_space && len)
btrfs_try_granting_tickets(fs_info, space_info);
if (!readonly && return_free_space)
btrfs_return_free_space(space_info, len);
spin_unlock(&space_info->lock);
}
@ -5142,8 +5151,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
parent = ins.objectid;
flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
owning_root = reloc_src_root;
} else
BUG_ON(parent > 0);
} else {
if (unlikely(parent > 0)) {
/*
* Other roots than reloc tree don't expect start
* offset of a parent block.
*/
ret = -EUCLEAN;
goto out_free_reserved;
}
}
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;

View File

@ -116,8 +116,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict,
int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr,
struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@ -163,5 +162,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
#endif

View File

@ -1167,6 +1167,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* last delalloc end.
*/
u64 last_delalloc_end = 0;
/*
* Save the last successfully ran delalloc range end (exclusive).
* This is for error handling to avoid ranges with ordered extent created
* but no IO will be submitted due to error.
*/
u64 last_finished = page_start;
u64 delalloc_start = page_start;
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
@ -1235,11 +1241,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
if (ret >= 0) {
/*
* Some delalloc range may be created by previous folios.
* Thus we still need to clean those range up during error
* handling.
*/
last_finished = found_start;
/* No errors hit so far, run the current delalloc range. */
ret = btrfs_run_delalloc_range(inode, folio,
found_start,
found_start + found_len - 1,
wbc);
if (ret >= 0)
last_finished = found_start + found_len;
if (unlikely(ret < 0))
btrfs_err_rl(fs_info,
"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
inode->root->root_key.objectid,
btrfs_ino(inode),
folio_pos(folio),
fs_info->sectors_per_page,
&bio_ctrl->submit_bitmap,
found_start, found_len, ret);
} else {
/*
* We've hit an error during previous delalloc range,
@ -1274,8 +1297,21 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = found_start + found_len;
}
if (ret < 0)
/*
* It's possible we have some ordered extents created before we hit
* an error, cleanup non-async successfully created delalloc ranges.
*/
if (unlikely(ret < 0)) {
unsigned int bitmap_size = min(
(last_finished - page_start) >> fs_info->sectorsize_bits,
fs_info->sectors_per_page);
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
btrfs_mark_ordered_io_finished(inode, folio,
page_start + (bit << fs_info->sectorsize_bits),
fs_info->sectorsize, false);
return ret;
}
out:
if (last_delalloc_end)
delalloc_end = last_delalloc_end;
@ -1335,7 +1371,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
if (IS_ERR(em))
return PTR_ERR_OR_ZERO(em);
return PTR_ERR(em);
extent_offset = filepos - em->start;
em_end = extent_map_end(em);
@ -1391,6 +1427,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long range_bitmap = 0;
bool submitted_io = false;
bool error = false;
const u64 folio_start = folio_pos(folio);
u64 cur;
int bit;
@ -1433,11 +1470,21 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
break;
}
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
if (ret < 0)
goto out;
if (unlikely(ret < 0)) {
submit_one_bio(bio_ctrl);
/*
* Failed to grab the extent map which should be very rare.
* Since there is no bio submitted to finish the ordered
* extent, we have to manually finish this sector.
*/
btrfs_mark_ordered_io_finished(inode, folio, cur,
fs_info->sectorsize, false);
error = true;
continue;
}
submitted_io = true;
}
out:
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
@ -1445,8 +1492,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
*
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*
* If we hit any error, the corresponding sector will still be dirty
* thus no need to clear PAGECACHE_TAG_DIRTY.
*/
if (!submitted_io) {
if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
@ -1466,7 +1516,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
{
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
const u64 page_start = folio_pos(folio);
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(inode);
@ -1506,16 +1555,19 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
PAGE_SIZE, bio_ctrl, i_size);
if (ret == 1)
return 0;
if (ret < 0)
btrfs_err_rl(fs_info,
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
BTRFS_I(inode)->root->root_key.objectid,
btrfs_ino(BTRFS_I(inode)),
folio_pos(folio), fs_info->sectors_per_page,
&bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
done:
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
page_start, PAGE_SIZE, !ret);
if (ret < 0)
mapping_set_error(folio->mapping, ret);
}
/*
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
@ -2292,11 +2344,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
if (ret == 1)
goto next_page;
if (ret) {
btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
cur, cur_len, !ret);
if (ret)
mapping_set_error(mapping, ret);
}
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;

View File

@ -36,52 +36,7 @@
#include "ioctl.h"
#include "file.h"
#include "super.h"
/*
* Helper to fault in page and copy. This should go away and be replaced with
* calls into generic code.
*/
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
struct folio *folio, struct iov_iter *i)
{
size_t copied = 0;
size_t total_copied = 0;
int offset = offset_in_page(pos);
while (write_bytes > 0) {
size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
/*
* Copy data from userspace to the current page
*/
copied = copy_folio_from_iter_atomic(folio, offset, count, i);
/* Flush processor's dcache for this page */
flush_dcache_folio(folio);
/*
* if we get a partial write, we can end up with
* partially up to date page. These add
* a lot of complexity, so make sure they don't
* happen by forcing this copy to be retried.
*
* The rest of the btrfs_file_write code will fall
* back to page at a time copies after we return 0.
*/
if (unlikely(copied < count)) {
if (!folio_test_uptodate(folio)) {
iov_iter_revert(i, copied);
copied = 0;
}
if (!copied)
break;
}
write_bytes -= copied;
total_copied += copied;
offset += copied;
}
return total_copied;
}
#include "print-tree.h"
/*
* Unlock folio after btrfs_file_write() is done with it.
@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
}
/*
* After btrfs_copy_from_user(), update the following things for delalloc:
* After copy_folio_from_iter_atomic(), update the following things for delalloc:
* - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back.
* - Mark modified folio as Uptodate/Dirty and not needing COW fixup
@ -224,7 +179,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->drop_cache)
btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
if (args->start >= inode->disk_i_size && !args->replace_extent)
if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
modify_tree = 0;
update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
@ -245,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
BUG_ON(del_nr > 0);
if (WARN_ON(del_nr > 0)) {
btrfs_print_leaf(leaf);
ret = -EINVAL;
break;
}
ret = btrfs_next_leaf(root, path);
if (ret < 0)
break;
@ -321,7 +280,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end < extent_end) {
BUG_ON(del_nr > 0);
if (WARN_ON(del_nr > 0)) {
btrfs_print_leaf(leaf);
ret = -EINVAL;
break;
}
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
@ -409,7 +372,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
* | -------- extent -------- |
*/
if (args->start > key.offset && args->end >= extent_end) {
BUG_ON(del_nr > 0);
if (WARN_ON(del_nr > 0)) {
btrfs_print_leaf(leaf);
ret = -EINVAL;
break;
}
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -EOPNOTSUPP;
break;
@ -437,7 +404,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
del_slot = path->slots[0];
del_nr = 1;
} else {
BUG_ON(del_slot + del_nr != path->slots[0]);
if (WARN_ON(del_slot + del_nr != path->slots[0])) {
btrfs_print_leaf(leaf);
ret = -EINVAL;
break;
}
del_nr++;
}
@ -1052,7 +1023,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
&cached_state);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, nowait, false);
NULL, nowait);
if (ret <= 0)
btrfs_drew_write_unlock(&root->snapshot_lock);
else
@ -1252,7 +1223,23 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
break;
}
copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
copied = copy_folio_from_iter_atomic(folio,
offset_in_folio(folio, pos), write_bytes, i);
flush_dcache_folio(folio);
/*
* If we get a partial write, we can end up with partially
* uptodate page. Although if sector size < page size we can
* handle it, but if it's not sector aligned it can cause
* a lot of complexity, so make sure they don't happen by
* forcing retry this copy.
*/
if (unlikely(copied < write_bytes)) {
if (!folio_test_uptodate(folio)) {
iov_iter_revert(i, copied);
copied = 0;
}
}
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset,

View File

@ -12,7 +12,7 @@
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
#include <linux/string_choices.h>
#include "ctree.h"
#include "extent-tree.h"
#include "fs.h"
#include "messages.h"
#include "misc.h"

View File

@ -4,6 +4,136 @@
#include "ctree.h"
#include "fs.h"
#include "accessors.h"
#include "volumes.h"
static const struct btrfs_csums {
u16 size;
const char name[10];
const char driver[12];
} btrfs_csums[] = {
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
.driver = "blake2b-256" },
};
/* This exists for btrfs-progs usages. */
u16 btrfs_csum_type_size(u16 type)
{
return btrfs_csums[type].size;
}
int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
/* csum type is validated at mount time. */
return btrfs_csum_type_size(t);
}
const char *btrfs_super_csum_name(u16 csum_type)
{
/* csum type is validated at mount time. */
return btrfs_csums[csum_type].name;
}
/*
* Return driver name if defined, otherwise the name that's also a valid driver
* name.
*/
const char *btrfs_super_csum_driver(u16 csum_type)
{
/* csum type is validated at mount time */
return btrfs_csums[csum_type].driver[0] ?
btrfs_csums[csum_type].driver :
btrfs_csums[csum_type].name;
}
size_t __attribute_const__ btrfs_get_num_csums(void)
{
return ARRAY_SIZE(btrfs_csums);
}
/*
* Start exclusive operation @type, return true on success.
*/
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
bool ret = false;
spin_lock(&fs_info->super_lock);
if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
fs_info->exclusive_operation = type;
ret = true;
}
spin_unlock(&fs_info->super_lock);
return ret;
}
/*
* Conditionally allow to enter the exclusive operation in case it's compatible
* with the running one. This must be paired with btrfs_exclop_start_unlock()
* and btrfs_exclop_finish().
*
* Compatibility:
* - the same type is already running
* - when trying to add a device and balance has been paused
* - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
* must check the condition first that would allow none -> @type
*/
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
spin_lock(&fs_info->super_lock);
if (fs_info->exclusive_operation == type ||
(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
type == BTRFS_EXCLOP_DEV_ADD))
return true;
spin_unlock(&fs_info->super_lock);
return false;
}
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
{
spin_unlock(&fs_info->super_lock);
}
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
{
spin_lock(&fs_info->super_lock);
WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
spin_unlock(&fs_info->super_lock);
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation op)
{
switch (op) {
case BTRFS_EXCLOP_BALANCE_PAUSED:
spin_lock(&fs_info->super_lock);
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
spin_unlock(&fs_info->super_lock);
break;
case BTRFS_EXCLOP_BALANCE:
spin_lock(&fs_info->super_lock);
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
spin_unlock(&fs_info->super_lock);
break;
default:
btrfs_warn(fs_info,
"invalid exclop balance operation %d requested", op);
}
}
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name)

View File

@ -18,6 +18,7 @@
#include <linux/rwsem.h>
#include <linux/semaphore.h>
#include <linux/list.h>
#include <linux/pagemap.h>
#include <linux/radix-tree.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
@ -887,6 +888,11 @@ struct btrfs_fs_info {
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
struct inode *: (_inode)))->root->fs_info)
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
{
return READ_ONCE(fs_info->generation);
@ -953,6 +959,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits)
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0;
@ -982,6 +990,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
u16 btrfs_csum_type_size(u16 type);
int btrfs_super_csum_size(const struct btrfs_super_block *s);
const char *btrfs_super_csum_name(u16 csum_type);
const char *btrfs_super_csum_driver(u16 csum_type);
size_t __attribute_const__ btrfs_get_num_csums(void);
static inline bool btrfs_is_empty_uuid(const u8 *uuid)
{
return uuid_is_null((const uuid_t *)uuid);
}
/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
@ -1058,6 +1077,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
/*
* We use folio flag owner_2 to indicate there is an ordered extent with
* unfinished IO.
*/
#define folio_test_ordered(folio) folio_test_owner_2(folio)
#define folio_set_ordered(folio) folio_set_owner_2(folio)
#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
#define EXPORT_FOR_TESTS

View File

@ -1129,19 +1129,15 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
&wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
btrfs_cleanup_ordered_extents(inode, locked_folio,
btrfs_cleanup_ordered_extents(inode, NULL,
start, end - start + 1);
if (locked_folio) {
const u64 page_start = folio_pos(locked_folio);
folio_start_writeback(locked_folio);
folio_end_writeback(locked_folio);
btrfs_mark_ordered_io_finished(inode, locked_folio,
page_start, PAGE_SIZE,
!ret);
mapping_set_error(locked_folio->mapping, ret);
folio_unlock(locked_folio);
}
if (locked_folio)
btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
start, async_extent->ram_size);
btrfs_err_rl(inode->root->fs_info,
"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
__func__, btrfs_root_id(inode->root),
btrfs_ino(inode), start, async_extent->ram_size, ret);
}
}
@ -1254,7 +1250,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
free_async_extent_pages(async_extent);
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
btrfs_debug(fs_info,
btrfs_debug_rl(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
btrfs_root_id(root), btrfs_ino(inode), start,
async_extent->ram_size, ret);
@ -1372,6 +1368,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
/*
* We're not doing compressed IO, don't unlock the first page
* (which the caller expects to stay locked), don't clear any
* dirty bits and don't set any writeback bits
*
* Do set the Ordered (Private2) bit so we know this page was
* properly setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
/*
* Relocation relies on the relocated extents to have exactly the same
* size as the original extents. Normally writeback for relocation data
@ -1431,6 +1438,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
/*
* Locked range will be released either during error clean up or
* after the whole range is finished.
*/
lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached);
@ -1476,21 +1487,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
/*
* We're not doing compressed IO, don't unlock the first page
* (which the caller expects to stay locked), don't clear any
* dirty bits and don't set any writeback bits
*
* Do set the Ordered flag so we know this page was
* properly setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
if (num_bytes < cur_alloc_size)
num_bytes = 0;
else
@ -1507,6 +1503,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret)
goto out_unlock;
}
extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
done:
if (done_offset)
*done_offset = end;
@ -1527,35 +1526,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* We process each region below.
*/
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* For the range (1). We have already instantiated the ordered extents
* for this region. They are cleaned up by
* btrfs_cleanup_ordered_extents() in e.g,
* btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
* already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
* EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
* function.
* btrfs_run_delalloc_range().
* EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
* are also handled by the cleanup function.
*
* However, in case of @keep_locked, we still need to unlock the pages
* (except @locked_folio) to ensure all the pages are unlocked.
* So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag,
* and finish the writeback of the involved folios, which will be
* never submitted.
*/
if (keep_locked && orig_start < start) {
if (orig_start < start) {
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
locked_folio, NULL, 0, page_ops);
locked_folio, NULL, clear_bits, page_ops);
}
/*
* At this point we're unlocked, we want to make sure we're only
* clearing these flags under the extent lock, so lock the rest of the
* range and clear everything up.
*/
lock_extent(&inode->io_tree, start, end, NULL);
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
/*
* For the range (2). If we reserved an extent for our delalloc range
@ -1589,6 +1584,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
end - start - cur_alloc_size + 1, NULL);
}
btrfs_err_rl(fs_info,
"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
__func__, btrfs_root_id(inode->root),
btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
return ret;
}
@ -1809,7 +1808,7 @@ static int fallback_to_cow(struct btrfs_inode *inode,
bytes = range_bytes;
spin_lock(&sinfo->lock);
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
btrfs_space_info_update_bytes_may_use(sinfo, bytes);
spin_unlock(&sinfo->lock);
if (count > 0)
@ -1837,7 +1836,6 @@ struct can_nocow_file_extent_args {
/* End file offset (inclusive) of the range we want to NOCOW. */
u64 end;
bool writeback_path;
bool strict;
/*
* Free the path passed to can_nocow_file_extent() once it's not needed
* anymore.
@ -1892,8 +1890,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
* for its subvolume was created, then this implies the extent is shared,
* hence we must COW.
*/
if (!args->strict &&
btrfs_file_extent_generation(leaf, fi) <=
if (btrfs_file_extent_generation(leaf, fi) <=
btrfs_root_last_snapshot(&root->root_item))
goto out;
@ -1922,9 +1919,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
*/
btrfs_release_path(path);
ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
key->offset - args->file_extent.offset,
args->file_extent.disk_bytenr, args->strict, path);
ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
args->file_extent.disk_bytenr, path);
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
if (ret != 0)
goto out;
@ -1970,6 +1966,48 @@ static int can_nocow_file_extent(struct btrfs_path *path,
return ret < 0 ? ret : can_nocow;
}
static void cleanup_dirty_folios(struct btrfs_inode *inode,
struct folio *locked_folio,
u64 start, u64 end, int error)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
pgoff_t start_index = start >> PAGE_SHIFT;
pgoff_t end_index = end >> PAGE_SHIFT;
u32 len;
ASSERT(end + 1 - start < U32_MAX);
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
len = end + 1 - start;
/*
* Handle the locked folio first.
* btrfs_folio_clamp_*() helpers can handle range out of the folio case.
*/
btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
for (pgoff_t index = start_index; index <= end_index; index++) {
struct folio *folio;
/* Already handled at the beginning. */
if (index == locked_folio->index)
continue;
folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
/* Cache already dropped, no need to do any cleanup. */
if (IS_ERR(folio))
continue;
btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
folio_unlock(folio);
folio_put(folio);
}
mapping_set_error(mapping, error);
}
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@ -1985,6 +2023,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
struct btrfs_path *path;
u64 cow_start = (u64)-1;
/*
* If not 0, represents the inclusive end of the last fallback_to_cow()
* range. Only for error handling.
*/
u64 cow_end = 0;
u64 cur_offset = start;
int ret;
bool check_prev = true;
@ -2145,6 +2188,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
found_key.offset - 1);
cow_start = (u64)-1;
if (ret) {
cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
@ -2218,11 +2262,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
cow_start = cur_offset;
if (cow_start != (u64)-1) {
cur_offset = end;
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
cow_start = (u64)-1;
if (ret)
if (ret) {
cow_end = end;
goto error;
}
}
btrfs_free_path(path);
@ -2230,12 +2275,42 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
error:
/*
* If an error happened while a COW region is outstanding, cur_offset
* needs to be reset to cow_start to ensure the COW region is unlocked
* as well.
* There are several error cases:
*
* 1) Failed without falling back to COW
* start cur_start end
* |/////////////| |
*
* For range [start, cur_start) the folios are already unlocked (except
* @locked_folio), EXTENT_DELALLOC already removed.
* Only need to clear the dirty flag as they will never be submitted.
* Ordered extent and extent maps are handled by
* btrfs_mark_ordered_io_finished() inside run_delalloc_range().
*
* 2) Failed with error from fallback_to_cow()
* start cur_start cow_end end
* |/////////////|-----------| |
*
* For range [start, cur_start) it's the same as case 1).
* But for range [cur_start, cow_end), the folios have dirty flag
* cleared and unlocked, EXTENT_DEALLLOC cleared.
* There may or may not be any ordered extents/extent maps allocated.
*
* We should not call extent_clear_unlock_delalloc() on range [cur_start,
* cow_end), as the folios are already unlocked.
*
* So clear the folio dirty flags for [start, cur_offset) first.
*/
if (cow_start != (u64)-1)
cur_offset = cow_start;
if (cur_offset > start)
cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
/*
* If an error happened while a COW region is outstanding, cur_offset
* needs to be reset to @cow_end + 1 to skip the COW range, as
* cow_file_range() will do the proper cleanup at error.
*/
if (cow_end)
cur_offset = cow_end + 1;
/*
* We need to lock the extent here because we're clearing DELALLOC and
@ -2255,6 +2330,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
}
btrfs_free_path(path);
btrfs_err_rl(fs_info,
"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
__func__, btrfs_root_id(inode->root),
btrfs_ino(inode), start, end + 1 - start, ret);
return ret;
}
@ -7011,8 +7090,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* @orig_start: (optional) Return the original file offset of the file extent
* @orig_len: (optional) Return the original on-disk length of the file extent
* @ram_bytes: (optional) Return the ram_bytes of the file extent
* @strict: if true, omit optimizations that might force us into unnecessary
* cow. e.g., don't trust generation number.
*
* Return:
* >0 and update @len if we can do nocow write
@ -7024,7 +7101,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait, bool strict)
bool nowait)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct can_nocow_file_extent_args nocow_args = { 0 };
@ -7077,7 +7154,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
nocow_args.start = offset;
nocow_args.end = offset + *len - 1;
nocow_args.strict = strict;
nocow_args.free_path = true;
ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
@ -9078,9 +9154,9 @@ static ssize_t btrfs_encoded_read_inline(
}
struct btrfs_encoded_read_private {
wait_queue_head_t wait;
struct completion done;
void *uring_ctx;
atomic_t pending;
refcount_t pending_refs;
blk_status_t status;
};
@ -9099,14 +9175,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
if (atomic_dec_and_test(&priv->pending)) {
if (refcount_dec_and_test(&priv->pending_refs)) {
int err = blk_status_to_errno(READ_ONCE(priv->status));
if (priv->uring_ctx) {
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
kfree(priv);
} else {
wake_up(&priv->wait);
complete(&priv->done);
}
}
bio_put(&bbio->bio);
@ -9126,8 +9202,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
if (!priv)
return -ENOMEM;
init_waitqueue_head(&priv->wait);
atomic_set(&priv->pending, 1);
init_completion(&priv->done);
refcount_set(&priv->pending_refs, 1);
priv->status = 0;
priv->uring_ctx = uring_ctx;
@ -9140,7 +9216,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
atomic_inc(&priv->pending);
refcount_inc(&priv->pending_refs);
btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
@ -9155,11 +9231,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
atomic_inc(&priv->pending);
refcount_inc(&priv->pending_refs);
btrfs_submit_bbio(bbio, 0);
if (uring_ctx) {
if (atomic_dec_return(&priv->pending) == 0) {
if (refcount_dec_and_test(&priv->pending_refs)) {
ret = blk_status_to_errno(READ_ONCE(priv->status));
btrfs_uring_read_extent_endio(uring_ctx, ret);
kfree(priv);
@ -9168,8 +9244,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
return -EIOCBQUEUED;
} else {
if (atomic_dec_return(&priv->pending) != 0)
io_wait_event(priv->wait, !atomic_read(&priv->pending));
if (!refcount_dec_and_test(&priv->pending_refs))
wait_for_completion_io(&priv->done);
/* See btrfs_encoded_read_endio() for ordering. */
ret = blk_status_to_errno(READ_ONCE(priv->status));
kfree(priv);
@ -9799,15 +9875,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
};
struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
struct btrfs_path *path = NULL;
int ret = 0;
u64 isize;
u64 start;
u64 prev_extent_end = 0;
/*
* Acquire the inode's mmap lock to prevent races with memory mapped
* writes, as they could happen after we flush delalloc below and before
* we lock the extent range further below. The inode was already locked
* up in the call chain.
*/
btrfs_assert_inode_locked(BTRFS_I(inode));
down_write(&BTRFS_I(inode)->i_mmap_lock);
/*
* If the swap file was just created, make sure delalloc is done. If the
@ -9816,22 +9902,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
*/
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
return ret;
goto out_unlock_mmap;
/*
* The inode is locked, so these flags won't change after we check them.
*/
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
btrfs_warn(fs_info, "swapfile must not be compressed");
return -EINVAL;
ret = -EINVAL;
goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
return -EINVAL;
ret = -EINVAL;
goto out_unlock_mmap;
}
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
btrfs_warn(fs_info, "swapfile must not be checksummed");
return -EINVAL;
ret = -EINVAL;
goto out_unlock_mmap;
}
path = btrfs_alloc_path();
backref_ctx = btrfs_alloc_backref_share_check_ctx();
if (!path || !backref_ctx) {
ret = -ENOMEM;
goto out_unlock_mmap;
}
/*
@ -9846,7 +9942,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
ret = -EBUSY;
goto out_unlock_mmap;
}
/*
@ -9860,7 +9957,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
btrfs_warn(fs_info,
"cannot activate swapfile because snapshot creation is in progress");
return -EINVAL;
ret = -EINVAL;
goto out_unlock_mmap;
}
/*
* Snapshots can create extents which require COW even if NODATACOW is
@ -9881,7 +9979,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_warn(fs_info,
"cannot activate swapfile because subvolume %llu is being deleted",
btrfs_root_id(root));
return -EPERM;
ret = -EPERM;
goto out_unlock_mmap;
}
atomic_inc(&root->nr_swapfiles);
spin_unlock(&root->root_item_lock);
@ -9889,24 +9988,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
lock_extent(io_tree, 0, isize - 1, &cached_state);
start = 0;
while (start < isize) {
u64 logical_block_start, physical_block_start;
while (prev_extent_end < isize) {
struct btrfs_key key;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *ei;
struct btrfs_block_group *bg;
u64 len = isize - start;
u64 logical_block_start;
u64 physical_block_start;
u64 extent_gen;
u64 disk_bytenr;
u64 len;
em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = prev_extent_end;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
}
if (em->disk_bytenr == EXTENT_MAP_HOLE) {
/*
* If key not found it means we have an implicit hole (NO_HOLES
* is enabled).
*/
if (ret > 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
if (em->disk_bytenr == EXTENT_MAP_INLINE) {
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
/*
* It's unlikely we'll ever actually find ourselves
* here, as a file small enough to fit inline won't be
@ -9918,23 +10032,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
if (extent_map_is_compressed(em)) {
if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
}
logical_block_start = extent_map_block_start(em) + (start - em->start);
len = min(len, em->len - (start - em->start));
free_extent_map(em);
em = NULL;
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
if (disk_bytenr == 0) {
btrfs_warn(fs_info, "swapfile must not have holes");
ret = -EINVAL;
goto out;
}
ret = can_nocow_extent(inode, start, &len, NULL, false, true);
logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
extent_gen = btrfs_file_extent_generation(leaf, ei);
prev_extent_end = btrfs_file_extent_end(path);
if (prev_extent_end > isize)
len = isize - key.offset;
else
len = btrfs_file_extent_num_bytes(leaf, ei);
backref_ctx->curr_leaf_bytenr = leaf->start;
/*
* Don't need the path anymore, release to avoid deadlocks when
* calling btrfs_is_data_extent_shared() because when joining a
* transaction it can block waiting for the current one's commit
* which in turn may be trying to lock the same leaf to flush
* delayed items for example.
*/
btrfs_release_path(path);
ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
extent_gen, backref_ctx);
if (ret < 0) {
goto out;
} else if (ret) {
ret = 0;
} else {
} else if (ret > 0) {
btrfs_warn(fs_info,
"swapfile must not be copy-on-write");
ret = -EINVAL;
@ -9969,7 +10105,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
physical_block_start = (map->stripes[0].physical +
(logical_block_start - map->start));
len = min(len, map->chunk_len - (logical_block_start - map->start));
btrfs_free_chunk_map(map);
map = NULL;
@ -10010,20 +10145,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
if (ret)
goto out;
}
bsi.start = start;
bsi.start = key.offset;
bsi.block_start = physical_block_start;
bsi.block_len = len;
}
start += len;
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto out;
}
cond_resched();
}
if (bsi.block_len)
ret = btrfs_add_swap_extent(sis, &bsi);
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
if (!IS_ERR_OR_NULL(map))
btrfs_free_chunk_map(map);
@ -10036,6 +10174,10 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
btrfs_exclop_finish(fs_info);
out_unlock_mmap:
up_write(&BTRFS_I(inode)->i_mmap_lock);
btrfs_free_backref_share_ctx(backref_ctx);
btrfs_free_path(path);
if (ret)
return ret;

View File

@ -403,86 +403,6 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return ret;
}
/*
* Start exclusive operation @type, return true on success
*/
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
bool ret = false;
spin_lock(&fs_info->super_lock);
if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
fs_info->exclusive_operation = type;
ret = true;
}
spin_unlock(&fs_info->super_lock);
return ret;
}
/*
* Conditionally allow to enter the exclusive operation in case it's compatible
* with the running one. This must be paired with btrfs_exclop_start_unlock and
* btrfs_exclop_finish.
*
* Compatibility:
* - the same type is already running
* - when trying to add a device and balance has been paused
* - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
* must check the condition first that would allow none -> @type
*/
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
spin_lock(&fs_info->super_lock);
if (fs_info->exclusive_operation == type ||
(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
type == BTRFS_EXCLOP_DEV_ADD))
return true;
spin_unlock(&fs_info->super_lock);
return false;
}
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
{
spin_unlock(&fs_info->super_lock);
}
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
{
spin_lock(&fs_info->super_lock);
WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
spin_unlock(&fs_info->super_lock);
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation op)
{
switch (op) {
case BTRFS_EXCLOP_BALANCE_PAUSED:
spin_lock(&fs_info->super_lock);
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
spin_unlock(&fs_info->super_lock);
break;
case BTRFS_EXCLOP_BALANCE:
spin_lock(&fs_info->super_lock);
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
spin_unlock(&fs_info->super_lock);
break;
default:
btrfs_warn(fs_info,
"invalid exclop balance operation %d requested", op);
}
}
static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
{
return put_user(inode->i_generation, arg);
@ -551,17 +471,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
return ret;
}
int __pure btrfs_is_empty_uuid(const u8 *uuid)
{
int i;
for (i = 0; i < BTRFS_UUID_SIZE; i++) {
if (uuid[i])
return 0;
}
return 1;
}
/*
* Calculate the number of transaction items to reserve for creating a subvolume
* or snapshot, not including the inode, directory entries, or parent directory.
@ -4984,15 +4893,14 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
* undo this.
*/
if (!iov) {
iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
iov = kmemdup(iovstack, sizeof(struct iovec) * args.iovcnt,
GFP_NOFS);
if (!iov) {
unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
ret = -ENOMEM;
goto out_acct;
}
memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
}
count = min_t(u64, iov_iter_count(&iter), disk_io_size);
@ -5300,6 +5208,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return fsverity_ioctl_enable(file, (const void __user *)argp);
case FS_IOC_MEASURE_VERITY:
return fsverity_ioctl_measure(file, argp);
case FS_IOC_READ_VERITY_METADATA:
return fsverity_ioctl_read_metadata(file, argp);
case BTRFS_IOC_ENCODED_READ:
return btrfs_ioctl_encoded_read(file, argp, false);
case BTRFS_IOC_ENCODED_WRITE:

View File

@ -19,7 +19,6 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);

View File

@ -199,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
{
lockdep_assert_held_write(&eb->lock);
}
static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
{
lockdep_assert_held_read(&eb->lock);
}
#else
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
#endif
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);

View File

@ -163,4 +163,32 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
return (found_set == start + nbits);
}
/*
* Count how many bits are set in the bitmap.
*
* Similar to bitmap_weight() but accepts a subrange of the bitmap.
*/
static inline unsigned int bitmap_count_set(const unsigned long *addr,
unsigned long start,
unsigned long nbits)
{
const unsigned long bitmap_nbits = start + nbits;
unsigned long cur = start;
unsigned long total_set = 0;
while (cur < bitmap_nbits) {
unsigned long found_zero;
unsigned long found_set;
found_zero = find_next_zero_bit(addr, bitmap_nbits, cur);
total_set += found_zero - cur;
cur = found_zero;
if (cur >= bitmap_nbits)
break;
found_set = find_next_bit(addr, bitmap_nbits, cur);
cur = found_set;
}
return total_set;
}
#endif

View File

@ -194,6 +194,14 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
INIT_LIST_HEAD(&entry->bioc_list);
init_completion(&entry->completion);
#ifdef CONFIG_BTRFS_DEBUG
entry->finished_bitmap = bitmap_zalloc(
num_bytes >> inode->root->fs_info->sectorsize_bits, GFP_NOFS);
if (!entry->finished_bitmap) {
kmem_cache_free(btrfs_ordered_extent_cache, entry);
return ERR_PTR(-ENOMEM);
}
#endif
/*
* We don't need the count_max_extents here, we can assume that all of
* that work has been done at higher layers, so this is truly the
@ -356,13 +364,39 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
btrfs_folio_clear_ordered(fs_info, folio, file_offset, len);
}
#ifdef CONFIG_BTRFS_DEBUG
{
unsigned long start_bit;
unsigned long nbits;
unsigned long nr_set;
ASSERT(file_offset >= ordered->file_offset);
ASSERT(file_offset + len <= ordered->file_offset + ordered->num_bytes);
start_bit = (file_offset - ordered->file_offset) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
nr_set = bitmap_count_set(ordered->finished_bitmap, start_bit, nbits);
if (WARN_ON(nr_set)) {
btrfs_crit(fs_info,
"double ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu range offset=%llu range len=%llu already finished len=%lu finish_bitmap=%*pbl",
btrfs_root_id(inode->root), btrfs_ino(inode),
ordered->file_offset, ordered->num_bytes,
file_offset, len, nr_set << fs_info->sectorsize_bits,
(int)(ordered->num_bytes >> fs_info->sectorsize_bits),
ordered->finished_bitmap);
}
bitmap_set(ordered->finished_bitmap, start_bit, nbits);
len -= (nr_set << fs_info->sectorsize_bits);
}
#endif
/* Now we're fine to update the accounting. */
if (WARN_ON_ONCE(len > ordered->bytes_left)) {
btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu range start=%llu range len=%llu left=%llu",
btrfs_root_id(inode->root), btrfs_ino(inode),
ordered->file_offset, ordered->num_bytes,
len, ordered->bytes_left);
file_offset, len, ordered->bytes_left);
ordered->bytes_left = 0;
} else {
ordered->bytes_left -= len;
@ -379,6 +413,28 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
* the finish_func to be executed.
*/
set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
#ifdef CONFIG_BTRFS_DEBUG
{
u64 real_len;
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
real_len = ordered->truncated_len;
else
real_len = ordered->num_bytes;
if (WARN_ON(!bitmap_full(ordered->finished_bitmap,
real_len >> fs_info->sectorsize_bits))) {
btrfs_crit(fs_info,
"ordered extent finished bitmap desync, root=%llu ino=%llu OE offset=%llu OE len=%llu bytes_left=%llu bitmap=%*pbl",
btrfs_root_id(inode->root), btrfs_ino(inode),
ordered->file_offset, ordered->num_bytes,
ordered->bytes_left,
(int)(real_len >> fs_info->sectorsize_bits),
ordered->finished_bitmap);
}
}
#endif
cond_wake_up(&ordered->wait);
refcount_inc(&ordered->refs);
trace_btrfs_ordered_extent_mark_finished(inode, ordered);
@ -624,6 +680,9 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
list_del(&sum->list);
kvfree(sum);
}
#ifdef CONFIG_BTRFS_DEBUG
bitmap_free(entry->finished_bitmap);
#endif
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}

View File

@ -154,6 +154,15 @@ struct btrfs_ordered_extent {
struct list_head work_list;
struct list_head bioc_list;
#ifdef CONFIG_BTRFS_DEBUG
/*
* Set if one block has finished.
*
* To catch double freeing with more accuracy.
*/
unsigned long *finished_bitmap;
#endif
};
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);

View File

@ -1121,6 +1121,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
if (simple) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
} else {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@ -1254,8 +1255,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
spin_lock(&fs_info->qgroup_lock);
fs_info->quota_root = quota_root;
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
if (simple)
btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
spin_unlock(&fs_info->qgroup_lock);
/* Skip rescan for simple qgroups. */
@ -1839,9 +1838,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
* Thus its reserved space should all be zero, no matter if qgroup
* is consistent or the mode.
*/
WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_warn_rl(fs_info,
"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
btrfs_qgroup_level(qgroup->qgroupid),
btrfs_qgroup_subvolid(qgroup->qgroupid),
qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
}
/*
* The same for rfer/excl numbers, but that's only if our qgroup is
* consistent and if it's in regular qgroup mode.
@ -1850,8 +1859,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
*/
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
if (WARN_ON(qgroup->rfer || qgroup->excl ||
qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
if (qgroup->rfer || qgroup->excl ||
qgroup->rfer_cmpr || qgroup->excl_cmpr) {
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
btrfs_warn_rl(fs_info,
"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
btrfs_qgroup_level(qgroup->qgroupid),

View File

@ -199,12 +199,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
for (int i = 0; i < num_stripes; i++) {
u64 devid = bioc->stripes[i].dev->devid;
u64 physical = bioc->stripes[i].physical;
u64 length = bioc->stripes[i].length;
struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
if (length == 0)
length = bioc->size;
btrfs_set_stack_raid_stride_devid(raid_stride, devid);
btrfs_set_stack_raid_stride_physical(raid_stride, physical);
}

View File

@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
if (cur == node)
ret = true;
/* The node is the lowest node */
if (cur->lowest) {
list_del_init(&cur->lower);
cur->lowest = 0;
}
/* Cleanup the lower edges */
while (!list_empty(&cur->lower)) {
struct btrfs_backref_edge *edge;
@ -373,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
* cache to avoid unnecessary backref lookup.
*/
if (cur->level > 0) {
list_add(&cur->list, &cache->detached);
cur->detached = 1;
} else {
rb_erase(&cur->rb_node, &cache->rb_root);
@ -426,7 +419,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
goto out;
}
node->lowest = 1;
cur = node;
/* Breadth-first search to build backref cache */
@ -469,92 +461,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
return node;
}
/*
* helper to add backref node for the newly created snapshot.
* the backref node is created by cloning backref node that
* corresponds to root of source tree
*/
static int clone_backref_node(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
const struct btrfs_root *src,
struct btrfs_root *dest)
{
struct btrfs_root *reloc_root = src->reloc_root;
struct btrfs_backref_cache *cache = &rc->backref_cache;
struct btrfs_backref_node *node = NULL;
struct btrfs_backref_node *new_node;
struct btrfs_backref_edge *edge;
struct btrfs_backref_edge *new_edge;
struct rb_node *rb_node;
rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
if (node->detached)
node = NULL;
else
BUG_ON(node->new_bytenr != reloc_root->node->start);
}
if (!node) {
rb_node = rb_simple_search(&cache->rb_root,
reloc_root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct btrfs_backref_node,
rb_node);
BUG_ON(node->detached);
}
}
if (!node)
return 0;
new_node = btrfs_backref_alloc_node(cache, dest->node->start,
node->level);
if (!new_node)
return -ENOMEM;
new_node->lowest = node->lowest;
new_node->checked = 1;
new_node->root = btrfs_grab_root(dest);
ASSERT(new_node->root);
if (!node->lowest) {
list_for_each_entry(edge, &node->lower, list[UPPER]) {
new_edge = btrfs_backref_alloc_edge(cache);
if (!new_edge)
goto fail;
btrfs_backref_link_edge(new_edge, edge->node[LOWER],
new_node, LINK_UPPER);
}
} else {
list_add_tail(&new_node->lower, &cache->leaves);
}
rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
&new_node->rb_node);
if (rb_node)
btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
if (!new_node->lowest) {
list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
list_add_tail(&new_edge->list[LOWER],
&new_edge->node[LOWER]->upper);
}
}
return 0;
fail:
while (!list_empty(&new_node->lower)) {
new_edge = list_entry(new_node->lower.next,
struct btrfs_backref_edge, list[UPPER]);
list_del(&new_edge->list[UPPER]);
btrfs_backref_free_edge(cache, new_edge);
}
btrfs_backref_free_node(cache, new_node);
return -ENOMEM;
}
/*
* helper to add 'address of tree root -> reloc tree' mapping
*/
@ -2058,100 +1964,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
int index = 0;
int ret;
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
next = walk_up_backref(node, edges, &index);
root = next->root;
/*
* If there is no root, then our references for this block are
* incomplete, as we should be able to walk all the way up to a
* block that is owned by a root.
*
* This path is only for SHAREABLE roots, so if we come upon a
* non-SHAREABLE root then we have backrefs that resolve
* improperly.
*
* Both of these cases indicate file system corruption, or a bug
* in the backref walking code.
*/
if (!root) {
ASSERT(0);
btrfs_err(trans->fs_info,
"bytenr %llu doesn't have a backref path ending in a root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ASSERT(0);
btrfs_err(trans->fs_info,
"bytenr %llu has multiple refs with one ending in a non-shareable root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
/*
* If there is no root, then our references for this block are
* incomplete, as we should be able to walk all the way up to a block
* that is owned by a root.
*
* This path is only for SHAREABLE roots, so if we come upon a
* non-SHAREABLE root then we have backrefs that resolve improperly.
*
* Both of these cases indicate file system corruption, or a bug in the
* backref walking code.
*/
if (unlikely(!root)) {
btrfs_err(trans->fs_info,
"bytenr %llu doesn't have a backref path ending in a root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
btrfs_err(trans->fs_info,
"bytenr %llu has multiple refs with one ending in a non-shareable root",
node->bytenr);
return ERR_PTR(-EUCLEAN);
}
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
break;
}
ret = btrfs_record_root_in_trans(trans, root);
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
ret = record_reloc_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
root = root->reloc_root;
/*
* We could have raced with another thread which failed, so
* root->reloc_root may not be set, return ENOENT in this case.
*/
if (!root)
return ERR_PTR(-ENOENT);
if (next->new_bytenr != root->node->start) {
/*
* We just created the reloc root, so we shouldn't have
* ->new_bytenr set and this shouldn't be in the changed
* list. If it is then we have multiple roots pointing
* at the same bytenr which indicates corruption, or
* we've made a mistake in the backref walking code.
*/
ASSERT(next->new_bytenr == 0);
ASSERT(list_empty(&next->list));
if (next->new_bytenr || !list_empty(&next->list)) {
btrfs_err(trans->fs_info,
"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
node->bytenr, next->bytenr);
return ERR_PTR(-EUCLEAN);
}
next->new_bytenr = root->node->start;
btrfs_put_root(next->root);
next->root = btrfs_grab_root(root);
ASSERT(next->root);
list_add_tail(&next->list,
&rc->backref_cache.changed);
mark_block_processed(rc, next);
break;
}
WARN_ON(1);
root = NULL;
next = walk_down_backref(edges, &index);
if (!next || next->level <= node->level)
break;
goto found;
}
if (!root) {
/*
* This can happen if there's fs corruption or if there's a bug
* in the backref lookup code.
*/
ASSERT(0);
ret = btrfs_record_root_in_trans(trans, root);
if (ret)
return ERR_PTR(ret);
root = root->reloc_root;
/*
* We could have raced with another thread which failed, so
* root->reloc_root may not be set, return ENOENT in this case.
*/
if (!root)
return ERR_PTR(-ENOENT);
if (next->new_bytenr) {
/*
* We just created the reloc root, so we shouldn't have
* ->new_bytenr set yet. If it is then we have multiple roots
* pointing at the same bytenr which indicates corruption, or
* we've made a mistake in the backref walking code.
*/
ASSERT(next->new_bytenr == 0);
btrfs_err(trans->fs_info,
"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
node->bytenr, next->bytenr);
return ERR_PTR(-EUCLEAN);
}
next->new_bytenr = root->node->start;
btrfs_put_root(next->root);
next->root = btrfs_grab_root(root);
ASSERT(next->root);
mark_block_processed(rc, next);
found:
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
while (1) {
@ -2247,17 +2125,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
return num_bytes;
}
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_backref_node *node)
static int refill_metadata_space(struct btrfs_trans_handle *trans,
struct reloc_control *rc, u64 num_bytes)
{
struct btrfs_root *root = rc->extent_root;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 num_bytes;
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
u64 tmp;
num_bytes = calcu_metadata_size(rc, node) * 2;
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
@ -2270,7 +2142,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
while (tmp <= rc->reserved_bytes)
tmp <<= 1;
/*
@ -2288,6 +2161,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
return 0;
}
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct btrfs_backref_node *node)
{
u64 num_bytes;
num_bytes = calcu_metadata_size(rc, node) * 2;
return refill_metadata_space(trans, rc, num_bytes);
}
/*
* relocate a block tree, and then update pointers in upper level
* blocks that reference the block to point to the new location.
@ -2442,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!ret && node->pending) {
btrfs_backref_drop_node_buffer(node);
list_move_tail(&node->list, &rc->backref_cache.changed);
list_del_init(&node->list);
node->pending = 0;
}
@ -2605,8 +2488,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
/*
* This block was the root block of a root, and this is
* the first time we're processing the block and thus it
* should not have had the ->new_bytenr modified and
* should have not been included on the changed list.
* should not have had the ->new_bytenr modified.
*
* However in the case of corruption we could have
* multiple refs pointing to the same block improperly,
@ -2616,8 +2498,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
* normal user in the case of corruption.
*/
ASSERT(node->new_bytenr == 0);
ASSERT(list_empty(&node->list));
if (node->new_bytenr || !list_empty(&node->list)) {
if (node->new_bytenr) {
btrfs_err(root->fs_info,
"bytenr %llu has improper references to it",
node->bytenr);
@ -2640,17 +2521,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
btrfs_put_root(node->root);
node->root = btrfs_grab_root(root);
ASSERT(node->root);
list_add_tail(&node->list, &rc->backref_cache.changed);
} else {
path->lowest_level = node->level;
if (root == root->fs_info->chunk_root)
btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
btrfs_release_path(path);
if (root == root->fs_info->chunk_root)
btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
btrfs_err(root->fs_info,
"bytenr %llu resolved to a non-shareable root",
node->bytenr);
ret = -EUCLEAN;
goto out;
}
if (!ret)
update_processed_blocks(rc, node);
@ -2658,11 +2534,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
ret = do_relocation(trans, rc, node, key, path, 1);
}
out:
if (ret || node->level == 0 || node->cowonly)
if (ret || node->level == 0)
btrfs_backref_cleanup_node(&rc->backref_cache, node);
return ret;
}
static int relocate_cowonly_block(struct btrfs_trans_handle *trans,
struct reloc_control *rc, struct tree_block *block,
struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
u64 num_bytes;
int nr_levels;
int ret;
root = btrfs_get_fs_root(fs_info, block->owner, true);
if (IS_ERR(root))
return PTR_ERR(root);
nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1;
num_bytes = fs_info->nodesize * nr_levels;
ret = refill_metadata_space(trans, rc, num_bytes);
if (ret) {
btrfs_put_root(root);
return ret;
}
path->lowest_level = block->level;
if (root == root->fs_info->chunk_root)
btrfs_reserve_chunk_metadata(trans, false);
ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1);
path->lowest_level = 0;
btrfs_release_path(path);
if (root == root->fs_info->chunk_root)
btrfs_trans_release_chunk_metadata(trans);
if (ret > 0)
ret = 0;
btrfs_put_root(root);
return ret;
}
/*
* relocate a list of blocks
*/
@ -2702,6 +2617,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
/* Do tree relocation */
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
/*
* For COWonly blocks, or the data reloc tree, we only need to
* COW down to the block, there's no need to generate a backref
* tree.
*/
if (block->owner &&
(!is_fstree(block->owner) ||
block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
ret = relocate_cowonly_block(trans, rc, block, path);
if (ret)
break;
continue;
}
node = build_backref_tree(trans, rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
@ -2902,6 +2831,7 @@ static int relocate_one_folio(struct reloc_control *rc,
const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
ASSERT(index <= last_index);
again:
folio = filemap_lock_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
@ -2937,6 +2867,11 @@ static int relocate_one_folio(struct reloc_control *rc,
ret = -EIO;
goto release_folio;
}
if (folio->mapping != inode->i_mapping) {
folio_unlock(folio);
folio_put(folio);
goto again;
}
}
/*
@ -4399,8 +4334,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
WARN_ON(!first_cow && level == 0);
node = rc->backref_cache.path[level];
BUG_ON(node->bytenr != buf->start &&
node->new_bytenr != buf->start);
/*
* If node->bytenr != buf->start and node->new_bytenr !=
* buf->start then we've got the wrong backref node for what we
* expected to see here and the cache is incorrect.
*/
if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
btrfs_err(fs_info,
"bytenr %llu was found but our backref cache was expecting %llu or %llu",
buf->start, node->bytenr, node->new_bytenr);
return -EUCLEAN;
}
btrfs_backref_drop_node_buffer(node);
atomic_inc(&cow->refs);
@ -4500,10 +4445,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)
ret = clone_backref_node(trans, rc, root, reloc_root);
return ret;
return 0;
}
/*

View File

@ -226,6 +226,7 @@ struct scrub_warning {
u64 physical;
u64 logical;
struct btrfs_device *dev;
bool message_printed;
};
static void release_scrub_stripe(struct scrub_stripe *stripe)
@ -388,17 +389,13 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *warn_ctx)
{
u32 nlink;
int ret;
int i;
unsigned nofs_flag;
struct extent_buffer *eb;
struct btrfs_inode_item *inode_item;
struct scrub_warning *swarn = warn_ctx;
struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key key;
local_root = btrfs_get_fs_root(fs_info, root, true);
if (IS_ERR(local_root)) {
@ -406,26 +403,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
goto err;
}
/*
* this makes the path point to (inum INODE_ITEM ioff)
*/
key.objectid = inum;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
btrfs_put_root(local_root);
btrfs_release_path(swarn->path);
goto err;
}
eb = swarn->path->nodes[0];
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
struct btrfs_inode_item);
nlink = btrfs_inode_nlink(eb, inode_item);
btrfs_release_path(swarn->path);
/*
* init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
* uses GFP_NOFS in this context, so we keep it consistent but it does
@ -449,34 +426,35 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
* we deliberately ignore the bit ipath might have been too small to
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
btrfs_warn_rl_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, path: %s",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset,
fs_info->sectorsize, nlink,
(char *)(unsigned long)ipath->fspath->val[i]);
swarn->message_printed = true;
}
btrfs_put_root(local_root);
free_ipath(ipath);
return 0;
err:
btrfs_warn_in_rcu(fs_info,
btrfs_warn_rl_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset, ret);
swarn->message_printed = true;
free_ipath(ipath);
return 0;
}
static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
bool is_super, u64 logical, u64 physical)
u64 logical, u64 physical)
{
struct btrfs_fs_info *fs_info = dev->fs_info;
struct btrfs_path *path;
@ -488,12 +466,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
u32 item_size;
int ret;
/* Super block error, no need to search extent tree. */
if (is_super) {
btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
errstr, btrfs_dev_name(dev), physical);
return;
}
path = btrfs_alloc_path();
if (!path)
return;
@ -502,6 +474,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
swarn.logical = logical;
swarn.errstr = errstr;
swarn.dev = NULL;
swarn.message_printed = false;
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
&flags);
@ -523,20 +496,22 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
if (ret < 0) {
btrfs_warn(fs_info,
"failed to resolve tree backref for logical %llu: %d",
swarn.logical, ret);
if (ret < 0)
break;
}
if (ret > 0)
break;
btrfs_warn_in_rcu(fs_info,
btrfs_warn_rl_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical, btrfs_dev_name(dev),
swarn.physical, (ref_level ? "node" : "leaf"),
ref_level, ref_root);
swarn.message_printed = true;
}
if (!swarn.message_printed)
btrfs_warn_rl_in_rcu(fs_info,
"%s at metadata, logical %llu on dev %s physical %llu",
errstr, swarn.logical,
btrfs_dev_name(dev), swarn.physical);
btrfs_release_path(path);
} else {
struct btrfs_backref_walk_ctx ctx = { 0 };
@ -551,6 +526,11 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
swarn.dev = dev;
iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
if (!swarn.message_printed)
btrfs_warn_rl_in_rcu(fs_info,
"%s at data, filename unresolved, logical %llu on dev %s physical %llu",
errstr, swarn.logical,
btrfs_dev_name(dev), swarn.physical);
}
out:
@ -866,11 +846,9 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
struct scrub_stripe *stripe)
{
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_device *dev = NULL;
u64 physical = 0;
struct btrfs_device *dev = stripe->dev;
u64 stripe_physical = stripe->physical;
int nr_data_sectors = 0;
int nr_meta_sectors = 0;
int nr_nodatacsum_sectors = 0;
@ -880,36 +858,12 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
return;
/*
* Init needed infos for error reporting.
*
* Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
u64 mapped_len = fs_info->sectorsize;
struct btrfs_io_context *bioc = NULL;
int stripe_index = stripe->mirror_num - 1;
int ret;
/* For scrub, our mirror_num should always start at 1. */
ASSERT(stripe->mirror_num >= 1);
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
stripe->logical, &mapped_len, &bioc,
NULL, NULL);
/*
* If we failed, dev will be NULL, and later detailed reports
* will just be skipped.
*/
if (ret < 0)
goto skip;
physical = bioc->stripes[stripe_index].physical;
dev = bioc->stripes[stripe_index].dev;
btrfs_put_bioc(bioc);
}
skip:
ASSERT(dev);
for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
const u64 logical = stripe->logical +
(sector_nr << fs_info->sectorsize_bits);
const u64 physical = stripe_physical +
(sector_nr << fs_info->sectorsize_bits);
bool repaired = false;
if (stripe->sectors[sector_nr].is_metadata) {
@ -935,43 +889,23 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
* output the message of repaired message.
*/
if (repaired) {
if (dev) {
btrfs_err_rl_in_rcu(fs_info,
btrfs_err_rl_in_rcu(fs_info,
"fixed up error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
logical, btrfs_dev_name(dev),
physical);
} else {
btrfs_err_rl_in_rcu(fs_info,
"fixed up error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
continue;
}
/* The remaining are all for unrepaired. */
if (dev) {
btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (regular) error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (regular) error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
if (test_bit(sector_nr, &stripe->io_error_bitmap))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("i/o error", dev, false,
stripe->logical, physical);
scrub_print_common_warning("i/o error", dev,
logical, physical);
if (test_bit(sector_nr, &stripe->csum_error_bitmap))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("checksum error", dev, false,
stripe->logical, physical);
scrub_print_common_warning("checksum error", dev,
logical, physical);
if (test_bit(sector_nr, &stripe->meta_error_bitmap))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false,
stripe->logical, physical);
scrub_print_common_warning("header error", dev,
logical, physical);
}
spin_lock(&sctx->stat_lock);

View File

@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
again:
folio = filemap_lock_folio(mapping, index);
if (IS_ERR(folio)) {
page_cache_sync_readahead(mapping,
@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
ret = -EIO;
break;
}
if (folio->mapping != mapping) {
folio_unlock(folio);
folio_put(folio);
goto again;
}
}
memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
@ -7253,7 +7259,7 @@ static int changed_cb(struct btrfs_path *left_path,
enum btrfs_compare_tree_result result,
struct send_ctx *sctx)
{
int ret = 0;
int ret;
/*
* We can not hold the commit root semaphore here. This is because in
@ -7313,7 +7319,6 @@ static int changed_cb(struct btrfs_path *left_path,
return 0;
}
result = BTRFS_COMPARE_TREE_CHANGED;
ret = 0;
}
sctx->left_path = left_path;

View File

@ -14,6 +14,7 @@
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "zoned.h"
/*
* HOW DOES SPACE RESERVATION WORK
@ -127,6 +128,14 @@
* churn a lot and we can avoid making some extent tree modifications if we
* are able to delay for as long as possible.
*
* RESET_ZONES
* This state works only for the zoned mode. On the zoned mode, we cannot
* reuse once allocated then freed region until we reset the zone, due to
* the sequential write zone requirement. The RESET_ZONES state resets the
* zones of an unused block group and let us reuse the space. The reusing
* is faster than removing the block group and allocating another block
* group on the zones.
*
* ALLOC_CHUNK
* We will skip this the first time through space reservation, because of
* overcommit and we don't want to have a lot of useless metadata space when
@ -316,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
found->bytes_used += block_group->used;
found->disk_used += block_group->used * factor;
found->bytes_readonly += block_group->bytes_super;
btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable);
if (block_group->length > 0)
found->full = 0;
btrfs_try_granting_tickets(info, found);
@ -489,9 +498,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
if ((used + ticket->bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
remove_ticket(space_info, ticket);
ticket->bytes = 0;
space_info->tickets_id++;
@ -834,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
*/
ret = btrfs_commit_current_transaction(root);
break;
case RESET_ZONES:
ret = btrfs_reset_unused_block_groups(space_info, num_bytes);
break;
default:
ret = -ENOSPC;
break;
@ -1086,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
enum btrfs_flush_state final_state;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
if (btrfs_is_zoned(fs_info))
final_state = RESET_ZONES;
else
final_state = COMMIT_TRANS;
spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
@ -1141,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
flush_state++;
if (flush_state > COMMIT_TRANS) {
if (flush_state > final_state) {
commit_cycles++;
if (commit_cycles > 2) {
if (maybe_fail_all_tickets(fs_info, space_info)) {
@ -1155,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
}
spin_unlock(&space_info->lock);
} while (flush_state <= COMMIT_TRANS);
} while (flush_state <= final_state);
}
/*
@ -1286,6 +1301,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* This is where we reclaim all of the pinned space generated by running the
* iputs
*
* RESET_ZONES
* This state works only for the zoned mode. We scan the unused block group
* list and reset the zones and reuse the block group.
*
* ALLOC_CHUNK_FORCE
* For data we start with alloc chunk force, however we could have been full
* before, and then the transaction commit could have freed new block groups,
@ -1295,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = {
FLUSH_DELALLOC_FULL,
RUN_DELAYED_IPUTS,
COMMIT_TRANS,
RESET_ZONES,
ALLOC_CHUNK_FORCE,
};
@ -1386,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
static const enum btrfs_flush_state priority_flush_states[] = {
FLUSH_DELAYED_ITEMS_NR,
FLUSH_DELAYED_ITEMS,
RESET_ZONES,
ALLOC_CHUNK,
};
@ -1399,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
FLUSH_DELALLOC_FULL,
ALLOC_CHUNK,
COMMIT_TRANS,
RESET_ZONES,
};
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
@ -1690,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
@ -1703,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
used = btrfs_space_info_used(space_info, false);
if (used + orig_bytes <= space_info->total_bytes) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
ret = 0;
}
}
@ -2082,3 +2102,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
do_reclaim_sweep(space_info, raid);
}
}
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
{
struct btrfs_fs_info *fs_info = space_info->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
lockdep_assert_held(&space_info->lock);
/* Prioritize the global reservation to receive the freed space. */
if (global_rsv->space_info != space_info)
goto grant;
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
global_rsv->reserved += to_add;
btrfs_space_info_update_bytes_may_use(space_info, to_add);
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
len -= to_add;
}
spin_unlock(&global_rsv->lock);
grant:
/* Add to any tickets we may have. */
if (len)
btrfs_try_granting_tickets(fs_info, space_info);
}

View File

@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_EMERGENCY,
};
/*
* Please be aware that the order of enum values will be the order of the reclaim
* process in btrfs_async_reclaim_metadata_space().
*/
enum btrfs_flush_state {
FLUSH_DELAYED_ITEMS_NR = 1,
FLUSH_DELAYED_ITEMS = 2,
@ -91,6 +95,7 @@ enum btrfs_flush_state {
ALLOC_CHUNK_FORCE = 9,
RUN_DELAYED_IPUTS = 10,
COMMIT_TRANS = 11,
RESET_ZONES = 12,
};
struct btrfs_space_info {
@ -229,10 +234,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i
*/
#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \
static inline void \
btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
struct btrfs_space_info *sinfo, \
btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \
s64 bytes) \
{ \
struct btrfs_fs_info *fs_info = sinfo->fs_info; \
const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \
lockdep_assert_held(&sinfo->lock); \
trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
@ -275,13 +280,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 num_bytes)
{
spin_lock(&space_info->lock);
btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
btrfs_try_granting_tickets(fs_info, space_info);
btrfs_space_info_update_bytes_may_use(space_info, -num_bytes);
btrfs_try_granting_tickets(space_info->fs_info, space_info);
spin_unlock(&space_info->lock);
}
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
@ -295,5 +299,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
#endif /* BTRFS_SPACE_INFO_H */

View File

@ -635,6 +635,28 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
folio_test_checked);
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
{ \
const int sectors_per_page = fs_info->sectors_per_page; \
\
ASSERT(sectors_per_page < BITS_PER_LONG); \
*dst = bitmap_read(subpage->bitmaps, \
sectors_per_page * btrfs_bitmap_nr_##name, \
sectors_per_page); \
}
#define subpage_dump_bitmap(fs_info, folio, name, start, len) \
{ \
struct btrfs_subpage *subpage = folio_get_private(folio); \
unsigned long bitmap; \
\
GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \
btrfs_warn(fs_info, \
"dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
start, len, folio_pos(folio), \
fs_info->sectors_per_page, &bitmap); \
}
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
@ -660,6 +682,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
subpage = folio_get_private(folio);
ASSERT(subpage);
spin_lock_irqsave(&subpage->lock, flags);
if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
subpage_dump_bitmap(fs_info, folio, dirty, start, len);
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
}
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
spin_unlock_irqrestore(&subpage->lock, flags);
}
@ -689,23 +715,17 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
nbits = len >> fs_info->sectorsize_bits;
spin_lock_irqsave(&subpage->lock, flags);
/* Target range should not yet be locked. */
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
subpage_dump_bitmap(fs_info, folio, locked, start, len);
btrfs_warn(fs_info, "nr_locked=%u\n", atomic_read(&subpage->nr_locked));
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
}
bitmap_set(subpage->bitmaps, start_bit, nbits);
ret = atomic_add_return(nbits, &subpage->nr_locked);
ASSERT(ret <= fs_info->sectors_per_page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
{ \
const int sectors_per_page = fs_info->sectors_per_page; \
\
ASSERT(sectors_per_page < BITS_PER_LONG); \
*dst = bitmap_read(subpage->bitmaps, \
sectors_per_page * btrfs_bitmap_nr_##name, \
sectors_per_page); \
}
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
@ -716,6 +736,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long writeback_bitmap;
unsigned long ordered_bitmap;
unsigned long checked_bitmap;
unsigned long locked_bitmap;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
@ -728,15 +749,16 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap);
GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
sectors_per_page, &uptodate_bitmap,
sectors_per_page, &dirty_bitmap,
sectors_per_page, &locked_bitmap,
sectors_per_page, &writeback_bitmap,
sectors_per_page, &ordered_bitmap,
sectors_per_page, &checked_bitmap);

View File

@ -971,7 +971,7 @@ static int btrfs_fill_super(struct super_block *sb,
err = open_ctree(sb, fs_devices);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
btrfs_err(fs_info, "open_ctree failed: %d", err);
return err;
}
@ -2446,6 +2446,9 @@ static __cold void btrfs_interface_exit(void)
static int __init btrfs_print_mod_info(void)
{
static const char options[] = ""
#ifdef CONFIG_BTRFS_EXPERIMENTAL
", experimental=on"
#endif
#ifdef CONFIG_BTRFS_DEBUG
", debug=on"
#endif
@ -2466,7 +2469,17 @@ static int __init btrfs_print_mod_info(void)
", fsverity=no"
#endif
;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
if (btrfs_get_raid1_balancing() == NULL)
pr_info("Btrfs loaded%s\n", options);
else
pr_info("Btrfs loaded%s, raid1_balancing=%s\n",
options, btrfs_get_raid1_balancing());
#else
pr_info("Btrfs loaded%s\n", options);
#endif
return 0;
}
@ -2524,6 +2537,11 @@ static const struct init_sequence mod_init_seq[] = {
}, {
.init_func = extent_map_init,
.exit_func = extent_map_exit,
#ifdef CONFIG_BTRFS_EXPERIMENTAL
}, {
.init_func = btrfs_raid1_balancing_init,
.exit_func = NULL,
#endif
}, {
.init_func = ordered_data_init,
.exit_func = ordered_data_exit,

View File

@ -1305,7 +1305,74 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
}
BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
static const char * const btrfs_read_policy_name[] = { "pid" };
static const char *btrfs_read_policy_name[] = {
"pid",
#ifdef CONFIG_BTRFS_EXPERIMENTAL
"round-robin",
"devid",
#endif
};
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Global module configuration parameters */
static char *raid1_balancing;
char *btrfs_get_raid1_balancing(void)
{
return raid1_balancing;
}
/* Set perm 0, disable sys/module/btrfs/parameter/raid1_balancing interface */
module_param(raid1_balancing, charp, 0);
MODULE_PARM_DESC(raid1_balancing,
"Global read policy; pid (default), round-robin:[min_contiguous_read], devid:[[devid]|[latest-gen]|[oldest-gen]]");
#endif
int btrfs_read_policy_to_enum(const char *str, s64 *value)
{
char param[32] = {'\0'};
char *__maybe_unused value_str;
int index;
bool found = false;
if (!str || strlen(str) == 0)
return 0;
strcpy(param, str);
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Separate value from input in policy:value format. */
if ((value_str = strchr(param, ':'))) {
*value_str = '\0';
value_str++;
if (value && kstrtou64(value_str, 10, value) != 0)
return -EINVAL;
}
#endif
for (index = 0; index < BTRFS_NR_READ_POLICY; index++) {
if (sysfs_streq(param, btrfs_read_policy_name[index])) {
found = true;
break;
}
}
if (found)
return index;
return -EINVAL;
}
#ifdef CONFIG_BTRFS_EXPERIMENTAL
int __init btrfs_raid1_balancing_init(void)
{
if (btrfs_read_policy_to_enum(raid1_balancing, NULL) == -EINVAL) {
btrfs_err(NULL, "Invalid raid1_balancing %s", raid1_balancing);
return -EINVAL;
}
return 0;
}
#endif
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
@ -1316,14 +1383,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
int i;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (policy == i)
ret += sysfs_emit_at(buf, ret, "%s[%s]",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
else
ret += sysfs_emit_at(buf, ret, "%s%s",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
if (ret != 0)
ret += sysfs_emit_at(buf, ret, " ");
if (i == policy)
ret += sysfs_emit_at(buf, ret, "[");
ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
#ifdef CONFIG_BTRFS_EXPERIMENTAL
if (i == BTRFS_READ_POLICY_RR)
ret += sysfs_emit_at(buf, ret, ":%d",
fs_devices->rr_min_contiguous_read);
if (i == BTRFS_READ_POLICY_DEVID)
ret += sysfs_emit_at(buf, ret, ":%llu",
fs_devices->read_devid);
#endif
if (i == policy)
ret += sysfs_emit_at(buf, ret, "]");
}
ret += sysfs_emit_at(buf, ret, "\n");
@ -1336,21 +1414,78 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
const char *buf, size_t len)
{
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
int i;
int index;
s64 value = -1;
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
if (i != READ_ONCE(fs_devices->read_policy)) {
WRITE_ONCE(fs_devices->read_policy, i);
btrfs_info(fs_devices->fs_info,
"read policy set to '%s'",
btrfs_read_policy_name[i]);
index = btrfs_read_policy_to_enum(buf, &value);
if (index == -EINVAL)
return -EINVAL;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
if (index == BTRFS_READ_POLICY_RR) {
if (value != -1) {
u32 sectorsize = fs_devices->fs_info->sectorsize;
if (!IS_ALIGNED(value, sectorsize)) {
u64 temp_value = round_up(value, sectorsize);
btrfs_warn(fs_devices->fs_info,
"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu",
value, sectorsize, temp_value);
value = temp_value;
}
return len;
} else {
value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
}
if (index != READ_ONCE(fs_devices->read_policy) ||
value != READ_ONCE(fs_devices->rr_min_contiguous_read)) {
WRITE_ONCE(fs_devices->read_policy, index);
WRITE_ONCE(fs_devices->rr_min_contiguous_read, value);
atomic_set(&fs_devices->total_reads, 0);
btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
btrfs_read_policy_name[index], value);
}
return len;
}
return -EINVAL;
if (index == BTRFS_READ_POLICY_DEVID) {
if (value != -1) {
BTRFS_DEV_LOOKUP_ARGS(args);
/* Validate input devid */
args.devid = value;
if (btrfs_find_device(fs_devices, &args) == NULL)
return -EINVAL;
} else {
/* Set default devid to the devid of the latest device */
value = fs_devices->latest_dev->devid;
}
if (index != READ_ONCE(fs_devices->read_policy) ||
(value != READ_ONCE(fs_devices->read_devid))) {
WRITE_ONCE(fs_devices->read_policy, index);
WRITE_ONCE(fs_devices->read_devid, value);
btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'",
btrfs_read_policy_name[index], value);
}
return len;
}
#endif
if (index != READ_ONCE(fs_devices->read_policy)) {
WRITE_ONCE(fs_devices->read_policy, index);
btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
btrfs_read_policy_name[index]);
}
return len;
}
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);

View File

@ -47,5 +47,10 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup);
int btrfs_read_policy_to_enum(const char *str, s64 *value);
#ifdef CONFIG_BTRFS_EXPERIMENTAL
int __init btrfs_raid1_balancing_init(void);
char *btrfs_get_raid1_balancing(void);
#endif
#endif

View File

@ -30,6 +30,7 @@ const char *test_error[] = {
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
[TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
[TEST_ALLOC_TRANSACTION] = "cannot allocate transaction",
};
static const struct super_operations btrfs_test_super_ops = {
@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
fs_info->nodesize = nodesize;
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
/* CRC32C csum size. */
fs_info->csum_size = 4;
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) /
fs_info->csum_size;
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
test_mnt->mnt_sb->s_fs_info = fs_info;
@ -247,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
kfree(cache);
}
void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info)
{
memset(trans, 0, sizeof(*trans));
trans->fs_info = fs_info;
xa_init(&trans->delayed_refs.head_refs);
xa_init(&trans->delayed_refs.dirty_extents);
spin_lock_init(&trans->delayed_refs.lock);
}
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info)
{
@ -295,6 +310,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
if (ret)
goto out;
ret = btrfs_test_delayed_refs(sectorsize, nodesize);
if (ret)
goto out;
}
}
ret = btrfs_test_extent_map();

View File

@ -6,6 +6,8 @@
#ifndef BTRFS_TESTS_H
#define BTRFS_TESTS_H
#include <linux/types.h>
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int btrfs_run_sanity_tests(void);
@ -25,12 +27,14 @@ enum {
TEST_ALLOC_EXTENT_MAP,
TEST_ALLOC_CHUNK_MAP,
TEST_ALLOC_IO_CONTEXT,
TEST_ALLOC_TRANSACTION,
};
extern const char *test_error[];
struct btrfs_root;
struct btrfs_trans_handle;
struct btrfs_transaction;
int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void);
int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);
struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
#else
static inline int btrfs_run_sanity_tests(void)

File diff suppressed because it is too large Load Diff

View File

@ -795,8 +795,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
if (num_bytes)
btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
if (delayed_refs_bytes)
btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
delayed_refs_bytes);
btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes);
reserve_fail:
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
return ERR_PTR(ret);

View File

@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
delayed_refs->qgroup_to_skip = 0;
}
bool __cold abort_should_print_stack(int error);
/*
* We want the transaction abort to print stack trace only for errors where the
* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
* caused by external factors.
*/
static inline bool btrfs_abort_should_print_stack(int error)
{
switch (error) {
case -EIO:
case -EROFS:
case -ENOMEM:
return false;
}
return true;
}
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
@ -240,7 +254,7 @@ do { \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
__first = true; \
if (WARN(abort_should_print_stack(error), \
if (WARN(btrfs_abort_should_print_stack(error), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
(error))) { \

View File

@ -973,6 +973,105 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return 0;
}
int btrfs_check_system_chunk_array(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb)
{
struct extent_buffer *dummy;
u32 array_size;
u32 cur_offset = 0;
u32 len;
int ret = 0;
/*
* We allocated a dummy extent, just to use extent buffer accessors.
* There will be unused space after BTRFS_SUPER_INFO_SIZE, but
* that's fine, we will not go beyond system chunk array anyway.
*/
dummy = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
if (!dummy)
return -ENOMEM;
set_extent_buffer_uptodate(dummy);
write_extent_buffer(dummy, sb, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(sb);
if (array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
btrfs_crit(fs_info,
"superblock syschunk too large, have %u expect <=%u",
array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
ret = -EUCLEAN;
goto out;
}
while (cur_offset < array_size) {
struct btrfs_disk_key *disk_key;
struct btrfs_key key;
struct btrfs_chunk *chunk;
u32 num_stripes;
u64 type;
len = sizeof(*disk_key);
if (cur_offset + len > array_size)
goto out_short_read;
disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur_offset);
btrfs_disk_key_to_cpu(&key, disk_key);
cur_offset += len;
if (key.type != BTRFS_CHUNK_ITEM_KEY) {
btrfs_crit(fs_info,
"unexpected item type %u in sys_array at offset %u",
(u32)key.type, cur_offset);
ret = -EUCLEAN;
goto out;
}
/*
* At least one btrfs_chunk with one stripe must be present,
* exact stripe count check comes afterwards
*/
len = btrfs_chunk_item_size(1);
if (cur_offset + len > array_size)
goto out_short_read;
chunk = (struct btrfs_chunk *)
(offsetof(struct btrfs_super_block, sys_chunk_array) +
cur_offset);
num_stripes = btrfs_chunk_num_stripes(dummy, chunk);
if (!num_stripes) {
btrfs_crit(fs_info,
"invalid number of stripes %u in sys_array at offset %u",
num_stripes, cur_offset);
ret = -EUCLEAN;
goto out;
}
type = btrfs_chunk_type(dummy, chunk);
if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
btrfs_err(fs_info,
"invalid chunk type %llu in sys_array at offset %u",
type, cur_offset);
ret = -EUCLEAN;
goto out;
}
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
ret = btrfs_check_chunk_valid(dummy, chunk, key.offset);
if (ret)
goto out;
cur_offset += len;
}
out:
free_extent_buffer_stale(dummy);
return ret;
out_short_read:
btrfs_crit(fs_info,
"sys_array too short to read %u bytes at offset %u array size %u",
len, cur_offset, array_size);
free_extent_buffer_stale(dummy);
return ret;
}
/*
* Enhanced version of chunk item checker.
*

View File

@ -8,6 +8,7 @@
#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
#include "fs.h"
struct extent_buffer;
struct btrfs_chunk;
@ -68,6 +69,8 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
int btrfs_check_system_chunk_array(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
int btrfs_verify_level_key(struct extent_buffer *eb,
const struct btrfs_tree_parent_check *check);

View File

@ -13,8 +13,8 @@
#include <linux/list_sort.h>
#include <linux/namei.h>
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "extent-tree.h"
#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
@ -48,6 +48,7 @@ struct btrfs_io_geometry {
u64 raid56_full_stripe_start;
int max_errors;
enum btrfs_map_op op;
bool use_rst;
};
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
@ -1327,7 +1328,14 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
fs_devices->latest_dev = latest_dev;
fs_devices->total_rw_bytes = 0;
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
fs_devices->read_devid = latest_dev->devid;
fs_devices->read_policy =
btrfs_read_policy_to_enum(btrfs_get_raid1_balancing(), NULL);
#else
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
#endif
return 0;
}
@ -5959,6 +5967,88 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
#ifdef CONFIG_BTRFS_EXPERIMENTAL
static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first,
int num_stripe)
{
int last = first + num_stripe;
int stripe_index;
for (stripe_index = first; stripe_index < last; stripe_index++) {
struct btrfs_device *device = map->stripes[stripe_index].dev;
if (device->devid == READ_ONCE(device->fs_devices->read_devid))
return stripe_index;
}
/* If no read-preferred device, use first stripe */
return first;
}
struct stripe_mirror {
u64 devid;
int num;
};
static int btrfs_cmp_devid(const void *a, const void *b)
{
const struct stripe_mirror *s1 = (struct stripe_mirror *)a;
const struct stripe_mirror *s2 = (struct stripe_mirror *)b;
if (s1->devid < s2->devid)
return -1;
if (s1->devid > s2->devid)
return 1;
return 0;
}
/*
* btrfs_read_rr.
*
* Select a stripe for reading using a round-robin algorithm:
*
* 1. Compute the read cycle as the total sectors read divided by the minimum
* sectors per device.
* 2. Determine the stripe number for the current read by taking the modulus
* of the read cycle with the total number of stripes:
*
* stripe index = (total sectors / min sectors per dev) % num stripes
*
* The calculated stripe index is then used to select the corresponding device
* from the list of devices, which is ordered by devid.
*/
static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
{
struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
struct btrfs_fs_devices *fs_devices;
struct btrfs_device *device;
int read_cycle;
int index;
int ret_stripe;
int total_reads;
int reads_per_dev = 0;
device = map->stripes[first].dev;
fs_devices = device->fs_devices;
reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT;
index = 0;
for (int i = first; i < first + num_stripe; i++) {
stripes[index].devid = map->stripes[i].dev->devid;
stripes[index].num = i;
index++;
}
sort(stripes, num_stripe, sizeof(struct stripe_mirror),
btrfs_cmp_devid, NULL);
total_reads = atomic_inc_return(&fs_devices->total_reads);
read_cycle = total_reads / reads_per_dev;
ret_stripe = stripes[read_cycle % num_stripe].num;
return ret_stripe;
}
#endif
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
@ -5988,6 +6078,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
case BTRFS_READ_POLICY_PID:
preferred_mirror = first + (current->pid % num_stripes);
break;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
case BTRFS_READ_POLICY_RR:
preferred_mirror = btrfs_read_rr(map, first, num_stripes);
break;
case BTRFS_READ_POLICY_DEVID:
preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
break;
#endif
}
if (dev_replace_is_ongoing &&
@ -6346,8 +6444,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
{
dst->dev = map->stripes[io_geom->stripe_index].dev;
if (io_geom->op == BTRFS_MAP_READ &&
btrfs_need_stripe_tree_update(fs_info, map->type))
if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)
return btrfs_get_raid_extent_offset(fs_info, logical, length,
map->type,
io_geom->stripe_index, dst);
@ -6362,7 +6459,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
const struct btrfs_io_stripe *smap,
const struct btrfs_chunk_map *map,
int num_alloc_stripes,
enum btrfs_map_op op, int mirror_num)
struct btrfs_io_geometry *io_geom)
{
if (!smap)
return false;
@ -6370,10 +6467,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
if (num_alloc_stripes != 1)
return false;
if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)
return false;
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)
return false;
return true;
@ -6579,6 +6676,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
io_geom.raid56_full_stripe_start = (u64)-1;
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
*length = min_t(u64, map->chunk_len - map_offset, max_len);
io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
if (dev_replace->replace_task != current)
down_read(&dev_replace->rwsem);
@ -6647,8 +6745,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
io_geom.mirror_num)) {
if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {
ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
*mirror_num_ret = io_geom.mirror_num;
@ -6662,6 +6759,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
goto out;
}
bioc->map_type = map->type;
bioc->use_rst = io_geom.use_rst;
/*
* For RAID56 full map, we need to make sure the stripes[] follows the
@ -7002,16 +7100,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif
/*
* Only need to verify chunk item if we're reading from sys chunk array,
* as chunk item in tree block is already verified by tree-checker.
*/
if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
ret = btrfs_check_chunk_valid(leaf, chunk, logical);
if (ret)
return ret;
}
map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
@ -7274,11 +7362,9 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
u8 *array_ptr;
unsigned long sb_array_offset;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
u32 cur_offset;
u64 type;
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
@ -7301,10 +7387,17 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
cur_offset = 0;
while (cur_offset < array_size) {
u32 num_stripes;
disk_key = (struct btrfs_disk_key *)array_ptr;
len = sizeof(*disk_key);
if (cur_offset + len > array_size)
goto out_short_read;
/*
* The super block should have passed
* btrfs_check_system_chunk_array(), thus we only do
* ASSERT() for those sanity checks.
*/
ASSERT(cur_offset + len <= array_size);
btrfs_disk_key_to_cpu(&key, disk_key);
@ -7312,44 +7405,24 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
sb_array_offset += len;
cur_offset += len;
if (key.type != BTRFS_CHUNK_ITEM_KEY) {
btrfs_err(fs_info,
"unexpected item type %u in sys_array at offset %u",
(u32)key.type, cur_offset);
ret = -EIO;
break;
}
ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);
chunk = (struct btrfs_chunk *)sb_array_offset;
/*
* At least one btrfs_chunk with one stripe must be present,
* exact stripe count check comes afterwards
*/
len = btrfs_chunk_item_size(1);
if (cur_offset + len > array_size)
goto out_short_read;
ASSERT(cur_offset + btrfs_chunk_item_size(1) <= array_size);
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
if (!num_stripes) {
btrfs_err(fs_info,
"invalid number of stripes %u in sys_array at offset %u",
num_stripes, cur_offset);
ret = -EIO;
break;
}
/* Should have at least one stripe. */
ASSERT(num_stripes);
type = btrfs_chunk_type(sb, chunk);
if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
btrfs_err(fs_info,
"invalid chunk type %llu in sys_array at offset %u",
type, cur_offset);
ret = -EIO;
break;
}
/* Only system chunks are allowed in system chunk array. */
ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM);
len = btrfs_chunk_item_size(num_stripes);
if (cur_offset + len > array_size)
goto out_short_read;
ASSERT(cur_offset + len <= array_size);
ret = read_one_chunk(&key, sb, chunk);
if (ret)
@ -7362,13 +7435,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return ret;
out_short_read:
btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
len, cur_offset);
clear_extent_buffer_uptodate(sb);
free_extent_buffer_stale(sb);
return -EIO;
}
/*
@ -7568,8 +7634,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
int ret = 0;
fs_devices->fs_info = fs_info;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list)
device->fs_info = fs_info;
@ -7798,7 +7862,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
if (!dev->dev_stats_valid)
return;
btrfs_err_rl_in_rcu(dev->fs_info,
btrfs_debug_rl_in_rcu(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),

View File

@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy {
BTRFS_CHUNK_ALLOC_ZONED,
};
#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ (SZ_256K)
#define BTRFS_RAID1_MAX_MIRRORS (4)
/*
* Read policies for mirrored block group profiles, read picks the stripe based
* on these policies.
@ -303,6 +305,12 @@ enum btrfs_chunk_allocation_policy {
enum btrfs_read_policy {
/* Use process PID to choose the stripe */
BTRFS_READ_POLICY_PID,
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Balancing raid1 reads across all striped devices (round-robin) */
BTRFS_READ_POLICY_RR,
/* Read from the specific device */
BTRFS_READ_POLICY_DEVID,
#endif
BTRFS_NR_READ_POLICY,
};
@ -431,6 +439,14 @@ struct btrfs_fs_devices {
enum btrfs_read_policy read_policy;
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* IO stat, read counter. */
atomic_t total_reads;
/* Min contiguous reads before switching to next device. */
int rr_min_contiguous_read;
/* Device to be used for reading in case of RAID1. */
u64 read_devid;
/* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode;
#endif
@ -485,6 +501,7 @@ struct btrfs_io_context {
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
bool use_rst;
u64 logical;
u64 size;

View File

@ -741,12 +741,23 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* we add the pages one by one to a bio, and cannot increase the
* metadata reservation even if it increases the number of extents, it
* is safe to stick with the limit.
*
* If there is no zoned device in the filesystem, we have
* max_zone_append_sectors = 0. That will cause
* fs_info->max_zone_append_size and fs_info->max_extent_size to be
* 0 in the following lines. Set the maximum value to avoid that.
*/
fs_info->max_zone_append_size = ALIGN_DOWN(
min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
(u64)lim->max_sectors << SECTOR_SHIFT,
(u64)lim->max_segments << PAGE_SHIFT),
fs_info->sectorsize);
if (lim->features & BLK_FEAT_ZONED)
fs_info->max_zone_append_size = ALIGN_DOWN(
min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
(u64)lim->max_sectors << SECTOR_SHIFT,
(u64)lim->max_segments << PAGE_SHIFT),
fs_info->sectorsize);
else
fs_info->max_zone_append_size = ALIGN_DOWN(
min((u64)lim->max_sectors << SECTOR_SHIFT,
(u64)lim->max_segments << PAGE_SHIFT),
fs_info->sectorsize);
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
if (fs_info->max_zone_append_size < fs_info->max_extent_size)
fs_info->max_extent_size = fs_info->max_zone_append_size;
@ -1671,6 +1682,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return -EINVAL;
}
/* Reject non SINGLE data profiles without RST. */
if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
(map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
!fs_info->stripe_root) {
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
btrfs_bg_type_to_raid_name(map->type));
return -EINVAL;
}
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@ -2651,3 +2671,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->zone_active_bgs_lock);
}
/*
* Reset the zones of unused block groups from @space_info->bytes_zone_unusable.
*
* @space_info: the space to work on
* @num_bytes: targeting reclaim bytes
*
* This one resets the zones of a block group, so we can reuse the region
* without removing the block group. On the other hand, btrfs_delete_unused_bgs()
* just removes a block group and frees up the underlying zones. So, we still
* need to allocate a new block group to reuse the zones.
*
* Resetting is faster than deleting/recreating a block group. It is similar
* to freeing the logical space on the regular mode. However, we cannot change
* the block group's profile with this operation.
*/
int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = space_info->fs_info;
const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT;
if (!btrfs_is_zoned(fs_info))
return 0;
while (num_bytes > 0) {
struct btrfs_chunk_map *map;
struct btrfs_block_group *bg = NULL;
bool found = false;
u64 reclaimed = 0;
/*
* Here, we choose a fully zone_unusable block group. It's
* technically possible to reset a partly zone_unusable block
* group, which still has some free space left. However,
* handling that needs to cope with the allocation side, which
* makes the logic more complex. So, let's handle the easy case
* for now.
*/
spin_lock(&fs_info->unused_bgs_lock);
list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) {
if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags)
continue;
/*
* Use trylock to avoid locking order violation. In
* btrfs_reclaim_bgs_work(), the lock order is
* &bg->lock -> &fs_info->unused_bgs_lock. We skip a
* block group if we cannot take its lock.
*/
if (!spin_trylock(&bg->lock))
continue;
if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) {
spin_unlock(&bg->lock);
continue;
}
spin_unlock(&bg->lock);
found = true;
break;
}
if (!found) {
spin_unlock(&fs_info->unused_bgs_lock);
return 0;
}
list_del_init(&bg->bg_list);
btrfs_put_block_group(bg);
spin_unlock(&fs_info->unused_bgs_lock);
/*
* Since the block group is fully zone_unusable and we cannot
* allocate from this block group anymore, we don't need to set
* this block group read-only.
*/
down_read(&fs_info->dev_replace.rwsem);
map = bg->physical_map;
for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_io_stripe *stripe = &map->stripes[i];
unsigned int nofs_flags;
int ret;
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET,
stripe->physical >> SECTOR_SHIFT,
zone_size_sectors);
memalloc_nofs_restore(nofs_flags);
if (ret) {
up_read(&fs_info->dev_replace.rwsem);
return ret;
}
}
up_read(&fs_info->dev_replace.rwsem);
spin_lock(&space_info->lock);
spin_lock(&bg->lock);
ASSERT(!btrfs_is_block_group_used(bg));
if (bg->ro) {
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
continue;
}
reclaimed = bg->alloc_offset;
bg->zone_unusable = bg->length - bg->zone_capacity;
bg->alloc_offset = 0;
/*
* This holds because we currently reset fully used then freed
* block group.
*/
ASSERT(reclaimed == bg->zone_capacity);
bg->free_space_ctl->free_space += reclaimed;
space_info->bytes_zone_unusable -= reclaimed;
spin_unlock(&bg->lock);
btrfs_return_free_space(space_info, reclaimed);
spin_unlock(&space_info->lock);
if (num_bytes <= reclaimed)
break;
num_bytes -= reclaimed;
}
return 0;
}

View File

@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info, bool do_finish);
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info,
u64 num_bytes)
{
return 0;
}
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)

View File

@ -100,7 +100,8 @@ struct find_free_extent_ctl;
EM( ALLOC_CHUNK, "ALLOC_CHUNK") \
EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \
EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \
EMe(COMMIT_TRANS, "COMMIT_TRANS")
EM( COMMIT_TRANS, "COMMIT_TRANS") \
EMe(RESET_ZONES, "RESET_ZONES")
/*
* First define the enums in the above macros to be exported to userspace via