mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2024-12-28 16:52:18 +00:00
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git
This commit is contained in:
commit
389534fa27
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
# misc-next marker
|
||||
|
||||
config BTRFS_FS
|
||||
tristate "Btrfs filesystem support"
|
||||
|
@ -44,4 +44,4 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
|
||||
tests/extent-buffer-tests.o tests/btrfs-tests.o \
|
||||
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
|
||||
tests/free-space-tree-tests.o tests/extent-map-tests.o \
|
||||
tests/raid-stripe-tree-tests.o
|
||||
tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o
|
||||
|
@ -3022,9 +3022,6 @@ void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info,
|
||||
cache->rb_root = RB_ROOT;
|
||||
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
|
||||
INIT_LIST_HEAD(&cache->pending[i]);
|
||||
INIT_LIST_HEAD(&cache->changed);
|
||||
INIT_LIST_HEAD(&cache->detached);
|
||||
INIT_LIST_HEAD(&cache->leaves);
|
||||
INIT_LIST_HEAD(&cache->pending_edge);
|
||||
INIT_LIST_HEAD(&cache->useless_node);
|
||||
cache->fs_info = fs_info;
|
||||
@ -3132,29 +3129,17 @@ void btrfs_backref_drop_node(struct btrfs_backref_cache *tree,
|
||||
void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
|
||||
struct btrfs_backref_node *node)
|
||||
{
|
||||
struct btrfs_backref_node *upper;
|
||||
struct btrfs_backref_edge *edge;
|
||||
|
||||
if (!node)
|
||||
return;
|
||||
|
||||
BUG_ON(!node->lowest && !node->detached);
|
||||
while (!list_empty(&node->upper)) {
|
||||
edge = list_entry(node->upper.next, struct btrfs_backref_edge,
|
||||
list[LOWER]);
|
||||
upper = edge->node[UPPER];
|
||||
list_del(&edge->list[LOWER]);
|
||||
list_del(&edge->list[UPPER]);
|
||||
btrfs_backref_free_edge(cache, edge);
|
||||
|
||||
/*
|
||||
* Add the node to leaf node list if no other child block
|
||||
* cached.
|
||||
*/
|
||||
if (list_empty(&upper->lower)) {
|
||||
list_add_tail(&upper->lower, &cache->leaves);
|
||||
upper->lowest = 1;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_backref_drop_node(cache, node);
|
||||
@ -3166,33 +3151,13 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
|
||||
void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
|
||||
{
|
||||
struct btrfs_backref_node *node;
|
||||
int i;
|
||||
|
||||
while (!list_empty(&cache->detached)) {
|
||||
node = list_entry(cache->detached.next,
|
||||
struct btrfs_backref_node, list);
|
||||
while ((node = rb_entry_safe(rb_first(&cache->rb_root),
|
||||
struct btrfs_backref_node, rb_node)))
|
||||
btrfs_backref_cleanup_node(cache, node);
|
||||
}
|
||||
|
||||
while (!list_empty(&cache->leaves)) {
|
||||
node = list_entry(cache->leaves.next,
|
||||
struct btrfs_backref_node, lower);
|
||||
btrfs_backref_cleanup_node(cache, node);
|
||||
}
|
||||
|
||||
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
|
||||
while (!list_empty(&cache->pending[i])) {
|
||||
node = list_first_entry(&cache->pending[i],
|
||||
struct btrfs_backref_node,
|
||||
list);
|
||||
btrfs_backref_cleanup_node(cache, node);
|
||||
}
|
||||
}
|
||||
ASSERT(list_empty(&cache->pending_edge));
|
||||
ASSERT(list_empty(&cache->useless_node));
|
||||
ASSERT(list_empty(&cache->changed));
|
||||
ASSERT(list_empty(&cache->detached));
|
||||
ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
|
||||
ASSERT(!cache->nr_nodes);
|
||||
ASSERT(!cache->nr_edges);
|
||||
}
|
||||
@ -3316,8 +3281,12 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
|
||||
root = btrfs_get_fs_root(fs_info, ref_key->offset, false);
|
||||
if (IS_ERR(root))
|
||||
return PTR_ERR(root);
|
||||
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
|
||||
cur->cowonly = 1;
|
||||
|
||||
/* We shouldn't be using backref cache for non-shareable roots. */
|
||||
if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
|
||||
btrfs_put_root(root);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
|
||||
if (btrfs_root_level(&root->root_item) == cur->level) {
|
||||
/* Tree root */
|
||||
@ -3403,8 +3372,15 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
|
||||
goto out;
|
||||
}
|
||||
upper->owner = btrfs_header_owner(eb);
|
||||
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
|
||||
upper->cowonly = 1;
|
||||
|
||||
/* We shouldn't be using backref cache for non shareable roots. */
|
||||
if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
|
||||
btrfs_put_root(root);
|
||||
btrfs_backref_free_edge(cache, edge);
|
||||
btrfs_backref_free_node(cache, upper);
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we know the block isn't shared we can avoid
|
||||
@ -3595,15 +3571,9 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
|
||||
|
||||
ASSERT(start->checked);
|
||||
|
||||
/* Insert this node to cache if it's not COW-only */
|
||||
if (!start->cowonly) {
|
||||
rb_node = rb_simple_insert(&cache->rb_root, start->bytenr,
|
||||
&start->rb_node);
|
||||
if (rb_node)
|
||||
btrfs_backref_panic(cache->fs_info, start->bytenr,
|
||||
-EEXIST);
|
||||
list_add_tail(&start->lower, &cache->leaves);
|
||||
}
|
||||
rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
|
||||
if (rb_node)
|
||||
btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
|
||||
|
||||
/*
|
||||
* Use breadth first search to iterate all related edges.
|
||||
@ -3642,11 +3612,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
|
||||
* parents have already been linked.
|
||||
*/
|
||||
if (!RB_EMPTY_NODE(&upper->rb_node)) {
|
||||
if (upper->lowest) {
|
||||
list_del_init(&upper->lower);
|
||||
upper->lowest = 0;
|
||||
}
|
||||
|
||||
list_add_tail(&edge->list[UPPER], &upper->lower);
|
||||
continue;
|
||||
}
|
||||
@ -3657,23 +3622,13 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
|
||||
return -EUCLEAN;
|
||||
}
|
||||
|
||||
/* Sanity check, COW-only node has non-COW-only parent */
|
||||
if (start->cowonly != upper->cowonly) {
|
||||
ASSERT(0);
|
||||
rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
|
||||
&upper->rb_node);
|
||||
if (unlikely(rb_node)) {
|
||||
btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
|
||||
/* Only cache non-COW-only (subvolume trees) tree blocks */
|
||||
if (!upper->cowonly) {
|
||||
rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
|
||||
&upper->rb_node);
|
||||
if (rb_node) {
|
||||
btrfs_backref_panic(cache->fs_info,
|
||||
upper->bytenr, -EEXIST);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
}
|
||||
|
||||
list_add_tail(&edge->list[UPPER], &upper->lower);
|
||||
|
||||
/*
|
||||
|
@ -318,6 +318,12 @@ struct btrfs_backref_node {
|
||||
u64 bytenr;
|
||||
}; /* Use rb_simple_node for search/insert */
|
||||
|
||||
/*
|
||||
* This is a sanity check, whenever we COW a block we will update
|
||||
* new_bytenr with it's current location, and we will check this in
|
||||
* various places to validate that the cache makes sense, it shouldn't
|
||||
* be used for anything else.
|
||||
*/
|
||||
u64 new_bytenr;
|
||||
/* Objectid of tree block owner, can be not uptodate */
|
||||
u64 owner;
|
||||
@ -335,10 +341,6 @@ struct btrfs_backref_node {
|
||||
struct extent_buffer *eb;
|
||||
/* Level of the tree block */
|
||||
unsigned int level:8;
|
||||
/* Is the block in a non-shareable tree */
|
||||
unsigned int cowonly:1;
|
||||
/* 1 if no child node is in the cache */
|
||||
unsigned int lowest:1;
|
||||
/* Is the extent buffer locked */
|
||||
unsigned int locked:1;
|
||||
/* Has the block been processed */
|
||||
@ -391,12 +393,6 @@ struct btrfs_backref_cache {
|
||||
* level blocks may not reflect the new location
|
||||
*/
|
||||
struct list_head pending[BTRFS_MAX_LEVEL];
|
||||
/* List of backref nodes with no child node */
|
||||
struct list_head leaves;
|
||||
/* List of blocks that have been COWed in current transaction */
|
||||
struct list_head changed;
|
||||
/* List of detached backref node. */
|
||||
struct list_head detached;
|
||||
|
||||
u64 last_trans;
|
||||
|
||||
|
@ -725,8 +725,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
|
||||
bio->bi_opf |= REQ_OP_ZONE_APPEND;
|
||||
}
|
||||
|
||||
if (is_data_bbio(bbio) && bioc &&
|
||||
btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
|
||||
if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
|
||||
/*
|
||||
* No locking for the list update, as we only add to
|
||||
* the list in the I/O submission path, and list
|
||||
|
@ -1223,7 +1223,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
|
||||
block_group->space_info->total_bytes -= block_group->length;
|
||||
block_group->space_info->bytes_readonly -=
|
||||
(block_group->length - block_group->zone_unusable);
|
||||
btrfs_space_info_update_bytes_zone_unusable(fs_info, block_group->space_info,
|
||||
btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
|
||||
-block_group->zone_unusable);
|
||||
block_group->space_info->disk_total -= block_group->length * factor;
|
||||
|
||||
@ -1396,8 +1396,7 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
|
||||
if (btrfs_is_zoned(cache->fs_info)) {
|
||||
/* Migrate zone_unusable bytes to readonly */
|
||||
sinfo->bytes_readonly += cache->zone_unusable;
|
||||
btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
|
||||
-cache->zone_unusable);
|
||||
btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
|
||||
cache->zone_unusable = 0;
|
||||
}
|
||||
cache->ro++;
|
||||
@ -1645,8 +1644,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
|
||||
spin_lock(&space_info->lock);
|
||||
spin_lock(&block_group->lock);
|
||||
|
||||
btrfs_space_info_update_bytes_pinned(fs_info, space_info,
|
||||
-block_group->pinned);
|
||||
btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
|
||||
space_info->bytes_readonly += block_group->pinned;
|
||||
block_group->pinned = 0;
|
||||
|
||||
@ -3060,8 +3058,7 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
|
||||
(cache->alloc_offset - cache->used - cache->pinned -
|
||||
cache->reserved) +
|
||||
(cache->length - cache->zone_capacity);
|
||||
btrfs_space_info_update_bytes_zone_unusable(cache->fs_info, sinfo,
|
||||
cache->zone_unusable);
|
||||
btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
|
||||
sinfo->bytes_readonly -= cache->zone_unusable;
|
||||
}
|
||||
num_bytes = cache->length - cache->reserved -
|
||||
@ -3699,7 +3696,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
|
||||
old_val -= num_bytes;
|
||||
cache->used = old_val;
|
||||
cache->pinned += num_bytes;
|
||||
btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes);
|
||||
btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
|
||||
space_info->bytes_used -= num_bytes;
|
||||
space_info->disk_used -= num_bytes * factor;
|
||||
if (READ_ONCE(space_info->periodic_reclaim))
|
||||
@ -3781,8 +3778,7 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
|
||||
space_info->bytes_reserved += num_bytes;
|
||||
trace_btrfs_space_reservation(cache->fs_info, "space_info",
|
||||
space_info->flags, num_bytes, 1);
|
||||
btrfs_space_info_update_bytes_may_use(cache->fs_info,
|
||||
space_info, -ram_bytes);
|
||||
btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
|
||||
if (delalloc)
|
||||
cache->delalloc_bytes += num_bytes;
|
||||
|
||||
|
@ -150,9 +150,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
|
||||
spin_unlock(&dest->lock);
|
||||
}
|
||||
if (num_bytes)
|
||||
btrfs_space_info_free_bytes_may_use(fs_info,
|
||||
space_info,
|
||||
num_bytes);
|
||||
btrfs_space_info_free_bytes_may_use(space_info, num_bytes);
|
||||
}
|
||||
if (qgroup_to_release_ret)
|
||||
*qgroup_to_release_ret = qgroup_to_release;
|
||||
@ -383,13 +381,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
|
||||
|
||||
if (block_rsv->reserved < block_rsv->size) {
|
||||
num_bytes = block_rsv->size - block_rsv->reserved;
|
||||
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
|
||||
num_bytes);
|
||||
btrfs_space_info_update_bytes_may_use(sinfo, num_bytes);
|
||||
block_rsv->reserved = block_rsv->size;
|
||||
} else if (block_rsv->reserved > block_rsv->size) {
|
||||
num_bytes = block_rsv->reserved - block_rsv->size;
|
||||
btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
|
||||
-num_bytes);
|
||||
btrfs_space_info_update_bytes_may_use(sinfo, -num_bytes);
|
||||
block_rsv->reserved = block_rsv->size;
|
||||
btrfs_try_granting_tickets(fs_info, sinfo);
|
||||
}
|
||||
|
@ -526,7 +526,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
|
||||
u32 bio_offset, struct bio_vec *bv);
|
||||
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
||||
struct btrfs_file_extent *file_extent,
|
||||
bool nowait, bool strict);
|
||||
bool nowait);
|
||||
|
||||
void btrfs_del_delalloc_inode(struct btrfs_inode *inode);
|
||||
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
|
||||
|
@ -37,19 +37,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
|
||||
static int balance_node_right(struct btrfs_trans_handle *trans,
|
||||
struct extent_buffer *dst_buf,
|
||||
struct extent_buffer *src_buf);
|
||||
|
||||
static const struct btrfs_csums {
|
||||
u16 size;
|
||||
const char name[10];
|
||||
const char driver[12];
|
||||
} btrfs_csums[] = {
|
||||
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
|
||||
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
|
||||
[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
|
||||
[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
|
||||
.driver = "blake2b-256" },
|
||||
};
|
||||
|
||||
/*
|
||||
* The leaf data grows from end-to-front in the node. this returns the address
|
||||
* of the start of the last item, which is the stop of the leaf data stack.
|
||||
@ -148,44 +135,6 @@ static inline void copy_leaf_items(const struct extent_buffer *dst,
|
||||
nr_items * sizeof(struct btrfs_item));
|
||||
}
|
||||
|
||||
/* This exists for btrfs-progs usages. */
|
||||
u16 btrfs_csum_type_size(u16 type)
|
||||
{
|
||||
return btrfs_csums[type].size;
|
||||
}
|
||||
|
||||
int btrfs_super_csum_size(const struct btrfs_super_block *s)
|
||||
{
|
||||
u16 t = btrfs_super_csum_type(s);
|
||||
/*
|
||||
* csum type is validated at mount time
|
||||
*/
|
||||
return btrfs_csum_type_size(t);
|
||||
}
|
||||
|
||||
const char *btrfs_super_csum_name(u16 csum_type)
|
||||
{
|
||||
/* csum type is validated at mount time */
|
||||
return btrfs_csums[csum_type].name;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return driver name if defined, otherwise the name that's also a valid driver
|
||||
* name
|
||||
*/
|
||||
const char *btrfs_super_csum_driver(u16 csum_type)
|
||||
{
|
||||
/* csum type is validated at mount time */
|
||||
return btrfs_csums[csum_type].driver[0] ?
|
||||
btrfs_csums[csum_type].driver :
|
||||
btrfs_csums[csum_type].name;
|
||||
}
|
||||
|
||||
size_t __attribute_const__ btrfs_get_num_csums(void)
|
||||
{
|
||||
return ARRAY_SIZE(btrfs_csums);
|
||||
}
|
||||
|
||||
struct btrfs_path *btrfs_alloc_path(void)
|
||||
{
|
||||
might_sleep();
|
||||
@ -225,22 +174,6 @@ noinline void btrfs_release_path(struct btrfs_path *p)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We want the transaction abort to print stack trace only for errors where the
|
||||
* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
|
||||
* caused by external factors.
|
||||
*/
|
||||
bool __cold abort_should_print_stack(int error)
|
||||
{
|
||||
switch (error) {
|
||||
case -EIO:
|
||||
case -EROFS:
|
||||
case -ENOMEM:
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* safely gets a reference on the root node of a tree. A lock
|
||||
* is not taken, so a concurrent writer may put a different node
|
||||
@ -654,6 +587,8 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
|
||||
goto error_unlock_cow;
|
||||
}
|
||||
}
|
||||
|
||||
trace_btrfs_cow_block(root, buf, cow);
|
||||
if (unlock_orig)
|
||||
btrfs_tree_unlock(buf);
|
||||
free_extent_buffer_stale(buf);
|
||||
@ -710,7 +645,6 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
u64 search_start;
|
||||
int ret;
|
||||
|
||||
if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
|
||||
btrfs_abort_transaction(trans, -EUCLEAN);
|
||||
@ -751,12 +685,8 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans,
|
||||
* Also We don't care about the error, as it's handled internally.
|
||||
*/
|
||||
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
|
||||
ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
|
||||
cow_ret, search_start, 0, nest);
|
||||
|
||||
trace_btrfs_cow_block(root, buf, *cow_ret);
|
||||
|
||||
return ret;
|
||||
return btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
|
||||
cow_ret, search_start, 0, nest);
|
||||
}
|
||||
ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
|
||||
|
||||
|
@ -7,7 +7,6 @@
|
||||
#define BTRFS_CTREE_H
|
||||
|
||||
#include "linux/cleanup.h"
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/mutex.h>
|
||||
@ -506,20 +505,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
|
||||
return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item);
|
||||
}
|
||||
|
||||
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \
|
||||
((bytes) >> (fs_info)->sectorsize_bits)
|
||||
|
||||
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
|
||||
{
|
||||
return mapping_gfp_constraint(mapping, ~__GFP_FS);
|
||||
}
|
||||
|
||||
void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
|
||||
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
|
||||
u64 num_bytes, u64 *actual_bytes);
|
||||
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
|
||||
|
||||
/* ctree.c */
|
||||
int __init btrfs_ctree_init(void);
|
||||
void __cold btrfs_ctree_exit(void);
|
||||
|
||||
@ -756,18 +741,4 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
|
||||
return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID;
|
||||
}
|
||||
|
||||
u16 btrfs_csum_type_size(u16 type);
|
||||
int btrfs_super_csum_size(const struct btrfs_super_block *s);
|
||||
const char *btrfs_super_csum_name(u16 csum_type);
|
||||
const char *btrfs_super_csum_driver(u16 csum_type);
|
||||
size_t __attribute_const__ btrfs_get_num_csums(void);
|
||||
|
||||
/*
|
||||
* We use folio flag owner_2 to indicate there is an ordered extent with
|
||||
* unfinished IO.
|
||||
*/
|
||||
#define folio_test_ordered(folio) folio_test_owner_2(folio)
|
||||
#define folio_set_ordered(folio) folio_set_owner_2(folio)
|
||||
#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
|
||||
|
||||
#endif
|
||||
|
@ -176,7 +176,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
|
||||
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
|
||||
|
||||
data_sinfo = fs_info->data_sinfo;
|
||||
btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
|
||||
btrfs_space_info_free_bytes_may_use(data_sinfo, len);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -93,6 +93,9 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
|
||||
u64 num_bytes;
|
||||
u64 reserved_bytes;
|
||||
|
||||
if (btrfs_is_testing(fs_info))
|
||||
return;
|
||||
|
||||
num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates);
|
||||
num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info,
|
||||
trans->delayed_ref_csum_deletions);
|
||||
@ -254,7 +257,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
|
||||
spin_unlock(&block_rsv->lock);
|
||||
|
||||
if (to_free > 0)
|
||||
btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free);
|
||||
btrfs_space_info_free_bytes_may_use(space_info, to_free);
|
||||
|
||||
if (refilled_bytes > 0)
|
||||
trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0,
|
||||
@ -555,6 +558,32 @@ void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
|
||||
delayed_refs->num_heads_ready--;
|
||||
}
|
||||
|
||||
struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head)
|
||||
{
|
||||
struct btrfs_delayed_ref_node *ref;
|
||||
|
||||
lockdep_assert_held(&head->mutex);
|
||||
lockdep_assert_held(&head->lock);
|
||||
|
||||
if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
|
||||
* This is to prevent a ref count from going down to zero, which deletes
|
||||
* the extent item from the extent tree, when there still are references
|
||||
* to add, which would fail because they would not find the extent item.
|
||||
*/
|
||||
if (!list_empty(&head->ref_add_list))
|
||||
return list_first_entry(&head->ref_add_list,
|
||||
struct btrfs_delayed_ref_node, add_list);
|
||||
|
||||
ref = rb_entry(rb_first_cached(&head->ref_tree),
|
||||
struct btrfs_delayed_ref_node, ref_node);
|
||||
ASSERT(list_empty(&ref->add_list));
|
||||
return ref;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper to insert the ref_node to the tail or merge with tail.
|
||||
*
|
||||
@ -1234,6 +1263,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
|
||||
{
|
||||
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
|
||||
struct btrfs_fs_info *fs_info = trans->fs_info;
|
||||
bool testing = btrfs_is_testing(fs_info);
|
||||
|
||||
spin_lock(&delayed_refs->lock);
|
||||
while (true) {
|
||||
@ -1263,7 +1293,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
|
||||
spin_unlock(&delayed_refs->lock);
|
||||
mutex_unlock(&head->mutex);
|
||||
|
||||
if (pin_bytes) {
|
||||
if (!testing && pin_bytes) {
|
||||
struct btrfs_block_group *bg;
|
||||
|
||||
bg = btrfs_lookup_block_group(fs_info, head->bytenr);
|
||||
@ -1281,8 +1311,7 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
|
||||
spin_lock(&bg->space_info->lock);
|
||||
spin_lock(&bg->lock);
|
||||
bg->pinned += head->num_bytes;
|
||||
btrfs_space_info_update_bytes_pinned(fs_info,
|
||||
bg->space_info,
|
||||
btrfs_space_info_update_bytes_pinned(bg->space_info,
|
||||
head->num_bytes);
|
||||
bg->reserved -= head->num_bytes;
|
||||
bg->space_info->bytes_reserved -= head->num_bytes;
|
||||
@ -1295,12 +1324,15 @@ void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
|
||||
btrfs_error_unpin_extent_range(fs_info, head->bytenr,
|
||||
head->bytenr + head->num_bytes - 1);
|
||||
}
|
||||
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
|
||||
if (!testing)
|
||||
btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
|
||||
btrfs_put_delayed_ref_head(head);
|
||||
cond_resched();
|
||||
spin_lock(&delayed_refs->lock);
|
||||
}
|
||||
btrfs_qgroup_destroy_extent_records(trans);
|
||||
|
||||
if (!testing)
|
||||
btrfs_qgroup_destroy_extent_records(trans);
|
||||
|
||||
spin_unlock(&delayed_refs->lock);
|
||||
}
|
||||
|
@ -402,6 +402,7 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head(
|
||||
struct btrfs_delayed_ref_root *delayed_refs);
|
||||
void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
|
||||
struct btrfs_delayed_ref_head *head);
|
||||
struct btrfs_delayed_ref_node *btrfs_select_delayed_ref(struct btrfs_delayed_ref_head *head);
|
||||
|
||||
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
|
||||
|
||||
|
@ -248,8 +248,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
|
||||
len = min(len, em->len - (start - em->start));
|
||||
block_start = extent_map_block_start(em) + (start - em->start);
|
||||
|
||||
if (can_nocow_extent(inode, start, &len,
|
||||
&file_extent, false, false) == 1) {
|
||||
if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) {
|
||||
bg = btrfs_inc_nocow_writers(fs_info, block_start);
|
||||
if (bg)
|
||||
can_nocow = true;
|
||||
|
@ -2337,7 +2337,7 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
|
||||
* 1, 2 2nd and 3rd backup copy
|
||||
* -1 skip bytenr check
|
||||
*/
|
||||
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
|
||||
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
|
||||
const struct btrfs_super_block *sb, int mirror_num)
|
||||
{
|
||||
u64 nodesize = btrfs_super_nodesize(sb);
|
||||
@ -2495,24 +2495,7 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Obvious sys_chunk_array corruptions, it must hold at least one key
|
||||
* and one chunk
|
||||
*/
|
||||
if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
|
||||
btrfs_err(fs_info, "system chunk array too big %u > %u",
|
||||
btrfs_super_sys_array_size(sb),
|
||||
BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
|
||||
ret = -EINVAL;
|
||||
}
|
||||
if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
|
||||
+ sizeof(struct btrfs_chunk)) {
|
||||
btrfs_err(fs_info, "system chunk array too small %u < %zu",
|
||||
btrfs_super_sys_array_size(sb),
|
||||
sizeof(struct btrfs_disk_key)
|
||||
+ sizeof(struct btrfs_chunk));
|
||||
ret = -EINVAL;
|
||||
}
|
||||
ret = btrfs_check_system_chunk_array(fs_info, sb);
|
||||
|
||||
/*
|
||||
* The generation is a global counter, we'll trust it more than the others
|
||||
@ -3321,6 +3304,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
|
||||
fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
|
||||
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
|
||||
fs_info->stripesize = stripesize;
|
||||
fs_info->fs_devices->fs_info = fs_info;
|
||||
|
||||
/*
|
||||
* Handle the space caching options appropriately now that we have the
|
||||
|
@ -54,7 +54,7 @@ int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
|
||||
const struct btrfs_super_block *disk_sb);
|
||||
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
|
||||
void __cold close_ctree(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
|
||||
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
|
||||
const struct btrfs_super_block *sb, int mirror_num);
|
||||
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
|
||||
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
|
||||
|
@ -1803,30 +1803,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct btrfs_delayed_ref_node *
|
||||
select_delayed_ref(struct btrfs_delayed_ref_head *head)
|
||||
{
|
||||
struct btrfs_delayed_ref_node *ref;
|
||||
|
||||
if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
|
||||
* This is to prevent a ref count from going down to zero, which deletes
|
||||
* the extent item from the extent tree, when there still are references
|
||||
* to add, which would fail because they would not find the extent item.
|
||||
*/
|
||||
if (!list_empty(&head->ref_add_list))
|
||||
return list_first_entry(&head->ref_add_list,
|
||||
struct btrfs_delayed_ref_node, add_list);
|
||||
|
||||
ref = rb_entry(rb_first_cached(&head->ref_tree),
|
||||
struct btrfs_delayed_ref_node, ref_node);
|
||||
ASSERT(list_empty(&ref->add_list));
|
||||
return ref;
|
||||
}
|
||||
|
||||
static struct btrfs_delayed_extent_op *cleanup_extent_op(
|
||||
struct btrfs_delayed_ref_head *head)
|
||||
{
|
||||
@ -1959,7 +1935,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
|
||||
lockdep_assert_held(&locked_ref->mutex);
|
||||
lockdep_assert_held(&locked_ref->lock);
|
||||
|
||||
while ((ref = select_delayed_ref(locked_ref))) {
|
||||
while ((ref = btrfs_select_delayed_ref(locked_ref))) {
|
||||
if (ref->seq &&
|
||||
btrfs_check_delayed_seq(fs_info, ref->seq)) {
|
||||
spin_unlock(&locked_ref->lock);
|
||||
@ -2230,10 +2206,11 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline int check_delayed_ref(struct btrfs_root *root,
|
||||
static noinline int check_delayed_ref(struct btrfs_inode *inode,
|
||||
struct btrfs_path *path,
|
||||
u64 objectid, u64 offset, u64 bytenr)
|
||||
u64 offset, u64 bytenr)
|
||||
{
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_delayed_ref_head *head;
|
||||
struct btrfs_delayed_ref_node *ref;
|
||||
struct btrfs_delayed_ref_root *delayed_refs;
|
||||
@ -2307,7 +2284,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
|
||||
* then we have a cross reference.
|
||||
*/
|
||||
if (ref->ref_root != btrfs_root_id(root) ||
|
||||
ref_owner != objectid || ref_offset != offset) {
|
||||
ref_owner != btrfs_ino(inode) || ref_offset != offset) {
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
@ -2318,11 +2295,54 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static noinline int check_committed_ref(struct btrfs_root *root,
|
||||
/*
|
||||
* Check if there are references for a data extent other than the one belonging
|
||||
* to the given inode and offset.
|
||||
*
|
||||
* @inode: The only inode we expect to find associated with the data extent.
|
||||
* @path: A path to use for searching the extent tree.
|
||||
* @offset: The only offset we expect to find associated with the data
|
||||
* extent.
|
||||
* @bytenr: The logical address of the data extent.
|
||||
*
|
||||
* When the extent does not have any other references other than the one we
|
||||
* expect to find, we always return a value of 0 with the path having a locked
|
||||
* leaf that contains the extent's extent item - this is necessary to ensure
|
||||
* we don't race with a task running delayed references, and our caller must
|
||||
* have such a path when calling check_delayed_ref() - it must lock a delayed
|
||||
* ref head while holding the leaf locked. In case the extent item is not found
|
||||
* in the extent tree, we return -ENOENT with the path having the leaf (locked)
|
||||
* where the extent item should be, in order to prevent races with another task
|
||||
* running delayed references, so that we don't miss any reference when calling
|
||||
* check_delayed_ref().
|
||||
*
|
||||
* Note: this may return false positives, and this is because we want to be
|
||||
* quick here as we're called in write paths (when flushing delalloc and
|
||||
* in the direct IO write path). For example we can have an extent with
|
||||
* a single reference but that reference is not inlined, or we may have
|
||||
* many references in the extent tree but we also have delayed references
|
||||
* that cancel all the reference except the one for our inode and offset,
|
||||
* but it would be expensive to do such checks and complex due to all
|
||||
* locking to avoid races between the checks and flushing delayed refs,
|
||||
* plus non-inline references may be located on leaves other than the one
|
||||
* that contains the extent item in the extent tree. The important thing
|
||||
* here is to not return false negatives and that the false positives are
|
||||
* not very common.
|
||||
*
|
||||
* Returns: 0 if there are no cross references and with the path having a locked
|
||||
* leaf from the extent tree that contains the extent's extent item.
|
||||
*
|
||||
* 1 if there are cross references (false positives can happen).
|
||||
*
|
||||
* < 0 in case of an error. In case of -ENOENT the leaf in the extent
|
||||
* tree where the extent item should be located at is read locked and
|
||||
* accessible in the given path.
|
||||
*/
|
||||
static noinline int check_committed_ref(struct btrfs_inode *inode,
|
||||
struct btrfs_path *path,
|
||||
u64 objectid, u64 offset, u64 bytenr,
|
||||
bool strict)
|
||||
u64 offset, u64 bytenr)
|
||||
{
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
|
||||
struct extent_buffer *leaf;
|
||||
@ -2341,35 +2361,32 @@ static noinline int check_committed_ref(struct btrfs_root *root,
|
||||
|
||||
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
return ret;
|
||||
if (ret == 0) {
|
||||
/*
|
||||
* Key with offset -1 found, there would have to exist an extent
|
||||
* item with such offset, but this is out of the valid range.
|
||||
*/
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
return -EUCLEAN;
|
||||
}
|
||||
|
||||
ret = -ENOENT;
|
||||
if (path->slots[0] == 0)
|
||||
goto out;
|
||||
return -ENOENT;
|
||||
|
||||
path->slots[0]--;
|
||||
leaf = path->nodes[0];
|
||||
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
||||
|
||||
if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
|
||||
goto out;
|
||||
return -ENOENT;
|
||||
|
||||
ret = 1;
|
||||
item_size = btrfs_item_size(leaf, path->slots[0]);
|
||||
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
|
||||
expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY);
|
||||
|
||||
/* No inline refs; we need to bail before checking for owner ref. */
|
||||
if (item_size == sizeof(*ei))
|
||||
goto out;
|
||||
return 1;
|
||||
|
||||
/* Check for an owner ref; skip over it to the real inline refs. */
|
||||
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
|
||||
@ -2377,56 +2394,69 @@ static noinline int check_committed_ref(struct btrfs_root *root,
|
||||
if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) {
|
||||
expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY);
|
||||
iref = (struct btrfs_extent_inline_ref *)(iref + 1);
|
||||
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
|
||||
}
|
||||
|
||||
/* If extent item has more than 1 inline ref then it's shared */
|
||||
if (item_size != expected_size)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If extent created before last snapshot => it's shared unless the
|
||||
* snapshot has been deleted. Use the heuristic if strict is false.
|
||||
*/
|
||||
if (!strict &&
|
||||
(btrfs_extent_generation(leaf, ei) <=
|
||||
btrfs_root_last_snapshot(&root->root_item)))
|
||||
goto out;
|
||||
return 1;
|
||||
|
||||
/* If this extent has SHARED_DATA_REF then it's shared */
|
||||
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
|
||||
if (type != BTRFS_EXTENT_DATA_REF_KEY)
|
||||
goto out;
|
||||
return 1;
|
||||
|
||||
ref = (struct btrfs_extent_data_ref *)(&iref->offset);
|
||||
if (btrfs_extent_refs(leaf, ei) !=
|
||||
btrfs_extent_data_ref_count(leaf, ref) ||
|
||||
btrfs_extent_data_ref_root(leaf, ref) != btrfs_root_id(root) ||
|
||||
btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
|
||||
btrfs_extent_data_ref_objectid(leaf, ref) != btrfs_ino(inode) ||
|
||||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
|
||||
goto out;
|
||||
return 1;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
|
||||
u64 bytenr, bool strict, struct btrfs_path *path)
|
||||
int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset,
|
||||
u64 bytenr, struct btrfs_path *path)
|
||||
{
|
||||
int ret;
|
||||
|
||||
do {
|
||||
ret = check_committed_ref(root, path, objectid,
|
||||
offset, bytenr, strict);
|
||||
ret = check_committed_ref(inode, path, offset, bytenr);
|
||||
if (ret && ret != -ENOENT)
|
||||
goto out;
|
||||
|
||||
ret = check_delayed_ref(root, path, objectid, offset, bytenr);
|
||||
/*
|
||||
* The path must have a locked leaf from the extent tree where
|
||||
* the extent item for our extent is located, in case it exists,
|
||||
* or where it should be located in case it doesn't exist yet
|
||||
* because it's new and its delayed ref was not yet flushed.
|
||||
* We need to lock the delayed ref head at check_delayed_ref(),
|
||||
* if one exists, while holding the leaf locked in order to not
|
||||
* race with delayed ref flushing, missing references and
|
||||
* incorrectly reporting that the extent is not shared.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
|
||||
struct extent_buffer *leaf = path->nodes[0];
|
||||
|
||||
ASSERT(leaf != NULL);
|
||||
btrfs_assert_tree_read_locked(leaf);
|
||||
|
||||
if (ret != -ENOENT) {
|
||||
struct btrfs_key key;
|
||||
|
||||
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
||||
ASSERT(key.objectid == bytenr);
|
||||
ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY);
|
||||
}
|
||||
}
|
||||
|
||||
ret = check_delayed_ref(inode, path, offset, bytenr);
|
||||
} while (ret == -EAGAIN && !path->nowait);
|
||||
|
||||
out:
|
||||
btrfs_release_path(path);
|
||||
if (btrfs_is_data_reloc_root(root))
|
||||
if (btrfs_is_data_reloc_root(inode->root))
|
||||
WARN_ON(ret > 0);
|
||||
return ret;
|
||||
}
|
||||
@ -2571,13 +2601,10 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_block_group *cache,
|
||||
u64 bytenr, u64 num_bytes, int reserved)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = cache->fs_info;
|
||||
|
||||
spin_lock(&cache->space_info->lock);
|
||||
spin_lock(&cache->lock);
|
||||
cache->pinned += num_bytes;
|
||||
btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
|
||||
num_bytes);
|
||||
btrfs_space_info_update_bytes_pinned(cache->space_info, num_bytes);
|
||||
if (reserved) {
|
||||
cache->reserved -= num_bytes;
|
||||
cache->space_info->bytes_reserved -= num_bytes;
|
||||
@ -2724,15 +2751,15 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
|
||||
{
|
||||
struct btrfs_block_group *cache = NULL;
|
||||
struct btrfs_space_info *space_info;
|
||||
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
|
||||
struct btrfs_free_cluster *cluster = NULL;
|
||||
u64 len;
|
||||
u64 total_unpinned = 0;
|
||||
u64 empty_cluster = 0;
|
||||
bool readonly;
|
||||
int ret = 0;
|
||||
|
||||
while (start <= end) {
|
||||
u64 len;
|
||||
|
||||
readonly = false;
|
||||
if (!cache ||
|
||||
start >= cache->start + cache->length) {
|
||||
@ -2778,37 +2805,19 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
|
||||
spin_lock(&space_info->lock);
|
||||
spin_lock(&cache->lock);
|
||||
cache->pinned -= len;
|
||||
btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
|
||||
btrfs_space_info_update_bytes_pinned(space_info, -len);
|
||||
space_info->max_extent_size = 0;
|
||||
if (cache->ro) {
|
||||
space_info->bytes_readonly += len;
|
||||
readonly = true;
|
||||
} else if (btrfs_is_zoned(fs_info)) {
|
||||
/* Need reset before reusing in a zoned block group */
|
||||
btrfs_space_info_update_bytes_zone_unusable(fs_info, space_info,
|
||||
len);
|
||||
btrfs_space_info_update_bytes_zone_unusable(space_info, len);
|
||||
readonly = true;
|
||||
}
|
||||
spin_unlock(&cache->lock);
|
||||
if (!readonly && return_free_space &&
|
||||
global_rsv->space_info == space_info) {
|
||||
spin_lock(&global_rsv->lock);
|
||||
if (!global_rsv->full) {
|
||||
u64 to_add = min(len, global_rsv->size -
|
||||
global_rsv->reserved);
|
||||
|
||||
global_rsv->reserved += to_add;
|
||||
btrfs_space_info_update_bytes_may_use(fs_info,
|
||||
space_info, to_add);
|
||||
if (global_rsv->reserved >= global_rsv->size)
|
||||
global_rsv->full = 1;
|
||||
len -= to_add;
|
||||
}
|
||||
spin_unlock(&global_rsv->lock);
|
||||
}
|
||||
/* Add to any tickets we may have */
|
||||
if (!readonly && return_free_space && len)
|
||||
btrfs_try_granting_tickets(fs_info, space_info);
|
||||
if (!readonly && return_free_space)
|
||||
btrfs_return_free_space(space_info, len);
|
||||
spin_unlock(&space_info->lock);
|
||||
}
|
||||
|
||||
@ -5142,8 +5151,16 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
|
||||
parent = ins.objectid;
|
||||
flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
|
||||
owning_root = reloc_src_root;
|
||||
} else
|
||||
BUG_ON(parent > 0);
|
||||
} else {
|
||||
if (unlikely(parent > 0)) {
|
||||
/*
|
||||
* Other roots than reloc tree don't expect start
|
||||
* offset of a parent block.
|
||||
*/
|
||||
ret = -EUCLEAN;
|
||||
goto out_free_reserved;
|
||||
}
|
||||
}
|
||||
|
||||
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
|
||||
struct btrfs_delayed_extent_op *extent_op;
|
||||
|
@ -116,8 +116,7 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
|
||||
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
|
||||
const struct extent_buffer *eb);
|
||||
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
|
||||
int btrfs_cross_ref_exist(struct btrfs_root *root,
|
||||
u64 objectid, u64 offset, u64 bytenr, bool strict,
|
||||
int btrfs_cross_ref_exist(struct btrfs_inode *inode, u64 offset, u64 bytenr,
|
||||
struct btrfs_path *path);
|
||||
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root,
|
||||
@ -163,5 +162,9 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_root *root,
|
||||
struct extent_buffer *node,
|
||||
struct extent_buffer *parent);
|
||||
void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
|
||||
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
|
||||
u64 num_bytes, u64 *actual_bytes);
|
||||
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
|
||||
|
||||
#endif
|
||||
|
@ -1167,6 +1167,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
|
||||
* last delalloc end.
|
||||
*/
|
||||
u64 last_delalloc_end = 0;
|
||||
/*
|
||||
* Save the last successfully ran delalloc range end (exclusive).
|
||||
* This is for error handling to avoid ranges with ordered extent created
|
||||
* but no IO will be submitted due to error.
|
||||
*/
|
||||
u64 last_finished = page_start;
|
||||
u64 delalloc_start = page_start;
|
||||
u64 delalloc_end = page_end;
|
||||
u64 delalloc_to_write = 0;
|
||||
@ -1235,11 +1241,28 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
|
||||
found_len = last_delalloc_end + 1 - found_start;
|
||||
|
||||
if (ret >= 0) {
|
||||
/*
|
||||
* Some delalloc range may be created by previous folios.
|
||||
* Thus we still need to clean those range up during error
|
||||
* handling.
|
||||
*/
|
||||
last_finished = found_start;
|
||||
/* No errors hit so far, run the current delalloc range. */
|
||||
ret = btrfs_run_delalloc_range(inode, folio,
|
||||
found_start,
|
||||
found_start + found_len - 1,
|
||||
wbc);
|
||||
if (ret >= 0)
|
||||
last_finished = found_start + found_len;
|
||||
if (unlikely(ret < 0))
|
||||
btrfs_err_rl(fs_info,
|
||||
"failed to run delalloc range, root=%lld ino=%llu folio=%llu submit_bitmap=%*pbl start=%llu len=%u: %d",
|
||||
inode->root->root_key.objectid,
|
||||
btrfs_ino(inode),
|
||||
folio_pos(folio),
|
||||
fs_info->sectors_per_page,
|
||||
&bio_ctrl->submit_bitmap,
|
||||
found_start, found_len, ret);
|
||||
} else {
|
||||
/*
|
||||
* We've hit an error during previous delalloc range,
|
||||
@ -1274,8 +1297,21 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
|
||||
|
||||
delalloc_start = found_start + found_len;
|
||||
}
|
||||
if (ret < 0)
|
||||
/*
|
||||
* It's possible we have some ordered extents created before we hit
|
||||
* an error, cleanup non-async successfully created delalloc ranges.
|
||||
*/
|
||||
if (unlikely(ret < 0)) {
|
||||
unsigned int bitmap_size = min(
|
||||
(last_finished - page_start) >> fs_info->sectorsize_bits,
|
||||
fs_info->sectors_per_page);
|
||||
|
||||
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
|
||||
btrfs_mark_ordered_io_finished(inode, folio,
|
||||
page_start + (bit << fs_info->sectorsize_bits),
|
||||
fs_info->sectorsize, false);
|
||||
return ret;
|
||||
}
|
||||
out:
|
||||
if (last_delalloc_end)
|
||||
delalloc_end = last_delalloc_end;
|
||||
@ -1335,7 +1371,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
|
||||
|
||||
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
|
||||
if (IS_ERR(em))
|
||||
return PTR_ERR_OR_ZERO(em);
|
||||
return PTR_ERR(em);
|
||||
|
||||
extent_offset = filepos - em->start;
|
||||
em_end = extent_map_end(em);
|
||||
@ -1391,6 +1427,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
unsigned long range_bitmap = 0;
|
||||
bool submitted_io = false;
|
||||
bool error = false;
|
||||
const u64 folio_start = folio_pos(folio);
|
||||
u64 cur;
|
||||
int bit;
|
||||
@ -1433,11 +1470,21 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
|
||||
break;
|
||||
}
|
||||
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (unlikely(ret < 0)) {
|
||||
submit_one_bio(bio_ctrl);
|
||||
/*
|
||||
* Failed to grab the extent map which should be very rare.
|
||||
* Since there is no bio submitted to finish the ordered
|
||||
* extent, we have to manually finish this sector.
|
||||
*/
|
||||
btrfs_mark_ordered_io_finished(inode, folio, cur,
|
||||
fs_info->sectorsize, false);
|
||||
error = true;
|
||||
continue;
|
||||
}
|
||||
submitted_io = true;
|
||||
}
|
||||
out:
|
||||
|
||||
/*
|
||||
* If we didn't submitted any sector (>= i_size), folio dirty get
|
||||
* cleared but PAGECACHE_TAG_DIRTY is not cleared (only cleared
|
||||
@ -1445,8 +1492,11 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
|
||||
*
|
||||
* Here we set writeback and clear for the range. If the full folio
|
||||
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
|
||||
*
|
||||
* If we hit any error, the corresponding sector will still be dirty
|
||||
* thus no need to clear PAGECACHE_TAG_DIRTY.
|
||||
*/
|
||||
if (!submitted_io) {
|
||||
if (!submitted_io && !error) {
|
||||
btrfs_folio_set_writeback(fs_info, folio, start, len);
|
||||
btrfs_folio_clear_writeback(fs_info, folio, start, len);
|
||||
}
|
||||
@ -1466,7 +1516,6 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
|
||||
{
|
||||
struct inode *inode = folio->mapping->host;
|
||||
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
|
||||
const u64 page_start = folio_pos(folio);
|
||||
int ret;
|
||||
size_t pg_offset;
|
||||
loff_t i_size = i_size_read(inode);
|
||||
@ -1506,16 +1555,19 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
|
||||
PAGE_SIZE, bio_ctrl, i_size);
|
||||
if (ret == 1)
|
||||
return 0;
|
||||
if (ret < 0)
|
||||
btrfs_err_rl(fs_info,
|
||||
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
|
||||
BTRFS_I(inode)->root->root_key.objectid,
|
||||
btrfs_ino(BTRFS_I(inode)),
|
||||
folio_pos(folio), fs_info->sectors_per_page,
|
||||
&bio_ctrl->submit_bitmap, ret);
|
||||
|
||||
bio_ctrl->wbc->nr_to_write--;
|
||||
|
||||
done:
|
||||
if (ret) {
|
||||
btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
|
||||
page_start, PAGE_SIZE, !ret);
|
||||
if (ret < 0)
|
||||
mapping_set_error(folio->mapping, ret);
|
||||
}
|
||||
|
||||
/*
|
||||
* Only unlock ranges that are submitted. As there can be some async
|
||||
* submitted ranges inside the folio.
|
||||
@ -2292,11 +2344,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
|
||||
if (ret == 1)
|
||||
goto next_page;
|
||||
|
||||
if (ret) {
|
||||
btrfs_mark_ordered_io_finished(BTRFS_I(inode), folio,
|
||||
cur, cur_len, !ret);
|
||||
if (ret)
|
||||
mapping_set_error(mapping, ret);
|
||||
}
|
||||
btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
|
||||
if (ret < 0)
|
||||
found_error = true;
|
||||
|
@ -36,52 +36,7 @@
|
||||
#include "ioctl.h"
|
||||
#include "file.h"
|
||||
#include "super.h"
|
||||
|
||||
/*
|
||||
* Helper to fault in page and copy. This should go away and be replaced with
|
||||
* calls into generic code.
|
||||
*/
|
||||
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
|
||||
struct folio *folio, struct iov_iter *i)
|
||||
{
|
||||
size_t copied = 0;
|
||||
size_t total_copied = 0;
|
||||
int offset = offset_in_page(pos);
|
||||
|
||||
while (write_bytes > 0) {
|
||||
size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
|
||||
/*
|
||||
* Copy data from userspace to the current page
|
||||
*/
|
||||
copied = copy_folio_from_iter_atomic(folio, offset, count, i);
|
||||
|
||||
/* Flush processor's dcache for this page */
|
||||
flush_dcache_folio(folio);
|
||||
|
||||
/*
|
||||
* if we get a partial write, we can end up with
|
||||
* partially up to date page. These add
|
||||
* a lot of complexity, so make sure they don't
|
||||
* happen by forcing this copy to be retried.
|
||||
*
|
||||
* The rest of the btrfs_file_write code will fall
|
||||
* back to page at a time copies after we return 0.
|
||||
*/
|
||||
if (unlikely(copied < count)) {
|
||||
if (!folio_test_uptodate(folio)) {
|
||||
iov_iter_revert(i, copied);
|
||||
copied = 0;
|
||||
}
|
||||
if (!copied)
|
||||
break;
|
||||
}
|
||||
|
||||
write_bytes -= copied;
|
||||
total_copied += copied;
|
||||
offset += copied;
|
||||
}
|
||||
return total_copied;
|
||||
}
|
||||
#include "print-tree.h"
|
||||
|
||||
/*
|
||||
* Unlock folio after btrfs_file_write() is done with it.
|
||||
@ -106,7 +61,7 @@ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
|
||||
}
|
||||
|
||||
/*
|
||||
* After btrfs_copy_from_user(), update the following things for delalloc:
|
||||
* After copy_folio_from_iter_atomic(), update the following things for delalloc:
|
||||
* - Mark newly dirtied folio as DELALLOC in the io tree.
|
||||
* Used to advise which range is to be written back.
|
||||
* - Mark modified folio as Uptodate/Dirty and not needing COW fixup
|
||||
@ -224,7 +179,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
|
||||
if (args->drop_cache)
|
||||
btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
|
||||
|
||||
if (args->start >= inode->disk_i_size && !args->replace_extent)
|
||||
if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
|
||||
modify_tree = 0;
|
||||
|
||||
update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
|
||||
@ -245,7 +200,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
|
||||
next_slot:
|
||||
leaf = path->nodes[0];
|
||||
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
|
||||
BUG_ON(del_nr > 0);
|
||||
if (WARN_ON(del_nr > 0)) {
|
||||
btrfs_print_leaf(leaf);
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
ret = btrfs_next_leaf(root, path);
|
||||
if (ret < 0)
|
||||
break;
|
||||
@ -321,7 +280,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
|
||||
* | -------- extent -------- |
|
||||
*/
|
||||
if (args->start > key.offset && args->end < extent_end) {
|
||||
BUG_ON(del_nr > 0);
|
||||
if (WARN_ON(del_nr > 0)) {
|
||||
btrfs_print_leaf(leaf);
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
||||
ret = -EOPNOTSUPP;
|
||||
break;
|
||||
@ -409,7 +372,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
|
||||
* | -------- extent -------- |
|
||||
*/
|
||||
if (args->start > key.offset && args->end >= extent_end) {
|
||||
BUG_ON(del_nr > 0);
|
||||
if (WARN_ON(del_nr > 0)) {
|
||||
btrfs_print_leaf(leaf);
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
|
||||
ret = -EOPNOTSUPP;
|
||||
break;
|
||||
@ -437,7 +404,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
|
||||
del_slot = path->slots[0];
|
||||
del_nr = 1;
|
||||
} else {
|
||||
BUG_ON(del_slot + del_nr != path->slots[0]);
|
||||
if (WARN_ON(del_slot + del_nr != path->slots[0])) {
|
||||
btrfs_print_leaf(leaf);
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
del_nr++;
|
||||
}
|
||||
|
||||
@ -1052,7 +1023,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
|
||||
&cached_state);
|
||||
}
|
||||
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
|
||||
NULL, nowait, false);
|
||||
NULL, nowait);
|
||||
if (ret <= 0)
|
||||
btrfs_drew_write_unlock(&root->snapshot_lock);
|
||||
else
|
||||
@ -1252,7 +1223,23 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
|
||||
break;
|
||||
}
|
||||
|
||||
copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
|
||||
copied = copy_folio_from_iter_atomic(folio,
|
||||
offset_in_folio(folio, pos), write_bytes, i);
|
||||
flush_dcache_folio(folio);
|
||||
|
||||
/*
|
||||
* If we get a partial write, we can end up with partially
|
||||
* uptodate page. Although if sector size < page size we can
|
||||
* handle it, but if it's not sector aligned it can cause
|
||||
* a lot of complexity, so make sure they don't happen by
|
||||
* forcing retry this copy.
|
||||
*/
|
||||
if (unlikely(copied < write_bytes)) {
|
||||
if (!folio_test_uptodate(folio)) {
|
||||
iov_iter_revert(i, copied);
|
||||
copied = 0;
|
||||
}
|
||||
}
|
||||
|
||||
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
|
||||
dirty_sectors = round_up(copied + sector_offset,
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include <linux/error-injection.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/string_choices.h>
|
||||
#include "ctree.h"
|
||||
#include "extent-tree.h"
|
||||
#include "fs.h"
|
||||
#include "messages.h"
|
||||
#include "misc.h"
|
||||
|
130
fs/btrfs/fs.c
130
fs/btrfs/fs.c
@ -4,6 +4,136 @@
|
||||
#include "ctree.h"
|
||||
#include "fs.h"
|
||||
#include "accessors.h"
|
||||
#include "volumes.h"
|
||||
|
||||
static const struct btrfs_csums {
|
||||
u16 size;
|
||||
const char name[10];
|
||||
const char driver[12];
|
||||
} btrfs_csums[] = {
|
||||
[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
|
||||
[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
|
||||
[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
|
||||
[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
|
||||
.driver = "blake2b-256" },
|
||||
};
|
||||
|
||||
/* This exists for btrfs-progs usages. */
|
||||
u16 btrfs_csum_type_size(u16 type)
|
||||
{
|
||||
return btrfs_csums[type].size;
|
||||
}
|
||||
|
||||
int btrfs_super_csum_size(const struct btrfs_super_block *s)
|
||||
{
|
||||
u16 t = btrfs_super_csum_type(s);
|
||||
|
||||
/* csum type is validated at mount time. */
|
||||
return btrfs_csum_type_size(t);
|
||||
}
|
||||
|
||||
const char *btrfs_super_csum_name(u16 csum_type)
|
||||
{
|
||||
/* csum type is validated at mount time. */
|
||||
return btrfs_csums[csum_type].name;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return driver name if defined, otherwise the name that's also a valid driver
|
||||
* name.
|
||||
*/
|
||||
const char *btrfs_super_csum_driver(u16 csum_type)
|
||||
{
|
||||
/* csum type is validated at mount time */
|
||||
return btrfs_csums[csum_type].driver[0] ?
|
||||
btrfs_csums[csum_type].driver :
|
||||
btrfs_csums[csum_type].name;
|
||||
}
|
||||
|
||||
size_t __attribute_const__ btrfs_get_num_csums(void)
|
||||
{
|
||||
return ARRAY_SIZE(btrfs_csums);
|
||||
}
|
||||
|
||||
/*
|
||||
* Start exclusive operation @type, return true on success.
|
||||
*/
|
||||
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
|
||||
enum btrfs_exclusive_operation type)
|
||||
{
|
||||
bool ret = false;
|
||||
|
||||
spin_lock(&fs_info->super_lock);
|
||||
if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
|
||||
fs_info->exclusive_operation = type;
|
||||
ret = true;
|
||||
}
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Conditionally allow to enter the exclusive operation in case it's compatible
|
||||
* with the running one. This must be paired with btrfs_exclop_start_unlock()
|
||||
* and btrfs_exclop_finish().
|
||||
*
|
||||
* Compatibility:
|
||||
* - the same type is already running
|
||||
* - when trying to add a device and balance has been paused
|
||||
* - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
|
||||
* must check the condition first that would allow none -> @type
|
||||
*/
|
||||
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
|
||||
enum btrfs_exclusive_operation type)
|
||||
{
|
||||
spin_lock(&fs_info->super_lock);
|
||||
if (fs_info->exclusive_operation == type ||
|
||||
(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
|
||||
type == BTRFS_EXCLOP_DEV_ADD))
|
||||
return true;
|
||||
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
}
|
||||
|
||||
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
spin_lock(&fs_info->super_lock);
|
||||
WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
|
||||
}
|
||||
|
||||
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
|
||||
enum btrfs_exclusive_operation op)
|
||||
{
|
||||
switch (op) {
|
||||
case BTRFS_EXCLOP_BALANCE_PAUSED:
|
||||
spin_lock(&fs_info->super_lock);
|
||||
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
|
||||
fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
|
||||
fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
|
||||
fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
|
||||
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
break;
|
||||
case BTRFS_EXCLOP_BALANCE:
|
||||
spin_lock(&fs_info->super_lock);
|
||||
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
|
||||
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
break;
|
||||
default:
|
||||
btrfs_warn(fs_info,
|
||||
"invalid exclop balance operation %d requested", op);
|
||||
}
|
||||
}
|
||||
|
||||
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
|
||||
const char *name)
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <linux/rwsem.h>
|
||||
#include <linux/semaphore.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/radix-tree.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/wait.h>
|
||||
@ -887,6 +888,11 @@ struct btrfs_fs_info {
|
||||
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
|
||||
struct inode *: (_inode)))->root->fs_info)
|
||||
|
||||
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
|
||||
{
|
||||
return mapping_gfp_constraint(mapping, ~__GFP_FS);
|
||||
}
|
||||
|
||||
static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
return READ_ONCE(fs_info->generation);
|
||||
@ -953,6 +959,8 @@ static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
|
||||
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
|
||||
sizeof(struct btrfs_item))
|
||||
|
||||
#define BTRFS_BYTES_TO_BLKS(fs_info, bytes) ((bytes) >> (fs_info)->sectorsize_bits)
|
||||
|
||||
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && fs_info->zone_size > 0;
|
||||
@ -982,6 +990,17 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
|
||||
|
||||
int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args);
|
||||
|
||||
u16 btrfs_csum_type_size(u16 type);
|
||||
int btrfs_super_csum_size(const struct btrfs_super_block *s);
|
||||
const char *btrfs_super_csum_name(u16 csum_type);
|
||||
const char *btrfs_super_csum_driver(u16 csum_type);
|
||||
size_t __attribute_const__ btrfs_get_num_csums(void);
|
||||
|
||||
static inline bool btrfs_is_empty_uuid(const u8 *uuid)
|
||||
{
|
||||
return uuid_is_null((const uuid_t *)uuid);
|
||||
}
|
||||
|
||||
/* Compatibility and incompatibility defines */
|
||||
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
|
||||
const char *name);
|
||||
@ -1058,6 +1077,14 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
|
||||
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
|
||||
&(fs_info)->fs_state)))
|
||||
|
||||
/*
|
||||
* We use folio flag owner_2 to indicate there is an ordered extent with
|
||||
* unfinished IO.
|
||||
*/
|
||||
#define folio_test_ordered(folio) folio_test_owner_2(folio)
|
||||
#define folio_set_ordered(folio) folio_set_owner_2(folio)
|
||||
#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
|
||||
|
||||
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
||||
|
||||
#define EXPORT_FOR_TESTS
|
||||
|
358
fs/btrfs/inode.c
358
fs/btrfs/inode.c
@ -1129,19 +1129,15 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
|
||||
&wbc, false);
|
||||
wbc_detach_inode(&wbc);
|
||||
if (ret < 0) {
|
||||
btrfs_cleanup_ordered_extents(inode, locked_folio,
|
||||
btrfs_cleanup_ordered_extents(inode, NULL,
|
||||
start, end - start + 1);
|
||||
if (locked_folio) {
|
||||
const u64 page_start = folio_pos(locked_folio);
|
||||
|
||||
folio_start_writeback(locked_folio);
|
||||
folio_end_writeback(locked_folio);
|
||||
btrfs_mark_ordered_io_finished(inode, locked_folio,
|
||||
page_start, PAGE_SIZE,
|
||||
!ret);
|
||||
mapping_set_error(locked_folio->mapping, ret);
|
||||
folio_unlock(locked_folio);
|
||||
}
|
||||
if (locked_folio)
|
||||
btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
|
||||
start, async_extent->ram_size);
|
||||
btrfs_err_rl(inode->root->fs_info,
|
||||
"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
|
||||
__func__, btrfs_root_id(inode->root),
|
||||
btrfs_ino(inode), start, async_extent->ram_size, ret);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1254,7 +1250,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
|
||||
free_async_extent_pages(async_extent);
|
||||
if (async_chunk->blkcg_css)
|
||||
kthread_associate_blkcg(NULL);
|
||||
btrfs_debug(fs_info,
|
||||
btrfs_debug_rl(fs_info,
|
||||
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
|
||||
btrfs_root_id(root), btrfs_ino(inode), start,
|
||||
async_extent->ram_size, ret);
|
||||
@ -1372,6 +1368,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
|
||||
|
||||
alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
|
||||
|
||||
/*
|
||||
* We're not doing compressed IO, don't unlock the first page
|
||||
* (which the caller expects to stay locked), don't clear any
|
||||
* dirty bits and don't set any writeback bits
|
||||
*
|
||||
* Do set the Ordered (Private2) bit so we know this page was
|
||||
* properly setup for writepage.
|
||||
*/
|
||||
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
|
||||
page_ops |= PAGE_SET_ORDERED;
|
||||
|
||||
/*
|
||||
* Relocation relies on the relocated extents to have exactly the same
|
||||
* size as the original extents. Normally writeback for relocation data
|
||||
@ -1431,6 +1438,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
|
||||
file_extent.offset = 0;
|
||||
file_extent.compression = BTRFS_COMPRESS_NONE;
|
||||
|
||||
/*
|
||||
* Locked range will be released either during error clean up or
|
||||
* after the whole range is finished.
|
||||
*/
|
||||
lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
|
||||
&cached);
|
||||
|
||||
@ -1476,21 +1487,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
|
||||
|
||||
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
|
||||
|
||||
/*
|
||||
* We're not doing compressed IO, don't unlock the first page
|
||||
* (which the caller expects to stay locked), don't clear any
|
||||
* dirty bits and don't set any writeback bits
|
||||
*
|
||||
* Do set the Ordered flag so we know this page was
|
||||
* properly setup for writepage.
|
||||
*/
|
||||
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
|
||||
page_ops |= PAGE_SET_ORDERED;
|
||||
|
||||
extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
|
||||
locked_folio, &cached,
|
||||
EXTENT_LOCKED | EXTENT_DELALLOC,
|
||||
page_ops);
|
||||
if (num_bytes < cur_alloc_size)
|
||||
num_bytes = 0;
|
||||
else
|
||||
@ -1507,6 +1503,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
}
|
||||
extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
|
||||
EXTENT_LOCKED | EXTENT_DELALLOC,
|
||||
page_ops);
|
||||
done:
|
||||
if (done_offset)
|
||||
*done_offset = end;
|
||||
@ -1527,35 +1526,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
|
||||
* We process each region below.
|
||||
*/
|
||||
|
||||
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
|
||||
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
|
||||
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
|
||||
|
||||
/*
|
||||
* For the range (1). We have already instantiated the ordered extents
|
||||
* for this region. They are cleaned up by
|
||||
* btrfs_cleanup_ordered_extents() in e.g,
|
||||
* btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
|
||||
* already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
|
||||
* EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
|
||||
* function.
|
||||
* btrfs_run_delalloc_range().
|
||||
* EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
|
||||
* are also handled by the cleanup function.
|
||||
*
|
||||
* However, in case of @keep_locked, we still need to unlock the pages
|
||||
* (except @locked_folio) to ensure all the pages are unlocked.
|
||||
* So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag,
|
||||
* and finish the writeback of the involved folios, which will be
|
||||
* never submitted.
|
||||
*/
|
||||
if (keep_locked && orig_start < start) {
|
||||
if (orig_start < start) {
|
||||
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
|
||||
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
|
||||
|
||||
if (!locked_folio)
|
||||
mapping_set_error(inode->vfs_inode.i_mapping, ret);
|
||||
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
|
||||
locked_folio, NULL, 0, page_ops);
|
||||
locked_folio, NULL, clear_bits, page_ops);
|
||||
}
|
||||
|
||||
/*
|
||||
* At this point we're unlocked, we want to make sure we're only
|
||||
* clearing these flags under the extent lock, so lock the rest of the
|
||||
* range and clear everything up.
|
||||
*/
|
||||
lock_extent(&inode->io_tree, start, end, NULL);
|
||||
clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
|
||||
EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
|
||||
page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
|
||||
|
||||
/*
|
||||
* For the range (2). If we reserved an extent for our delalloc range
|
||||
@ -1589,6 +1584,10 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
|
||||
btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
|
||||
end - start - cur_alloc_size + 1, NULL);
|
||||
}
|
||||
btrfs_err_rl(fs_info,
|
||||
"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
|
||||
__func__, btrfs_root_id(inode->root),
|
||||
btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1809,7 +1808,7 @@ static int fallback_to_cow(struct btrfs_inode *inode,
|
||||
bytes = range_bytes;
|
||||
|
||||
spin_lock(&sinfo->lock);
|
||||
btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
|
||||
btrfs_space_info_update_bytes_may_use(sinfo, bytes);
|
||||
spin_unlock(&sinfo->lock);
|
||||
|
||||
if (count > 0)
|
||||
@ -1837,7 +1836,6 @@ struct can_nocow_file_extent_args {
|
||||
/* End file offset (inclusive) of the range we want to NOCOW. */
|
||||
u64 end;
|
||||
bool writeback_path;
|
||||
bool strict;
|
||||
/*
|
||||
* Free the path passed to can_nocow_file_extent() once it's not needed
|
||||
* anymore.
|
||||
@ -1892,8 +1890,7 @@ static int can_nocow_file_extent(struct btrfs_path *path,
|
||||
* for its subvolume was created, then this implies the extent is shared,
|
||||
* hence we must COW.
|
||||
*/
|
||||
if (!args->strict &&
|
||||
btrfs_file_extent_generation(leaf, fi) <=
|
||||
if (btrfs_file_extent_generation(leaf, fi) <=
|
||||
btrfs_root_last_snapshot(&root->root_item))
|
||||
goto out;
|
||||
|
||||
@ -1922,9 +1919,8 @@ static int can_nocow_file_extent(struct btrfs_path *path,
|
||||
*/
|
||||
btrfs_release_path(path);
|
||||
|
||||
ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
|
||||
key->offset - args->file_extent.offset,
|
||||
args->file_extent.disk_bytenr, args->strict, path);
|
||||
ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
|
||||
args->file_extent.disk_bytenr, path);
|
||||
WARN_ON_ONCE(ret > 0 && is_freespace_inode);
|
||||
if (ret != 0)
|
||||
goto out;
|
||||
@ -1970,6 +1966,48 @@ static int can_nocow_file_extent(struct btrfs_path *path,
|
||||
return ret < 0 ? ret : can_nocow;
|
||||
}
|
||||
|
||||
static void cleanup_dirty_folios(struct btrfs_inode *inode,
|
||||
struct folio *locked_folio,
|
||||
u64 start, u64 end, int error)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode->root->fs_info;
|
||||
struct address_space *mapping = inode->vfs_inode.i_mapping;
|
||||
pgoff_t start_index = start >> PAGE_SHIFT;
|
||||
pgoff_t end_index = end >> PAGE_SHIFT;
|
||||
u32 len;
|
||||
|
||||
ASSERT(end + 1 - start < U32_MAX);
|
||||
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
|
||||
IS_ALIGNED(end + 1, fs_info->sectorsize));
|
||||
len = end + 1 - start;
|
||||
|
||||
/*
|
||||
* Handle the locked folio first.
|
||||
* btrfs_folio_clamp_*() helpers can handle range out of the folio case.
|
||||
*/
|
||||
btrfs_folio_clamp_clear_dirty(fs_info, locked_folio, start, len);
|
||||
btrfs_folio_clamp_set_writeback(fs_info, locked_folio, start, len);
|
||||
btrfs_folio_clamp_clear_writeback(fs_info, locked_folio, start, len);
|
||||
|
||||
for (pgoff_t index = start_index; index <= end_index; index++) {
|
||||
struct folio *folio;
|
||||
|
||||
/* Already handled at the beginning. */
|
||||
if (index == locked_folio->index)
|
||||
continue;
|
||||
folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
|
||||
/* Cache already dropped, no need to do any cleanup. */
|
||||
if (IS_ERR(folio))
|
||||
continue;
|
||||
btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
|
||||
btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
|
||||
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
mapping_set_error(mapping, error);
|
||||
}
|
||||
|
||||
/*
|
||||
* when nowcow writeback call back. This checks for snapshots or COW copies
|
||||
* of the extents that exist in the file, and COWs the file as required.
|
||||
@ -1985,6 +2023,11 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
||||
struct btrfs_root *root = inode->root;
|
||||
struct btrfs_path *path;
|
||||
u64 cow_start = (u64)-1;
|
||||
/*
|
||||
* If not 0, represents the inclusive end of the last fallback_to_cow()
|
||||
* range. Only for error handling.
|
||||
*/
|
||||
u64 cow_end = 0;
|
||||
u64 cur_offset = start;
|
||||
int ret;
|
||||
bool check_prev = true;
|
||||
@ -2145,6 +2188,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
||||
found_key.offset - 1);
|
||||
cow_start = (u64)-1;
|
||||
if (ret) {
|
||||
cow_end = found_key.offset - 1;
|
||||
btrfs_dec_nocow_writers(nocow_bg);
|
||||
goto error;
|
||||
}
|
||||
@ -2218,11 +2262,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
||||
cow_start = cur_offset;
|
||||
|
||||
if (cow_start != (u64)-1) {
|
||||
cur_offset = end;
|
||||
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
|
||||
cow_start = (u64)-1;
|
||||
if (ret)
|
||||
if (ret) {
|
||||
cow_end = end;
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
btrfs_free_path(path);
|
||||
@ -2230,12 +2275,42 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
||||
|
||||
error:
|
||||
/*
|
||||
* If an error happened while a COW region is outstanding, cur_offset
|
||||
* needs to be reset to cow_start to ensure the COW region is unlocked
|
||||
* as well.
|
||||
* There are several error cases:
|
||||
*
|
||||
* 1) Failed without falling back to COW
|
||||
* start cur_start end
|
||||
* |/////////////| |
|
||||
*
|
||||
* For range [start, cur_start) the folios are already unlocked (except
|
||||
* @locked_folio), EXTENT_DELALLOC already removed.
|
||||
* Only need to clear the dirty flag as they will never be submitted.
|
||||
* Ordered extent and extent maps are handled by
|
||||
* btrfs_mark_ordered_io_finished() inside run_delalloc_range().
|
||||
*
|
||||
* 2) Failed with error from fallback_to_cow()
|
||||
* start cur_start cow_end end
|
||||
* |/////////////|-----------| |
|
||||
*
|
||||
* For range [start, cur_start) it's the same as case 1).
|
||||
* But for range [cur_start, cow_end), the folios have dirty flag
|
||||
* cleared and unlocked, EXTENT_DEALLLOC cleared.
|
||||
* There may or may not be any ordered extents/extent maps allocated.
|
||||
*
|
||||
* We should not call extent_clear_unlock_delalloc() on range [cur_start,
|
||||
* cow_end), as the folios are already unlocked.
|
||||
*
|
||||
* So clear the folio dirty flags for [start, cur_offset) first.
|
||||
*/
|
||||
if (cow_start != (u64)-1)
|
||||
cur_offset = cow_start;
|
||||
if (cur_offset > start)
|
||||
cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
|
||||
|
||||
/*
|
||||
* If an error happened while a COW region is outstanding, cur_offset
|
||||
* needs to be reset to @cow_end + 1 to skip the COW range, as
|
||||
* cow_file_range() will do the proper cleanup at error.
|
||||
*/
|
||||
if (cow_end)
|
||||
cur_offset = cow_end + 1;
|
||||
|
||||
/*
|
||||
* We need to lock the extent here because we're clearing DELALLOC and
|
||||
@ -2255,6 +2330,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
|
||||
btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
|
||||
}
|
||||
btrfs_free_path(path);
|
||||
btrfs_err_rl(fs_info,
|
||||
"%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
|
||||
__func__, btrfs_root_id(inode->root),
|
||||
btrfs_ino(inode), start, end + 1 - start, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -7011,8 +7090,6 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
|
||||
* @orig_start: (optional) Return the original file offset of the file extent
|
||||
* @orig_len: (optional) Return the original on-disk length of the file extent
|
||||
* @ram_bytes: (optional) Return the ram_bytes of the file extent
|
||||
* @strict: if true, omit optimizations that might force us into unnecessary
|
||||
* cow. e.g., don't trust generation number.
|
||||
*
|
||||
* Return:
|
||||
* >0 and update @len if we can do nocow write
|
||||
@ -7024,7 +7101,7 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
|
||||
*/
|
||||
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
||||
struct btrfs_file_extent *file_extent,
|
||||
bool nowait, bool strict)
|
||||
bool nowait)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
|
||||
struct can_nocow_file_extent_args nocow_args = { 0 };
|
||||
@ -7077,7 +7154,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
|
||||
|
||||
nocow_args.start = offset;
|
||||
nocow_args.end = offset + *len - 1;
|
||||
nocow_args.strict = strict;
|
||||
nocow_args.free_path = true;
|
||||
|
||||
ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
|
||||
@ -9078,9 +9154,9 @@ static ssize_t btrfs_encoded_read_inline(
|
||||
}
|
||||
|
||||
struct btrfs_encoded_read_private {
|
||||
wait_queue_head_t wait;
|
||||
struct completion done;
|
||||
void *uring_ctx;
|
||||
atomic_t pending;
|
||||
refcount_t pending_refs;
|
||||
blk_status_t status;
|
||||
};
|
||||
|
||||
@ -9099,14 +9175,14 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
|
||||
*/
|
||||
WRITE_ONCE(priv->status, bbio->bio.bi_status);
|
||||
}
|
||||
if (atomic_dec_and_test(&priv->pending)) {
|
||||
if (refcount_dec_and_test(&priv->pending_refs)) {
|
||||
int err = blk_status_to_errno(READ_ONCE(priv->status));
|
||||
|
||||
if (priv->uring_ctx) {
|
||||
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
|
||||
kfree(priv);
|
||||
} else {
|
||||
wake_up(&priv->wait);
|
||||
complete(&priv->done);
|
||||
}
|
||||
}
|
||||
bio_put(&bbio->bio);
|
||||
@ -9126,8 +9202,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
if (!priv)
|
||||
return -ENOMEM;
|
||||
|
||||
init_waitqueue_head(&priv->wait);
|
||||
atomic_set(&priv->pending, 1);
|
||||
init_completion(&priv->done);
|
||||
refcount_set(&priv->pending_refs, 1);
|
||||
priv->status = 0;
|
||||
priv->uring_ctx = uring_ctx;
|
||||
|
||||
@ -9140,7 +9216,7 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
|
||||
|
||||
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
|
||||
atomic_inc(&priv->pending);
|
||||
refcount_inc(&priv->pending_refs);
|
||||
btrfs_submit_bbio(bbio, 0);
|
||||
|
||||
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
|
||||
@ -9155,11 +9231,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
disk_io_size -= bytes;
|
||||
} while (disk_io_size);
|
||||
|
||||
atomic_inc(&priv->pending);
|
||||
refcount_inc(&priv->pending_refs);
|
||||
btrfs_submit_bbio(bbio, 0);
|
||||
|
||||
if (uring_ctx) {
|
||||
if (atomic_dec_return(&priv->pending) == 0) {
|
||||
if (refcount_dec_and_test(&priv->pending_refs)) {
|
||||
ret = blk_status_to_errno(READ_ONCE(priv->status));
|
||||
btrfs_uring_read_extent_endio(uring_ctx, ret);
|
||||
kfree(priv);
|
||||
@ -9168,8 +9244,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
|
||||
|
||||
return -EIOCBQUEUED;
|
||||
} else {
|
||||
if (atomic_dec_return(&priv->pending) != 0)
|
||||
io_wait_event(priv->wait, !atomic_read(&priv->pending));
|
||||
if (!refcount_dec_and_test(&priv->pending_refs))
|
||||
wait_for_completion_io(&priv->done);
|
||||
/* See btrfs_encoded_read_endio() for ordering. */
|
||||
ret = blk_status_to_errno(READ_ONCE(priv->status));
|
||||
kfree(priv);
|
||||
@ -9799,15 +9875,25 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
|
||||
struct extent_state *cached_state = NULL;
|
||||
struct extent_map *em = NULL;
|
||||
struct btrfs_chunk_map *map = NULL;
|
||||
struct btrfs_device *device = NULL;
|
||||
struct btrfs_swap_info bsi = {
|
||||
.lowest_ppage = (sector_t)-1ULL,
|
||||
};
|
||||
struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
|
||||
struct btrfs_path *path = NULL;
|
||||
int ret = 0;
|
||||
u64 isize;
|
||||
u64 start;
|
||||
u64 prev_extent_end = 0;
|
||||
|
||||
/*
|
||||
* Acquire the inode's mmap lock to prevent races with memory mapped
|
||||
* writes, as they could happen after we flush delalloc below and before
|
||||
* we lock the extent range further below. The inode was already locked
|
||||
* up in the call chain.
|
||||
*/
|
||||
btrfs_assert_inode_locked(BTRFS_I(inode));
|
||||
down_write(&BTRFS_I(inode)->i_mmap_lock);
|
||||
|
||||
/*
|
||||
* If the swap file was just created, make sure delalloc is done. If the
|
||||
@ -9816,22 +9902,32 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
*/
|
||||
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
|
||||
if (ret)
|
||||
return ret;
|
||||
goto out_unlock_mmap;
|
||||
|
||||
/*
|
||||
* The inode is locked, so these flags won't change after we check them.
|
||||
*/
|
||||
if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
|
||||
btrfs_warn(fs_info, "swapfile must not be compressed");
|
||||
return -EINVAL;
|
||||
ret = -EINVAL;
|
||||
goto out_unlock_mmap;
|
||||
}
|
||||
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
|
||||
btrfs_warn(fs_info, "swapfile must not be copy-on-write");
|
||||
return -EINVAL;
|
||||
ret = -EINVAL;
|
||||
goto out_unlock_mmap;
|
||||
}
|
||||
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
|
||||
btrfs_warn(fs_info, "swapfile must not be checksummed");
|
||||
return -EINVAL;
|
||||
ret = -EINVAL;
|
||||
goto out_unlock_mmap;
|
||||
}
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
backref_ctx = btrfs_alloc_backref_share_check_ctx();
|
||||
if (!path || !backref_ctx) {
|
||||
ret = -ENOMEM;
|
||||
goto out_unlock_mmap;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -9846,7 +9942,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
|
||||
btrfs_warn(fs_info,
|
||||
"cannot activate swapfile while exclusive operation is running");
|
||||
return -EBUSY;
|
||||
ret = -EBUSY;
|
||||
goto out_unlock_mmap;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -9860,7 +9957,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
btrfs_exclop_finish(fs_info);
|
||||
btrfs_warn(fs_info,
|
||||
"cannot activate swapfile because snapshot creation is in progress");
|
||||
return -EINVAL;
|
||||
ret = -EINVAL;
|
||||
goto out_unlock_mmap;
|
||||
}
|
||||
/*
|
||||
* Snapshots can create extents which require COW even if NODATACOW is
|
||||
@ -9881,7 +9979,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
btrfs_warn(fs_info,
|
||||
"cannot activate swapfile because subvolume %llu is being deleted",
|
||||
btrfs_root_id(root));
|
||||
return -EPERM;
|
||||
ret = -EPERM;
|
||||
goto out_unlock_mmap;
|
||||
}
|
||||
atomic_inc(&root->nr_swapfiles);
|
||||
spin_unlock(&root->root_item_lock);
|
||||
@ -9889,24 +9988,39 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
|
||||
|
||||
lock_extent(io_tree, 0, isize - 1, &cached_state);
|
||||
start = 0;
|
||||
while (start < isize) {
|
||||
u64 logical_block_start, physical_block_start;
|
||||
while (prev_extent_end < isize) {
|
||||
struct btrfs_key key;
|
||||
struct extent_buffer *leaf;
|
||||
struct btrfs_file_extent_item *ei;
|
||||
struct btrfs_block_group *bg;
|
||||
u64 len = isize - start;
|
||||
u64 logical_block_start;
|
||||
u64 physical_block_start;
|
||||
u64 extent_gen;
|
||||
u64 disk_bytenr;
|
||||
u64 len;
|
||||
|
||||
em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
|
||||
if (IS_ERR(em)) {
|
||||
ret = PTR_ERR(em);
|
||||
key.objectid = btrfs_ino(BTRFS_I(inode));
|
||||
key.type = BTRFS_EXTENT_DATA_KEY;
|
||||
key.offset = prev_extent_end;
|
||||
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (em->disk_bytenr == EXTENT_MAP_HOLE) {
|
||||
/*
|
||||
* If key not found it means we have an implicit hole (NO_HOLES
|
||||
* is enabled).
|
||||
*/
|
||||
if (ret > 0) {
|
||||
btrfs_warn(fs_info, "swapfile must not have holes");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
if (em->disk_bytenr == EXTENT_MAP_INLINE) {
|
||||
|
||||
leaf = path->nodes[0];
|
||||
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
|
||||
|
||||
if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
|
||||
/*
|
||||
* It's unlikely we'll ever actually find ourselves
|
||||
* here, as a file small enough to fit inline won't be
|
||||
@ -9918,23 +10032,45 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
if (extent_map_is_compressed(em)) {
|
||||
|
||||
if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
|
||||
btrfs_warn(fs_info, "swapfile must not be compressed");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
logical_block_start = extent_map_block_start(em) + (start - em->start);
|
||||
len = min(len, em->len - (start - em->start));
|
||||
free_extent_map(em);
|
||||
em = NULL;
|
||||
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
|
||||
if (disk_bytenr == 0) {
|
||||
btrfs_warn(fs_info, "swapfile must not have holes");
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = can_nocow_extent(inode, start, &len, NULL, false, true);
|
||||
logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
|
||||
extent_gen = btrfs_file_extent_generation(leaf, ei);
|
||||
prev_extent_end = btrfs_file_extent_end(path);
|
||||
|
||||
if (prev_extent_end > isize)
|
||||
len = isize - key.offset;
|
||||
else
|
||||
len = btrfs_file_extent_num_bytes(leaf, ei);
|
||||
|
||||
backref_ctx->curr_leaf_bytenr = leaf->start;
|
||||
|
||||
/*
|
||||
* Don't need the path anymore, release to avoid deadlocks when
|
||||
* calling btrfs_is_data_extent_shared() because when joining a
|
||||
* transaction it can block waiting for the current one's commit
|
||||
* which in turn may be trying to lock the same leaf to flush
|
||||
* delayed items for example.
|
||||
*/
|
||||
btrfs_release_path(path);
|
||||
|
||||
ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
|
||||
extent_gen, backref_ctx);
|
||||
if (ret < 0) {
|
||||
goto out;
|
||||
} else if (ret) {
|
||||
ret = 0;
|
||||
} else {
|
||||
} else if (ret > 0) {
|
||||
btrfs_warn(fs_info,
|
||||
"swapfile must not be copy-on-write");
|
||||
ret = -EINVAL;
|
||||
@ -9969,7 +10105,6 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
|
||||
physical_block_start = (map->stripes[0].physical +
|
||||
(logical_block_start - map->start));
|
||||
len = min(len, map->chunk_len - (logical_block_start - map->start));
|
||||
btrfs_free_chunk_map(map);
|
||||
map = NULL;
|
||||
|
||||
@ -10010,20 +10145,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
bsi.start = start;
|
||||
bsi.start = key.offset;
|
||||
bsi.block_start = physical_block_start;
|
||||
bsi.block_len = len;
|
||||
}
|
||||
|
||||
start += len;
|
||||
if (fatal_signal_pending(current)) {
|
||||
ret = -EINTR;
|
||||
goto out;
|
||||
}
|
||||
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (bsi.block_len)
|
||||
ret = btrfs_add_swap_extent(sis, &bsi);
|
||||
|
||||
out:
|
||||
if (!IS_ERR_OR_NULL(em))
|
||||
free_extent_map(em);
|
||||
if (!IS_ERR_OR_NULL(map))
|
||||
btrfs_free_chunk_map(map);
|
||||
|
||||
@ -10036,6 +10174,10 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
|
||||
btrfs_exclop_finish(fs_info);
|
||||
|
||||
out_unlock_mmap:
|
||||
up_write(&BTRFS_I(inode)->i_mmap_lock);
|
||||
btrfs_free_backref_share_ctx(backref_ctx);
|
||||
btrfs_free_path(path);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@ -403,86 +403,6 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Start exclusive operation @type, return true on success
|
||||
*/
|
||||
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
|
||||
enum btrfs_exclusive_operation type)
|
||||
{
|
||||
bool ret = false;
|
||||
|
||||
spin_lock(&fs_info->super_lock);
|
||||
if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
|
||||
fs_info->exclusive_operation = type;
|
||||
ret = true;
|
||||
}
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Conditionally allow to enter the exclusive operation in case it's compatible
|
||||
* with the running one. This must be paired with btrfs_exclop_start_unlock and
|
||||
* btrfs_exclop_finish.
|
||||
*
|
||||
* Compatibility:
|
||||
* - the same type is already running
|
||||
* - when trying to add a device and balance has been paused
|
||||
* - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
|
||||
* must check the condition first that would allow none -> @type
|
||||
*/
|
||||
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
|
||||
enum btrfs_exclusive_operation type)
|
||||
{
|
||||
spin_lock(&fs_info->super_lock);
|
||||
if (fs_info->exclusive_operation == type ||
|
||||
(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
|
||||
type == BTRFS_EXCLOP_DEV_ADD))
|
||||
return true;
|
||||
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
}
|
||||
|
||||
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
spin_lock(&fs_info->super_lock);
|
||||
WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
|
||||
}
|
||||
|
||||
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
|
||||
enum btrfs_exclusive_operation op)
|
||||
{
|
||||
switch (op) {
|
||||
case BTRFS_EXCLOP_BALANCE_PAUSED:
|
||||
spin_lock(&fs_info->super_lock);
|
||||
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
|
||||
fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD ||
|
||||
fs_info->exclusive_operation == BTRFS_EXCLOP_NONE ||
|
||||
fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
|
||||
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
break;
|
||||
case BTRFS_EXCLOP_BALANCE:
|
||||
spin_lock(&fs_info->super_lock);
|
||||
ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
|
||||
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
|
||||
spin_unlock(&fs_info->super_lock);
|
||||
break;
|
||||
default:
|
||||
btrfs_warn(fs_info,
|
||||
"invalid exclop balance operation %d requested", op);
|
||||
}
|
||||
}
|
||||
|
||||
static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
|
||||
{
|
||||
return put_user(inode->i_generation, arg);
|
||||
@ -551,17 +471,6 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __pure btrfs_is_empty_uuid(const u8 *uuid)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BTRFS_UUID_SIZE; i++) {
|
||||
if (uuid[i])
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the number of transaction items to reserve for creating a subvolume
|
||||
* or snapshot, not including the inode, directory entries, or parent directory.
|
||||
@ -4984,15 +4893,14 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
|
||||
* undo this.
|
||||
*/
|
||||
if (!iov) {
|
||||
iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
|
||||
iov = kmemdup(iovstack, sizeof(struct iovec) * args.iovcnt,
|
||||
GFP_NOFS);
|
||||
if (!iov) {
|
||||
unlock_extent(io_tree, start, lockend, &cached_state);
|
||||
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
|
||||
ret = -ENOMEM;
|
||||
goto out_acct;
|
||||
}
|
||||
|
||||
memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
|
||||
}
|
||||
|
||||
count = min_t(u64, iov_iter_count(&iter), disk_io_size);
|
||||
@ -5300,6 +5208,8 @@ long btrfs_ioctl(struct file *file, unsigned int
|
||||
return fsverity_ioctl_enable(file, (const void __user *)argp);
|
||||
case FS_IOC_MEASURE_VERITY:
|
||||
return fsverity_ioctl_measure(file, argp);
|
||||
case FS_IOC_READ_VERITY_METADATA:
|
||||
return fsverity_ioctl_read_metadata(file, argp);
|
||||
case BTRFS_IOC_ENCODED_READ:
|
||||
return btrfs_ioctl_encoded_read(file, argp, false);
|
||||
case BTRFS_IOC_ENCODED_WRITE:
|
||||
|
@ -19,7 +19,6 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
|
||||
struct dentry *dentry, struct fileattr *fa);
|
||||
int btrfs_ioctl_get_supported_features(void __user *arg);
|
||||
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
|
||||
int __pure btrfs_is_empty_uuid(const u8 *uuid);
|
||||
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_ioctl_balance_args *bargs);
|
||||
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
|
||||
|
@ -199,8 +199,13 @@ static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
|
||||
{
|
||||
lockdep_assert_held_write(&eb->lock);
|
||||
}
|
||||
static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
|
||||
{
|
||||
lockdep_assert_held_read(&eb->lock);
|
||||
}
|
||||
#else
|
||||
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { }
|
||||
static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
|
||||
#endif
|
||||
|
||||
void btrfs_unlock_up_safe(struct btrfs_path *path, int level);
|
||||
|
@ -163,4 +163,32 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
|
||||
return (found_set == start + nbits);
|
||||
}
|
||||
|
||||
/*
|
||||
* Count how many bits are set in the bitmap.
|
||||
*
|
||||
* Similar to bitmap_weight() but accepts a subrange of the bitmap.
|
||||
*/
|
||||
static inline unsigned int bitmap_count_set(const unsigned long *addr,
|
||||
unsigned long start,
|
||||
unsigned long nbits)
|
||||
{
|
||||
const unsigned long bitmap_nbits = start + nbits;
|
||||
unsigned long cur = start;
|
||||
unsigned long total_set = 0;
|
||||
|
||||
while (cur < bitmap_nbits) {
|
||||
unsigned long found_zero;
|
||||
unsigned long found_set;
|
||||
|
||||
found_zero = find_next_zero_bit(addr, bitmap_nbits, cur);
|
||||
total_set += found_zero - cur;
|
||||
|
||||
cur = found_zero;
|
||||
if (cur >= bitmap_nbits)
|
||||
break;
|
||||
found_set = find_next_bit(addr, bitmap_nbits, cur);
|
||||
cur = found_set;
|
||||
}
|
||||
return total_set;
|
||||
}
|
||||
#endif
|
||||
|
@ -194,6 +194,14 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
|
||||
INIT_LIST_HEAD(&entry->bioc_list);
|
||||
init_completion(&entry->completion);
|
||||
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
entry->finished_bitmap = bitmap_zalloc(
|
||||
num_bytes >> inode->root->fs_info->sectorsize_bits, GFP_NOFS);
|
||||
if (!entry->finished_bitmap) {
|
||||
kmem_cache_free(btrfs_ordered_extent_cache, entry);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
#endif
|
||||
/*
|
||||
* We don't need the count_max_extents here, we can assume that all of
|
||||
* that work has been done at higher layers, so this is truly the
|
||||
@ -356,13 +364,39 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
|
||||
btrfs_folio_clear_ordered(fs_info, folio, file_offset, len);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
{
|
||||
unsigned long start_bit;
|
||||
unsigned long nbits;
|
||||
unsigned long nr_set;
|
||||
|
||||
ASSERT(file_offset >= ordered->file_offset);
|
||||
ASSERT(file_offset + len <= ordered->file_offset + ordered->num_bytes);
|
||||
|
||||
start_bit = (file_offset - ordered->file_offset) >> fs_info->sectorsize_bits;
|
||||
nbits = len >> fs_info->sectorsize_bits;
|
||||
|
||||
nr_set = bitmap_count_set(ordered->finished_bitmap, start_bit, nbits);
|
||||
if (WARN_ON(nr_set)) {
|
||||
btrfs_crit(fs_info,
|
||||
"double ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu range offset=%llu range len=%llu already finished len=%lu finish_bitmap=%*pbl",
|
||||
btrfs_root_id(inode->root), btrfs_ino(inode),
|
||||
ordered->file_offset, ordered->num_bytes,
|
||||
file_offset, len, nr_set << fs_info->sectorsize_bits,
|
||||
(int)(ordered->num_bytes >> fs_info->sectorsize_bits),
|
||||
ordered->finished_bitmap);
|
||||
}
|
||||
bitmap_set(ordered->finished_bitmap, start_bit, nbits);
|
||||
len -= (nr_set << fs_info->sectorsize_bits);
|
||||
}
|
||||
#endif
|
||||
/* Now we're fine to update the accounting. */
|
||||
if (WARN_ON_ONCE(len > ordered->bytes_left)) {
|
||||
btrfs_crit(fs_info,
|
||||
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%llu left=%llu",
|
||||
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu range start=%llu range len=%llu left=%llu",
|
||||
btrfs_root_id(inode->root), btrfs_ino(inode),
|
||||
ordered->file_offset, ordered->num_bytes,
|
||||
len, ordered->bytes_left);
|
||||
file_offset, len, ordered->bytes_left);
|
||||
ordered->bytes_left = 0;
|
||||
} else {
|
||||
ordered->bytes_left -= len;
|
||||
@ -379,6 +413,28 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
|
||||
* the finish_func to be executed.
|
||||
*/
|
||||
set_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags);
|
||||
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
{
|
||||
u64 real_len;
|
||||
|
||||
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
|
||||
real_len = ordered->truncated_len;
|
||||
else
|
||||
real_len = ordered->num_bytes;
|
||||
|
||||
if (WARN_ON(!bitmap_full(ordered->finished_bitmap,
|
||||
real_len >> fs_info->sectorsize_bits))) {
|
||||
btrfs_crit(fs_info,
|
||||
"ordered extent finished bitmap desync, root=%llu ino=%llu OE offset=%llu OE len=%llu bytes_left=%llu bitmap=%*pbl",
|
||||
btrfs_root_id(inode->root), btrfs_ino(inode),
|
||||
ordered->file_offset, ordered->num_bytes,
|
||||
ordered->bytes_left,
|
||||
(int)(real_len >> fs_info->sectorsize_bits),
|
||||
ordered->finished_bitmap);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
cond_wake_up(&ordered->wait);
|
||||
refcount_inc(&ordered->refs);
|
||||
trace_btrfs_ordered_extent_mark_finished(inode, ordered);
|
||||
@ -624,6 +680,9 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
|
||||
list_del(&sum->list);
|
||||
kvfree(sum);
|
||||
}
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
bitmap_free(entry->finished_bitmap);
|
||||
#endif
|
||||
kmem_cache_free(btrfs_ordered_extent_cache, entry);
|
||||
}
|
||||
}
|
||||
|
@ -154,6 +154,15 @@ struct btrfs_ordered_extent {
|
||||
struct list_head work_list;
|
||||
|
||||
struct list_head bioc_list;
|
||||
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
/*
|
||||
* Set if one block has finished.
|
||||
*
|
||||
* To catch double freeing with more accuracy.
|
||||
*/
|
||||
unsigned long *finished_bitmap;
|
||||
#endif
|
||||
};
|
||||
|
||||
int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent);
|
||||
|
@ -1121,6 +1121,7 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
|
||||
fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON;
|
||||
if (simple) {
|
||||
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
|
||||
btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
|
||||
btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid);
|
||||
} else {
|
||||
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
|
||||
@ -1254,8 +1255,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
|
||||
spin_lock(&fs_info->qgroup_lock);
|
||||
fs_info->quota_root = quota_root;
|
||||
set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
|
||||
if (simple)
|
||||
btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA);
|
||||
spin_unlock(&fs_info->qgroup_lock);
|
||||
|
||||
/* Skip rescan for simple qgroups. */
|
||||
@ -1839,9 +1838,19 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
|
||||
* Thus its reserved space should all be zero, no matter if qgroup
|
||||
* is consistent or the mode.
|
||||
*/
|
||||
WARN_ON(qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
|
||||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
|
||||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
|
||||
if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
|
||||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
|
||||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
|
||||
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
|
||||
btrfs_warn_rl(fs_info,
|
||||
"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
|
||||
btrfs_qgroup_level(qgroup->qgroupid),
|
||||
btrfs_qgroup_subvolid(qgroup->qgroupid),
|
||||
qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA],
|
||||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC],
|
||||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]);
|
||||
|
||||
}
|
||||
/*
|
||||
* The same for rfer/excl numbers, but that's only if our qgroup is
|
||||
* consistent and if it's in regular qgroup mode.
|
||||
@ -1850,8 +1859,9 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
|
||||
*/
|
||||
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL &&
|
||||
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
|
||||
if (WARN_ON(qgroup->rfer || qgroup->excl ||
|
||||
qgroup->rfer_cmpr || qgroup->excl_cmpr)) {
|
||||
if (qgroup->rfer || qgroup->excl ||
|
||||
qgroup->rfer_cmpr || qgroup->excl_cmpr) {
|
||||
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
|
||||
btrfs_warn_rl(fs_info,
|
||||
"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
|
||||
btrfs_qgroup_level(qgroup->qgroupid),
|
||||
|
@ -199,12 +199,8 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
|
||||
for (int i = 0; i < num_stripes; i++) {
|
||||
u64 devid = bioc->stripes[i].dev->devid;
|
||||
u64 physical = bioc->stripes[i].physical;
|
||||
u64 length = bioc->stripes[i].length;
|
||||
struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
|
||||
|
||||
if (length == 0)
|
||||
length = bioc->size;
|
||||
|
||||
btrfs_set_stack_raid_stride_devid(raid_stride, devid);
|
||||
btrfs_set_stack_raid_stride_physical(raid_stride, physical);
|
||||
}
|
||||
|
@ -342,12 +342,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
|
||||
if (cur == node)
|
||||
ret = true;
|
||||
|
||||
/* The node is the lowest node */
|
||||
if (cur->lowest) {
|
||||
list_del_init(&cur->lower);
|
||||
cur->lowest = 0;
|
||||
}
|
||||
|
||||
/* Cleanup the lower edges */
|
||||
while (!list_empty(&cur->lower)) {
|
||||
struct btrfs_backref_edge *edge;
|
||||
@ -373,7 +367,6 @@ static bool handle_useless_nodes(struct reloc_control *rc,
|
||||
* cache to avoid unnecessary backref lookup.
|
||||
*/
|
||||
if (cur->level > 0) {
|
||||
list_add(&cur->list, &cache->detached);
|
||||
cur->detached = 1;
|
||||
} else {
|
||||
rb_erase(&cur->rb_node, &cache->rb_root);
|
||||
@ -426,7 +419,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
|
||||
goto out;
|
||||
}
|
||||
|
||||
node->lowest = 1;
|
||||
cur = node;
|
||||
|
||||
/* Breadth-first search to build backref cache */
|
||||
@ -469,92 +461,6 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
|
||||
return node;
|
||||
}
|
||||
|
||||
/*
|
||||
* helper to add backref node for the newly created snapshot.
|
||||
* the backref node is created by cloning backref node that
|
||||
* corresponds to root of source tree
|
||||
*/
|
||||
static int clone_backref_node(struct btrfs_trans_handle *trans,
|
||||
struct reloc_control *rc,
|
||||
const struct btrfs_root *src,
|
||||
struct btrfs_root *dest)
|
||||
{
|
||||
struct btrfs_root *reloc_root = src->reloc_root;
|
||||
struct btrfs_backref_cache *cache = &rc->backref_cache;
|
||||
struct btrfs_backref_node *node = NULL;
|
||||
struct btrfs_backref_node *new_node;
|
||||
struct btrfs_backref_edge *edge;
|
||||
struct btrfs_backref_edge *new_edge;
|
||||
struct rb_node *rb_node;
|
||||
|
||||
rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
|
||||
if (rb_node) {
|
||||
node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
|
||||
if (node->detached)
|
||||
node = NULL;
|
||||
else
|
||||
BUG_ON(node->new_bytenr != reloc_root->node->start);
|
||||
}
|
||||
|
||||
if (!node) {
|
||||
rb_node = rb_simple_search(&cache->rb_root,
|
||||
reloc_root->commit_root->start);
|
||||
if (rb_node) {
|
||||
node = rb_entry(rb_node, struct btrfs_backref_node,
|
||||
rb_node);
|
||||
BUG_ON(node->detached);
|
||||
}
|
||||
}
|
||||
|
||||
if (!node)
|
||||
return 0;
|
||||
|
||||
new_node = btrfs_backref_alloc_node(cache, dest->node->start,
|
||||
node->level);
|
||||
if (!new_node)
|
||||
return -ENOMEM;
|
||||
|
||||
new_node->lowest = node->lowest;
|
||||
new_node->checked = 1;
|
||||
new_node->root = btrfs_grab_root(dest);
|
||||
ASSERT(new_node->root);
|
||||
|
||||
if (!node->lowest) {
|
||||
list_for_each_entry(edge, &node->lower, list[UPPER]) {
|
||||
new_edge = btrfs_backref_alloc_edge(cache);
|
||||
if (!new_edge)
|
||||
goto fail;
|
||||
|
||||
btrfs_backref_link_edge(new_edge, edge->node[LOWER],
|
||||
new_node, LINK_UPPER);
|
||||
}
|
||||
} else {
|
||||
list_add_tail(&new_node->lower, &cache->leaves);
|
||||
}
|
||||
|
||||
rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
|
||||
&new_node->rb_node);
|
||||
if (rb_node)
|
||||
btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
|
||||
|
||||
if (!new_node->lowest) {
|
||||
list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
|
||||
list_add_tail(&new_edge->list[LOWER],
|
||||
&new_edge->node[LOWER]->upper);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
fail:
|
||||
while (!list_empty(&new_node->lower)) {
|
||||
new_edge = list_entry(new_node->lower.next,
|
||||
struct btrfs_backref_edge, list[UPPER]);
|
||||
list_del(&new_edge->list[UPPER]);
|
||||
btrfs_backref_free_edge(cache, new_edge);
|
||||
}
|
||||
btrfs_backref_free_node(cache, new_node);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
/*
|
||||
* helper to add 'address of tree root -> reloc tree' mapping
|
||||
*/
|
||||
@ -2058,100 +1964,72 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
|
||||
int index = 0;
|
||||
int ret;
|
||||
|
||||
next = node;
|
||||
while (1) {
|
||||
cond_resched();
|
||||
next = walk_up_backref(next, edges, &index);
|
||||
root = next->root;
|
||||
next = walk_up_backref(node, edges, &index);
|
||||
root = next->root;
|
||||
|
||||
/*
|
||||
* If there is no root, then our references for this block are
|
||||
* incomplete, as we should be able to walk all the way up to a
|
||||
* block that is owned by a root.
|
||||
*
|
||||
* This path is only for SHAREABLE roots, so if we come upon a
|
||||
* non-SHAREABLE root then we have backrefs that resolve
|
||||
* improperly.
|
||||
*
|
||||
* Both of these cases indicate file system corruption, or a bug
|
||||
* in the backref walking code.
|
||||
*/
|
||||
if (!root) {
|
||||
ASSERT(0);
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu doesn't have a backref path ending in a root",
|
||||
node->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
|
||||
ASSERT(0);
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu has multiple refs with one ending in a non-shareable root",
|
||||
node->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
/*
|
||||
* If there is no root, then our references for this block are
|
||||
* incomplete, as we should be able to walk all the way up to a block
|
||||
* that is owned by a root.
|
||||
*
|
||||
* This path is only for SHAREABLE roots, so if we come upon a
|
||||
* non-SHAREABLE root then we have backrefs that resolve improperly.
|
||||
*
|
||||
* Both of these cases indicate file system corruption, or a bug in the
|
||||
* backref walking code.
|
||||
*/
|
||||
if (unlikely(!root)) {
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu doesn't have a backref path ending in a root",
|
||||
node->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
if (unlikely(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))) {
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu has multiple refs with one ending in a non-shareable root",
|
||||
node->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
|
||||
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
|
||||
ret = record_reloc_root_in_trans(trans, root);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
break;
|
||||
}
|
||||
|
||||
ret = btrfs_record_root_in_trans(trans, root);
|
||||
if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) {
|
||||
ret = record_reloc_root_in_trans(trans, root);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
root = root->reloc_root;
|
||||
|
||||
/*
|
||||
* We could have raced with another thread which failed, so
|
||||
* root->reloc_root may not be set, return ENOENT in this case.
|
||||
*/
|
||||
if (!root)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
if (next->new_bytenr != root->node->start) {
|
||||
/*
|
||||
* We just created the reloc root, so we shouldn't have
|
||||
* ->new_bytenr set and this shouldn't be in the changed
|
||||
* list. If it is then we have multiple roots pointing
|
||||
* at the same bytenr which indicates corruption, or
|
||||
* we've made a mistake in the backref walking code.
|
||||
*/
|
||||
ASSERT(next->new_bytenr == 0);
|
||||
ASSERT(list_empty(&next->list));
|
||||
if (next->new_bytenr || !list_empty(&next->list)) {
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
|
||||
node->bytenr, next->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
|
||||
next->new_bytenr = root->node->start;
|
||||
btrfs_put_root(next->root);
|
||||
next->root = btrfs_grab_root(root);
|
||||
ASSERT(next->root);
|
||||
list_add_tail(&next->list,
|
||||
&rc->backref_cache.changed);
|
||||
mark_block_processed(rc, next);
|
||||
break;
|
||||
}
|
||||
|
||||
WARN_ON(1);
|
||||
root = NULL;
|
||||
next = walk_down_backref(edges, &index);
|
||||
if (!next || next->level <= node->level)
|
||||
break;
|
||||
goto found;
|
||||
}
|
||||
if (!root) {
|
||||
/*
|
||||
* This can happen if there's fs corruption or if there's a bug
|
||||
* in the backref lookup code.
|
||||
*/
|
||||
ASSERT(0);
|
||||
|
||||
ret = btrfs_record_root_in_trans(trans, root);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
root = root->reloc_root;
|
||||
|
||||
/*
|
||||
* We could have raced with another thread which failed, so
|
||||
* root->reloc_root may not be set, return ENOENT in this case.
|
||||
*/
|
||||
if (!root)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
if (next->new_bytenr) {
|
||||
/*
|
||||
* We just created the reloc root, so we shouldn't have
|
||||
* ->new_bytenr set yet. If it is then we have multiple roots
|
||||
* pointing at the same bytenr which indicates corruption, or
|
||||
* we've made a mistake in the backref walking code.
|
||||
*/
|
||||
ASSERT(next->new_bytenr == 0);
|
||||
btrfs_err(trans->fs_info,
|
||||
"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
|
||||
node->bytenr, next->bytenr);
|
||||
return ERR_PTR(-EUCLEAN);
|
||||
}
|
||||
|
||||
next->new_bytenr = root->node->start;
|
||||
btrfs_put_root(next->root);
|
||||
next->root = btrfs_grab_root(root);
|
||||
ASSERT(next->root);
|
||||
mark_block_processed(rc, next);
|
||||
found:
|
||||
next = node;
|
||||
/* setup backref node path for btrfs_reloc_cow_block */
|
||||
while (1) {
|
||||
@ -2247,17 +2125,11 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
|
||||
return num_bytes;
|
||||
}
|
||||
|
||||
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
|
||||
struct reloc_control *rc,
|
||||
struct btrfs_backref_node *node)
|
||||
static int refill_metadata_space(struct btrfs_trans_handle *trans,
|
||||
struct reloc_control *rc, u64 num_bytes)
|
||||
{
|
||||
struct btrfs_root *root = rc->extent_root;
|
||||
struct btrfs_fs_info *fs_info = root->fs_info;
|
||||
u64 num_bytes;
|
||||
struct btrfs_fs_info *fs_info = trans->fs_info;
|
||||
int ret;
|
||||
u64 tmp;
|
||||
|
||||
num_bytes = calcu_metadata_size(rc, node) * 2;
|
||||
|
||||
trans->block_rsv = rc->block_rsv;
|
||||
rc->reserved_bytes += num_bytes;
|
||||
@ -2270,7 +2142,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
|
||||
ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
|
||||
BTRFS_RESERVE_FLUSH_LIMIT);
|
||||
if (ret) {
|
||||
tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
|
||||
u64 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
|
||||
|
||||
while (tmp <= rc->reserved_bytes)
|
||||
tmp <<= 1;
|
||||
/*
|
||||
@ -2288,6 +2161,16 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
|
||||
struct reloc_control *rc,
|
||||
struct btrfs_backref_node *node)
|
||||
{
|
||||
u64 num_bytes;
|
||||
|
||||
num_bytes = calcu_metadata_size(rc, node) * 2;
|
||||
return refill_metadata_space(trans, rc, num_bytes);
|
||||
}
|
||||
|
||||
/*
|
||||
* relocate a block tree, and then update pointers in upper level
|
||||
* blocks that reference the block to point to the new location.
|
||||
@ -2442,7 +2325,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
|
||||
|
||||
if (!ret && node->pending) {
|
||||
btrfs_backref_drop_node_buffer(node);
|
||||
list_move_tail(&node->list, &rc->backref_cache.changed);
|
||||
list_del_init(&node->list);
|
||||
node->pending = 0;
|
||||
}
|
||||
|
||||
@ -2605,8 +2488,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
|
||||
/*
|
||||
* This block was the root block of a root, and this is
|
||||
* the first time we're processing the block and thus it
|
||||
* should not have had the ->new_bytenr modified and
|
||||
* should have not been included on the changed list.
|
||||
* should not have had the ->new_bytenr modified.
|
||||
*
|
||||
* However in the case of corruption we could have
|
||||
* multiple refs pointing to the same block improperly,
|
||||
@ -2616,8 +2498,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
|
||||
* normal user in the case of corruption.
|
||||
*/
|
||||
ASSERT(node->new_bytenr == 0);
|
||||
ASSERT(list_empty(&node->list));
|
||||
if (node->new_bytenr || !list_empty(&node->list)) {
|
||||
if (node->new_bytenr) {
|
||||
btrfs_err(root->fs_info,
|
||||
"bytenr %llu has improper references to it",
|
||||
node->bytenr);
|
||||
@ -2640,17 +2521,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
|
||||
btrfs_put_root(node->root);
|
||||
node->root = btrfs_grab_root(root);
|
||||
ASSERT(node->root);
|
||||
list_add_tail(&node->list, &rc->backref_cache.changed);
|
||||
} else {
|
||||
path->lowest_level = node->level;
|
||||
if (root == root->fs_info->chunk_root)
|
||||
btrfs_reserve_chunk_metadata(trans, false);
|
||||
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
|
||||
btrfs_release_path(path);
|
||||
if (root == root->fs_info->chunk_root)
|
||||
btrfs_trans_release_chunk_metadata(trans);
|
||||
if (ret > 0)
|
||||
ret = 0;
|
||||
btrfs_err(root->fs_info,
|
||||
"bytenr %llu resolved to a non-shareable root",
|
||||
node->bytenr);
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
if (!ret)
|
||||
update_processed_blocks(rc, node);
|
||||
@ -2658,11 +2534,50 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
|
||||
ret = do_relocation(trans, rc, node, key, path, 1);
|
||||
}
|
||||
out:
|
||||
if (ret || node->level == 0 || node->cowonly)
|
||||
if (ret || node->level == 0)
|
||||
btrfs_backref_cleanup_node(&rc->backref_cache, node);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int relocate_cowonly_block(struct btrfs_trans_handle *trans,
|
||||
struct reloc_control *rc, struct tree_block *block,
|
||||
struct btrfs_path *path)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = trans->fs_info;
|
||||
struct btrfs_root *root;
|
||||
u64 num_bytes;
|
||||
int nr_levels;
|
||||
int ret;
|
||||
|
||||
root = btrfs_get_fs_root(fs_info, block->owner, true);
|
||||
if (IS_ERR(root))
|
||||
return PTR_ERR(root);
|
||||
|
||||
nr_levels = max(btrfs_header_level(root->node) - block->level, 0) + 1;
|
||||
|
||||
num_bytes = fs_info->nodesize * nr_levels;
|
||||
ret = refill_metadata_space(trans, rc, num_bytes);
|
||||
if (ret) {
|
||||
btrfs_put_root(root);
|
||||
return ret;
|
||||
}
|
||||
path->lowest_level = block->level;
|
||||
if (root == root->fs_info->chunk_root)
|
||||
btrfs_reserve_chunk_metadata(trans, false);
|
||||
|
||||
ret = btrfs_search_slot(trans, root, &block->key, path, 0, 1);
|
||||
path->lowest_level = 0;
|
||||
btrfs_release_path(path);
|
||||
|
||||
if (root == root->fs_info->chunk_root)
|
||||
btrfs_trans_release_chunk_metadata(trans);
|
||||
if (ret > 0)
|
||||
ret = 0;
|
||||
btrfs_put_root(root);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* relocate a list of blocks
|
||||
*/
|
||||
@ -2702,6 +2617,20 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
|
||||
|
||||
/* Do tree relocation */
|
||||
rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
|
||||
/*
|
||||
* For COWonly blocks, or the data reloc tree, we only need to
|
||||
* COW down to the block, there's no need to generate a backref
|
||||
* tree.
|
||||
*/
|
||||
if (block->owner &&
|
||||
(!is_fstree(block->owner) ||
|
||||
block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
|
||||
ret = relocate_cowonly_block(trans, rc, block, path);
|
||||
if (ret)
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
|
||||
node = build_backref_tree(trans, rc, &block->key,
|
||||
block->level, block->bytenr);
|
||||
if (IS_ERR(node)) {
|
||||
@ -2902,6 +2831,7 @@ static int relocate_one_folio(struct reloc_control *rc,
|
||||
const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags);
|
||||
|
||||
ASSERT(index <= last_index);
|
||||
again:
|
||||
folio = filemap_lock_folio(inode->i_mapping, index);
|
||||
if (IS_ERR(folio)) {
|
||||
|
||||
@ -2937,6 +2867,11 @@ static int relocate_one_folio(struct reloc_control *rc,
|
||||
ret = -EIO;
|
||||
goto release_folio;
|
||||
}
|
||||
if (folio->mapping != inode->i_mapping) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -4399,8 +4334,18 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
|
||||
WARN_ON(!first_cow && level == 0);
|
||||
|
||||
node = rc->backref_cache.path[level];
|
||||
BUG_ON(node->bytenr != buf->start &&
|
||||
node->new_bytenr != buf->start);
|
||||
|
||||
/*
|
||||
* If node->bytenr != buf->start and node->new_bytenr !=
|
||||
* buf->start then we've got the wrong backref node for what we
|
||||
* expected to see here and the cache is incorrect.
|
||||
*/
|
||||
if (unlikely(node->bytenr != buf->start && node->new_bytenr != buf->start)) {
|
||||
btrfs_err(fs_info,
|
||||
"bytenr %llu was found but our backref cache was expecting %llu or %llu",
|
||||
buf->start, node->bytenr, node->new_bytenr);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
|
||||
btrfs_backref_drop_node_buffer(node);
|
||||
atomic_inc(&cow->refs);
|
||||
@ -4500,10 +4445,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
|
||||
return ret;
|
||||
}
|
||||
new_root->reloc_root = btrfs_grab_root(reloc_root);
|
||||
|
||||
if (rc->create_reloc_tree)
|
||||
ret = clone_backref_node(trans, rc, root, reloc_root);
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
|
142
fs/btrfs/scrub.c
142
fs/btrfs/scrub.c
@ -226,6 +226,7 @@ struct scrub_warning {
|
||||
u64 physical;
|
||||
u64 logical;
|
||||
struct btrfs_device *dev;
|
||||
bool message_printed;
|
||||
};
|
||||
|
||||
static void release_scrub_stripe(struct scrub_stripe *stripe)
|
||||
@ -388,17 +389,13 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
|
||||
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
|
||||
u64 root, void *warn_ctx)
|
||||
{
|
||||
u32 nlink;
|
||||
int ret;
|
||||
int i;
|
||||
unsigned nofs_flag;
|
||||
struct extent_buffer *eb;
|
||||
struct btrfs_inode_item *inode_item;
|
||||
struct scrub_warning *swarn = warn_ctx;
|
||||
struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
|
||||
struct inode_fs_paths *ipath = NULL;
|
||||
struct btrfs_root *local_root;
|
||||
struct btrfs_key key;
|
||||
|
||||
local_root = btrfs_get_fs_root(fs_info, root, true);
|
||||
if (IS_ERR(local_root)) {
|
||||
@ -406,26 +403,6 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* this makes the path point to (inum INODE_ITEM ioff)
|
||||
*/
|
||||
key.objectid = inum;
|
||||
key.type = BTRFS_INODE_ITEM_KEY;
|
||||
key.offset = 0;
|
||||
|
||||
ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
|
||||
if (ret) {
|
||||
btrfs_put_root(local_root);
|
||||
btrfs_release_path(swarn->path);
|
||||
goto err;
|
||||
}
|
||||
|
||||
eb = swarn->path->nodes[0];
|
||||
inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
|
||||
struct btrfs_inode_item);
|
||||
nlink = btrfs_inode_nlink(eb, inode_item);
|
||||
btrfs_release_path(swarn->path);
|
||||
|
||||
/*
|
||||
* init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
|
||||
* uses GFP_NOFS in this context, so we keep it consistent but it does
|
||||
@ -449,34 +426,35 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
|
||||
* we deliberately ignore the bit ipath might have been too small to
|
||||
* hold all of the paths here
|
||||
*/
|
||||
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
|
||||
btrfs_warn_in_rcu(fs_info,
|
||||
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
|
||||
for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
|
||||
btrfs_warn_rl_in_rcu(fs_info,
|
||||
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, path: %s",
|
||||
swarn->errstr, swarn->logical,
|
||||
btrfs_dev_name(swarn->dev),
|
||||
swarn->physical,
|
||||
root, inum, offset,
|
||||
fs_info->sectorsize, nlink,
|
||||
(char *)(unsigned long)ipath->fspath->val[i]);
|
||||
swarn->message_printed = true;
|
||||
}
|
||||
|
||||
btrfs_put_root(local_root);
|
||||
free_ipath(ipath);
|
||||
return 0;
|
||||
|
||||
err:
|
||||
btrfs_warn_in_rcu(fs_info,
|
||||
btrfs_warn_rl_in_rcu(fs_info,
|
||||
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
|
||||
swarn->errstr, swarn->logical,
|
||||
btrfs_dev_name(swarn->dev),
|
||||
swarn->physical,
|
||||
root, inum, offset, ret);
|
||||
|
||||
swarn->message_printed = true;
|
||||
free_ipath(ipath);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
|
||||
bool is_super, u64 logical, u64 physical)
|
||||
u64 logical, u64 physical)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = dev->fs_info;
|
||||
struct btrfs_path *path;
|
||||
@ -488,12 +466,6 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
|
||||
u32 item_size;
|
||||
int ret;
|
||||
|
||||
/* Super block error, no need to search extent tree. */
|
||||
if (is_super) {
|
||||
btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
|
||||
errstr, btrfs_dev_name(dev), physical);
|
||||
return;
|
||||
}
|
||||
path = btrfs_alloc_path();
|
||||
if (!path)
|
||||
return;
|
||||
@ -502,6 +474,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
|
||||
swarn.logical = logical;
|
||||
swarn.errstr = errstr;
|
||||
swarn.dev = NULL;
|
||||
swarn.message_printed = false;
|
||||
|
||||
ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
|
||||
&flags);
|
||||
@ -523,20 +496,22 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
|
||||
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
|
||||
item_size, &ref_root,
|
||||
&ref_level);
|
||||
if (ret < 0) {
|
||||
btrfs_warn(fs_info,
|
||||
"failed to resolve tree backref for logical %llu: %d",
|
||||
swarn.logical, ret);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
if (ret > 0)
|
||||
break;
|
||||
btrfs_warn_in_rcu(fs_info,
|
||||
btrfs_warn_rl_in_rcu(fs_info,
|
||||
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
|
||||
errstr, swarn.logical, btrfs_dev_name(dev),
|
||||
swarn.physical, (ref_level ? "node" : "leaf"),
|
||||
ref_level, ref_root);
|
||||
swarn.message_printed = true;
|
||||
}
|
||||
if (!swarn.message_printed)
|
||||
btrfs_warn_rl_in_rcu(fs_info,
|
||||
"%s at metadata, logical %llu on dev %s physical %llu",
|
||||
errstr, swarn.logical,
|
||||
btrfs_dev_name(dev), swarn.physical);
|
||||
btrfs_release_path(path);
|
||||
} else {
|
||||
struct btrfs_backref_walk_ctx ctx = { 0 };
|
||||
@ -551,6 +526,11 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
|
||||
swarn.dev = dev;
|
||||
|
||||
iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
|
||||
if (!swarn.message_printed)
|
||||
btrfs_warn_rl_in_rcu(fs_info,
|
||||
"%s at data, filename unresolved, logical %llu on dev %s physical %llu",
|
||||
errstr, swarn.logical,
|
||||
btrfs_dev_name(dev), swarn.physical);
|
||||
}
|
||||
|
||||
out:
|
||||
@ -866,11 +846,9 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
|
||||
static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
|
||||
struct scrub_stripe *stripe)
|
||||
{
|
||||
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
struct btrfs_fs_info *fs_info = sctx->fs_info;
|
||||
struct btrfs_device *dev = NULL;
|
||||
u64 physical = 0;
|
||||
struct btrfs_device *dev = stripe->dev;
|
||||
u64 stripe_physical = stripe->physical;
|
||||
int nr_data_sectors = 0;
|
||||
int nr_meta_sectors = 0;
|
||||
int nr_nodatacsum_sectors = 0;
|
||||
@ -880,36 +858,12 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
|
||||
if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Init needed infos for error reporting.
|
||||
*
|
||||
* Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
|
||||
* thus no need for dev/physical, error reporting still needs dev and physical.
|
||||
*/
|
||||
if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
|
||||
u64 mapped_len = fs_info->sectorsize;
|
||||
struct btrfs_io_context *bioc = NULL;
|
||||
int stripe_index = stripe->mirror_num - 1;
|
||||
int ret;
|
||||
|
||||
/* For scrub, our mirror_num should always start at 1. */
|
||||
ASSERT(stripe->mirror_num >= 1);
|
||||
ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
|
||||
stripe->logical, &mapped_len, &bioc,
|
||||
NULL, NULL);
|
||||
/*
|
||||
* If we failed, dev will be NULL, and later detailed reports
|
||||
* will just be skipped.
|
||||
*/
|
||||
if (ret < 0)
|
||||
goto skip;
|
||||
physical = bioc->stripes[stripe_index].physical;
|
||||
dev = bioc->stripes[stripe_index].dev;
|
||||
btrfs_put_bioc(bioc);
|
||||
}
|
||||
|
||||
skip:
|
||||
ASSERT(dev);
|
||||
for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
|
||||
const u64 logical = stripe->logical +
|
||||
(sector_nr << fs_info->sectorsize_bits);
|
||||
const u64 physical = stripe_physical +
|
||||
(sector_nr << fs_info->sectorsize_bits);
|
||||
bool repaired = false;
|
||||
|
||||
if (stripe->sectors[sector_nr].is_metadata) {
|
||||
@ -935,43 +889,23 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
|
||||
* output the message of repaired message.
|
||||
*/
|
||||
if (repaired) {
|
||||
if (dev) {
|
||||
btrfs_err_rl_in_rcu(fs_info,
|
||||
btrfs_err_rl_in_rcu(fs_info,
|
||||
"fixed up error at logical %llu on dev %s physical %llu",
|
||||
stripe->logical, btrfs_dev_name(dev),
|
||||
logical, btrfs_dev_name(dev),
|
||||
physical);
|
||||
} else {
|
||||
btrfs_err_rl_in_rcu(fs_info,
|
||||
"fixed up error at logical %llu on mirror %u",
|
||||
stripe->logical, stripe->mirror_num);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/* The remaining are all for unrepaired. */
|
||||
if (dev) {
|
||||
btrfs_err_rl_in_rcu(fs_info,
|
||||
"unable to fixup (regular) error at logical %llu on dev %s physical %llu",
|
||||
stripe->logical, btrfs_dev_name(dev),
|
||||
physical);
|
||||
} else {
|
||||
btrfs_err_rl_in_rcu(fs_info,
|
||||
"unable to fixup (regular) error at logical %llu on mirror %u",
|
||||
stripe->logical, stripe->mirror_num);
|
||||
}
|
||||
|
||||
if (test_bit(sector_nr, &stripe->io_error_bitmap))
|
||||
if (__ratelimit(&rs) && dev)
|
||||
scrub_print_common_warning("i/o error", dev, false,
|
||||
stripe->logical, physical);
|
||||
scrub_print_common_warning("i/o error", dev,
|
||||
logical, physical);
|
||||
if (test_bit(sector_nr, &stripe->csum_error_bitmap))
|
||||
if (__ratelimit(&rs) && dev)
|
||||
scrub_print_common_warning("checksum error", dev, false,
|
||||
stripe->logical, physical);
|
||||
scrub_print_common_warning("checksum error", dev,
|
||||
logical, physical);
|
||||
if (test_bit(sector_nr, &stripe->meta_error_bitmap))
|
||||
if (__ratelimit(&rs) && dev)
|
||||
scrub_print_common_warning("header error", dev, false,
|
||||
stripe->logical, physical);
|
||||
scrub_print_common_warning("header error", dev,
|
||||
logical, physical);
|
||||
}
|
||||
|
||||
spin_lock(&sctx->stat_lock);
|
||||
|
@ -5280,6 +5280,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
|
||||
unsigned cur_len = min_t(unsigned, len,
|
||||
PAGE_SIZE - pg_offset);
|
||||
|
||||
again:
|
||||
folio = filemap_lock_folio(mapping, index);
|
||||
if (IS_ERR(folio)) {
|
||||
page_cache_sync_readahead(mapping,
|
||||
@ -5312,6 +5313,11 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
if (folio->mapping != mapping) {
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
|
||||
memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
|
||||
@ -7253,7 +7259,7 @@ static int changed_cb(struct btrfs_path *left_path,
|
||||
enum btrfs_compare_tree_result result,
|
||||
struct send_ctx *sctx)
|
||||
{
|
||||
int ret = 0;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* We can not hold the commit root semaphore here. This is because in
|
||||
@ -7313,7 +7319,6 @@ static int changed_cb(struct btrfs_path *left_path,
|
||||
return 0;
|
||||
}
|
||||
result = BTRFS_COMPARE_TREE_CHANGED;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
sctx->left_path = left_path;
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include "fs.h"
|
||||
#include "accessors.h"
|
||||
#include "extent-tree.h"
|
||||
#include "zoned.h"
|
||||
|
||||
/*
|
||||
* HOW DOES SPACE RESERVATION WORK
|
||||
@ -127,6 +128,14 @@
|
||||
* churn a lot and we can avoid making some extent tree modifications if we
|
||||
* are able to delay for as long as possible.
|
||||
*
|
||||
* RESET_ZONES
|
||||
* This state works only for the zoned mode. On the zoned mode, we cannot
|
||||
* reuse once allocated then freed region until we reset the zone, due to
|
||||
* the sequential write zone requirement. The RESET_ZONES state resets the
|
||||
* zones of an unused block group and let us reuse the space. The reusing
|
||||
* is faster than removing the block group and allocating another block
|
||||
* group on the zones.
|
||||
*
|
||||
* ALLOC_CHUNK
|
||||
* We will skip this the first time through space reservation, because of
|
||||
* overcommit and we don't want to have a lot of useless metadata space when
|
||||
@ -316,7 +325,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
|
||||
found->bytes_used += block_group->used;
|
||||
found->disk_used += block_group->used * factor;
|
||||
found->bytes_readonly += block_group->bytes_super;
|
||||
btrfs_space_info_update_bytes_zone_unusable(info, found, block_group->zone_unusable);
|
||||
btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable);
|
||||
if (block_group->length > 0)
|
||||
found->full = 0;
|
||||
btrfs_try_granting_tickets(info, found);
|
||||
@ -489,9 +498,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
|
||||
if ((used + ticket->bytes <= space_info->total_bytes) ||
|
||||
btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
|
||||
flush)) {
|
||||
btrfs_space_info_update_bytes_may_use(fs_info,
|
||||
space_info,
|
||||
ticket->bytes);
|
||||
btrfs_space_info_update_bytes_may_use(space_info, ticket->bytes);
|
||||
remove_ticket(space_info, ticket);
|
||||
ticket->bytes = 0;
|
||||
space_info->tickets_id++;
|
||||
@ -834,6 +841,9 @@ static void flush_space(struct btrfs_fs_info *fs_info,
|
||||
*/
|
||||
ret = btrfs_commit_current_transaction(root);
|
||||
break;
|
||||
case RESET_ZONES:
|
||||
ret = btrfs_reset_unused_block_groups(space_info, num_bytes);
|
||||
break;
|
||||
default:
|
||||
ret = -ENOSPC;
|
||||
break;
|
||||
@ -1086,9 +1096,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
|
||||
enum btrfs_flush_state flush_state;
|
||||
int commit_cycles = 0;
|
||||
u64 last_tickets_id;
|
||||
enum btrfs_flush_state final_state;
|
||||
|
||||
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
|
||||
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
|
||||
if (btrfs_is_zoned(fs_info))
|
||||
final_state = RESET_ZONES;
|
||||
else
|
||||
final_state = COMMIT_TRANS;
|
||||
|
||||
spin_lock(&space_info->lock);
|
||||
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
|
||||
@ -1141,7 +1156,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
|
||||
if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
|
||||
flush_state++;
|
||||
|
||||
if (flush_state > COMMIT_TRANS) {
|
||||
if (flush_state > final_state) {
|
||||
commit_cycles++;
|
||||
if (commit_cycles > 2) {
|
||||
if (maybe_fail_all_tickets(fs_info, space_info)) {
|
||||
@ -1155,7 +1170,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
spin_unlock(&space_info->lock);
|
||||
} while (flush_state <= COMMIT_TRANS);
|
||||
} while (flush_state <= final_state);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1286,6 +1301,10 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
|
||||
* This is where we reclaim all of the pinned space generated by running the
|
||||
* iputs
|
||||
*
|
||||
* RESET_ZONES
|
||||
* This state works only for the zoned mode. We scan the unused block group
|
||||
* list and reset the zones and reuse the block group.
|
||||
*
|
||||
* ALLOC_CHUNK_FORCE
|
||||
* For data we start with alloc chunk force, however we could have been full
|
||||
* before, and then the transaction commit could have freed new block groups,
|
||||
@ -1295,6 +1314,7 @@ static const enum btrfs_flush_state data_flush_states[] = {
|
||||
FLUSH_DELALLOC_FULL,
|
||||
RUN_DELAYED_IPUTS,
|
||||
COMMIT_TRANS,
|
||||
RESET_ZONES,
|
||||
ALLOC_CHUNK_FORCE,
|
||||
};
|
||||
|
||||
@ -1386,6 +1406,7 @@ void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
|
||||
static const enum btrfs_flush_state priority_flush_states[] = {
|
||||
FLUSH_DELAYED_ITEMS_NR,
|
||||
FLUSH_DELAYED_ITEMS,
|
||||
RESET_ZONES,
|
||||
ALLOC_CHUNK,
|
||||
};
|
||||
|
||||
@ -1399,6 +1420,7 @@ static const enum btrfs_flush_state evict_flush_states[] = {
|
||||
FLUSH_DELALLOC_FULL,
|
||||
ALLOC_CHUNK,
|
||||
COMMIT_TRANS,
|
||||
RESET_ZONES,
|
||||
};
|
||||
|
||||
static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
|
||||
@ -1690,8 +1712,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
|
||||
if (!pending_tickets &&
|
||||
((used + orig_bytes <= space_info->total_bytes) ||
|
||||
btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
|
||||
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
|
||||
orig_bytes);
|
||||
btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
@ -1703,8 +1724,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
|
||||
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
|
||||
used = btrfs_space_info_used(space_info, false);
|
||||
if (used + orig_bytes <= space_info->total_bytes) {
|
||||
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
|
||||
orig_bytes);
|
||||
btrfs_space_info_update_bytes_may_use(space_info, orig_bytes);
|
||||
ret = 0;
|
||||
}
|
||||
}
|
||||
@ -2082,3 +2102,32 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
|
||||
do_reclaim_sweep(space_info, raid);
|
||||
}
|
||||
}
|
||||
|
||||
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = space_info->fs_info;
|
||||
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
|
||||
|
||||
lockdep_assert_held(&space_info->lock);
|
||||
|
||||
/* Prioritize the global reservation to receive the freed space. */
|
||||
if (global_rsv->space_info != space_info)
|
||||
goto grant;
|
||||
|
||||
spin_lock(&global_rsv->lock);
|
||||
if (!global_rsv->full) {
|
||||
u64 to_add = min(len, global_rsv->size - global_rsv->reserved);
|
||||
|
||||
global_rsv->reserved += to_add;
|
||||
btrfs_space_info_update_bytes_may_use(space_info, to_add);
|
||||
if (global_rsv->reserved >= global_rsv->size)
|
||||
global_rsv->full = 1;
|
||||
len -= to_add;
|
||||
}
|
||||
spin_unlock(&global_rsv->lock);
|
||||
|
||||
grant:
|
||||
/* Add to any tickets we may have. */
|
||||
if (len)
|
||||
btrfs_try_granting_tickets(fs_info, space_info);
|
||||
}
|
||||
|
@ -79,6 +79,10 @@ enum btrfs_reserve_flush_enum {
|
||||
BTRFS_RESERVE_FLUSH_EMERGENCY,
|
||||
};
|
||||
|
||||
/*
|
||||
* Please be aware that the order of enum values will be the order of the reclaim
|
||||
* process in btrfs_async_reclaim_metadata_space().
|
||||
*/
|
||||
enum btrfs_flush_state {
|
||||
FLUSH_DELAYED_ITEMS_NR = 1,
|
||||
FLUSH_DELAYED_ITEMS = 2,
|
||||
@ -91,6 +95,7 @@ enum btrfs_flush_state {
|
||||
ALLOC_CHUNK_FORCE = 9,
|
||||
RUN_DELAYED_IPUTS = 10,
|
||||
COMMIT_TRANS = 11,
|
||||
RESET_ZONES = 12,
|
||||
};
|
||||
|
||||
struct btrfs_space_info {
|
||||
@ -229,10 +234,10 @@ static inline bool btrfs_mixed_space_info(const struct btrfs_space_info *space_i
|
||||
*/
|
||||
#define DECLARE_SPACE_INFO_UPDATE(name, trace_name) \
|
||||
static inline void \
|
||||
btrfs_space_info_update_##name(struct btrfs_fs_info *fs_info, \
|
||||
struct btrfs_space_info *sinfo, \
|
||||
btrfs_space_info_update_##name(struct btrfs_space_info *sinfo, \
|
||||
s64 bytes) \
|
||||
{ \
|
||||
struct btrfs_fs_info *fs_info = sinfo->fs_info; \
|
||||
const u64 abs_bytes = (bytes < 0) ? -bytes : bytes; \
|
||||
lockdep_assert_held(&sinfo->lock); \
|
||||
trace_update_##name(fs_info, sinfo, sinfo->name, bytes); \
|
||||
@ -275,13 +280,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
|
||||
enum btrfs_reserve_flush_enum flush);
|
||||
|
||||
static inline void btrfs_space_info_free_bytes_may_use(
|
||||
struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_space_info *space_info,
|
||||
u64 num_bytes)
|
||||
{
|
||||
spin_lock(&space_info->lock);
|
||||
btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
|
||||
btrfs_try_granting_tickets(fs_info, space_info);
|
||||
btrfs_space_info_update_bytes_may_use(space_info, -num_bytes);
|
||||
btrfs_try_granting_tickets(space_info->fs_info, space_info);
|
||||
spin_unlock(&space_info->lock);
|
||||
}
|
||||
int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
|
||||
@ -295,5 +299,6 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
|
||||
bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
|
||||
int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
|
||||
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
|
||||
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
|
||||
|
||||
#endif /* BTRFS_SPACE_INFO_H */
|
||||
|
@ -635,6 +635,28 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
|
||||
IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
|
||||
folio_test_checked);
|
||||
|
||||
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
|
||||
{ \
|
||||
const int sectors_per_page = fs_info->sectors_per_page; \
|
||||
\
|
||||
ASSERT(sectors_per_page < BITS_PER_LONG); \
|
||||
*dst = bitmap_read(subpage->bitmaps, \
|
||||
sectors_per_page * btrfs_bitmap_nr_##name, \
|
||||
sectors_per_page); \
|
||||
}
|
||||
|
||||
#define subpage_dump_bitmap(fs_info, folio, name, start, len) \
|
||||
{ \
|
||||
struct btrfs_subpage *subpage = folio_get_private(folio); \
|
||||
unsigned long bitmap; \
|
||||
\
|
||||
GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \
|
||||
btrfs_warn(fs_info, \
|
||||
"dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
|
||||
start, len, folio_pos(folio), \
|
||||
fs_info->sectors_per_page, &bitmap); \
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
|
||||
* is cleared.
|
||||
@ -660,6 +682,10 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
|
||||
subpage = folio_get_private(folio);
|
||||
ASSERT(subpage);
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
|
||||
subpage_dump_bitmap(fs_info, folio, dirty, start, len);
|
||||
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
|
||||
}
|
||||
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
}
|
||||
@ -689,23 +715,17 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
|
||||
nbits = len >> fs_info->sectorsize_bits;
|
||||
spin_lock_irqsave(&subpage->lock, flags);
|
||||
/* Target range should not yet be locked. */
|
||||
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
|
||||
if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
|
||||
subpage_dump_bitmap(fs_info, folio, locked, start, len);
|
||||
btrfs_warn(fs_info, "nr_locked=%u\n", atomic_read(&subpage->nr_locked));
|
||||
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
|
||||
}
|
||||
bitmap_set(subpage->bitmaps, start_bit, nbits);
|
||||
ret = atomic_add_return(nbits, &subpage->nr_locked);
|
||||
ASSERT(ret <= fs_info->sectors_per_page);
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
}
|
||||
|
||||
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
|
||||
{ \
|
||||
const int sectors_per_page = fs_info->sectors_per_page; \
|
||||
\
|
||||
ASSERT(sectors_per_page < BITS_PER_LONG); \
|
||||
*dst = bitmap_read(subpage->bitmaps, \
|
||||
sectors_per_page * btrfs_bitmap_nr_##name, \
|
||||
sectors_per_page); \
|
||||
}
|
||||
|
||||
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
|
||||
struct folio *folio, u64 start, u32 len)
|
||||
{
|
||||
@ -716,6 +736,7 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
|
||||
unsigned long writeback_bitmap;
|
||||
unsigned long ordered_bitmap;
|
||||
unsigned long checked_bitmap;
|
||||
unsigned long locked_bitmap;
|
||||
unsigned long flags;
|
||||
|
||||
ASSERT(folio_test_private(folio) && folio_get_private(folio));
|
||||
@ -728,15 +749,16 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
|
||||
GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
|
||||
GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
|
||||
GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
|
||||
GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &checked_bitmap);
|
||||
GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap);
|
||||
spin_unlock_irqrestore(&subpage->lock, flags);
|
||||
|
||||
dump_page(folio_page(folio, 0), "btrfs subpage dump");
|
||||
btrfs_warn(fs_info,
|
||||
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
|
||||
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
|
||||
start, len, folio_pos(folio),
|
||||
sectors_per_page, &uptodate_bitmap,
|
||||
sectors_per_page, &dirty_bitmap,
|
||||
sectors_per_page, &locked_bitmap,
|
||||
sectors_per_page, &writeback_bitmap,
|
||||
sectors_per_page, &ordered_bitmap,
|
||||
sectors_per_page, &checked_bitmap);
|
||||
|
@ -971,7 +971,7 @@ static int btrfs_fill_super(struct super_block *sb,
|
||||
|
||||
err = open_ctree(sb, fs_devices);
|
||||
if (err) {
|
||||
btrfs_err(fs_info, "open_ctree failed");
|
||||
btrfs_err(fs_info, "open_ctree failed: %d", err);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -2446,6 +2446,9 @@ static __cold void btrfs_interface_exit(void)
|
||||
static int __init btrfs_print_mod_info(void)
|
||||
{
|
||||
static const char options[] = ""
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
", experimental=on"
|
||||
#endif
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
", debug=on"
|
||||
#endif
|
||||
@ -2466,7 +2469,17 @@ static int __init btrfs_print_mod_info(void)
|
||||
", fsverity=no"
|
||||
#endif
|
||||
;
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
if (btrfs_get_raid1_balancing() == NULL)
|
||||
pr_info("Btrfs loaded%s\n", options);
|
||||
else
|
||||
pr_info("Btrfs loaded%s, raid1_balancing=%s\n",
|
||||
options, btrfs_get_raid1_balancing());
|
||||
#else
|
||||
pr_info("Btrfs loaded%s\n", options);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2524,6 +2537,11 @@ static const struct init_sequence mod_init_seq[] = {
|
||||
}, {
|
||||
.init_func = extent_map_init,
|
||||
.exit_func = extent_map_exit,
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
}, {
|
||||
.init_func = btrfs_raid1_balancing_init,
|
||||
.exit_func = NULL,
|
||||
#endif
|
||||
}, {
|
||||
.init_func = ordered_data_init,
|
||||
.exit_func = ordered_data_exit,
|
||||
|
173
fs/btrfs/sysfs.c
173
fs/btrfs/sysfs.c
@ -1305,7 +1305,74 @@ static ssize_t btrfs_temp_fsid_show(struct kobject *kobj,
|
||||
}
|
||||
BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show);
|
||||
|
||||
static const char * const btrfs_read_policy_name[] = { "pid" };
|
||||
static const char *btrfs_read_policy_name[] = {
|
||||
"pid",
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
"round-robin",
|
||||
"devid",
|
||||
#endif
|
||||
};
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
/* Global module configuration parameters */
|
||||
static char *raid1_balancing;
|
||||
char *btrfs_get_raid1_balancing(void)
|
||||
{
|
||||
return raid1_balancing;
|
||||
}
|
||||
|
||||
/* Set perm 0, disable sys/module/btrfs/parameter/raid1_balancing interface */
|
||||
module_param(raid1_balancing, charp, 0);
|
||||
MODULE_PARM_DESC(raid1_balancing,
|
||||
"Global read policy; pid (default), round-robin:[min_contiguous_read], devid:[[devid]|[latest-gen]|[oldest-gen]]");
|
||||
#endif
|
||||
|
||||
int btrfs_read_policy_to_enum(const char *str, s64 *value)
|
||||
{
|
||||
char param[32] = {'\0'};
|
||||
char *__maybe_unused value_str;
|
||||
int index;
|
||||
bool found = false;
|
||||
|
||||
if (!str || strlen(str) == 0)
|
||||
return 0;
|
||||
|
||||
strcpy(param, str);
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
/* Separate value from input in policy:value format. */
|
||||
if ((value_str = strchr(param, ':'))) {
|
||||
*value_str = '\0';
|
||||
value_str++;
|
||||
if (value && kstrtou64(value_str, 10, value) != 0)
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (index = 0; index < BTRFS_NR_READ_POLICY; index++) {
|
||||
if (sysfs_streq(param, btrfs_read_policy_name[index])) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found)
|
||||
return index;
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
int __init btrfs_raid1_balancing_init(void)
|
||||
{
|
||||
if (btrfs_read_policy_to_enum(raid1_balancing, NULL) == -EINVAL) {
|
||||
btrfs_err(NULL, "Invalid raid1_balancing %s", raid1_balancing);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
||||
struct kobj_attribute *a, char *buf)
|
||||
@ -1316,14 +1383,25 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
|
||||
if (policy == i)
|
||||
ret += sysfs_emit_at(buf, ret, "%s[%s]",
|
||||
(ret == 0 ? "" : " "),
|
||||
btrfs_read_policy_name[i]);
|
||||
else
|
||||
ret += sysfs_emit_at(buf, ret, "%s%s",
|
||||
(ret == 0 ? "" : " "),
|
||||
btrfs_read_policy_name[i]);
|
||||
if (ret != 0)
|
||||
ret += sysfs_emit_at(buf, ret, " ");
|
||||
|
||||
if (i == policy)
|
||||
ret += sysfs_emit_at(buf, ret, "[");
|
||||
|
||||
ret += sysfs_emit_at(buf, ret, "%s", btrfs_read_policy_name[i]);
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
if (i == BTRFS_READ_POLICY_RR)
|
||||
ret += sysfs_emit_at(buf, ret, ":%d",
|
||||
fs_devices->rr_min_contiguous_read);
|
||||
|
||||
if (i == BTRFS_READ_POLICY_DEVID)
|
||||
ret += sysfs_emit_at(buf, ret, ":%llu",
|
||||
fs_devices->read_devid);
|
||||
#endif
|
||||
if (i == policy)
|
||||
ret += sysfs_emit_at(buf, ret, "]");
|
||||
}
|
||||
|
||||
ret += sysfs_emit_at(buf, ret, "\n");
|
||||
@ -1336,21 +1414,78 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
|
||||
const char *buf, size_t len)
|
||||
{
|
||||
struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj);
|
||||
int i;
|
||||
int index;
|
||||
s64 value = -1;
|
||||
|
||||
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
|
||||
if (sysfs_streq(buf, btrfs_read_policy_name[i])) {
|
||||
if (i != READ_ONCE(fs_devices->read_policy)) {
|
||||
WRITE_ONCE(fs_devices->read_policy, i);
|
||||
btrfs_info(fs_devices->fs_info,
|
||||
"read policy set to '%s'",
|
||||
btrfs_read_policy_name[i]);
|
||||
index = btrfs_read_policy_to_enum(buf, &value);
|
||||
if (index == -EINVAL)
|
||||
return -EINVAL;
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
if (index == BTRFS_READ_POLICY_RR) {
|
||||
if (value != -1) {
|
||||
u32 sectorsize = fs_devices->fs_info->sectorsize;
|
||||
|
||||
if (!IS_ALIGNED(value, sectorsize)) {
|
||||
u64 temp_value = round_up(value, sectorsize);
|
||||
|
||||
btrfs_warn(fs_devices->fs_info,
|
||||
"read_policy: min contiguous read %lld should be multiples of the sectorsize %u, rounded to %llu",
|
||||
value, sectorsize, temp_value);
|
||||
value = temp_value;
|
||||
}
|
||||
return len;
|
||||
} else {
|
||||
value = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
|
||||
}
|
||||
|
||||
if (index != READ_ONCE(fs_devices->read_policy) ||
|
||||
value != READ_ONCE(fs_devices->rr_min_contiguous_read)) {
|
||||
WRITE_ONCE(fs_devices->read_policy, index);
|
||||
WRITE_ONCE(fs_devices->rr_min_contiguous_read, value);
|
||||
atomic_set(&fs_devices->total_reads, 0);
|
||||
|
||||
btrfs_info(fs_devices->fs_info, "read policy set to '%s:%lld'",
|
||||
btrfs_read_policy_name[index], value);
|
||||
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
if (index == BTRFS_READ_POLICY_DEVID) {
|
||||
|
||||
if (value != -1) {
|
||||
BTRFS_DEV_LOOKUP_ARGS(args);
|
||||
|
||||
/* Validate input devid */
|
||||
args.devid = value;
|
||||
if (btrfs_find_device(fs_devices, &args) == NULL)
|
||||
return -EINVAL;
|
||||
} else {
|
||||
/* Set default devid to the devid of the latest device */
|
||||
value = fs_devices->latest_dev->devid;
|
||||
}
|
||||
|
||||
if (index != READ_ONCE(fs_devices->read_policy) ||
|
||||
(value != READ_ONCE(fs_devices->read_devid))) {
|
||||
WRITE_ONCE(fs_devices->read_policy, index);
|
||||
WRITE_ONCE(fs_devices->read_devid, value);
|
||||
|
||||
btrfs_info(fs_devices->fs_info, "read policy set to '%s:%llu'",
|
||||
btrfs_read_policy_name[index], value);
|
||||
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
#endif
|
||||
if (index != READ_ONCE(fs_devices->read_policy)) {
|
||||
WRITE_ONCE(fs_devices->read_policy, index);
|
||||
btrfs_info(fs_devices->fs_info, "read policy set to '%s'",
|
||||
btrfs_read_policy_name[index]);
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
|
||||
|
||||
|
@ -47,5 +47,10 @@ void btrfs_sysfs_del_qgroups(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_sysfs_add_qgroups(struct btrfs_fs_info *fs_info);
|
||||
void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_qgroup *qgroup);
|
||||
int btrfs_read_policy_to_enum(const char *str, s64 *value);
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
int __init btrfs_raid1_balancing_init(void);
|
||||
char *btrfs_get_raid1_balancing(void);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -30,6 +30,7 @@ const char *test_error[] = {
|
||||
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
|
||||
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
|
||||
[TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
|
||||
[TEST_ALLOC_TRANSACTION] = "cannot allocate transaction",
|
||||
};
|
||||
|
||||
static const struct super_operations btrfs_test_super_ops = {
|
||||
@ -142,6 +143,11 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
|
||||
fs_info->nodesize = nodesize;
|
||||
fs_info->sectorsize = sectorsize;
|
||||
fs_info->sectorsize_bits = ilog2(sectorsize);
|
||||
|
||||
/* CRC32C csum size. */
|
||||
fs_info->csum_size = 4;
|
||||
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) /
|
||||
fs_info->csum_size;
|
||||
set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
|
||||
|
||||
test_mnt->mnt_sb->s_fs_info = fs_info;
|
||||
@ -247,6 +253,15 @@ void btrfs_free_dummy_block_group(struct btrfs_block_group *cache)
|
||||
kfree(cache);
|
||||
}
|
||||
|
||||
void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
memset(trans, 0, sizeof(*trans));
|
||||
trans->fs_info = fs_info;
|
||||
xa_init(&trans->delayed_refs.head_refs);
|
||||
xa_init(&trans->delayed_refs.dirty_extents);
|
||||
spin_lock_init(&trans->delayed_refs.lock);
|
||||
}
|
||||
|
||||
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_fs_info *fs_info)
|
||||
{
|
||||
@ -295,6 +310,9 @@ int btrfs_run_sanity_tests(void)
|
||||
ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
|
||||
if (ret)
|
||||
goto out;
|
||||
ret = btrfs_test_delayed_refs(sectorsize, nodesize);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
ret = btrfs_test_extent_map();
|
||||
|
@ -6,6 +6,8 @@
|
||||
#ifndef BTRFS_TESTS_H
|
||||
#define BTRFS_TESTS_H
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
|
||||
int btrfs_run_sanity_tests(void);
|
||||
|
||||
@ -25,12 +27,14 @@ enum {
|
||||
TEST_ALLOC_EXTENT_MAP,
|
||||
TEST_ALLOC_CHUNK_MAP,
|
||||
TEST_ALLOC_IO_CONTEXT,
|
||||
TEST_ALLOC_TRANSACTION,
|
||||
};
|
||||
|
||||
extern const char *test_error[];
|
||||
|
||||
struct btrfs_root;
|
||||
struct btrfs_trans_handle;
|
||||
struct btrfs_transaction;
|
||||
|
||||
int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
|
||||
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
|
||||
@ -40,6 +44,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
|
||||
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
|
||||
int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
|
||||
int btrfs_test_extent_map(void);
|
||||
int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize);
|
||||
struct inode *btrfs_new_test_inode(void);
|
||||
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
|
||||
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
|
||||
@ -49,6 +54,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt
|
||||
void btrfs_free_dummy_block_group(struct btrfs_block_group *cache);
|
||||
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans,
|
||||
struct btrfs_fs_info *fs_info);
|
||||
void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);
|
||||
struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info);
|
||||
#else
|
||||
static inline int btrfs_run_sanity_tests(void)
|
||||
|
1015
fs/btrfs/tests/delayed-refs-tests.c
Normal file
1015
fs/btrfs/tests/delayed-refs-tests.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -795,8 +795,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
|
||||
if (num_bytes)
|
||||
btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL);
|
||||
if (delayed_refs_bytes)
|
||||
btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info,
|
||||
delayed_refs_bytes);
|
||||
btrfs_space_info_free_bytes_may_use(trans_rsv->space_info, delayed_refs_bytes);
|
||||
reserve_fail:
|
||||
btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
|
||||
return ERR_PTR(ret);
|
||||
|
@ -227,7 +227,21 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
|
||||
delayed_refs->qgroup_to_skip = 0;
|
||||
}
|
||||
|
||||
bool __cold abort_should_print_stack(int error);
|
||||
/*
|
||||
* We want the transaction abort to print stack trace only for errors where the
|
||||
* cause could be a bug, eg. due to ENOSPC, and not for common errors that are
|
||||
* caused by external factors.
|
||||
*/
|
||||
static inline bool btrfs_abort_should_print_stack(int error)
|
||||
{
|
||||
switch (error) {
|
||||
case -EIO:
|
||||
case -EROFS:
|
||||
case -ENOMEM:
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Call btrfs_abort_transaction as early as possible when an error condition is
|
||||
@ -240,7 +254,7 @@ do { \
|
||||
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
|
||||
&((trans)->fs_info->fs_state))) { \
|
||||
__first = true; \
|
||||
if (WARN(abort_should_print_stack(error), \
|
||||
if (WARN(btrfs_abort_should_print_stack(error), \
|
||||
KERN_ERR \
|
||||
"BTRFS: Transaction aborted (error %d)\n", \
|
||||
(error))) { \
|
||||
|
@ -973,6 +973,105 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int btrfs_check_system_chunk_array(struct btrfs_fs_info *fs_info,
|
||||
const struct btrfs_super_block *sb)
|
||||
{
|
||||
struct extent_buffer *dummy;
|
||||
u32 array_size;
|
||||
u32 cur_offset = 0;
|
||||
u32 len;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* We allocated a dummy extent, just to use extent buffer accessors.
|
||||
* There will be unused space after BTRFS_SUPER_INFO_SIZE, but
|
||||
* that's fine, we will not go beyond system chunk array anyway.
|
||||
*/
|
||||
dummy = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
|
||||
if (!dummy)
|
||||
return -ENOMEM;
|
||||
set_extent_buffer_uptodate(dummy);
|
||||
write_extent_buffer(dummy, sb, 0, BTRFS_SUPER_INFO_SIZE);
|
||||
|
||||
array_size = btrfs_super_sys_array_size(sb);
|
||||
if (array_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
|
||||
btrfs_crit(fs_info,
|
||||
"superblock syschunk too large, have %u expect <=%u",
|
||||
array_size, BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
while (cur_offset < array_size) {
|
||||
struct btrfs_disk_key *disk_key;
|
||||
struct btrfs_key key;
|
||||
struct btrfs_chunk *chunk;
|
||||
u32 num_stripes;
|
||||
u64 type;
|
||||
|
||||
len = sizeof(*disk_key);
|
||||
if (cur_offset + len > array_size)
|
||||
goto out_short_read;
|
||||
disk_key = (struct btrfs_disk_key *)(sb->sys_chunk_array + cur_offset);
|
||||
btrfs_disk_key_to_cpu(&key, disk_key);
|
||||
cur_offset += len;
|
||||
|
||||
if (key.type != BTRFS_CHUNK_ITEM_KEY) {
|
||||
btrfs_crit(fs_info,
|
||||
"unexpected item type %u in sys_array at offset %u",
|
||||
(u32)key.type, cur_offset);
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* At least one btrfs_chunk with one stripe must be present,
|
||||
* exact stripe count check comes afterwards
|
||||
*/
|
||||
len = btrfs_chunk_item_size(1);
|
||||
if (cur_offset + len > array_size)
|
||||
goto out_short_read;
|
||||
|
||||
chunk = (struct btrfs_chunk *)
|
||||
(offsetof(struct btrfs_super_block, sys_chunk_array) +
|
||||
cur_offset);
|
||||
num_stripes = btrfs_chunk_num_stripes(dummy, chunk);
|
||||
if (!num_stripes) {
|
||||
btrfs_crit(fs_info,
|
||||
"invalid number of stripes %u in sys_array at offset %u",
|
||||
num_stripes, cur_offset);
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
type = btrfs_chunk_type(dummy, chunk);
|
||||
if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
|
||||
btrfs_err(fs_info,
|
||||
"invalid chunk type %llu in sys_array at offset %u",
|
||||
type, cur_offset);
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
|
||||
len = btrfs_chunk_item_size(num_stripes);
|
||||
if (cur_offset + len > array_size)
|
||||
goto out_short_read;
|
||||
|
||||
ret = btrfs_check_chunk_valid(dummy, chunk, key.offset);
|
||||
if (ret)
|
||||
goto out;
|
||||
cur_offset += len;
|
||||
}
|
||||
out:
|
||||
free_extent_buffer_stale(dummy);
|
||||
return ret;
|
||||
|
||||
out_short_read:
|
||||
btrfs_crit(fs_info,
|
||||
"sys_array too short to read %u bytes at offset %u array size %u",
|
||||
len, cur_offset, array_size);
|
||||
free_extent_buffer_stale(dummy);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Enhanced version of chunk item checker.
|
||||
*
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <uapi/linux/btrfs_tree.h>
|
||||
#include "fs.h"
|
||||
|
||||
struct extent_buffer;
|
||||
struct btrfs_chunk;
|
||||
@ -68,6 +69,8 @@ int btrfs_check_node(struct extent_buffer *node);
|
||||
|
||||
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
|
||||
struct btrfs_chunk *chunk, u64 logical);
|
||||
int btrfs_check_system_chunk_array(struct btrfs_fs_info *fs_info,
|
||||
const struct btrfs_super_block *sb);
|
||||
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
|
||||
int btrfs_verify_level_key(struct extent_buffer *eb,
|
||||
const struct btrfs_tree_parent_check *check);
|
||||
|
@ -13,8 +13,8 @@
|
||||
#include <linux/list_sort.h>
|
||||
#include <linux/namei.h>
|
||||
#include "misc.h"
|
||||
#include "ctree.h"
|
||||
#include "disk-io.h"
|
||||
#include "extent-tree.h"
|
||||
#include "transaction.h"
|
||||
#include "volumes.h"
|
||||
#include "raid56.h"
|
||||
@ -48,6 +48,7 @@ struct btrfs_io_geometry {
|
||||
u64 raid56_full_stripe_start;
|
||||
int max_errors;
|
||||
enum btrfs_map_op op;
|
||||
bool use_rst;
|
||||
};
|
||||
|
||||
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
|
||||
@ -1327,7 +1328,14 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
|
||||
fs_devices->latest_dev = latest_dev;
|
||||
fs_devices->total_rw_bytes = 0;
|
||||
fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
fs_devices->rr_min_contiguous_read = BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ;
|
||||
fs_devices->read_devid = latest_dev->devid;
|
||||
fs_devices->read_policy =
|
||||
btrfs_read_policy_to_enum(btrfs_get_raid1_balancing(), NULL);
|
||||
#else
|
||||
fs_devices->read_policy = BTRFS_READ_POLICY_PID;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -5959,6 +5967,88 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
|
||||
return len;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
static int btrfs_read_preferred(struct btrfs_chunk_map *map, int first,
|
||||
int num_stripe)
|
||||
{
|
||||
int last = first + num_stripe;
|
||||
int stripe_index;
|
||||
|
||||
for (stripe_index = first; stripe_index < last; stripe_index++) {
|
||||
struct btrfs_device *device = map->stripes[stripe_index].dev;
|
||||
|
||||
if (device->devid == READ_ONCE(device->fs_devices->read_devid))
|
||||
return stripe_index;
|
||||
}
|
||||
|
||||
/* If no read-preferred device, use first stripe */
|
||||
return first;
|
||||
}
|
||||
|
||||
struct stripe_mirror {
|
||||
u64 devid;
|
||||
int num;
|
||||
};
|
||||
|
||||
static int btrfs_cmp_devid(const void *a, const void *b)
|
||||
{
|
||||
const struct stripe_mirror *s1 = (struct stripe_mirror *)a;
|
||||
const struct stripe_mirror *s2 = (struct stripe_mirror *)b;
|
||||
|
||||
if (s1->devid < s2->devid)
|
||||
return -1;
|
||||
if (s1->devid > s2->devid)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* btrfs_read_rr.
|
||||
*
|
||||
* Select a stripe for reading using a round-robin algorithm:
|
||||
*
|
||||
* 1. Compute the read cycle as the total sectors read divided by the minimum
|
||||
* sectors per device.
|
||||
* 2. Determine the stripe number for the current read by taking the modulus
|
||||
* of the read cycle with the total number of stripes:
|
||||
*
|
||||
* stripe index = (total sectors / min sectors per dev) % num stripes
|
||||
*
|
||||
* The calculated stripe index is then used to select the corresponding device
|
||||
* from the list of devices, which is ordered by devid.
|
||||
*/
|
||||
static int btrfs_read_rr(struct btrfs_chunk_map *map, int first, int num_stripe)
|
||||
{
|
||||
struct stripe_mirror stripes[BTRFS_RAID1_MAX_MIRRORS] = {0};
|
||||
struct btrfs_fs_devices *fs_devices;
|
||||
struct btrfs_device *device;
|
||||
int read_cycle;
|
||||
int index;
|
||||
int ret_stripe;
|
||||
int total_reads;
|
||||
int reads_per_dev = 0;
|
||||
|
||||
device = map->stripes[first].dev;
|
||||
|
||||
fs_devices = device->fs_devices;
|
||||
reads_per_dev = fs_devices->rr_min_contiguous_read >> SECTOR_SHIFT;
|
||||
index = 0;
|
||||
for (int i = first; i < first + num_stripe; i++) {
|
||||
stripes[index].devid = map->stripes[i].dev->devid;
|
||||
stripes[index].num = i;
|
||||
index++;
|
||||
}
|
||||
sort(stripes, num_stripe, sizeof(struct stripe_mirror),
|
||||
btrfs_cmp_devid, NULL);
|
||||
|
||||
total_reads = atomic_inc_return(&fs_devices->total_reads);
|
||||
read_cycle = total_reads / reads_per_dev;
|
||||
ret_stripe = stripes[read_cycle % num_stripe].num;
|
||||
|
||||
return ret_stripe;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_chunk_map *map, int first,
|
||||
int dev_replace_is_ongoing)
|
||||
@ -5988,6 +6078,14 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
|
||||
case BTRFS_READ_POLICY_PID:
|
||||
preferred_mirror = first + (current->pid % num_stripes);
|
||||
break;
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
case BTRFS_READ_POLICY_RR:
|
||||
preferred_mirror = btrfs_read_rr(map, first, num_stripes);
|
||||
break;
|
||||
case BTRFS_READ_POLICY_DEVID:
|
||||
preferred_mirror = btrfs_read_preferred(map, first, num_stripes);
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (dev_replace_is_ongoing &&
|
||||
@ -6346,8 +6444,7 @@ static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
|
||||
{
|
||||
dst->dev = map->stripes[io_geom->stripe_index].dev;
|
||||
|
||||
if (io_geom->op == BTRFS_MAP_READ &&
|
||||
btrfs_need_stripe_tree_update(fs_info, map->type))
|
||||
if (io_geom->op == BTRFS_MAP_READ && io_geom->use_rst)
|
||||
return btrfs_get_raid_extent_offset(fs_info, logical, length,
|
||||
map->type,
|
||||
io_geom->stripe_index, dst);
|
||||
@ -6362,7 +6459,7 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
|
||||
const struct btrfs_io_stripe *smap,
|
||||
const struct btrfs_chunk_map *map,
|
||||
int num_alloc_stripes,
|
||||
enum btrfs_map_op op, int mirror_num)
|
||||
struct btrfs_io_geometry *io_geom)
|
||||
{
|
||||
if (!smap)
|
||||
return false;
|
||||
@ -6370,10 +6467,10 @@ static bool is_single_device_io(struct btrfs_fs_info *fs_info,
|
||||
if (num_alloc_stripes != 1)
|
||||
return false;
|
||||
|
||||
if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
|
||||
if (io_geom->use_rst && io_geom->op != BTRFS_MAP_READ)
|
||||
return false;
|
||||
|
||||
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
|
||||
if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && io_geom->mirror_num > 1)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@ -6579,6 +6676,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
||||
io_geom.raid56_full_stripe_start = (u64)-1;
|
||||
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
|
||||
*length = min_t(u64, map->chunk_len - map_offset, max_len);
|
||||
io_geom.use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
|
||||
|
||||
if (dev_replace->replace_task != current)
|
||||
down_read(&dev_replace->rwsem);
|
||||
@ -6647,8 +6745,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
||||
* physical block information on the stack instead of allocating an
|
||||
* I/O context structure.
|
||||
*/
|
||||
if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
|
||||
io_geom.mirror_num)) {
|
||||
if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, &io_geom)) {
|
||||
ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
|
||||
if (mirror_num_ret)
|
||||
*mirror_num_ret = io_geom.mirror_num;
|
||||
@ -6662,6 +6759,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
|
||||
goto out;
|
||||
}
|
||||
bioc->map_type = map->type;
|
||||
bioc->use_rst = io_geom.use_rst;
|
||||
|
||||
/*
|
||||
* For RAID56 full map, we need to make sure the stripes[] follows the
|
||||
@ -7002,16 +7100,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
|
||||
warn_32bit_meta_chunk(fs_info, logical, length, type);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Only need to verify chunk item if we're reading from sys chunk array,
|
||||
* as chunk item in tree block is already verified by tree-checker.
|
||||
*/
|
||||
if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
|
||||
ret = btrfs_check_chunk_valid(leaf, chunk, logical);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
map = btrfs_find_chunk_map(fs_info, logical, 1);
|
||||
|
||||
/* already mapped? */
|
||||
@ -7274,11 +7362,9 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
|
||||
u8 *array_ptr;
|
||||
unsigned long sb_array_offset;
|
||||
int ret = 0;
|
||||
u32 num_stripes;
|
||||
u32 array_size;
|
||||
u32 len = 0;
|
||||
u32 cur_offset;
|
||||
u64 type;
|
||||
struct btrfs_key key;
|
||||
|
||||
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
|
||||
@ -7301,10 +7387,17 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
|
||||
cur_offset = 0;
|
||||
|
||||
while (cur_offset < array_size) {
|
||||
u32 num_stripes;
|
||||
|
||||
disk_key = (struct btrfs_disk_key *)array_ptr;
|
||||
len = sizeof(*disk_key);
|
||||
if (cur_offset + len > array_size)
|
||||
goto out_short_read;
|
||||
|
||||
/*
|
||||
* The super block should have passed
|
||||
* btrfs_check_system_chunk_array(), thus we only do
|
||||
* ASSERT() for those sanity checks.
|
||||
*/
|
||||
ASSERT(cur_offset + len <= array_size);
|
||||
|
||||
btrfs_disk_key_to_cpu(&key, disk_key);
|
||||
|
||||
@ -7312,44 +7405,24 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
|
||||
sb_array_offset += len;
|
||||
cur_offset += len;
|
||||
|
||||
if (key.type != BTRFS_CHUNK_ITEM_KEY) {
|
||||
btrfs_err(fs_info,
|
||||
"unexpected item type %u in sys_array at offset %u",
|
||||
(u32)key.type, cur_offset);
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
ASSERT(key.type == BTRFS_CHUNK_ITEM_KEY);
|
||||
|
||||
chunk = (struct btrfs_chunk *)sb_array_offset;
|
||||
/*
|
||||
* At least one btrfs_chunk with one stripe must be present,
|
||||
* exact stripe count check comes afterwards
|
||||
*/
|
||||
len = btrfs_chunk_item_size(1);
|
||||
if (cur_offset + len > array_size)
|
||||
goto out_short_read;
|
||||
ASSERT(cur_offset + btrfs_chunk_item_size(1) <= array_size);
|
||||
|
||||
num_stripes = btrfs_chunk_num_stripes(sb, chunk);
|
||||
if (!num_stripes) {
|
||||
btrfs_err(fs_info,
|
||||
"invalid number of stripes %u in sys_array at offset %u",
|
||||
num_stripes, cur_offset);
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
/* Should have at least one stripe. */
|
||||
ASSERT(num_stripes);
|
||||
|
||||
type = btrfs_chunk_type(sb, chunk);
|
||||
if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
|
||||
btrfs_err(fs_info,
|
||||
"invalid chunk type %llu in sys_array at offset %u",
|
||||
type, cur_offset);
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
/* Only system chunks are allowed in system chunk array. */
|
||||
ASSERT(btrfs_chunk_type(sb, chunk) & BTRFS_BLOCK_GROUP_SYSTEM);
|
||||
|
||||
len = btrfs_chunk_item_size(num_stripes);
|
||||
if (cur_offset + len > array_size)
|
||||
goto out_short_read;
|
||||
ASSERT(cur_offset + len <= array_size);
|
||||
|
||||
ret = read_one_chunk(&key, sb, chunk);
|
||||
if (ret)
|
||||
@ -7362,13 +7435,6 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
|
||||
clear_extent_buffer_uptodate(sb);
|
||||
free_extent_buffer_stale(sb);
|
||||
return ret;
|
||||
|
||||
out_short_read:
|
||||
btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
|
||||
len, cur_offset);
|
||||
clear_extent_buffer_uptodate(sb);
|
||||
free_extent_buffer_stale(sb);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -7568,8 +7634,6 @@ int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
|
||||
struct btrfs_device *device;
|
||||
int ret = 0;
|
||||
|
||||
fs_devices->fs_info = fs_info;
|
||||
|
||||
mutex_lock(&fs_devices->device_list_mutex);
|
||||
list_for_each_entry(device, &fs_devices->devices, dev_list)
|
||||
device->fs_info = fs_info;
|
||||
@ -7798,7 +7862,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
|
||||
|
||||
if (!dev->dev_stats_valid)
|
||||
return;
|
||||
btrfs_err_rl_in_rcu(dev->fs_info,
|
||||
btrfs_debug_rl_in_rcu(dev->fs_info,
|
||||
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
|
||||
btrfs_dev_name(dev),
|
||||
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
|
||||
|
@ -296,6 +296,8 @@ enum btrfs_chunk_allocation_policy {
|
||||
BTRFS_CHUNK_ALLOC_ZONED,
|
||||
};
|
||||
|
||||
#define BTRFS_DEFAULT_RR_MIN_CONTIGUOUS_READ (SZ_256K)
|
||||
#define BTRFS_RAID1_MAX_MIRRORS (4)
|
||||
/*
|
||||
* Read policies for mirrored block group profiles, read picks the stripe based
|
||||
* on these policies.
|
||||
@ -303,6 +305,12 @@ enum btrfs_chunk_allocation_policy {
|
||||
enum btrfs_read_policy {
|
||||
/* Use process PID to choose the stripe */
|
||||
BTRFS_READ_POLICY_PID,
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
/* Balancing raid1 reads across all striped devices (round-robin) */
|
||||
BTRFS_READ_POLICY_RR,
|
||||
/* Read from the specific device */
|
||||
BTRFS_READ_POLICY_DEVID,
|
||||
#endif
|
||||
BTRFS_NR_READ_POLICY,
|
||||
};
|
||||
|
||||
@ -431,6 +439,14 @@ struct btrfs_fs_devices {
|
||||
enum btrfs_read_policy read_policy;
|
||||
|
||||
#ifdef CONFIG_BTRFS_EXPERIMENTAL
|
||||
/* IO stat, read counter. */
|
||||
atomic_t total_reads;
|
||||
/* Min contiguous reads before switching to next device. */
|
||||
int rr_min_contiguous_read;
|
||||
|
||||
/* Device to be used for reading in case of RAID1. */
|
||||
u64 read_devid;
|
||||
|
||||
/* Checksum mode - offload it or do it synchronously. */
|
||||
enum btrfs_offload_csum_mode offload_csum_mode;
|
||||
#endif
|
||||
@ -485,6 +501,7 @@ struct btrfs_io_context {
|
||||
struct bio *orig_bio;
|
||||
atomic_t error;
|
||||
u16 max_errors;
|
||||
bool use_rst;
|
||||
|
||||
u64 logical;
|
||||
u64 size;
|
||||
|
154
fs/btrfs/zoned.c
154
fs/btrfs/zoned.c
@ -741,12 +741,23 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
|
||||
* we add the pages one by one to a bio, and cannot increase the
|
||||
* metadata reservation even if it increases the number of extents, it
|
||||
* is safe to stick with the limit.
|
||||
*
|
||||
* If there is no zoned device in the filesystem, we have
|
||||
* max_zone_append_sectors = 0. That will cause
|
||||
* fs_info->max_zone_append_size and fs_info->max_extent_size to be
|
||||
* 0 in the following lines. Set the maximum value to avoid that.
|
||||
*/
|
||||
fs_info->max_zone_append_size = ALIGN_DOWN(
|
||||
min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
|
||||
(u64)lim->max_sectors << SECTOR_SHIFT,
|
||||
(u64)lim->max_segments << PAGE_SHIFT),
|
||||
fs_info->sectorsize);
|
||||
if (lim->features & BLK_FEAT_ZONED)
|
||||
fs_info->max_zone_append_size = ALIGN_DOWN(
|
||||
min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT,
|
||||
(u64)lim->max_sectors << SECTOR_SHIFT,
|
||||
(u64)lim->max_segments << PAGE_SHIFT),
|
||||
fs_info->sectorsize);
|
||||
else
|
||||
fs_info->max_zone_append_size = ALIGN_DOWN(
|
||||
min((u64)lim->max_sectors << SECTOR_SHIFT,
|
||||
(u64)lim->max_segments << PAGE_SHIFT),
|
||||
fs_info->sectorsize);
|
||||
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
|
||||
if (fs_info->max_zone_append_size < fs_info->max_extent_size)
|
||||
fs_info->max_extent_size = fs_info->max_zone_append_size;
|
||||
@ -1671,6 +1682,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Reject non SINGLE data profiles without RST. */
|
||||
if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
|
||||
(map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
|
||||
!fs_info->stripe_root) {
|
||||
btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
|
||||
btrfs_bg_type_to_raid_name(map->type));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (cache->alloc_offset > cache->zone_capacity) {
|
||||
btrfs_err(fs_info,
|
||||
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
|
||||
@ -2651,3 +2671,127 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
|
||||
}
|
||||
spin_unlock(&fs_info->zone_active_bgs_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset the zones of unused block groups from @space_info->bytes_zone_unusable.
|
||||
*
|
||||
* @space_info: the space to work on
|
||||
* @num_bytes: targeting reclaim bytes
|
||||
*
|
||||
* This one resets the zones of a block group, so we can reuse the region
|
||||
* without removing the block group. On the other hand, btrfs_delete_unused_bgs()
|
||||
* just removes a block group and frees up the underlying zones. So, we still
|
||||
* need to allocate a new block group to reuse the zones.
|
||||
*
|
||||
* Resetting is faster than deleting/recreating a block group. It is similar
|
||||
* to freeing the logical space on the regular mode. However, we cannot change
|
||||
* the block group's profile with this operation.
|
||||
*/
|
||||
int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes)
|
||||
{
|
||||
struct btrfs_fs_info *fs_info = space_info->fs_info;
|
||||
const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT;
|
||||
|
||||
if (!btrfs_is_zoned(fs_info))
|
||||
return 0;
|
||||
|
||||
while (num_bytes > 0) {
|
||||
struct btrfs_chunk_map *map;
|
||||
struct btrfs_block_group *bg = NULL;
|
||||
bool found = false;
|
||||
u64 reclaimed = 0;
|
||||
|
||||
/*
|
||||
* Here, we choose a fully zone_unusable block group. It's
|
||||
* technically possible to reset a partly zone_unusable block
|
||||
* group, which still has some free space left. However,
|
||||
* handling that needs to cope with the allocation side, which
|
||||
* makes the logic more complex. So, let's handle the easy case
|
||||
* for now.
|
||||
*/
|
||||
spin_lock(&fs_info->unused_bgs_lock);
|
||||
list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) {
|
||||
if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Use trylock to avoid locking order violation. In
|
||||
* btrfs_reclaim_bgs_work(), the lock order is
|
||||
* &bg->lock -> &fs_info->unused_bgs_lock. We skip a
|
||||
* block group if we cannot take its lock.
|
||||
*/
|
||||
if (!spin_trylock(&bg->lock))
|
||||
continue;
|
||||
if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) {
|
||||
spin_unlock(&bg->lock);
|
||||
continue;
|
||||
}
|
||||
spin_unlock(&bg->lock);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
if (!found) {
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
list_del_init(&bg->bg_list);
|
||||
btrfs_put_block_group(bg);
|
||||
spin_unlock(&fs_info->unused_bgs_lock);
|
||||
|
||||
/*
|
||||
* Since the block group is fully zone_unusable and we cannot
|
||||
* allocate from this block group anymore, we don't need to set
|
||||
* this block group read-only.
|
||||
*/
|
||||
|
||||
down_read(&fs_info->dev_replace.rwsem);
|
||||
map = bg->physical_map;
|
||||
for (int i = 0; i < map->num_stripes; i++) {
|
||||
struct btrfs_io_stripe *stripe = &map->stripes[i];
|
||||
unsigned int nofs_flags;
|
||||
int ret;
|
||||
|
||||
nofs_flags = memalloc_nofs_save();
|
||||
ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET,
|
||||
stripe->physical >> SECTOR_SHIFT,
|
||||
zone_size_sectors);
|
||||
memalloc_nofs_restore(nofs_flags);
|
||||
|
||||
if (ret) {
|
||||
up_read(&fs_info->dev_replace.rwsem);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
up_read(&fs_info->dev_replace.rwsem);
|
||||
|
||||
spin_lock(&space_info->lock);
|
||||
spin_lock(&bg->lock);
|
||||
ASSERT(!btrfs_is_block_group_used(bg));
|
||||
if (bg->ro) {
|
||||
spin_unlock(&bg->lock);
|
||||
spin_unlock(&space_info->lock);
|
||||
continue;
|
||||
}
|
||||
|
||||
reclaimed = bg->alloc_offset;
|
||||
bg->zone_unusable = bg->length - bg->zone_capacity;
|
||||
bg->alloc_offset = 0;
|
||||
/*
|
||||
* This holds because we currently reset fully used then freed
|
||||
* block group.
|
||||
*/
|
||||
ASSERT(reclaimed == bg->zone_capacity);
|
||||
bg->free_space_ctl->free_space += reclaimed;
|
||||
space_info->bytes_zone_unusable -= reclaimed;
|
||||
spin_unlock(&bg->lock);
|
||||
btrfs_return_free_space(space_info, reclaimed);
|
||||
spin_unlock(&space_info->lock);
|
||||
|
||||
if (num_bytes <= reclaimed)
|
||||
break;
|
||||
num_bytes -= reclaimed;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -96,6 +96,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
|
||||
struct btrfs_space_info *space_info, bool do_finish);
|
||||
void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info);
|
||||
int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes);
|
||||
#else /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
|
||||
@ -265,6 +266,12 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
|
||||
|
||||
static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { }
|
||||
|
||||
static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info,
|
||||
u64 num_bytes)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
|
||||
|
@ -100,7 +100,8 @@ struct find_free_extent_ctl;
|
||||
EM( ALLOC_CHUNK, "ALLOC_CHUNK") \
|
||||
EM( ALLOC_CHUNK_FORCE, "ALLOC_CHUNK_FORCE") \
|
||||
EM( RUN_DELAYED_IPUTS, "RUN_DELAYED_IPUTS") \
|
||||
EMe(COMMIT_TRANS, "COMMIT_TRANS")
|
||||
EM( COMMIT_TRANS, "COMMIT_TRANS") \
|
||||
EMe(RESET_ZONES, "RESET_ZONES")
|
||||
|
||||
/*
|
||||
* First define the enums in the above macros to be exported to userspace via
|
||||
|
Loading…
Reference in New Issue
Block a user