Btrfs: track dirty block groups on their own list

Currently any time we try to update the block groups on disk we will walk _all_
block groups and check for the ->dirty flag to see if it is set.  This function
can get called several times during a commit.  So if you have several terabytes
of data you will be a very sad panda as we will loop through _all_ of the block
groups several times, which makes the commit take a while which slows down the
rest of the file system operations.

This patch introduces a dirty list for the block groups that we get added to
when we dirty the block group for the first time.  Then we simply update any
block groups that have been dirtied since the last time we called
btrfs_write_dirty_block_groups.  This allows us to clean up how we write the
free space cache out so it is much cleaner.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
This commit is contained in:
Josef Bacik 2014-11-17 15:45:48 -05:00 committed by Chris Mason
parent e7070be198
commit ce93ec548c
5 changed files with 72 additions and 124 deletions

View File

@ -1238,7 +1238,6 @@ enum btrfs_disk_cache_state {
BTRFS_DC_ERROR = 1,
BTRFS_DC_CLEAR = 2,
BTRFS_DC_SETUP = 3,
BTRFS_DC_NEED_WRITE = 4,
};
struct btrfs_caching_control {
@ -1276,7 +1275,6 @@ struct btrfs_block_group_cache {
unsigned long full_stripe_len;
unsigned int ro:1;
unsigned int dirty:1;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
@ -1314,6 +1312,9 @@ struct btrfs_block_group_cache {
struct list_head ro_list;
atomic_t trimming;
/* For dirty block groups */
struct list_head dirty_list;
};
/* delayed seq elem */

View File

@ -74,8 +74,9 @@ enum {
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@ -3315,120 +3316,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_group_cache *cache;
int err = 0;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
struct btrfs_path *path;
u64 last = 0;
if (list_empty(&cur_trans->dirty_bgs))
return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
while (1) {
cache = btrfs_lookup_first_block_group(root->fs_info, last);
while (cache) {
if (cache->disk_cache_state == BTRFS_DC_CLEAR)
break;
cache = next_block_group(root, cache);
}
if (!cache) {
if (last == 0)
break;
last = 0;
continue;
}
err = cache_save_setup(cache, trans, path);
last = cache->key.objectid + cache->key.offset;
/*
* We don't need the lock here since we are protected by the transaction
* commit. We want to do the cache_save_setup first and then run the
* delayed refs to make sure we have the best chance at doing this all
* in one shot.
*/
while (!list_empty(&cur_trans->dirty_bgs)) {
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
dirty_list);
list_del_init(&cache->dirty_list);
if (cache->disk_cache_state == BTRFS_DC_CLEAR)
cache_save_setup(cache, trans, path);
if (!ret)
ret = btrfs_run_delayed_refs(trans, root,
(unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
btrfs_write_out_cache(root, trans, cache, path);
if (!ret)
ret = write_one_cache_group(trans, root, path, cache);
btrfs_put_block_group(cache);
}
while (1) {
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
if (err) /* File system offline */
goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
while (cache) {
if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
btrfs_put_block_group(cache);
goto again;
}
if (cache->dirty)
break;
cache = next_block_group(root, cache);
}
if (!cache) {
if (last == 0)
break;
last = 0;
continue;
}
if (cache->disk_cache_state == BTRFS_DC_SETUP)
cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
cache->dirty = 0;
last = cache->key.objectid + cache->key.offset;
err = write_one_cache_group(trans, root, path, cache);
btrfs_put_block_group(cache);
if (err) /* File system offline */
goto out;
}
while (1) {
/*
* I don't think this is needed since we're just marking our
* preallocated extent as written, but just in case it can't
* hurt.
*/
if (last == 0) {
err = btrfs_run_delayed_refs(trans, root,
(unsigned long)-1);
if (err) /* File system offline */
goto out;
}
cache = btrfs_lookup_first_block_group(root->fs_info, last);
while (cache) {
/*
* Really this shouldn't happen, but it could if we
* couldn't write the entire preallocated extent and
* splitting the extent resulted in a new block.
*/
if (cache->dirty) {
btrfs_put_block_group(cache);
goto again;
}
if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
break;
cache = next_block_group(root, cache);
}
if (!cache) {
if (last == 0)
break;
last = 0;
continue;
}
err = btrfs_write_out_cache(root, trans, cache, path);
/*
* If we didn't have an error then the cache state is still
* NEED_WRITE, so we can set it to WRITTEN.
*/
if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
cache->disk_cache_state = BTRFS_DC_WRITTEN;
last = cache->key.objectid + cache->key.offset;
btrfs_put_block_group(cache);
}
out:
btrfs_free_path(path);
return err;
return ret;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@ -5375,8 +5298,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
btrfs_free_reserved_data_space(inode, num_bytes);
}
static int update_block_group(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc)
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_fs_info *info = root->fs_info;
@ -5414,6 +5338,14 @@ static int update_block_group(struct btrfs_root *root,
if (!alloc && cache->cached == BTRFS_CACHE_NO)
cache_block_group(cache, 1);
spin_lock(&trans->transaction->dirty_bgs_lock);
if (list_empty(&cache->dirty_list)) {
list_add_tail(&cache->dirty_list,
&trans->transaction->dirty_bgs);
btrfs_get_block_group(cache);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@ -5424,7 +5356,6 @@ static int update_block_group(struct btrfs_root *root,
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
cache->dirty = 1;
old_val = btrfs_block_group_used(&cache->item);
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
@ -6103,7 +6034,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
ret = update_block_group(root, bytenr, num_bytes, 0);
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@ -7063,7 +6994,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
ret = update_block_group(root, ins->objectid, ins->offset, 1);
ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@ -7152,7 +7083,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
ret = update_block_group(root, ins->objectid, root->nodesize, 1);
ret = update_block_group(trans, root, ins->objectid, root->nodesize,
1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@ -9005,6 +8937,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
INIT_LIST_HEAD(&cache->dirty_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
@ -9068,9 +9001,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
cache->disk_cache_state = BTRFS_DC_CLEAR;
if (btrfs_test_opt(root, SPACE_CACHE))
cache->dirty = 1;
cache->disk_cache_state = BTRFS_DC_CLEAR;
}
read_extent_buffer(leaf, &cache->item,
@ -9461,6 +9393,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
}
spin_lock(&trans->transaction->dirty_bgs_lock);
if (!list_empty(&block_group->dirty_list)) {
list_del_init(&block_group->dirty_list);
btrfs_put_block_group(block_group);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);

View File

@ -1243,6 +1243,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
root = root->fs_info->tree_root;
@ -1266,9 +1267,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
path, block_group->key.objectid);
if (ret) {
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);
dcs = BTRFS_DC_ERROR;
ret = 0;
#ifdef DEBUG
btrfs_err(root->fs_info,
@ -1277,6 +1276,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
#endif
}
spin_lock(&block_group->lock);
block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
iput(inode);
return ret;
}

View File

@ -248,6 +248,8 @@ static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
spin_lock_init(&cur_trans->dirty_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@ -1028,7 +1030,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
if (old_root_bytenr == root->node->start &&
old_root_used == btrfs_root_used(&root->root_item))
old_root_used == btrfs_root_used(&root->root_item) &&
(!extent_root ||
list_empty(&trans->transaction->dirty_bgs)))
break;
btrfs_set_root_node(&root->root_item, root->node);
@ -1047,6 +1051,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret)
return ret;
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret)
return ret;
}
return 0;
@ -1067,10 +1074,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
int ret;
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret)
return ret;
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
0, &eb);
@ -1990,6 +1993,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
ASSERT(list_empty(&cur_trans->dirty_bgs));
update_super_roots(root);
btrfs_set_super_log_root(root->fs_info->super_copy, 0);

View File

@ -58,6 +58,8 @@ struct btrfs_transaction {
struct list_head pending_chunks;
struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
spinlock_t dirty_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};