Btrfs: remove empty block groups automatically

One problem that has plagued us is that a user will use up all of his space with
data, remove a bunch of that data, and then try to create a bunch of small files
and run out of space.  This happens because all the chunks were allocated for
data since the metadata requirements were so low.  But now there's a bunch of
empty data block groups and not enough metadata space to do anything.  This
patch solves this problem by automatically deleting empty block groups.  If we
notice the used count go down to 0 when deleting or on mount notice that a block
group has a used count of 0 then we will queue it to be deleted.

When the cleaner thread runs we will double check to make sure the block group
is still empty and then we will delete it.  This patch has the side effect of no
longer having a bunch of BUG_ON()'s in the chunk delete code, which will be
helpful for both this and relocate.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Chris Mason <clm@fb.com>
This commit is contained in:
Josef Bacik 2014-09-18 11:20:02 -04:00 committed by Chris Mason
parent 8407f55326
commit 47ab2a6c68
6 changed files with 250 additions and 73 deletions

View File

@ -1298,8 +1298,8 @@ struct btrfs_block_group_cache {
*/ */
struct list_head cluster_list; struct list_head cluster_list;
/* For delayed block group creation */ /* For delayed block group creation or deletion of empty block groups */
struct list_head new_bg_list; struct list_head bg_list;
}; };
/* delayed seq elem */ /* delayed seq elem */
@ -1568,6 +1568,7 @@ struct btrfs_fs_info {
int do_barriers; int do_barriers;
int closing; int closing;
int log_root_recovering; int log_root_recovering;
int open;
u64 total_pinned; u64 total_pinned;
@ -1717,6 +1718,9 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */ /* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work; struct work_struct async_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
}; };
struct btrfs_subvolume_writers { struct btrfs_subvolume_writers {
@ -3344,6 +3348,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
u64 size); u64 size);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 group_start); struct btrfs_root *root, u64 group_start);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root); struct btrfs_root *root);
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);

View File

@ -1769,6 +1769,7 @@ static int cleaner_kthread(void *arg)
} }
btrfs_run_delayed_iputs(root); btrfs_run_delayed_iputs(root);
btrfs_delete_unused_bgs(root->fs_info);
again = btrfs_clean_one_deleted_snapshot(root); again = btrfs_clean_one_deleted_snapshot(root);
mutex_unlock(&root->fs_info->cleaner_mutex); mutex_unlock(&root->fs_info->cleaner_mutex);
@ -2230,6 +2231,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock); rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->delalloc_root_mutex);
@ -2239,6 +2241,7 @@ int open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
btrfs_mapping_init(&fs_info->mapping_tree); btrfs_mapping_init(&fs_info->mapping_tree);
btrfs_init_block_rsv(&fs_info->global_block_rsv, btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL); BTRFS_BLOCK_RSV_GLOBAL);
@ -2977,6 +2980,8 @@ int open_ctree(struct super_block *sb,
fs_info->update_uuid_tree_gen = 1; fs_info->update_uuid_tree_gen = 1;
} }
fs_info->open = 1;
return 0; return 0;
fail_qgroup: fail_qgroup:
@ -3688,6 +3693,7 @@ void close_ctree(struct btrfs_root *root)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping); invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info); btrfs_stop_all_workers(fs_info);
fs_info->open = 0;
free_root_pointers(fs_info, 1); free_root_pointers(fs_info, 1);
iput(fs_info->btree_inode); iput(fs_info->btree_inode);

View File

@ -5433,6 +5433,20 @@ static int update_block_group(struct btrfs_root *root,
spin_unlock(&cache->space_info->lock); spin_unlock(&cache->space_info->lock);
} else { } else {
old_val -= num_bytes; old_val -= num_bytes;
/*
* No longer have used bytes in this block group, queue
* it for deletion.
*/
if (old_val == 0) {
spin_lock(&info->unused_bgs_lock);
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
list_add_tail(&cache->bg_list,
&info->unused_bgs);
}
spin_unlock(&info->unused_bgs_lock);
}
btrfs_set_block_group_used(&cache->item, old_val); btrfs_set_block_group_used(&cache->item, old_val);
cache->pinned += num_bytes; cache->pinned += num_bytes;
cache->space_info->bytes_pinned += num_bytes; cache->space_info->bytes_pinned += num_bytes;
@ -8855,6 +8869,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
} }
up_write(&info->commit_root_sem); up_write(&info->commit_root_sem);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
block_group = list_first_entry(&info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
list_del_init(&block_group->bg_list);
btrfs_put_block_group(block_group);
}
spin_unlock(&info->unused_bgs_lock);
spin_lock(&info->block_group_cache_lock); spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache, block_group = rb_entry(n, struct btrfs_block_group_cache,
@ -8989,7 +9013,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
init_rwsem(&cache->data_rwsem); init_rwsem(&cache->data_rwsem);
INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->new_bg_list); INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache); btrfs_init_free_space_ctl(cache);
return cache; return cache;
@ -9130,8 +9154,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
__link_block_group(space_info, cache); __link_block_group(space_info, cache);
set_avail_alloc_bits(root->fs_info, cache->flags); set_avail_alloc_bits(root->fs_info, cache->flags);
if (btrfs_chunk_readonly(root, cache->key.objectid)) if (btrfs_chunk_readonly(root, cache->key.objectid)) {
set_block_group_ro(cache, 1); set_block_group_ro(cache, 1);
} else if (btrfs_block_group_used(&cache->item) == 0) {
spin_lock(&info->unused_bgs_lock);
/* Should always be true but just in case. */
if (list_empty(&cache->bg_list)) {
btrfs_get_block_group(cache);
list_add_tail(&cache->bg_list,
&info->unused_bgs);
}
spin_unlock(&info->unused_bgs_lock);
}
} }
list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@ -9172,10 +9206,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_key key; struct btrfs_key key;
int ret = 0; int ret = 0;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
new_bg_list) { list_del_init(&block_group->bg_list);
list_del_init(&block_group->new_bg_list);
if (ret) if (ret)
continue; continue;
@ -9261,7 +9293,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
__link_block_group(cache->space_info, cache); __link_block_group(cache->space_info, cache);
list_add_tail(&cache->new_bg_list, &trans->new_bgs); list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type); set_avail_alloc_bits(extent_root->fs_info, type);
@ -9430,6 +9462,101 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
return ret; return ret;
} }
/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
struct btrfs_root *root = fs_info->extent_root;
struct btrfs_trans_handle *trans;
int ret = 0;
if (!fs_info->open)
return;
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
u64 start, end;
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
space_info = block_group->space_info;
list_del_init(&block_group->bg_list);
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved ||
btrfs_block_group_used(&block_group->item) ||
block_group->ro) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
* the ro check in case balance is currently acting on
* this block group.
*/
spin_unlock(&block_group->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&block_group->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = set_block_group_ro(block_group, 0);
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
goto next;
}
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
btrfs_set_block_group_rw(root, block_group);
ret = PTR_ERR(trans);
goto next;
}
/*
* We could have pending pinned extents for this block group,
* just delete them, we don't care about them anymore.
*/
start = block_group->key.objectid;
end = start + block_group->key.offset - 1;
clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY, GFP_NOFS);
clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
/* Reset pinned so btrfs_put_block_group doesn't complain */
block_group->pinned = 0;
/*
* Btrfs_remove_chunk will abort the transaction if things go
* horribly wrong.
*/
ret = btrfs_remove_chunk(trans, root,
block_group->key.objectid);
btrfs_end_transaction(trans, root);
next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
}
int btrfs_init_space_info(struct btrfs_fs_info *fs_info) int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{ {
struct btrfs_space_info *space_info; struct btrfs_space_info *space_info;

View File

@ -45,7 +45,7 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
spin_lock_init(&cache->lock); spin_lock_init(&cache->lock);
INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->new_bg_list); INIT_LIST_HEAD(&cache->bg_list);
btrfs_init_free_space_ctl(cache); btrfs_init_free_space_ctl(cache);

View File

@ -2568,23 +2568,114 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
return ret; return ret;
} }
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 chunk_offset)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
struct btrfs_root *extent_root = root->fs_info->extent_root;
struct map_lookup *map;
u64 dev_extent_len = 0;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
u64 chunk_tree = root->fs_info->chunk_root->objectid;
int i, ret = 0;
/* Just in case */
root = root->fs_info->chunk_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
if (!em || em->start > chunk_offset ||
em->start + em->len < chunk_offset) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doens't
* do anything we still error out.
*/
ASSERT(0);
if (em)
free_extent_map(em);
return -EINVAL;
}
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
if (device->bytes_used > 0) {
lock_chunks(root);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += dev_extent_len;
spin_unlock(&root->fs_info->free_chunk_lock);
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
}
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
}
}
ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
}
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
}
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for the tree */
free_extent_map(em);
out:
/* once for us */
free_extent_map(em);
return ret;
}
static int btrfs_relocate_chunk(struct btrfs_root *root, static int btrfs_relocate_chunk(struct btrfs_root *root,
u64 chunk_tree, u64 chunk_objectid, u64 chunk_tree, u64 chunk_objectid,
u64 chunk_offset) u64 chunk_offset)
{ {
struct extent_map_tree *em_tree;
struct btrfs_root *extent_root; struct btrfs_root *extent_root;
struct btrfs_trans_handle *trans; struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct extent_map *em;
struct map_lookup *map;
u64 dev_extent_len = 0;
int ret; int ret;
int i;
root = root->fs_info->chunk_root; root = root->fs_info->chunk_root;
extent_root = root->fs_info->extent_root; extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
ret = btrfs_can_relocate(extent_root, chunk_offset); ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret) if (ret)
@ -2606,63 +2697,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
* step two, delete the device extents and the * step two, delete the device extents and the
* chunk tree entries * chunk tree entries
*/ */
read_lock(&em_tree->lock); ret = btrfs_remove_chunk(trans, root, chunk_offset);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
read_unlock(&em_tree->lock);
BUG_ON(!em || em->start > chunk_offset ||
em->start + em->len < chunk_offset);
map = (struct map_lookup *)em->bdev;
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
ret = btrfs_free_dev_extent(trans, device,
map->stripes[i].physical,
&dev_extent_len);
BUG_ON(ret);
if (device->bytes_used > 0) {
lock_chunks(root);
btrfs_device_set_bytes_used(device,
device->bytes_used - dev_extent_len);
spin_lock(&root->fs_info->free_chunk_lock);
root->fs_info->free_chunk_space += dev_extent_len;
spin_unlock(&root->fs_info->free_chunk_lock);
btrfs_clear_space_info_full(root->fs_info);
unlock_chunks(root);
}
if (map->stripes[i].dev) {
ret = btrfs_update_device(trans, map->stripes[i].dev);
BUG_ON(ret);
}
}
ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
chunk_offset);
BUG_ON(ret);
trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
BUG_ON(ret);
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
BUG_ON(ret);
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for the tree */
free_extent_map(em);
/* once for us */
free_extent_map(em);
btrfs_end_transaction(trans, root); btrfs_end_transaction(trans, root);
return 0; return ret;
} }
static int btrfs_relocate_sys_chunks(struct btrfs_root *root) static int btrfs_relocate_sys_chunks(struct btrfs_root *root)

View File

@ -463,6 +463,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, struct btrfs_root *extent_root,
u64 chunk_offset, u64 chunk_size); u64 chunk_offset, u64 chunk_size);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 chunk_offset);
static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
{ {