From b2acdddfad13c38a1e8b927d83c3cf321f63601a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 7 Oct 2015 17:23:23 +0800 Subject: [PATCH 01/21] Btrfs: add missing brelse when superblock checksum fails Looks like oversight, call brelse() when checksum fails. Further down the code, in the non error path, we do call brelse() and so we don't see brelse() in the goto error paths. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 974be09e7556..216f97765b56 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2668,6 +2668,7 @@ int open_ctree(struct super_block *sb, if (btrfs_check_super_csum(bh->b_data)) { printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); err = -EINVAL; + brelse(bh); goto fail_alloc; } From be7bd730841e69fe8f70120098596f648cd1f3ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 22 Oct 2015 15:05:09 -0400 Subject: [PATCH 02/21] Btrfs: igrab inode in writepage We hit this panic on a few of our boxes this week where we have an ordered_extent with an NULL inode. We do an igrab() of the inode in writepages, but weren't doing it in writepage which can be called directly from the VM on dirty pages. If the inode has been unlinked then we could have I_FREEING set which means igrab() would return NULL and we get this panic. Fix this by trying to igrab in btrfs_writepage, and if it returns NULL then just redirty the page and return AOP_WRITEPAGE_ACTIVATE; so the VM knows it wasn't successful. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Liu Bo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a70c5790f8f5..e8d9f9c8d00d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8534,15 +8534,28 @@ int btrfs_readpage(struct file *file, struct page *page) static int btrfs_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; - + struct inode *inode = page->mapping->host; + int ret; if (current->flags & PF_MEMALLOC) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; } + + /* + * If we are under memory pressure we will call this directly from the + * VM, we need to make sure we have the inode referenced for the ordered + * extent. If not just return like we didn't do anything. + */ + if (!igrab(inode)) { + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; + } tree = &BTRFS_I(page->mapping->host)->io_tree; - return extent_write_full_page(tree, page, btrfs_get_extent, wbc); + ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc); + btrfs_add_delayed_iput(inode); + return ret; } static int btrfs_writepages(struct address_space *mapping, From c5ca87819d7f10625b0940b356adb8ac0f1080d7 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Thu, 19 Nov 2015 17:26:22 +0800 Subject: [PATCH 03/21] btrfs: Support convert to -d dup for btrfs-convert Since we will add support for -d dup for non-mixed filesystem, kernel need to support converting to this raid-type. This patch remove limitation of above case. Tested by following script: (combination of dup conversion with fsck): export TEST_DEV='/dev/vdc' export TEST_DIR='/var/ltf/tester/mnt' do_dup_test() { local m_from="$1" local d_from="$2" local m_to="$3" local d_to="$4" echo "Convert from -m $m_from -d $d_from to -m $m_to -d $d_to" umount "$TEST_DIR" &>/dev/null ./mkfs.btrfs -f -m "$m_from" -d "$d_from" "$TEST_DEV" >/dev/null || return 1 mount "$TEST_DEV" "$TEST_DIR" || return 1 cp -a /sbin/* "$TEST_DIR" [[ "$m_from" != "$m_to" ]] && { ./btrfs balance start -f -mconvert="$m_to" "$TEST_DIR" || return 1 } [[ "$d_from" != "$d_to" ]] && { local opt=() [[ "$d_to" == single ]] && opt+=("-f") ./btrfs balance start "${opt[@]}" -dconvert="$d_to" "$TEST_DIR" || return 1 } umount "$TEST_DIR" || return 1 ./btrfsck "$TEST_DEV" || return 1 echo return 0 } test_all() { for m_from in single dup; do for d_from in single dup; do for m_to in single dup; do for d_to in single dup; do do_dup_test "$m_from" "$d_from" "$m_to" "$d_to" || return 1 done done done done } test_all Signed-off-by: Zhao Lei Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a23399e8e3ab..33b02505d110 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3723,14 +3723,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - /* allow dup'ed data chunks only in mixed mode */ - if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - btrfs_err(fs_info, "dup for data is not allowed"); - ret = -EINVAL; - goto out; - } - /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | From 8089fe62c6603860f6796ca80519b92391292f21 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 19 Nov 2015 14:15:51 +0100 Subject: [PATCH 04/21] btrfs: put delayed item hook into inode Inodes for delayed iput allocate a trivial helper structure, let's place the list hook directly into the inode and save a kmalloc (killing a __GFP_NOFAIL as a bonus) at the cost of increasing size of btrfs_inode. The inode can be put into the delayed_iputs list more than once and we have to keep the count. This means we can't use the list_splice to process a bunch of inodes because we'd lost track of the count if the inode is put into the delayed iputs again while it's processed. Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 4 ++++ fs/btrfs/inode.c | 54 +++++++++++++++++++----------------------- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc13fae2..61205e3bbefa 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -192,6 +192,10 @@ struct btrfs_inode { /* File creation time. */ struct timespec i_otime; + /* Hook into fs_info->delayed_iputs */ + struct list_head delayed_iput; + long delayed_iput_count; + struct inode vfs_inode; }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8d9f9c8d00d..255d0f966b9f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3106,55 +3106,47 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, start, (size_t)(end - start + 1)); } -struct delayed_iput { - struct list_head list; - struct inode *inode; -}; - -/* JDM: If this is fs-wide, why can't we add a pointer to - * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct delayed_iput *delayed; + struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); - delayed->inode = inode; - spin_lock(&fs_info->delayed_iput_lock); - list_add_tail(&delayed->list, &fs_info->delayed_iputs); + if (binode->delayed_iput_count == 0) { + ASSERT(list_empty(&binode->delayed_iput)); + list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + } else { + binode->delayed_iput_count++; + } spin_unlock(&fs_info->delayed_iput_lock); } void btrfs_run_delayed_iputs(struct btrfs_root *root) { - LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - struct delayed_iput *delayed; - int empty; - - spin_lock(&fs_info->delayed_iput_lock); - empty = list_empty(&fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); - if (empty) - return; down_read(&fs_info->delayed_iput_sem); - spin_lock(&fs_info->delayed_iput_lock); - list_splice_init(&fs_info->delayed_iputs, &list); - spin_unlock(&fs_info->delayed_iput_lock); + while (!list_empty(&fs_info->delayed_iputs)) { + struct btrfs_inode *inode; - while (!list_empty(&list)) { - delayed = list_entry(list.next, struct delayed_iput, list); - list_del(&delayed->list); - iput(delayed->inode); - kfree(delayed); + inode = list_first_entry(&fs_info->delayed_iputs, + struct btrfs_inode, delayed_iput); + if (inode->delayed_iput_count) { + inode->delayed_iput_count--; + list_move_tail(&inode->delayed_iput, + &fs_info->delayed_iputs); + } else { + list_del_init(&inode->delayed_iput); + } + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + spin_lock(&fs_info->delayed_iput_lock); } - + spin_unlock(&fs_info->delayed_iput_lock); up_read(&root->fs_info->delayed_iput_sem); } @@ -9037,6 +9029,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; + ei->delayed_iput_count = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -9061,6 +9054,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); return inode; From 35b3ad50baa4a5fc2ae616c0513d2987bfb52a85 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 30 Nov 2015 16:51:29 +0100 Subject: [PATCH 05/21] btrfs: better packing of btrfs_delayed_extent_op btrfs_delayed_extent_op can be packed in a better way, it's 40 bytes now and has 8 unused bytes. Reducing the level type to u8 makes it possible to squeeze it to the padding byte after key. The bitfields were switched to bool as there's space to store the full byte without increasing the whole structure, besides that the generated assembly is smaller. struct btrfs_delayed_extent_op { struct btrfs_disk_key key; /* 0 17 */ u8 level; /* 17 1 */ bool update_key; /* 18 1 */ bool update_flags; /* 19 1 */ bool is_data; /* 20 1 */ /* XXX 3 bytes hole, try to pack */ u64 flags_to_set; /* 24 8 */ /* size: 32, cachelines: 1, members: 6 */ /* sum members: 29, holes: 1, sum holes: 3 */ /* last cacheline: 32 bytes */ }; The final size is 32 bytes which gives +26 object per slab page. text data bss dec hex filename 938811 43670 23144 1005625 f5839 fs/btrfs/btrfs.ko.before 938747 43670 23144 1005561 f57f9 fs/btrfs/btrfs.ko.after Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 4 ++-- fs/btrfs/delayed-ref.h | 8 ++++---- fs/btrfs/extent-tree.c | 15 ++++++--------- fs/btrfs/extent-tree.h | 0 4 files changed, 12 insertions(+), 15 deletions(-) delete mode 100644 fs/btrfs/extent-tree.h diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e06dd75ad13f..914ac13bd92f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -493,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, memcpy(&existing_ref->extent_op->key, &ref->extent_op->key, sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = 1; + existing_ref->extent_op->update_key = true; } if (ref->extent_op->update_flags) { existing_ref->extent_op->flags_to_set |= ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = 1; + existing_ref->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(ref->extent_op); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 00ed02cbf3e9..c24b653c7343 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -75,11 +75,11 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; + u8 level; + bool update_key; + bool update_flags; + bool is_data; u64 flags_to_set; - int level; - unsigned int update_key:1; - unsigned int update_flags:1; - unsigned int is_data:1; }; /* diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c4661db2b72a..d796b53850d0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2988,9 +2988,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return -ENOMEM; extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, @@ -7980,12 +7980,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - if (skinny_metadata) - extent_op->update_key = 0; - else - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->is_data = false; extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h deleted file mode 100644 index e69de29bb2d1..000000000000 From f5cdedd73fa71b74dcc42f2a11a5735d89ce7c4f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 30 Nov 2015 17:27:06 +0100 Subject: [PATCH 06/21] btrfs: handle invalid num_stripes in sys_array We can handle the special case of num_stripes == 0 directly inside btrfs_read_sys_array. The BUG_ON in btrfs_chunk_item_size is there to catch other unhandled cases where we fail to validate external data. A crafted or corrupted image crashes at mount time: BTRFS: device fsid 9006933e-2a9a-44f0-917f-514252aeec2c devid 1 transid 7 /dev/loop0 BTRFS info (device loop0): disk space caching is enabled BUG: failure at fs/btrfs/ctree.h:337/btrfs_chunk_item_size()! Kernel panic - not syncing: BUG! CPU: 0 PID: 313 Comm: mount Not tainted 4.2.5-00657-ge047887-dirty #25 Stack: 637af890 60062489 602aeb2e 604192ba 60387961 00000011 637af8a0 6038a835 637af9c0 6038776b 634ef32b 00000000 Call Trace: [<6001c86d>] show_stack+0xfe/0x15b [<6038a835>] dump_stack+0x2a/0x2c [<6038776b>] panic+0x13e/0x2b3 [<6020f099>] btrfs_read_sys_array+0x25d/0x2ff [<601cfbbe>] open_ctree+0x192d/0x27af [<6019c2c1>] btrfs_mount+0x8f5/0xb9a [<600bc9a7>] mount_fs+0x11/0xf3 [<600d5167>] vfs_kern_mount+0x75/0x11a [<6019bcb0>] btrfs_mount+0x2e4/0xb9a [<600bc9a7>] mount_fs+0x11/0xf3 [<600d5167>] vfs_kern_mount+0x75/0x11a [<600d710b>] do_mount+0xa35/0xbc9 [<600d7557>] SyS_mount+0x95/0xc8 [<6001e884>] handle_syscall+0x6b/0x8e Reported-by: Jiri Slaby Reported-by: Vegard Nossum CC: stable@vger.kernel.org # 3.19+ Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 33b02505d110..dc6697cfae0c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6504,6 +6504,14 @@ int btrfs_read_sys_array(struct btrfs_root *root) goto out_short_read; num_stripes = btrfs_chunk_num_stripes(sb, chunk); + if (!num_stripes) { + printk(KERN_ERR + "BTRFS: invalid number of stripes %u in sys_array at offset %u\n", + num_stripes, cur_offset); + ret = -EIO; + break; + } + len = btrfs_chunk_item_size(num_stripes); if (cur_offset + len > array_size) goto out_short_read; From 93a3d46780b0f207f76608078eb965cf7b83902c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 30 Nov 2015 17:27:09 +0100 Subject: [PATCH 07/21] btrfs: verbose error when we find an unexpected item in sys_array Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dc6697cfae0c..0577d7f4c442 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6520,6 +6520,9 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (ret) break; } else { + printk(KERN_ERR + "BTRFS: unexpected item type %u in sys_array at offset %u\n", + (u32)key.type, cur_offset); ret = -EIO; break; } From 0de270fa83d5a45664e3f2428d432145037ac432 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 1 Dec 2015 18:09:12 +0100 Subject: [PATCH 08/21] btrfs: drop duplicate prefix from scrub workqueues The helper btrfs_alloc_workqueue will add the "btrfs-" prefix. Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b091d94ceef6..1d1f639b7c79 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3735,27 +3735,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + btrfs_alloc_workqueue("scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + btrfs_alloc_workqueue("scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("btrfs-scrubparity", flags, + btrfs_alloc_workqueue("scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; From 100d57025cce6bf568a10660c0d884bcc64c580e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 8 Dec 2015 14:39:32 +0100 Subject: [PATCH 09/21] btrfs: don't use slab cache for struct btrfs_delalloc_work Although we prefer to use separate caches for various structs, it seems better not to do that for struct btrfs_delalloc_work. Objects of this type are allocated rarely, when transaction commit calls btrfs_start_delalloc_roots, requesting delayed iputs. The objects are temporary (with some IO involved) but still allocated and freed within __start_delalloc_inodes. Memory allocation failure is handled. The slab cache is empty most of the time (observed on several systems), so if we need to allocate a new slab object, the first one has to allocate a full page. In a potential case of low memory conditions this might fail with higher probability compared to using the generic slab caches. Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 255d0f966b9f..0214a812fc0e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -77,7 +77,6 @@ static const struct file_operations btrfs_dir_file_operations; static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; -static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; @@ -9159,8 +9158,6 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); - if (btrfs_delalloc_work_cachep) - kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) @@ -9195,13 +9192,6 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; - btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", - sizeof(struct btrfs_delalloc_work), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL); - if (!btrfs_delalloc_work_cachep) - goto fail; - return 0; fail: btrfs_destroy_cachep(); @@ -9446,7 +9436,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, { struct btrfs_delalloc_work *work; - work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; @@ -9465,7 +9455,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) { wait_for_completion(&work->completion); - kmem_cache_free(btrfs_delalloc_work_cachep, work); + kfree(work); } /* From 28f0779a3fd6ef015303780f0b9a92b24728bc4b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 19 Nov 2015 11:42:24 +0100 Subject: [PATCH 10/21] btrfs tests: replace whole ops structure for free space tests Preparatory work for making btrfs_free_space_op constant. In test_steal_space_from_bitmap_to_extent, we substitute use_bitmap with own version thus preventing constification. We can rework it so we replace the whole structure with the correct function pointers. Signed-off-by: David Sterba --- fs/btrfs/tests/free-space-tests.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 8b72b005bfb9..46c2f292ea94 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -445,9 +445,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int ret; u64 offset; u64 max_extent_size; - - bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, - struct btrfs_free_space *); + struct btrfs_free_space_op test_free_space_ops = { + .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, + .use_bitmap = test_use_bitmap, + }; + struct btrfs_free_space_op *orig_free_space_ops; test_msg("Running space stealing from bitmap to extent\n"); @@ -469,8 +471,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that forces use of bitmaps as soon as we have at least 1 * extent entry. */ - use_bitmap_op = cache->free_space_ctl->op->use_bitmap; - cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; /* * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ @@ -877,7 +879,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) if (ret) return ret; - cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + cache->free_space_ctl->op = orig_free_space_ops; __btrfs_remove_free_space_cache(cache->free_space_ctl); return 0; From 20e5506baf3fd651e245bc970d8c11a734ee1b8a Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 19 Nov 2015 11:42:28 +0100 Subject: [PATCH 11/21] btrfs: constify remaining structs with function pointers * struct extent_io_ops * struct btrfs_free_space_op Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/free-space-cache.h | 2 +- fs/btrfs/inode-map.c | 4 ++-- fs/btrfs/inode.c | 4 ++-- fs/btrfs/tests/free-space-tests.c | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cfe99bec49de..ed8a3b7d3565 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2016,7 +2016,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_space_op = { +static const struct btrfs_free_space_op free_space_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index f251865eb6f3..33178c490ace 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -37,7 +37,7 @@ struct btrfs_free_space_ctl { int total_bitmaps; int unit; u64 start; - struct btrfs_free_space_op *op; + const struct btrfs_free_space_op *op; void *private; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 767a6056ac45..53014e963617 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_ino_op = { +static const struct btrfs_free_space_op free_ino_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, return false; } -static struct btrfs_free_space_op pinned_free_ino_op = { +static const struct btrfs_free_space_op pinned_free_ino_op = { .recalc_thresholds = pinned_recalc_thresholds, .use_bitmap = pinned_use_bitmap, }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0214a812fc0e..c4cfba79ab47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -74,7 +74,7 @@ static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; -static struct extent_io_ops btrfs_extent_io_ops; +static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -10020,7 +10020,7 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static struct extent_io_ops btrfs_extent_io_ops = { +static const struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, .merge_bio_hook = btrfs_merge_bio_hook, diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 46c2f292ea94..69a11e63d668 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -445,11 +445,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int ret; u64 offset; u64 max_extent_size; - struct btrfs_free_space_op test_free_space_ops = { + const struct btrfs_free_space_op test_free_space_ops = { .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, .use_bitmap = test_use_bitmap, }; - struct btrfs_free_space_op *orig_free_space_ops; + const struct btrfs_free_space_op *orig_free_space_ops; test_msg("Running space stealing from bitmap to extent\n"); From 4d4ab6d6bc05ba65169de9a5391e6ccbe09d8719 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 19 Nov 2015 11:42:31 +0100 Subject: [PATCH 12/21] btrfs: constify static arrays There are a few statically initialized arrays that can be made const. The remaining (like file_system_type, sysfs attributes or prop handlers) do not allow that due to type mismatch when passed to the APIs or because the structures are modified through other members. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/btrfs/super.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 35489e7129a7..99059fcb563e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,7 +174,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4 }; +static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c4cfba79ab47..3e1398b123aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -83,7 +83,7 @@ struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index da94138eb85e..fd429d7f40ab 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5286,7 +5286,7 @@ out_unlock: static int btrfs_ioctl_get_supported_features(struct file *file, void __user *arg) { - static struct btrfs_ioctl_feature_flags features[3] = { + static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), INIT_FEATURE_FLAGS(SAFE_SET), INIT_FEATURE_FLAGS(SAFE_CLEAR) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 24154e422945..b813fd76f03f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -309,7 +309,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, {Opt_subvolid, "subvolid=%s"}, From e4058b54d1e442b6b3eca949f0d63d49ba2b020d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 27 Nov 2015 16:31:35 +0100 Subject: [PATCH 13/21] btrfs: cleanup, use enum values for btrfs_path reada Replace the integers by enums for better readability. The value 2 does not have any meaning since a717531942f488209dded30f6bc648167bcefa72 "Btrfs: do less aggressive btree readahead" (2009-01-22). Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 9 ++++----- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 12 ++++++------ fs/btrfs/file-item.c | 4 ++-- fs/btrfs/inode-map.c | 2 +- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ioctl.c | 2 +- fs/btrfs/relocation.c | 12 ++++++------ fs/btrfs/scrub.c | 2 +- fs/btrfs/volumes.c | 6 +++--- fs/btrfs/xattr.c | 2 +- 11 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b8e235c4b6d..be1be0422ff4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root, u64 target; u64 nread = 0; u64 gen; - int direction = path->reada; struct extent_buffer *eb; u32 nr; u32 blocksize; @@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root, nr = slot; while (1) { - if (direction < 0) { + if (path->reada == READA_BACK) { if (nr == 0) break; nr--; - } else if (direction > 0) { + } else if (path->reada == READA_FORWARD) { nr++; if (nr >= nritems) break; } - if (path->reada < 0 && objectid) { + if (path->reada == READA_BACK && objectid) { btrfs_node_key(node, &disk_key, nr); if (btrfs_disk_key_objectid(&disk_key) != objectid) break; @@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); free_extent_buffer(tmp); - if (p->reada) + if (p->reada != READA_NONE) reada_for_search(root, p, level, slot, key->objectid); btrfs_release_path(p); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 99059fcb563e..a1cf4c813578 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -590,6 +590,7 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ +enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d796b53850d0..9e3639b21567 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -438,7 +438,7 @@ static noinline void caching_thread(struct btrfs_work *work) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 1; + path->reada = READA_FORWARD; key.objectid = last; key.offset = 0; @@ -2115,7 +2115,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, @@ -2141,7 +2141,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, @@ -2254,7 +2254,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 1); @@ -6438,7 +6438,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -9688,7 +9688,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 58ece6558430..a67e1c828d0f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, } if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) - path->reada = 2; + path->reada = READA_FORWARD; WARN_ON(bio->bi_vcnt <= 0); @@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (search_commit) { path->skip_locking = 1; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 53014e963617..1951ad69382f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -48,7 +48,7 @@ static int caching_kthread(void *data) /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.offset = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3e1398b123aa..dc2c1bedbafa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3342,7 +3342,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = -ENOMEM; goto out; } - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -4308,7 +4308,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; /* * We want to drop from the next block forward in case this new size is @@ -5744,7 +5744,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); @@ -6775,7 +6775,7 @@ again: * Chances are we'll be called again, so go ahead and do * readahead */ - path->reada = 1; + path->reada = READA_FORWARD; } ret = btrfs_lookup_file_extent(trans, root, path, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fd429d7f40ab..6e896415a734 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3478,7 +3478,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; } - path->reada = 2; + path->reada = READA_FORWARD; /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b4ca5454ef1a..ef6d8fc85853 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -708,8 +708,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = 1; - path2->reada = 2; + path1->reada = READA_FORWARD; + path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { @@ -2130,7 +2130,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -3527,7 +3527,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3917,7 +3917,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; ret = prepare_to_relocate(rc); if (ret) { @@ -4343,7 +4343,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1d1f639b7c79..143f2b7a8edf 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3507,7 +3507,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0577d7f4c442..e95d24967fa0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1102,7 +1102,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = device->devid; key.offset = start; @@ -1271,7 +1271,7 @@ again: goto out; } - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -4260,7 +4260,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; lock_chunks(root); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 1fcd7b6e7564..28b4e7e71e61 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; /* search for our xattrs */ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); From dccabfad20880bc6c8be21b538df4293506b99f8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 27 Nov 2015 16:31:38 +0100 Subject: [PATCH 14/21] btrfs: use smaller type for btrfs_path reada The possible values for reada are all positive and bounded, we can later save some bytes by storing it in u8. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a1cf4c813578..e5f9b96fc86f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -596,7 +596,7 @@ struct btrfs_path { int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ int locks[BTRFS_MAX_LEVEL]; - int reada; + u8 reada; /* keep some upper locks as we walk down */ int lowest_level; From 7853f15b2aeeb01c587168fc3f7f0ff76a3c9bfd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 27 Nov 2015 16:31:42 +0100 Subject: [PATCH 15/21] btrfs: use smaller type for btrfs_path lowest_level The level is 0..7, we can use smaller type. The size of btrfs_path is now 136 bytes from 144, which is +2 objects that fit into a 4k slab. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e5f9b96fc86f..09ee92d9670c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -598,7 +598,7 @@ struct btrfs_path { int locks[BTRFS_MAX_LEVEL]; u8 reada; /* keep some upper locks as we walk down */ - int lowest_level; + u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks From 4fb72bf2e913ca3798afd9e226e2416918936e49 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 27 Nov 2015 16:31:45 +0100 Subject: [PATCH 16/21] btrfs: use smaller type for btrfs_path locks The values of btrfs_path::locks are 0 to 4, fit into a u8. Let's see: * overall size of btrfs_path drops down from 136 to 112 (-24 bytes), * better packing in a slab page +6 objects * the whole structure now fits to 2 cachelines * slight decrease in code size: text data bss dec hex filename 938731 43670 23144 1005545 f57e9 fs/btrfs/btrfs.ko.before 938203 43670 23144 1005017 f55d9 fs/btrfs/btrfs.ko.after (and the generated assembly does not change much) The main purpose is to decrease the size of the structure without affecting performance. The byte access is usually well behaving accross arches, the locks are not accessed frequently and sometimes just compared to zero. Note for further size reduction attempts: the slots could be made u16 but this might generate worse code on some arches (non-byte and non-int access). Also the range of operations on slots is wider compared to locks and the potential performance drop should be evaluated first. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 09ee92d9670c..5e0fe0914d54 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -595,7 +595,7 @@ struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ - int locks[BTRFS_MAX_LEVEL]; + u8 locks[BTRFS_MAX_LEVEL]; u8 reada; /* keep some upper locks as we walk down */ u8 lowest_level; From a1ee736268448d74af25737568e383acb84c3c18 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 10 Nov 2015 18:53:56 +0100 Subject: [PATCH 17/21] btrfs: do an allocation earlier during snapshot creation We can allocate pending_snapshot earlier and do not have to do cleanup in case of failure. Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6e896415a734..fa25091e10ab 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -655,22 +655,20 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) + return -ENOMEM; + atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - goto out; + goto dec_and_free; btrfs_wait_ordered_extents(root, -1); - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - goto out; - } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -686,7 +684,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto free; + goto dec_and_free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -737,11 +735,11 @@ fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -free: - kfree(pending_snapshot); -out: +dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshoted)) wake_up_atomic_t(&root->will_be_snapshoted); + kfree(pending_snapshot); + return ret; } From b0c0ea6338d5018e02d27c5315084fb1a5d099f6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 10 Nov 2015 18:54:00 +0100 Subject: [PATCH 18/21] btrfs: allocate root item at snapshot ioctl time The actual snapshot creation is delayed until transaction commit. If we cannot get enough memory for the root item there, we have to fail the whole transaction commit which is bad. So we'll allocate the memory at the ioctl call and pass it along with the pending_snapshot struct. The potential ENOMEM will be returned to the caller of snapshot ioctl. Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 9 +++++++++ fs/btrfs/transaction.c | 9 +++------ fs/btrfs/transaction.h | 1 + 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fa25091e10ab..27fc660b0ff3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -659,6 +659,13 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!pending_snapshot) return -ENOMEM; + pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), + GFP_NOFS); + if (!pending_snapshot->root_item) { + ret = -ENOMEM; + goto free_pending; + } + atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); btrfs_wait_for_no_snapshoting_writes(root); @@ -738,6 +745,8 @@ fail: dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshoted)) wake_up_atomic_t(&root->will_be_snapshoted); +free_pending: + kfree(pending_snapshot->root_item); kfree(pending_snapshot); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index be8eae80ff65..2074106122d9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1325,11 +1325,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, return 0; } - new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); - if (!new_root_item) { - pending->error = -ENOMEM; - goto root_item_alloc_fail; - } + ASSERT(pending->root_item); + new_root_item = pending->root_item; pending->error = btrfs_find_free_objectid(tree_root, &objectid); if (pending->error) @@ -1562,7 +1559,7 @@ clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); -root_item_alloc_fail: + pending->root_item = NULL; btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 64c8221b6165..b6f9a3c94468 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -137,6 +137,7 @@ struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; struct btrfs_root *root; + struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; /* block reservation for the operation */ From 8546b570511f428838129c00e701eda481cd7c13 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 10 Nov 2015 18:54:03 +0100 Subject: [PATCH 19/21] btrfs: preallocate path for snapshot creation at ioctl time We can also preallocate btrfs_path that's used during pending snapshot creation and avoid another late ENOMEM failure. Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 +++- fs/btrfs/transaction.c | 9 ++++----- fs/btrfs/transaction.h | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 27fc660b0ff3..3d9b27d2176e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -661,7 +661,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), GFP_NOFS); - if (!pending_snapshot->root_item) { + pending_snapshot->path = btrfs_alloc_path(); + if (!pending_snapshot->root_item || !pending_snapshot->path) { ret = -ENOMEM; goto free_pending; } @@ -747,6 +748,7 @@ dec_and_free: wake_up_atomic_t(&root->will_be_snapshoted); free_pending: kfree(pending_snapshot->root_item); + btrfs_free_path(pending_snapshot->path); kfree(pending_snapshot); return ret; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2074106122d9..be463b7f1f30 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1319,11 +1319,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; - path = btrfs_alloc_path(); - if (!path) { - pending->error = -ENOMEM; - return 0; - } + ASSERT(pending->path); + path = pending->path; ASSERT(pending->root_item); new_root_item = pending->root_item; @@ -1561,6 +1558,8 @@ no_free_objectid: kfree(new_root_item); pending->root_item = NULL; btrfs_free_path(path); + pending->path = NULL; + return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b6f9a3c94468..72be51f7ca2f 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -140,6 +140,7 @@ struct btrfs_pending_snapshot { struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; + struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; From ca8a51b3a979d57b082b14eda38602b7f52d81d1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sat, 10 Oct 2015 17:59:53 +0200 Subject: [PATCH 20/21] btrfs: statfs: report zero available if metadata are exhausted There is one ENOSPC case that's very confusing. There's Available greater than zero but no file operation succeds (besides removing files). This happens when the metadata are exhausted and there's no possibility to allocate another chunk. In this scenario it's normal that there's still some space in the data chunk and the calculation in df reflects that in the Avail value. To at least give some clue about the ENOSPC situation, let statfs report zero value in Avail, even if there's still data space available. Current: /dev/sdb1 4.0G 3.3G 719M 83% /mnt/test New: /dev/sdb1 4.0G 3.3G 0 100% /mnt/test We calculate the remaining metadata space minus global reserve. If this is (supposedly) smaller than zero, there's no space. But this does not hold in practice, the exhausted state happens where's still some positive delta. So we apply some guesswork and compare the delta to a 4M threshold. (Practically observed delta was 2M.) We probably cannot calculate the exact threshold value because this depends on the internal reservations requested by various operations, so some operations that consume a few metadata will succeed even if the Avail is zero. But this is better than the other way around. Signed-off-by: David Sterba --- fs/btrfs/super.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b813fd76f03f..5099e4633e50 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1956,6 +1956,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * there are other factors that may change the result (like a new metadata * chunk). * + * If metadata is exhausted, f_bavail will be 0. + * * FIXME: not accurate for mixed block groups, total and free/used are ok, * available appears slightly larger. */ @@ -1967,11 +1969,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_space_info *found; u64 total_used = 0; u64 total_free_data = 0; + u64 total_free_meta = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)fs_info->fsid; unsigned factor = 1; struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; + u64 thresh = 0; /* * holding chunk_muext to avoid allocating new chunks, holding @@ -1997,6 +2001,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) } } } + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + total_free_meta += found->disk_total - found->disk_used; total_used += found->disk_used; } @@ -2019,6 +2025,24 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; + /* + * We calculate the remaining metadata space minus global reserve. If + * this is (supposedly) smaller than zero, there's no space. But this + * does not hold in practice, the exhausted state happens where's still + * some positive delta. So we apply some guesswork and compare the + * delta to a 4M threshold. (Practically observed delta was ~2M.) + * + * We probably cannot calculate the exact threshold value because this + * depends on the internal reservations requested by various + * operations, so some operations that consume a few metadata will + * succeed even if the Avail is zero. But this is better than the other + * way around. + */ + thresh = 4 * 1024 * 1024; + + if (total_free_meta - thresh < block_rsv->size) + buf->f_bavail = 0; + buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_namelen = BTRFS_NAME_LEN; From ee592d07716139ef7c3ea8049936cbc3aafbc533 Mon Sep 17 00:00:00 2001 From: Sam Tygier Date: Wed, 6 Jan 2016 08:46:12 +0000 Subject: [PATCH 21/21] Btrfs: Check metadata redundancy on balance When converting a filesystem via balance check that metadata mode is at least as redundant as the data mode. For example give warning when: -dconvert=raid1 -mconvert=single Signed-off-by: Sam Tygier [ minor message reformatting ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e95d24967fa0..574a717fcb6e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3748,6 +3748,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); + if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < + btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + btrfs_warn(fs_info, + "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx", + bctl->meta.target, bctl->data.target); + } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { fs_info->num_tolerated_disk_barrier_failures = min( btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),