mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-04 04:02:26 +00:00
763748b238
Whenever we want to create a new dir index item (when creating an inode, create a hard link, rename a file) we reserve 1 unit of metadata space for it in a transaction (that's 256K for a node/leaf size of 16K), and then create a delayed insertion item for it to be added later to the subvolume's tree. That unit of metadata is kept until the delayed item is inserted into the subvolume tree, which may take a while to happen (in the worst case, it's done only when the transaction commits). If we have multiple dir index items to insert for the same directory, say N index items, and they all fit in a single leaf of metadata, then we are holding N units of reserved metadata space when all we need is 1 unit. This change addresses that, whenever a new delayed dir index item is added, we release the unit of metadata the caller has reserved when it started the transaction if adding that new dir index item does not result in touching one more metadata leaf, otherwise the reservation is kept by transferring it from the transaction block reserve to the delayed items block reserve, just like before. Given that with a leaf size of 16K we can have a few hundred dir index items in a single leaf (the exact value depends on file name lengths), this reduces pressure on metadata reservation by releasing unnecessary space much sooner. The following fs_mark test showed some improvement when creating many files in parallel on machine running a non debug kernel (debian's default kernel config) with 12 cores: $ cat test.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/nvme0n1 MOUNT_OPTIONS="-o ssd" FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $DEV mount $MOUNT_OPTIONS $DEV $MNT OPTS="-S 0 -L 10 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT Before: FSUse% Count Size Files/sec App Overhead 2 1200000 0 225991.3 5465891 4 2400000 0 345728.1 5512106 4 3600000 0 346959.5 5557653 8 4800000 0 329643.0 5587548 8 6000000 0 312657.4 5606717 8 7200000 0 281707.5 5727985 12 8400000 0 88309.8 5020422 12 9600000 0 85835.9 5207496 16 10800000 0 81039.2 5404964 16 12000000 0 58548.6 5842468 After: FSUse% Count Size Files/sec App Overhead 2 1200000 0 230604.5 5778375 4 2400000 0 348908.3 5508072 4 3600000 0 357028.7 5484337 6 4800000 0 342898.3 5565703 6 6000000 0 314670.8 5751555 8 7200000 0 282548.2 5778177 12 8400000 0 90844.9 5306819 12 9600000 0 86963.1 5304689 16 10800000 0 89113.2 5455248 16 12000000 0 86693.5 5518933 The "after" results are after applying this patch and all the other patches in the same patchset, which is comprised of the following changes: btrfs: balance btree dirty pages and delayed items after a rename btrfs: free the path earlier when creating a new inode btrfs: balance btree dirty pages and delayed items after clone and dedupe btrfs: add assertions when deleting batches of delayed items btrfs: deal with deletion errors when deleting delayed items btrfs: refactor the delayed item deletion entry point btrfs: improve batch deletion of delayed dir index items btrfs: assert that delayed item is a dir index item when adding it btrfs: improve batch insertion of delayed dir index items btrfs: do not BUG_ON() on failure to reserve metadata for delayed item btrfs: set delayed item type when initializing it btrfs: reduce amount of reserved metadata for delayed item insertion Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
155 lines
4.6 KiB
C
155 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2011 Fujitsu. All rights reserved.
|
|
* Written by Miao Xie <miaox@cn.fujitsu.com>
|
|
*/
|
|
|
|
#ifndef BTRFS_DELAYED_INODE_H
|
|
#define BTRFS_DELAYED_INODE_H
|
|
|
|
#include <linux/rbtree.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/list.h>
|
|
#include <linux/wait.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/refcount.h>
|
|
#include "ctree.h"
|
|
|
|
/* types of the delayed item */
|
|
#define BTRFS_DELAYED_INSERTION_ITEM 1
|
|
#define BTRFS_DELAYED_DELETION_ITEM 2
|
|
|
|
struct btrfs_delayed_root {
|
|
spinlock_t lock;
|
|
struct list_head node_list;
|
|
/*
|
|
* Used for delayed nodes which is waiting to be dealt with by the
|
|
* worker. If the delayed node is inserted into the work queue, we
|
|
* drop it from this list.
|
|
*/
|
|
struct list_head prepare_list;
|
|
atomic_t items; /* for delayed items */
|
|
atomic_t items_seq; /* for delayed items */
|
|
int nodes; /* for delayed nodes */
|
|
wait_queue_head_t wait;
|
|
};
|
|
|
|
#define BTRFS_DELAYED_NODE_IN_LIST 0
|
|
#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
|
|
#define BTRFS_DELAYED_NODE_DEL_IREF 2
|
|
|
|
struct btrfs_delayed_node {
|
|
u64 inode_id;
|
|
u64 bytes_reserved;
|
|
struct btrfs_root *root;
|
|
/* Used to add the node into the delayed root's node list. */
|
|
struct list_head n_list;
|
|
/*
|
|
* Used to add the node into the prepare list, the nodes in this list
|
|
* is waiting to be dealt with by the async worker.
|
|
*/
|
|
struct list_head p_list;
|
|
struct rb_root_cached ins_root;
|
|
struct rb_root_cached del_root;
|
|
struct mutex mutex;
|
|
struct btrfs_inode_item inode_item;
|
|
refcount_t refs;
|
|
u64 index_cnt;
|
|
unsigned long flags;
|
|
int count;
|
|
/*
|
|
* The size of the next batch of dir index items to insert (if this
|
|
* node is from a directory inode). Protected by @mutex.
|
|
*/
|
|
u32 curr_index_batch_size;
|
|
/*
|
|
* Number of leaves reserved for inserting dir index items (if this
|
|
* node belongs to a directory inode). This may be larger then the
|
|
* actual number of leaves we end up using. Protected by @mutex.
|
|
*/
|
|
u32 index_item_leaves;
|
|
};
|
|
|
|
struct btrfs_delayed_item {
|
|
struct rb_node rb_node;
|
|
struct btrfs_key key;
|
|
struct list_head tree_list; /* used for batch insert/delete items */
|
|
struct list_head readdir_list; /* used for readdir items */
|
|
u64 bytes_reserved;
|
|
struct btrfs_delayed_node *delayed_node;
|
|
refcount_t refs;
|
|
int ins_or_del;
|
|
u32 data_len;
|
|
char data[];
|
|
};
|
|
|
|
static inline void btrfs_init_delayed_root(
|
|
struct btrfs_delayed_root *delayed_root)
|
|
{
|
|
atomic_set(&delayed_root->items, 0);
|
|
atomic_set(&delayed_root->items_seq, 0);
|
|
delayed_root->nodes = 0;
|
|
spin_lock_init(&delayed_root->lock);
|
|
init_waitqueue_head(&delayed_root->wait);
|
|
INIT_LIST_HEAD(&delayed_root->node_list);
|
|
INIT_LIST_HEAD(&delayed_root->prepare_list);
|
|
}
|
|
|
|
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
|
|
const char *name, int name_len,
|
|
struct btrfs_inode *dir,
|
|
struct btrfs_disk_key *disk_key, u8 type,
|
|
u64 index);
|
|
|
|
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *dir, u64 index);
|
|
|
|
int btrfs_inode_delayed_dir_index_count(struct btrfs_inode *inode);
|
|
|
|
int btrfs_run_delayed_items(struct btrfs_trans_handle *trans);
|
|
int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, int nr);
|
|
|
|
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info);
|
|
|
|
int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
|
|
struct btrfs_inode *inode);
|
|
/* Used for evicting the inode. */
|
|
void btrfs_remove_delayed_node(struct btrfs_inode *inode);
|
|
void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode);
|
|
int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
|
|
|
|
|
|
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
|
|
struct btrfs_root *root,
|
|
struct btrfs_inode *inode);
|
|
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
|
|
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
|
|
|
|
/* Used for drop dead root */
|
|
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
|
|
|
|
/* Used for clean the transaction */
|
|
void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info);
|
|
|
|
/* Used for readdir() */
|
|
bool btrfs_readdir_get_delayed_items(struct inode *inode,
|
|
struct list_head *ins_list,
|
|
struct list_head *del_list);
|
|
void btrfs_readdir_put_delayed_items(struct inode *inode,
|
|
struct list_head *ins_list,
|
|
struct list_head *del_list);
|
|
int btrfs_should_delete_dir_index(struct list_head *del_list,
|
|
u64 index);
|
|
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
|
|
struct list_head *ins_list);
|
|
|
|
/* for init */
|
|
int __init btrfs_delayed_inode_init(void);
|
|
void __cold btrfs_delayed_inode_exit(void);
|
|
|
|
/* for debugging */
|
|
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info);
|
|
|
|
#endif
|