for-6.10-rc7-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmaRcQgACgkQxWXV+ddt
 WDvAGxAAknJAiREp/AmzhSwkhr+nSnqex0t+VVgsOaMTu0BEHO0xhoXc3l0QuSwS
 u2AIqmOYyzr/UQVXCuatBqAE+5T4njtYAYIWwE825yquAtHNyuok9+Sjhfvxrwgs
 HmNAN4Vvl2Fwds7xbWE8ug18QlssuRTIX8hk7ZtS6xo49g0tsbRX9KlzIPpsULD3
 BOZa+2NJwC1PGVeNPf3p06rfiUkKfmFYgdDybe2zJ17uwsRz1CFSsaEEB35ys1f0
 xYOS4epfcie03EGyZmYctuNxatUkk/J/1lTH4Z9JHwvPBvLK1U97SyJ11Wz2VQC/
 8ar8gUDRYtjWdf6vn6AWBM4MseaYm9LDMlPhbSfvpDcWiclGTE64IOP4gKKr3mCh
 WzlNSIR9I+tYgrhvcsCEzd7lvrSVHa7clwfooYgkEx0wl5lgbN0llAdtJWG3eeLn
 3stxje2FqqXsFNj5N9SrPy7f7t6xF2i8vwk4qh6EpRuT4yuatb+nWzDm9EuTT/Bc
 P+zM1KFp7Blk7Zw/Tpw0O9qjt1whStY2xrqcMzg539WVo45MmuFEFzmGBRwZsH55
 QPGLIjXPpt728AgMdhBFEG0DtWaiA3AOI/C5nYOtLu92aZVBmbaX7/d/GpJv3Vvd
 Ihvr9s1c49YvTZsIS0T0tkq/7LXZi/SToRJDjhP5HCrRGf7A30Y=
 =gtsF
 -----END PGP SIGNATURE-----

Merge tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "Fix a regression in extent map shrinker behaviour.

  In the past weeks we got reports from users that there are huge
  latency spikes or freezes. This was bisected to newly added shrinker
  of extent maps (it was added to fix a build up of the structures in
  memory).

  I'm assuming that the freezes would happen to many users after release
  so I'd like to get it merged now so it's in 6.10. Although the diff
  size is not small the changes are relatively straightforward, the
  reporters verified the fixes and we did testing on our side.

  The fixes:

   - adjust behaviour under memory pressure and check lock or scheduling
     conditions, bail out if needed

   - synchronize tracking of the scanning progress so inode ranges are
     not skipped or work duplicated

   - do a delayed iput when scanning a root so evicting an inode does
     not slow things down in case of lots of dirty data, also fix
     lockdep warning, a deadlock could happen when writing the dirty
     data would need to start a transaction"

* tag 'for-6.10-rc7-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: avoid races when tracking progress for extent map shrinking
  btrfs: stop extent map shrinker if reschedule is needed
  btrfs: use delayed iput during extent map shrinking
This commit is contained in:
Linus Torvalds 2024-07-12 12:08:42 -07:00
commit 975f3b6da1
4 changed files with 107 additions and 37 deletions

View File

@ -2856,6 +2856,8 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
spin_lock_init(&fs_info->extent_map_shrinker_lock);
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;

View File

@ -1028,7 +1028,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
return ret;
}
static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_to_scan)
struct btrfs_em_shrink_ctx {
long nr_to_scan;
long scanned;
u64 last_ino;
u64 last_root;
};
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
@ -1057,14 +1064,25 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
write_lock(&tree->lock);
/*
* We want to be fast because we can be called from any path trying to
* allocate memory, so if the lock is busy we don't want to spend time
* waiting for it - either some task is about to do IO for the inode or
* we may have another task shrinking extent maps, here in this code, so
* skip this inode.
*/
if (!write_trylock(&tree->lock)) {
up_read(&inode->i_mmap_lock);
return 0;
}
node = rb_first_cached(&tree->map);
while (node) {
struct extent_map *em;
em = rb_entry(node, struct extent_map, rb_node);
node = rb_next(node);
(*scanned)++;
ctx->scanned++;
if (em->flags & EXTENT_FLAG_PINNED)
goto next;
@ -1085,16 +1103,18 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
free_extent_map(em);
nr_dropped++;
next:
if (*scanned >= nr_to_scan)
if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
* Restart if we had to reschedule, and any extent maps that were
* pinned before may have become unpinned after we released the
* lock and took it again.
* Stop if we need to reschedule or there's contention on the
* lock. This is to avoid slowing other tasks trying to take the
* lock and because the shrinker might be called during a memory
* allocation path and we want to avoid taking a very long time
* and slowing down all sorts of tasks.
*/
if (cond_resched_rwlock_write(&tree->lock))
node = rb_first_cached(&tree->map);
if (need_resched() || rwlock_needbreak(&tree->lock))
break;
}
write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
@ -1102,25 +1122,30 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
return nr_dropped;
}
static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_scan)
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
u64 min_ino = fs_info->extent_map_shrinker_last_ino + 1;
u64 min_ino = ctx->last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, scanned, nr_to_scan);
nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
fs_info->extent_map_shrinker_last_ino = btrfs_ino(inode);
iput(&inode->vfs_inode);
ctx->last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode);
if (*scanned >= nr_to_scan)
if (ctx->scanned >= ctx->nr_to_scan)
break;
/*
* We may be called from memory allocation paths, so we don't
* want to take too much time and slowdown tasks.
*/
if (need_resched())
break;
cond_resched();
inode = btrfs_find_first_inode(root, min_ino);
}
@ -1132,14 +1157,14 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root);
ctx->last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
fs_info->extent_map_shrinker_last_ino = 0;
fs_info->extent_map_shrinker_last_root = btrfs_root_id(root) + 1;
ctx->last_ino = 0;
ctx->last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
@ -1147,19 +1172,41 @@ static long btrfs_scan_root(struct btrfs_root *root, long *scanned, long nr_to_s
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
{
const u64 start_root_id = fs_info->extent_map_shrinker_last_root;
u64 next_root_id = start_root_id;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
long scanned = 0;
ctx.scanned = 0;
ctx.nr_to_scan = nr_to_scan;
/*
* In case we have multiple tasks running this shrinker, make the next
* one start from the next inode in case it starts before we finish.
*/
spin_lock(&fs_info->extent_map_shrinker_lock);
ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
fs_info->extent_map_shrinker_last_ino++;
ctx.last_root = fs_info->extent_map_shrinker_last_root;
spin_unlock(&fs_info->extent_map_shrinker_lock);
start_root_id = ctx.last_root;
next_root_id = ctx.last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
nr, ctx.last_root,
ctx.last_ino);
}
while (scanned < nr_to_scan) {
/*
* We may be called from memory allocation paths, so we don't want to
* take too much time and slowdown tasks, so stop if we need reschedule.
*/
while (ctx.scanned < ctx.nr_to_scan && !need_resched()) {
struct btrfs_root *root;
unsigned long count;
@ -1171,8 +1218,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
fs_info->extent_map_shrinker_last_root = 0;
fs_info->extent_map_shrinker_last_ino = 0;
ctx.last_root = 0;
ctx.last_ino = 0;
cycled = true;
continue;
}
@ -1186,15 +1233,33 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
continue;
if (is_fstree(btrfs_root_id(root)))
nr_dropped += btrfs_scan_root(root, &scanned, nr_to_scan);
nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
}
/*
* In case of multiple tasks running this extent map shrinking code this
* isn't perfect but it's simple and silences things like KCSAN. It's
* not possible to know which task made more progress because we can
* cycle back to the first root and first inode if it's not the first
* time the shrinker ran, see the above logic. Also a task that started
* later may finish ealier than another task and made less progress. So
* make this simple and update to the progress of the last task that
* finished, with the occasional possiblity of having two consecutive
* runs of the shrinker process the same inodes.
*/
spin_lock(&fs_info->extent_map_shrinker_lock);
fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
fs_info->extent_map_shrinker_last_root = ctx.last_root;
spin_unlock(&fs_info->extent_map_shrinker_lock);
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
nr, ctx.last_root,
ctx.last_ino);
}
return nr_dropped;

View File

@ -630,6 +630,7 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
spinlock_t extent_map_shrinker_lock;
u64 extent_map_shrinker_last_root;
u64 extent_map_shrinker_last_ino;

View File

@ -2556,9 +2556,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_count,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr,
u64 last_root_id, u64 last_ino),
TP_ARGS(fs_info, nr_to_scan, nr),
TP_ARGS(fs_info, nr_to_scan, nr, last_root_id, last_ino),
TP_STRUCT__entry_btrfs(
__field( long, nr_to_scan )
@ -2570,8 +2571,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TP_fast_assign_btrfs(fs_info,
__entry->nr_to_scan = nr_to_scan;
__entry->nr = nr;
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
__entry->last_root_id = last_root_id;
__entry->last_ino = last_ino;
),
TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
@ -2581,9 +2582,10 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr,
u64 last_root_id, u64 last_ino),
TP_ARGS(fs_info, nr_dropped, nr),
TP_ARGS(fs_info, nr_dropped, nr, last_root_id, last_ino),
TP_STRUCT__entry_btrfs(
__field( long, nr_dropped )
@ -2595,8 +2597,8 @@ TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
TP_fast_assign_btrfs(fs_info,
__entry->nr_dropped = nr_dropped;
__entry->nr = nr;
__entry->last_root_id = fs_info->extent_map_shrinker_last_root;
__entry->last_ino = fs_info->extent_map_shrinker_last_ino;
__entry->last_root_id = last_root_id;
__entry->last_ino = last_ino;
),
TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",