From b19ee72722087c1d441193ce3b6b3d937ae16bf2 Mon Sep 17 00:00:00 2001 From: liuderong Date: Wed, 18 Sep 2024 18:06:20 +0800 Subject: [PATCH 01/40] f2fs: introduce f2fs_get_section_mtime When segs_per_sec is larger than 1, section may contain invalid segments, mtime should be the average value of each valid blocks, so introduce f2fs_get_section_mtime to record the average mtime of all valid blocks in a section. Signed-off-by: liuderong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/gc.c | 17 ++++------------- fs/f2fs/segment.c | 35 ++++++++++++++++++++++++++++++----- fs/f2fs/segment.h | 2 ++ 4 files changed, 38 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 33f5449dc22d..60cf1e96a976 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3783,6 +3783,8 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi); unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno); #define DEF_FRAGMENT_SIZE 4 #define MIN_FRAGMENT_SIZE 1 diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9322a7200e31..e40bdd12e36d 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -361,20 +361,15 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; unsigned char u; - unsigned int i; unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); - for (i = 0; i < usable_segs_per_sec; i++) - mtime += get_seg_entry(sbi, start + i)->mtime; + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); vblocks = get_valid_blocks(sbi, segno, true); - - mtime = div_u64(mtime, usable_segs_per_sec); vblocks = div_u64(vblocks, usable_segs_per_sec); u = BLKS_TO_SEGS(sbi, vblocks * 100); @@ -519,10 +514,7 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, struct victim_sel_policy *p, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; - unsigned int i; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (p->gc_mode == GC_AT && @@ -530,9 +522,8 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, return; } - for (i = 0; i < SEGS_PER_SEC(sbi); i++) - mtime += get_seg_entry(sbi, start + i)->mtime; - mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); + mtime = f2fs_get_section_mtime(sbi, segno); + f2fs_bug_on(sbi, mtime == INVALID_MTIME); /* Handle if the system time has changed by the user */ if (mtime < sit_i->min_mtime) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 1766254279d2..d91fbd1b27ba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -5430,6 +5430,35 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi) return SEGS_PER_SEC(sbi); } +unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); + unsigned int secno = 0, start = 0; + unsigned int total_valid_blocks = 0; + unsigned long long mtime = 0; + unsigned int i = 0; + + secno = GET_SEC_FROM_SEG(sbi, segno); + start = GET_SEG_FROM_SEC(sbi, secno); + + if (!__is_large_section(sbi)) + return get_seg_entry(sbi, start + i)->mtime; + + for (i = 0; i < usable_segs_per_sec; i++) { + /* for large section, only check the mtime of valid segments */ + struct seg_entry *se = get_seg_entry(sbi, start+i); + + mtime += se->mtime * se->valid_blocks; + total_valid_blocks += se->valid_blocks; + } + + if (total_valid_blocks == 0) + return INVALID_MTIME; + + return div_u64(mtime, total_valid_blocks); +} + /* * Update min, max modified time for cost-benefit GC algorithm */ @@ -5443,13 +5472,9 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { - unsigned int i; unsigned long long mtime = 0; - for (i = 0; i < SEGS_PER_SEC(sbi); i++) - mtime += get_seg_entry(sbi, segno + i)->mtime; - - mtime = div_u64(mtime, SEGS_PER_SEC(sbi)); + mtime = f2fs_get_section_mtime(sbi, segno); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 71adb4a43bec..e9cc73093417 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ #define F2FS_MIN_META_SEGMENTS 8 /* SB + 2 (CP + SIT + NAT) + SSA */ +#define INVALID_MTIME ULLONG_MAX /* no valid blocks in a segment/section */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) #define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) From 527a4ded09b9266a5fb100e80e05b101b53053fd Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 7 Oct 2024 13:46:37 +0200 Subject: [PATCH 02/40] f2fs: Use struct_size() to improve f2fs_acl_clone() Use struct_size() to calculate the number of bytes to allocate for a cloned acl. Signed-off-by: Thorsten Blum Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 8bffdeccdbc3..1fbc0607363b 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -296,9 +296,8 @@ static struct posix_acl *f2fs_acl_clone(const struct posix_acl *acl, struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } From 26413ce18e85de3dda2cd3d72c3c3e8ab8f4f996 Mon Sep 17 00:00:00 2001 From: Qi Han Date: Sun, 29 Sep 2024 02:00:10 -0600 Subject: [PATCH 03/40] f2fs: compress: fix inconsistent update of i_blocks in release_compress_blocks and reserve_compress_blocks After release a file and subsequently reserve it, the FSCK flag is set when the file is deleted, as shown in the following backtrace: F2FS-fs (dm-48): Inconsistent i_blocks, ino:401231, iblocks:1448, sectors:1472 fs_rec_info_write_type+0x58/0x274 f2fs_rec_info_write+0x1c/0x2c set_sbi_flag+0x74/0x98 dec_valid_block_count+0x150/0x190 f2fs_truncate_data_blocks_range+0x2d4/0x3cc f2fs_do_truncate_blocks+0x2fc/0x5f0 f2fs_truncate_blocks+0x68/0x100 f2fs_truncate+0x80/0x128 f2fs_evict_inode+0x1a4/0x794 evict+0xd4/0x280 iput+0x238/0x284 do_unlinkat+0x1ac/0x298 __arm64_sys_unlinkat+0x48/0x68 invoke_syscall+0x58/0x11c For clusters of the following type, i_blocks are decremented by 1 and i_compr_blocks are incremented by 7 in release_compress_blocks, while updates to i_blocks and i_compr_blocks are skipped in reserve_compress_blocks. raw node: D D D D D D D D after compress: C D D D D D D D after reserve: C D D D D D D D Let's update i_blocks and i_compr_blocks properly in reserve_compress_blocks. Fixes: eb8fbaa53374 ("f2fs: compress: fix to check unreleased compressed cluster") Signed-off-by: Qi Han Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 321d8ffbab6e..adc7d64a6f47 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -3792,7 +3792,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, to_reserved = cluster_size - compr_blocks - reserved; /* for the case all blocks in cluster were reserved */ - if (to_reserved == 1) { + if (reserved && to_reserved == 1) { dn->ofs_in_node += cluster_size; goto next; } From d5c367ef8287fb4d235c46a2f8c8d68715f3a0ca Mon Sep 17 00:00:00 2001 From: Qi Han Date: Wed, 18 Sep 2024 02:44:00 -0600 Subject: [PATCH 04/40] f2fs: fix f2fs_bug_on when uninstalling filesystem call f2fs_evict_inode. creating a large files during checkpoint disable until it runs out of space and then delete it, then remount to enable checkpoint again, and then unmount the filesystem triggers the f2fs_bug_on as below: ------------[ cut here ]------------ kernel BUG at fs/f2fs/inode.c:896! CPU: 2 UID: 0 PID: 1286 Comm: umount Not tainted 6.11.0-rc7-dirty #360 Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:f2fs_evict_inode+0x58c/0x610 Call Trace: __die_body+0x15/0x60 die+0x33/0x50 do_trap+0x10a/0x120 f2fs_evict_inode+0x58c/0x610 do_error_trap+0x60/0x80 f2fs_evict_inode+0x58c/0x610 exc_invalid_op+0x53/0x60 f2fs_evict_inode+0x58c/0x610 asm_exc_invalid_op+0x16/0x20 f2fs_evict_inode+0x58c/0x610 evict+0x101/0x260 dispose_list+0x30/0x50 evict_inodes+0x140/0x190 generic_shutdown_super+0x2f/0x150 kill_block_super+0x11/0x40 kill_f2fs_super+0x7d/0x140 deactivate_locked_super+0x2a/0x70 cleanup_mnt+0xb3/0x140 task_work_run+0x61/0x90 The root cause is: creating large files during disable checkpoint period results in not enough free segments, so when writing back root inode will failed in f2fs_enable_checkpoint. When umount the file system after enabling checkpoint, the root inode is dirty in f2fs_evict_inode function, which triggers BUG_ON. The steps to reproduce are as follows: dd if=/dev/zero of=f2fs.img bs=1M count=55 mount f2fs.img f2fs_dir -o checkpoint=disable:10% dd if=/dev/zero of=big bs=1M count=50 sync rm big mount -o remount,checkpoint=enable f2fs_dir umount f2fs_dir Let's redirty inode when there is not free segments during checkpoint is disable. Signed-off-by: Qi Han Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1ed86df343a5..10780e37fc7b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -775,8 +775,10 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; - if (!f2fs_is_checkpoint_ready(sbi)) + if (!f2fs_is_checkpoint_ready(sbi)) { + f2fs_mark_inode_dirty_sync(inode, true); return -ENOSPC; + } /* * We need to balance fs here to prevent from producing dirty node pages From b7d0a97b28083084ebdd8e5c6bccd12e6ec18faa Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 12 Oct 2024 00:44:50 +0800 Subject: [PATCH 05/40] f2fs: fix null-ptr-deref in f2fs_submit_page_bio() There's issue as follows when concurrently installing the f2fs.ko module and mounting the f2fs file system: KASAN: null-ptr-deref in range [0x0000000000000020-0x0000000000000027] RIP: 0010:__bio_alloc+0x2fb/0x6c0 [f2fs] Call Trace: f2fs_submit_page_bio+0x126/0x8b0 [f2fs] __get_meta_page+0x1d4/0x920 [f2fs] get_checkpoint_version.constprop.0+0x2b/0x3c0 [f2fs] validate_checkpoint+0xac/0x290 [f2fs] f2fs_get_valid_checkpoint+0x207/0x950 [f2fs] f2fs_fill_super+0x1007/0x39b0 [f2fs] mount_bdev+0x183/0x250 legacy_get_tree+0xf4/0x1e0 vfs_get_tree+0x88/0x340 do_new_mount+0x283/0x5e0 path_mount+0x2b2/0x15b0 __x64_sys_mount+0x1fe/0x270 do_syscall_64+0x5f/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e Above issue happens as the biset of the f2fs file system is not initialized before register "f2fs_fs_type". To address above issue just register "f2fs_fs_type" at the last in init_f2fs_fs(). Ensure that all f2fs file system resources are initialized. Fixes: f543805fcd60 ("f2fs: introduce private bioset") Signed-off-by: Ye Bin Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 87ab5696bd48..8d4ecb2e855e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4991,9 +4991,6 @@ static int __init init_f2fs_fs(void) err = f2fs_init_shrinker(); if (err) goto free_sysfs; - err = register_filesystem(&f2fs_fs_type); - if (err) - goto free_shrinker; f2fs_create_root_stats(); err = f2fs_init_post_read_processing(); if (err) @@ -5016,7 +5013,12 @@ static int __init init_f2fs_fs(void) err = f2fs_create_casefold_cache(); if (err) goto free_compress_cache; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_casefold_cache; return 0; +free_casefold_cache: + f2fs_destroy_casefold_cache(); free_compress_cache: f2fs_destroy_compress_cache(); free_compress_mempool: @@ -5031,8 +5033,6 @@ static int __init init_f2fs_fs(void) f2fs_destroy_post_read_processing(); free_root_stats: f2fs_destroy_root_stats(); - unregister_filesystem(&f2fs_fs_type); -free_shrinker: f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); @@ -5056,6 +5056,7 @@ static int __init init_f2fs_fs(void) static void __exit exit_f2fs_fs(void) { + unregister_filesystem(&f2fs_fs_type); f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); @@ -5064,7 +5065,6 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); - unregister_filesystem(&f2fs_fs_type); f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); From 1acd73edbbfef2c3c5b43cba4006a7797eca7050 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 15 Oct 2024 11:43:39 +0800 Subject: [PATCH 06/40] f2fs: fix to account dirty data in __get_secs_required() It will trigger system panic w/ testcase in [1]: ------------[ cut here ]------------ kernel BUG at fs/f2fs/segment.c:2752! RIP: 0010:new_curseg+0xc81/0x2110 Call Trace: f2fs_allocate_data_block+0x1c91/0x4540 do_write_page+0x163/0xdf0 f2fs_outplace_write_data+0x1aa/0x340 f2fs_do_write_data_page+0x797/0x2280 f2fs_write_single_data_page+0x16cd/0x2190 f2fs_write_cache_pages+0x994/0x1c80 f2fs_write_data_pages+0x9cc/0xea0 do_writepages+0x194/0x7a0 filemap_fdatawrite_wbc+0x12b/0x1a0 __filemap_fdatawrite_range+0xbb/0xf0 file_write_and_wait_range+0xa1/0x110 f2fs_do_sync_file+0x26f/0x1c50 f2fs_sync_file+0x12b/0x1d0 vfs_fsync_range+0xfa/0x230 do_fsync+0x3d/0x80 __x64_sys_fsync+0x37/0x50 x64_sys_call+0x1e88/0x20d0 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e The root cause is if checkpoint_disabling and lfs_mode are both on, it will trigger OPU for all overwritten data, it may cost more free segment than expected, so f2fs must account those data correctly to calculate cosumed free segments later, and return ENOSPC earlier to avoid run out of free segment during block allocation. [1] https://lore.kernel.org/fstests/20241015025106.3203676-1-chao@kernel.org/ Fixes: 4354994f097d ("f2fs: checkpoint disabling") Cc: Daniel Rosenberg Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e9cc73093417..55a01da6c4be 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -561,18 +561,21 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) } static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, - unsigned int node_blocks, unsigned int dent_blocks) + unsigned int node_blocks, unsigned int data_blocks, + unsigned int dent_blocks) { - unsigned segno, left_blocks; + unsigned int segno, left_blocks, blocks; int i; - /* check current node sections in the worst case. */ - for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) { + /* check current data/node sections in the worst case. */ + for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); - if (node_blocks > left_blocks) + + blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; + if (blocks > left_blocks) return false; } @@ -586,8 +589,9 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, } /* - * calculate needed sections for dirty node/dentry - * and call has_curseg_enough_space + * calculate needed sections for dirty node/dentry and call + * has_curseg_enough_space, please note that, it needs to account + * dirty data as well in lfs mode when checkpoint is disabled. */ static inline void __get_secs_required(struct f2fs_sb_info *sbi, unsigned int *lower_p, unsigned int *upper_p, bool *curseg_p) @@ -596,19 +600,30 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi, get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int total_data_blocks = 0; unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int data_secs = 0; unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int data_blocks = 0; + + if (f2fs_lfs_mode(sbi) && + unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA); + data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi); + data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi); + } if (lower_p) - *lower_p = node_secs + dent_secs; + *lower_p = node_secs + dent_secs + data_secs; if (upper_p) *upper_p = node_secs + dent_secs + - (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0); + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) + + (data_blocks ? 1 : 0); if (curseg_p) *curseg_p = has_curseg_enough_space(sbi, - node_blocks, dent_blocks); + node_blocks, data_blocks, dent_blocks); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, From 6babe00ccd34fc65b78ef8b99754e32b4385f23d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Oct 2024 16:13:37 +0800 Subject: [PATCH 07/40] f2fs: fix to do sanity check on node blkaddr in truncate_node() syzbot reports a f2fs bug as below: ------------[ cut here ]------------ kernel BUG at fs/f2fs/segment.c:2534! RIP: 0010:f2fs_invalidate_blocks+0x35f/0x370 fs/f2fs/segment.c:2534 Call Trace: truncate_node+0x1ae/0x8c0 fs/f2fs/node.c:909 f2fs_remove_inode_page+0x5c2/0x870 fs/f2fs/node.c:1288 f2fs_evict_inode+0x879/0x15c0 fs/f2fs/inode.c:856 evict+0x4e8/0x9b0 fs/inode.c:723 f2fs_handle_failed_inode+0x271/0x2e0 fs/f2fs/inode.c:986 f2fs_create+0x357/0x530 fs/f2fs/namei.c:394 lookup_open fs/namei.c:3595 [inline] open_last_lookups fs/namei.c:3694 [inline] path_openat+0x1c03/0x3590 fs/namei.c:3930 do_filp_open+0x235/0x490 fs/namei.c:3960 do_sys_openat2+0x13e/0x1d0 fs/open.c:1415 do_sys_open fs/open.c:1430 [inline] __do_sys_openat fs/open.c:1446 [inline] __se_sys_openat fs/open.c:1441 [inline] __x64_sys_openat+0x247/0x2a0 fs/open.c:1441 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0010:f2fs_invalidate_blocks+0x35f/0x370 fs/f2fs/segment.c:2534 The root cause is: on a fuzzed image, blkaddr in nat entry may be corrupted, then it will cause system panic when using it in f2fs_invalidate_blocks(), to avoid this, let's add sanity check on nat blkaddr in truncate_node(). Reported-by: syzbot+33379ce4ac76acf7d0c7@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/0000000000009a6cd706224ca720@google.com/ Cc: stable@vger.kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 59b13ff243fa..af36c6d6542b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -905,6 +905,16 @@ static int truncate_node(struct dnode_of_data *dn) if (err) return err; + if (ni.blk_addr != NEW_ADDR && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC_ENHANCE)) { + f2fs_err_ratelimited(sbi, + "nat entry is corrupted, run fsck to fix it, ino:%u, " + "nid:%u, blkaddr:%u", ni.ino, ni.nid, ni.blk_addr); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); + return -EFSCORRUPTED; + } + /* Deallocate node address */ f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); From 2d56b4e39192fb9693284ce8aa3416b517d785b7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Oct 2024 16:24:20 +0800 Subject: [PATCH 08/40] f2fs: multidevice: add stats in debugfs This patch adds per-device stats in debugfs, the examples are as below: mkfs.f2fs -f -c /dev/vdc /dev/vdb mount /dev/vdb /mnt/f2fs/ cat /sys/kernel/debug/f2fs/status Multidevice stats: [seg: inuse dirty full free prefree] #0 5 0 0 4007 0 #1 1 0 0 8191 0 mkfs.f2fs -f -s 2 -c /dev/vdc /dev/vdb mount /dev/vdb /mnt/f2fs cat /sys/kernel/debug/f2fs/status Multidevice stats: [seg: inuse dirty full free prefree] [sec: inuse dirty full free prefree] #0 5 0 0 4005 0 5 0 0 2000 0 #1 1 0 0 8191 0 1 0 0 4095 0 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/f2fs/f2fs.h | 14 +++++++ 2 files changed, 121 insertions(+) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 546b8ba91261..278c8855ac0a 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -60,6 +60,70 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_DEBUG_FS +static void update_multidevice_stats(struct f2fs_sb_info *sbi) +{ + struct f2fs_stat_info *si = F2FS_STAT(sbi); + struct f2fs_dev_stats *dev_stats = si->dev_stats; + int i, j; + + if (!f2fs_is_multi_device(sbi)) + return; + + memset(dev_stats, 0, sizeof(struct f2fs_dev_stats) * sbi->s_ndevs); + for (i = 0; i < sbi->s_ndevs; i++) { + unsigned int start_segno, end_segno; + block_t start_blk, end_blk; + + if (i == 0) { + start_blk = MAIN_BLKADDR(sbi); + end_blk = FDEV(i).end_blk + 1 - SEG0_BLKADDR(sbi); + } else { + start_blk = FDEV(i).start_blk; + end_blk = FDEV(i).end_blk + 1; + } + + start_segno = GET_SEGNO(sbi, start_blk); + end_segno = GET_SEGNO(sbi, end_blk); + + for (j = start_segno; j < end_segno; j++) { + unsigned int seg_blks, sec_blks; + + seg_blks = get_seg_entry(sbi, j)->valid_blocks; + + /* update segment stats */ + if (IS_CURSEG(sbi, j)) + dev_stats[i].devstats[0][DEVSTAT_INUSE]++; + else if (seg_blks == BLKS_PER_SEG(sbi)) + dev_stats[i].devstats[0][DEVSTAT_FULL]++; + else if (seg_blks != 0) + dev_stats[i].devstats[0][DEVSTAT_DIRTY]++; + else if (!test_bit(j, FREE_I(sbi)->free_segmap)) + dev_stats[i].devstats[0][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[0][DEVSTAT_PREFREE]++; + + if (!__is_large_section(sbi) || + (j % SEGS_PER_SEC(sbi)) != 0) + continue; + + sec_blks = get_sec_entry(sbi, j)->valid_blocks; + + /* update section stats */ + if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j))) + dev_stats[i].devstats[1][DEVSTAT_INUSE]++; + else if (sec_blks == BLKS_PER_SEC(sbi)) + dev_stats[i].devstats[1][DEVSTAT_FULL]++; + else if (sec_blks != 0) + dev_stats[i].devstats[1][DEVSTAT_DIRTY]++; + else if (!test_bit(GET_SEC_FROM_SEG(sbi, j), + FREE_I(sbi)->free_secmap)) + dev_stats[i].devstats[1][DEVSTAT_FREE]++; + else + dev_stats[i].devstats[1][DEVSTAT_PREFREE]++; + } + } +} + static void update_general_status(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); @@ -214,6 +278,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->valid_blks[type] += blks; } + update_multidevice_stats(sbi); + for (i = 0; i < MAX_CALL_TYPE; i++) si->cp_call_count[i] = atomic_read(&sbi->cp_call_count[i]); @@ -498,6 +564,36 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); + if (f2fs_is_multi_device(sbi)) { + seq_puts(s, "Multidevice stats:\n"); + seq_printf(s, " [seg: %8s %8s %8s %8s %8s]", + "inuse", "dirty", "full", "free", "prefree"); + if (__is_large_section(sbi)) + seq_printf(s, " [sec: %8s %8s %8s %8s %8s]\n", + "inuse", "dirty", "full", "free", "prefree"); + else + seq_puts(s, "\n"); + + for (i = 0; i < sbi->s_ndevs; i++) { + seq_printf(s, " #%-2d %8u %8u %8u %8u %8u", i, + si->dev_stats[i].devstats[0][DEVSTAT_INUSE], + si->dev_stats[i].devstats[0][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[0][DEVSTAT_FULL], + si->dev_stats[i].devstats[0][DEVSTAT_FREE], + si->dev_stats[i].devstats[0][DEVSTAT_PREFREE]); + if (!__is_large_section(sbi)) { + seq_puts(s, "\n"); + continue; + } + seq_printf(s, " %8u %8u %8u %8u %8u\n", + si->dev_stats[i].devstats[1][DEVSTAT_INUSE], + si->dev_stats[i].devstats[1][DEVSTAT_DIRTY], + si->dev_stats[i].devstats[1][DEVSTAT_FULL], + si->dev_stats[i].devstats[1][DEVSTAT_FREE], + si->dev_stats[i].devstats[1][DEVSTAT_PREFREE]); + } + seq_puts(s, "\n"); + } seq_printf(s, "CP calls: %d (BG: %d)\n", si->cp_call_count[TOTAL_CALL], si->cp_call_count[BACKGROUND]); @@ -665,6 +761,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; + struct f2fs_dev_stats *dev_stats; unsigned long flags; int i; @@ -672,6 +769,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) if (!si) return -ENOMEM; + dev_stats = f2fs_kzalloc(sbi, sizeof(struct f2fs_dev_stats) * + sbi->s_ndevs, GFP_KERNEL); + if (!dev_stats) { + kfree(si); + return -ENOMEM; + } + + si->dev_stats = dev_stats; + si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -724,6 +830,7 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) list_del(&si->stat_list); raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags); + kfree(si->dev_stats); kfree(si); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 60cf1e96a976..dd47dbf6d9e6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3937,6 +3937,19 @@ void f2fs_destroy_recovery_cache(void); * debug.c */ #ifdef CONFIG_F2FS_STAT_FS +enum { + DEVSTAT_INUSE, + DEVSTAT_DIRTY, + DEVSTAT_FULL, + DEVSTAT_FREE, + DEVSTAT_PREFREE, + DEVSTAT_MAX, +}; + +struct f2fs_dev_stats { + unsigned int devstats[2][DEVSTAT_MAX]; /* 0: segs, 1: secs */ +}; + struct f2fs_stat_info { struct list_head stat_list; struct f2fs_sb_info *sbi; @@ -4000,6 +4013,7 @@ struct f2fs_stat_info { unsigned int block_count[2]; unsigned int inplace_count; unsigned long long base_mem, cache_mem, page_mem; + struct f2fs_dev_stats *dev_stats; }; static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) From fa08972bcb7baaf5f1f4fdf251dc08bdd3ab1cf0 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 15 Oct 2024 09:54:27 -0700 Subject: [PATCH 09/40] f2fs: decrease spare area for pinned files for zoned devices Now we reclaim too much space before allocating pinned space for zoned devices. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 ++- fs/f2fs/gc.h | 1 + fs/f2fs/segment.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index adc7d64a6f47..348ef73bf8dd 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1790,7 +1790,8 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, map.m_len = sec_blks; next_alloc: - if (has_not_enough_free_secs(sbi, 0, + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? + ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 2914b678bf8f..5c1eaf55e127 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -35,6 +35,7 @@ #define LIMIT_BOOST_ZONED_GC 25 /* percentage over total user space of boosted gc for zoned devices */ #define DEF_MIGRATION_WINDOW_GRANULARITY_ZONED 3 #define BOOST_GC_MULTIPLE 5 +#define ZONED_PIN_SEC_REQUIRED_COUNT 1 #define DEF_GC_FAILED_PINNED_FILES 2048 #define MAX_GC_FAILED_PINNED_FILES USHRT_MAX diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d91fbd1b27ba..859d70bbc5e7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3237,7 +3237,8 @@ int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1); + err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), + true, ZONED_PIN_SEC_REQUIRED_COUNT); f2fs_up_write(&sbi->gc_lock); gc_required = false; From 128d333f0dff2fbe41c546581c6f151e9d68cd4c Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Thu, 17 Oct 2024 10:31:53 -0700 Subject: [PATCH 10/40] f2fs: introduce device aliasing file F2FS should understand how the device aliasing file works and support deleting the file after use. A device aliasing file can be created by mkfs.f2fs tool and it can map the whole device with an extent, not using node blocks. The file space should be pinned and normally used for read-only usages. Signed-off-by: Daeho Jeong Signed-off-by: Chao Yu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 44 +++++++++++++++++++++++++++++ fs/f2fs/data.c | 5 ++++ fs/f2fs/extent_cache.c | 45 +++++++++++++++++++++++++++++- fs/f2fs/f2fs.h | 5 ++++ fs/f2fs/file.c | 45 +++++++++++++++++++++++++++--- fs/f2fs/inode.c | 19 ++++++++++++- fs/f2fs/super.c | 4 +++ fs/f2fs/sysfs.c | 2 ++ include/uapi/linux/f2fs.h | 1 + 9 files changed, 164 insertions(+), 6 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 68a0885fb5e6..fb7d2ee022bc 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -943,3 +943,47 @@ NVMe Zoned Namespace devices can start before the zone-capacity and span across zone-capacity boundary. Such spanning segments are also considered as usable segments. All blocks past the zone-capacity are considered unusable in these segments. + +Device aliasing feature +----------------------- + +f2fs can utilize a special file called a "device aliasing file." This file allows +the entire storage device to be mapped with a single, large extent, not using +the usual f2fs node structures. This mapped area is pinned and primarily intended +for holding the space. + +Essentially, this mechanism allows a portion of the f2fs area to be temporarily +reserved and used by another filesystem or for different purposes. Once that +external usage is complete, the device aliasing file can be deleted, releasing +the reserved space back to F2FS for its own use. + + + +# ls /dev/vd* +/dev/vdb (32GB) /dev/vdc (32GB) +# mkfs.ext4 /dev/vdc +# mkfs.f2fs -c /dev/vdc@vdc.file /dev/vdb +# mount /dev/vdb /mnt/f2fs +# ls -l /mnt/f2fs +vdc.file +# df -h +/dev/vdb 64G 33G 32G 52% /mnt/f2fs + +# mount -o loop /dev/vdc /mnt/ext4 +# df -h +/dev/vdb 64G 33G 32G 52% /mnt/f2fs +/dev/loop7 32G 24K 30G 1% /mnt/ext4 +# umount /mnt/ext4 + +# f2fs_io getflags /mnt/f2fs/vdc.file +get a flag on /mnt/f2fs/vdc.file ret=0, flags=nocow(pinned),immutable +# f2fs_io setflags noimmutable /mnt/f2fs/vdc.file +get a flag on noimmutable ret=0, flags=800010 +set a flag on /mnt/f2fs/vdc.file ret=0, flags=noimmutable +# rm /mnt/f2fs/vdc.file +# df -h +/dev/vdb 64G 753M 64G 2% /mnt/f2fs + +So, the key idea is, user can do any file operations on /dev/vdc, and +reclaim the space after the use, while the space is counted as /data. +That doesn't require modifying partition size and filesystem format. diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 94f7b084f601..90fa8ab85194 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3441,6 +3441,11 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { + if (IS_DEVICE_ALIASING(inode)) { + err = -ENODATA; + goto out; + } + if (locked) { err = f2fs_reserve_block(&dn, index); goto out; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 62ac440d9416..019c1f7b7fa5 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -24,6 +24,7 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; struct extent_info ei; + int devi; get_read_extent_info(&ei, i_ext); @@ -38,7 +39,36 @@ bool sanity_check_extent_cache(struct inode *inode, struct page *ipage) ei.blk, ei.fofs, ei.len); return false; } - return true; + + if (!IS_DEVICE_ALIASING(inode)) + return true; + + for (devi = 0; devi < sbi->s_ndevs; devi++) { + if (FDEV(devi).start_blk != ei.blk || + FDEV(devi).end_blk != ei.blk + ei.len - 1) + continue; + + if (devi == 0) { + f2fs_warn(sbi, + "%s: inode (ino=%lx) is an alias of meta device", + __func__, inode->i_ino); + return false; + } + + if (bdev_is_zoned(FDEV(devi).bdev)) { + f2fs_warn(sbi, + "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] maps to zoned block device", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; + } + return true; + } + + f2fs_warn(sbi, "%s: device alias inode (ino=%lx)'s extent info " + "[%u, %u, %u] is inconsistent w/ any devices", + __func__, inode->i_ino, ei.blk, ei.fofs, ei.len); + return false; } static void __set_extent_info(struct extent_info *ei, @@ -76,6 +106,9 @@ static bool __init_may_extent_tree(struct inode *inode, enum extent_type type) static bool __may_extent_tree(struct inode *inode, enum extent_type type) { + if (IS_DEVICE_ALIASING(inode) && type == EX_READ) + return true; + /* * for recovered files during mount do not create extents * if shrinker is not registered. @@ -401,6 +434,11 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) if (atomic_read(&et->node_cnt) || !ei.len) goto skip; + if (IS_DEVICE_ALIASING(inode)) { + et->largest = ei; + goto skip; + } + en = __attach_extent_node(sbi, et, &ei, NULL, &et->root.rb_root.rb_node, true); if (en) { @@ -463,6 +501,11 @@ static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } + if (IS_DEVICE_ALIASING(inode)) { + ret = false; + goto out; + } + en = __lookup_extent_node(&et->root, et->cached_en, pgofs); if (!en) goto out; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dd47dbf6d9e6..f3ef4dc50992 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -213,6 +213,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_CASEFOLD 0x00001000 #define F2FS_FEATURE_COMPRESSION 0x00002000 #define F2FS_FEATURE_RO 0x00004000 +#define F2FS_FEATURE_DEVICE_ALIAS 0x00008000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -3046,6 +3047,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define F2FS_DEVICE_ALIAS_FL 0x80000000 /* File for aliasing a device */ #define F2FS_QUOTA_DEFAULT_FL (F2FS_NOATIME_FL | F2FS_IMMUTABLE_FL) @@ -3061,6 +3063,8 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) /* Flags that are appropriate for non-directories/regular files. */ #define F2FS_OTHER_FLMASK (F2FS_NODUMP_FL | F2FS_NOATIME_FL) +#define IS_DEVICE_ALIASING(inode) (F2FS_I(inode)->i_flags & F2FS_DEVICE_ALIAS_FL) + static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) @@ -4526,6 +4530,7 @@ F2FS_FEATURE_FUNCS(sb_chksum, SB_CHKSUM); F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); +F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 348ef73bf8dd..75a8b22da664 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -725,6 +725,11 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) trace_f2fs_truncate_blocks_enter(inode, from); + if (IS_DEVICE_ALIASING(inode) && from) { + err = -EINVAL; + goto out_err; + } + free_from = (pgoff_t)F2FS_BLK_ALIGN(from); if (free_from >= max_file_blocks(inode)) @@ -739,6 +744,21 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } + if (IS_DEVICE_ALIASING(inode)) { + struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ]; + struct extent_info ei = et->largest; + unsigned int i; + + for (i = 0; i < ei.len; i++) + f2fs_invalidate_blocks(sbi, ei.blk + i); + + dec_valid_block_count(sbi, inode, ei.len); + f2fs_update_time(sbi, REQ_TIME); + + f2fs_put_page(ipage, 1); + goto out; + } + if (f2fs_has_inline_data(inode)) { f2fs_truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); @@ -774,7 +794,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); - +out_err: trace_f2fs_truncate_blocks_exit(inode, err); return err; } @@ -992,7 +1012,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return -EPERM; if ((attr->ia_valid & ATTR_SIZE)) { - if (!f2fs_is_compress_backend_ready(inode)) + if (!f2fs_is_compress_backend_ready(inode) || + IS_DEVICE_ALIASING(inode)) return -EOPNOTSUPP; if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED) && !IS_ALIGNED(attr->ia_size, @@ -1861,7 +1882,7 @@ static long f2fs_fallocate(struct file *file, int mode, return -EIO; if (!f2fs_is_checkpoint_ready(F2FS_I_SB(inode))) return -ENOSPC; - if (!f2fs_is_compress_backend_ready(inode)) + if (!f2fs_is_compress_backend_ready(inode) || IS_DEVICE_ALIASING(inode)) return -EOPNOTSUPP; /* f2fs only support ->fallocate for regular file */ @@ -3297,6 +3318,9 @@ int f2fs_pin_file_control(struct inode *inode, bool inc) struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + if (IS_DEVICE_ALIASING(inode)) + return -EINVAL; + if (fi->i_gc_failures >= sbi->gc_pin_file_threshold) { f2fs_warn(sbi, "%s: Enable GC = ino %lx after %x GC trials", __func__, inode->i_ino, fi->i_gc_failures); @@ -3327,6 +3351,9 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + if (!pin && IS_DEVICE_ALIASING(inode)) + return -EOPNOTSUPP; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -3392,6 +3419,12 @@ static int f2fs_ioc_get_pin_file(struct file *filp, unsigned long arg) return put_user(pin, (u32 __user *)arg); } +static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) +{ + return put_user(IS_DEVICE_ALIASING(file_inode(filp)) ? 1 : 0, + (u32 __user *)arg); +} + int f2fs_precache_extents(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -4491,6 +4524,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_decompress_file(filp); case F2FS_IOC_COMPRESS_FILE: return f2fs_ioc_compress_file(filp); + case F2FS_IOC_GET_DEV_ALIAS_FILE: + return f2fs_ioc_get_dev_alias_file(filp, arg); default: return -ENOTTY; } @@ -4766,7 +4801,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter, else return 0; - map.m_may_create = true; + if (!IS_DEVICE_ALIASING(inode)) + map.m_may_create = true; if (dio) { map.m_seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); @@ -5203,6 +5239,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_SET_COMPRESS_OPTION: case F2FS_IOC_DECOMPRESS_FILE: case F2FS_IOC_COMPRESS_FILE: + case F2FS_IOC_GET_DEV_ALIAS_FILE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 10780e37fc7b..282fd320bdb3 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -372,6 +372,19 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (IS_DEVICE_ALIASING(inode)) { + if (!f2fs_sb_has_device_alias(sbi)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but the feature is off", + __func__, inode->i_ino); + return false; + } + if (!f2fs_is_pinned_file(inode)) { + f2fs_warn(sbi, "%s: inode (ino=%lx) has device alias flag, but is not pinned", + __func__, inode->i_ino); + return false; + } + } + return true; } @@ -825,7 +838,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); f2fs_remove_dirty_inode(inode); - f2fs_destroy_extent_tree(inode); + if (!IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; @@ -881,6 +895,9 @@ void f2fs_evict_inode(struct inode *inode) goto retry; } + if (IS_DEVICE_ALIASING(inode)) + f2fs_destroy_extent_tree(inode); + if (err) { f2fs_update_inode_page(inode); if (dquot_initialize_needed(inode)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8d4ecb2e855e..aa14c8fce7d9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -834,6 +834,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: + if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) { + f2fs_err(sbi, "device aliasing requires extent cache"); + return -EINVAL; + } clear_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noinline_data: diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c56e8c873935..e51304bc65ea 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1313,6 +1313,7 @@ F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM); F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); F2FS_SB_FEATURE_RO_ATTR(readonly, RO); +F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS); static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_encryption), @@ -1329,6 +1330,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_casefold), ATTR_LIST(sb_compression), ATTR_LIST(sb_readonly), + ATTR_LIST(sb_device_alias), NULL, }; ATTRIBUTE_GROUPS(f2fs_sb_feat); diff --git a/include/uapi/linux/f2fs.h b/include/uapi/linux/f2fs.h index 955d440be104..f7aaf8d23e20 100644 --- a/include/uapi/linux/f2fs.h +++ b/include/uapi/linux/f2fs.h @@ -43,6 +43,7 @@ #define F2FS_IOC_DECOMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 23) #define F2FS_IOC_COMPRESS_FILE _IO(F2FS_IOCTL_MAGIC, 24) #define F2FS_IOC_START_ATOMIC_REPLACE _IO(F2FS_IOCTL_MAGIC, 25) +#define F2FS_IOC_GET_DEV_ALIAS_FILE _IOR(F2FS_IOCTL_MAGIC, 26, __u32) /* * should be same as XFS_IOC_GOINGDOWN. From 5bc5aae843128aefb1c55d769d057c92dd8a32c9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Oct 2024 14:26:36 +0800 Subject: [PATCH 11/40] f2fs: zone: introduce first_zoned_segno in f2fs_sb_info first_zoned_segno() returns a fixed value, let's cache it in structure f2fs_sb_info to avoid redundant calculation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 10 ---------- fs/f2fs/super.c | 13 +++++++++++++ 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f3ef4dc50992..3c6f3cce5779 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1759,6 +1759,7 @@ struct f2fs_sb_info { unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ + unsigned int first_zoned_segno; /* first zoned segno */ /* For write statistics */ u64 sectors_written_start; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 859d70bbc5e7..216762a10e5a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2711,7 +2711,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi, if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) segno = 0; else - segno = max(first_zoned_segno(sbi), *newseg); + segno = max(sbi->first_zoned_segno, *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } #endif @@ -2723,7 +2723,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi, if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { /* Write only to sequential zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { - hint = GET_SEC_FROM_SEG(sbi, first_zoned_segno(sbi)); + hint = GET_SEC_FROM_SEG(sbi, sbi->first_zoned_segno); secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); } else secno = find_first_zero_bit(free_i->free_secmap, diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 55a01da6c4be..8bf09fb67841 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -974,13 +974,3 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) dcc->discard_wake = true; wake_up_interruptible_all(&dcc->discard_wait_queue); } - -static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi) -{ - int devi; - - for (devi = 0; devi < sbi->s_ndevs; devi++) - if (bdev_is_zoned(FDEV(devi).bdev)) - return GET_SEGNO(sbi, FDEV(devi).start_blk); - return 0; -} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index aa14c8fce7d9..80a53dbf1c38 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4221,6 +4221,16 @@ static void f2fs_record_error_work(struct work_struct *work) f2fs_record_stop_reason(sbi); } +static inline unsigned int get_first_zoned_segno(struct f2fs_sb_info *sbi) +{ + int devi; + + for (devi = 0; devi < sbi->s_ndevs; devi++) + if (bdev_is_zoned(FDEV(devi).bdev)) + return GET_SEGNO(sbi, FDEV(devi).start_blk); + return 0; +} + static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); @@ -4621,6 +4631,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* For write statistics */ sbi->sectors_written_start = f2fs_get_sectors_written(sbi); + /* get segno of first zoned block device */ + sbi->first_zoned_segno = get_first_zoned_segno(sbi); + /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); if (__exist_node_summaries(sbi)) From c3af1f13476ec23fd99c98d060a89be28c1e8871 Mon Sep 17 00:00:00 2001 From: LongPing Wei Date: Mon, 21 Oct 2024 10:31:47 +0800 Subject: [PATCH 12/40] f2fs: fix the wrong f2fs_bug_on condition in f2fs_do_replace_block This f2fs_bug_on was introduced by commit 2c1905042c8c ("f2fs: check segment type in __f2fs_replace_block") when there were only 6 curseg types. After commit d0b9e42ab615 ("f2fs: introduce inmem curseg") was introduced, the condition should be changed to checking curseg->seg_type. Fixes: d0b9e42ab615 ("f2fs: introduce inmem curseg") Signed-off-by: LongPing Wei Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 216762a10e5a..96e7abaf42c8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3978,8 +3978,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } } - f2fs_bug_on(sbi, !IS_DATASEG(type)); curseg = CURSEG_I(sbi, type); + f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type)); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); From 43563069e1c1df417d2eed6eca8a22fc6b04691d Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Mon, 21 Oct 2024 12:48:01 +0800 Subject: [PATCH 13/40] f2fs: check curseg->inited before write_sum_page in change_curseg In the __f2fs_init_atgc_curseg->get_atssr_segment calling, curseg->segno is NULL_SEGNO, indicating that there is no summary block that needs to be written. Fixes: 093749e296e2 ("f2fs: support age threshold based garbage collection") Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 96e7abaf42c8..bda7156a2d9d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2926,7 +2926,8 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) struct f2fs_summary_block *sum_node; struct page *sum_page; - write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); + if (curseg->inited) + write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); From 84b5bb8bf0f6a78c232a20c2eecdbb8112ac2703 Mon Sep 17 00:00:00 2001 From: Qi Han Date: Fri, 25 Oct 2024 03:18:23 -0600 Subject: [PATCH 14/40] f2fs: modify f2fs_is_checkpoint_ready logic to allow more data to be written with the CP disable When the free segment is used up during CP disable, many write or ioctl operations will get ENOSPC error codes, even if there are still many blocks available. We can reproduce it in the following steps: dd if=/dev/zero of=f2fs.img bs=1M count=65 mkfs.f2fs -f f2fs.img mount f2fs.img f2fs_dir -o checkpoint=disable:10% cd f2fs_dir i=1 ; while [[ $i -lt 50 ]] ; do (file_name=./2M_file$i ; dd \ if=/dev/random of=$file_name bs=1M count=2); i=$((i+1)); done sync i=1 ; while [[ $i -lt 50 ]] ; do (file_name=./2M_file$i ; truncate \ -s 1K $file_name); i=$((i+1)); done sync dd if=/dev/zero of=./file bs=1M count=20 In f2fs_need_SSR() function, it is allowed to use SSR to allocate blocks when CP is disabled, so in f2fs_is_checkpoint_ready function, can we judge the number of invalid blocks when free segment is not enough, and return ENOSPC only if the number of invalid blocks is also not enough. Signed-off-by: Qi Han Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 8bf09fb67841..0f706cfc754a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -654,12 +654,30 @@ static inline bool has_enough_free_secs(struct f2fs_sb_info *sbi, return !has_not_enough_free_secs(sbi, freed, needed); } +static inline bool has_enough_free_blks(struct f2fs_sb_info *sbi) +{ + unsigned int total_free_blocks = 0; + unsigned int avail_user_block_count; + + spin_lock(&sbi->stat_lock); + + avail_user_block_count = get_available_block_count(sbi, NULL, true); + total_free_blocks = avail_user_block_count - (unsigned int)valid_user_blocks(sbi); + + spin_unlock(&sbi->stat_lock); + + return total_free_blocks > 0; +} + static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi) { if (likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (likely(has_enough_free_secs(sbi, 0, 0))) return true; + if (!f2fs_lfs_mode(sbi) && + likely(has_enough_free_blks(sbi))) + return true; return false; } From e63ce120b41ac5a904758a0231d43c6b328cd8fa Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Sun, 27 Oct 2024 16:06:08 +0200 Subject: [PATCH 15/40] f2fs: fix typos Fix typos: datas -> data. Via codespell. Signed-off-by: Andrew Kreimer Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 278c8855ac0a..468828288a4a 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -694,9 +694,9 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4d in files:%4d\n", + seq_printf(s, " - data: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - quota datas: %4d in quota files:%4d\n", + seq_printf(s, " - quota data: %4d in quota files:%4d\n", si->ndirty_qdata, si->nquota_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); From 0c3a38a4b442893f8baca72e44a2a27d52d6cc75 Mon Sep 17 00:00:00 2001 From: Zeng Heng Date: Wed, 23 Oct 2024 17:48:50 +0800 Subject: [PATCH 16/40] f2fs: Fix not used variable 'index' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following compilation warning: fs/f2fs/data.c:2391:10: warning: variable ‘index’ set but not used [-Wunused-but-set-variable] 2391 | pgoff_t index; Only define and set the variable index when the CONFIG_F2FS_FS_COMPRESSION is enabled. Fixes: db92e6c729d8 ("f2fs: convert f2fs_mpage_readpages() to use folio") Signed-off-by: Zeng Heng Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 90fa8ab85194..306b86b0595d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2385,10 +2385,10 @@ static int f2fs_mpage_readpages(struct inode *inode, .nr_cpages = 0, }; pgoff_t nc_cluster_idx = NULL_CLUSTER; + pgoff_t index; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; - pgoff_t index; int ret = 0; map.m_pblk = 0; @@ -2406,9 +2406,9 @@ static int f2fs_mpage_readpages(struct inode *inode, prefetchw(&folio->flags); } +#ifdef CONFIG_F2FS_FS_COMPRESSION index = folio_index(folio); -#ifdef CONFIG_F2FS_FS_COMPRESSION if (!f2fs_compressed_file(inode)) goto read_single_page; From f10a890308a7cd8794e21f646f09827c6cb4bf5d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 22 Oct 2024 16:36:23 +0800 Subject: [PATCH 17/40] f2fs: fix to avoid potential deadlock in f2fs_record_stop_reason() syzbot reports deadlock issue of f2fs as below: ====================================================== WARNING: possible circular locking dependency detected 6.12.0-rc3-syzkaller-00087-gc964ced77262 #0 Not tainted ------------------------------------------------------ kswapd0/79 is trying to acquire lock: ffff888011824088 (&sbi->sb_lock){++++}-{3:3}, at: f2fs_down_write fs/f2fs/f2fs.h:2199 [inline] ffff888011824088 (&sbi->sb_lock){++++}-{3:3}, at: f2fs_record_stop_reason+0x52/0x1d0 fs/f2fs/super.c:4068 but task is already holding lock: ffff88804bd92610 (sb_internal#2){.+.+}-{0:0}, at: f2fs_evict_inode+0x662/0x15c0 fs/f2fs/inode.c:842 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (sb_internal#2){.+.+}-{0:0}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5825 percpu_down_read include/linux/percpu-rwsem.h:51 [inline] __sb_start_write include/linux/fs.h:1716 [inline] sb_start_intwrite+0x4d/0x1c0 include/linux/fs.h:1899 f2fs_evict_inode+0x662/0x15c0 fs/f2fs/inode.c:842 evict+0x4e8/0x9b0 fs/inode.c:725 f2fs_evict_inode+0x1a4/0x15c0 fs/f2fs/inode.c:807 evict+0x4e8/0x9b0 fs/inode.c:725 dispose_list fs/inode.c:774 [inline] prune_icache_sb+0x239/0x2f0 fs/inode.c:963 super_cache_scan+0x38c/0x4b0 fs/super.c:223 do_shrink_slab+0x701/0x1160 mm/shrinker.c:435 shrink_slab+0x1093/0x14d0 mm/shrinker.c:662 shrink_one+0x43b/0x850 mm/vmscan.c:4818 shrink_many mm/vmscan.c:4879 [inline] lru_gen_shrink_node mm/vmscan.c:4957 [inline] shrink_node+0x3799/0x3de0 mm/vmscan.c:5937 kswapd_shrink_node mm/vmscan.c:6765 [inline] balance_pgdat mm/vmscan.c:6957 [inline] kswapd+0x1ca3/0x3700 mm/vmscan.c:7226 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 -> #1 (fs_reclaim){+.+.}-{0:0}: lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5825 __fs_reclaim_acquire mm/page_alloc.c:3834 [inline] fs_reclaim_acquire+0x88/0x130 mm/page_alloc.c:3848 might_alloc include/linux/sched/mm.h:318 [inline] prepare_alloc_pages+0x147/0x5b0 mm/page_alloc.c:4493 __alloc_pages_noprof+0x16f/0x710 mm/page_alloc.c:4722 alloc_pages_mpol_noprof+0x3e8/0x680 mm/mempolicy.c:2265 alloc_pages_noprof mm/mempolicy.c:2345 [inline] folio_alloc_noprof+0x128/0x180 mm/mempolicy.c:2352 filemap_alloc_folio_noprof+0xdf/0x500 mm/filemap.c:1010 do_read_cache_folio+0x2eb/0x850 mm/filemap.c:3787 read_mapping_folio include/linux/pagemap.h:1011 [inline] f2fs_commit_super+0x3c0/0x7d0 fs/f2fs/super.c:4032 f2fs_record_stop_reason+0x13b/0x1d0 fs/f2fs/super.c:4079 f2fs_handle_critical_error+0x2ac/0x5c0 fs/f2fs/super.c:4174 f2fs_write_inode+0x35f/0x4d0 fs/f2fs/inode.c:785 write_inode fs/fs-writeback.c:1503 [inline] __writeback_single_inode+0x711/0x10d0 fs/fs-writeback.c:1723 writeback_single_inode+0x1f3/0x660 fs/fs-writeback.c:1779 sync_inode_metadata+0xc4/0x120 fs/fs-writeback.c:2849 f2fs_release_file+0xa8/0x100 fs/f2fs/file.c:1941 __fput+0x23f/0x880 fs/file_table.c:431 task_work_run+0x24f/0x310 kernel/task_work.c:228 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop kernel/entry/common.c:114 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:328 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x168/0x370 kernel/entry/common.c:218 do_syscall_64+0x100/0x230 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&sbi->sb_lock){++++}-{3:3}: check_prev_add kernel/locking/lockdep.c:3161 [inline] check_prevs_add kernel/locking/lockdep.c:3280 [inline] validate_chain+0x18ef/0x5920 kernel/locking/lockdep.c:3904 __lock_acquire+0x1384/0x2050 kernel/locking/lockdep.c:5202 lock_acquire+0x1ed/0x550 kernel/locking/lockdep.c:5825 down_write+0x99/0x220 kernel/locking/rwsem.c:1577 f2fs_down_write fs/f2fs/f2fs.h:2199 [inline] f2fs_record_stop_reason+0x52/0x1d0 fs/f2fs/super.c:4068 f2fs_handle_critical_error+0x2ac/0x5c0 fs/f2fs/super.c:4174 f2fs_evict_inode+0xa61/0x15c0 fs/f2fs/inode.c:883 evict+0x4e8/0x9b0 fs/inode.c:725 f2fs_evict_inode+0x1a4/0x15c0 fs/f2fs/inode.c:807 evict+0x4e8/0x9b0 fs/inode.c:725 dispose_list fs/inode.c:774 [inline] prune_icache_sb+0x239/0x2f0 fs/inode.c:963 super_cache_scan+0x38c/0x4b0 fs/super.c:223 do_shrink_slab+0x701/0x1160 mm/shrinker.c:435 shrink_slab+0x1093/0x14d0 mm/shrinker.c:662 shrink_one+0x43b/0x850 mm/vmscan.c:4818 shrink_many mm/vmscan.c:4879 [inline] lru_gen_shrink_node mm/vmscan.c:4957 [inline] shrink_node+0x3799/0x3de0 mm/vmscan.c:5937 kswapd_shrink_node mm/vmscan.c:6765 [inline] balance_pgdat mm/vmscan.c:6957 [inline] kswapd+0x1ca3/0x3700 mm/vmscan.c:7226 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 other info that might help us debug this: Chain exists of: &sbi->sb_lock --> fs_reclaim --> sb_internal#2 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock(sb_internal#2); lock(fs_reclaim); lock(sb_internal#2); lock(&sbi->sb_lock); Root cause is there will be potential deadlock in between below tasks: Thread A Kswapd - f2fs_ioc_commit_atomic_write - mnt_want_write_file -- down_read lock A - balance_pgdat - __fs_reclaim_acquire -- lock B - shrink_node - prune_icache_sb - dispose_list - f2fs_evict_inode - sb_start_intwrite -- down_read lock A - f2fs_do_sync_file - f2fs_write_inode - f2fs_handle_critical_error - f2fs_record_stop_reason - f2fs_commit_super - read_mapping_folio - filemap_alloc_folio_noprof - fs_reclaim_acquire -- lock B Both threads try to acquire read lock of lock A, then its upcoming write lock grabber will trigger deadlock. Let's always create an asynchronous task in f2fs_handle_critical_error() rather than calling f2fs_record_stop_reason() synchronously to avoid this potential deadlock issue. Fixes: b62e71be2110 ("f2fs: support errors=remount-ro|continue|panic mountoption") Reported-by: syzbot+be4a9983e95a5e25c8d3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/6704d667.050a0220.1e4d62.0081.GAE@google.com Signed-off-by: Chao Yu Reviewed-by: Daejun Park Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/super.c | 13 +++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7f76460b721f..efda9a022981 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -32,7 +32,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, f2fs_build_fault_attr(sbi, 0, 0); if (!end_io) f2fs_flush_merged_writes(sbi); - f2fs_handle_critical_error(sbi, reason, end_io); + f2fs_handle_critical_error(sbi, reason); } /* diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3c6f3cce5779..0ac553627771 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3637,8 +3637,7 @@ int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); -void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, - bool irq_context); +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 80a53dbf1c38..49519439b770 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4159,8 +4159,7 @@ static bool system_going_down(void) || system_state == SYSTEM_RESTART; } -void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, - bool irq_context) +void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) { struct super_block *sb = sbi->sb; bool shutdown = reason == STOP_CP_REASON_SHUTDOWN; @@ -4172,10 +4171,12 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason, if (!f2fs_hw_is_readonly(sbi)) { save_stop_reason(sbi, reason); - if (irq_context && !shutdown) - schedule_work(&sbi->s_error_work); - else - f2fs_record_stop_reason(sbi); + /* + * always create an asynchronous task to record stop_reason + * in order to avoid potential deadlock when running into + * f2fs_record_stop_reason() synchronously. + */ + schedule_work(&sbi->s_error_work); } /* From 9395fb09e897dcd96f601715e63018fc5c69cf03 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 22 Oct 2024 10:01:49 +0800 Subject: [PATCH 18/40] f2fs: fix to parse temperature correctly in f2fs_get_segment_temp() In __get_segment_type(), __get_segment_type_6() may return CURSEG_COLD_DATA_PINNED or CURSEG_ALL_DATA_ATGC log type, but following f2fs_get_segment_temp() can only handle persistent log type, fix it. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 5 +++-- fs/f2fs/file.c | 4 ++-- fs/f2fs/segment.c | 33 +++++++++++++++++++++++++-------- fs/f2fs/segment.h | 4 ---- 4 files changed, 30 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0ac553627771..c13812dc7d3b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1019,7 +1019,7 @@ static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, #define NR_CURSEG_PERSIST_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE) #define NR_CURSEG_TYPE (NR_CURSEG_INMEM_TYPE + NR_CURSEG_PERSIST_TYPE) -enum { +enum log_type { CURSEG_HOT_DATA = 0, /* directory entry blocks */ CURSEG_WARM_DATA, /* data blocks */ CURSEG_COLD_DATA, /* multimedia or GCed data blocks */ @@ -3758,7 +3758,8 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr); -int f2fs_get_segment_temp(int seg_type); +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type seg_type); int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 75a8b22da664..0e7a0195eca8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4858,8 +4858,8 @@ static void f2fs_dio_write_submit_io(const struct iomap_iter *iter, { struct inode *inode = iter->inode; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int seg_type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); - enum temp_type temp = f2fs_get_segment_temp(seg_type); + enum log_type type = f2fs_rw_hint_to_seg_type(sbi, inode->i_write_hint); + enum temp_type temp = f2fs_get_segment_temp(sbi, type); bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, DATA, temp); submit_bio(bio); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index bda7156a2d9d..d8a0c50b8dfc 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3583,18 +3583,35 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) } } -int f2fs_get_segment_temp(int seg_type) +enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, + enum log_type type) { - if (IS_HOT(seg_type)) - return HOT; - else if (IS_WARM(seg_type)) - return WARM; - return COLD; + struct curseg_info *curseg = CURSEG_I(sbi, type); + enum temp_type temp = COLD; + + switch (curseg->seg_type) { + case CURSEG_HOT_NODE: + case CURSEG_HOT_DATA: + temp = HOT; + break; + case CURSEG_WARM_NODE: + case CURSEG_WARM_DATA: + temp = WARM; + break; + case CURSEG_COLD_NODE: + case CURSEG_COLD_DATA: + temp = COLD; + break; + default: + f2fs_bug_on(sbi, 1); + } + + return temp; } static int __get_segment_type(struct f2fs_io_info *fio) { - int type = 0; + enum log_type type = CURSEG_HOT_DATA; switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: @@ -3610,7 +3627,7 @@ static int __get_segment_type(struct f2fs_io_info *fio) f2fs_bug_on(fio->sbi, true); } - fio->temp = f2fs_get_segment_temp(type); + fio->temp = f2fs_get_segment_temp(fio->sbi, type); return type; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 0f706cfc754a..d440fdb83ca4 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -34,10 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG); } -#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) -#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) -#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) - #define IS_CURSEG(sbi, seg) \ (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ From 296b8cb34e65fa93382cf919be5a056f719c9a26 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Tue, 29 Oct 2024 11:12:49 +0800 Subject: [PATCH 19/40] f2fs: fix to avoid use GC_AT when setting gc_mode as GC_URGENT_LOW or GC_URGENT_MID If gc_mode is set to GC_URGENT_LOW or GC_URGENT_MID, cost benefit GC approach should be used, but if ATGC is enabled at the same time, Age-threshold approach will be selected, which can only do amount of GC and it is much less than the numbers of CB approach. some traces: f2fs_gc-254:48-396 [007] ..... 2311600.684028: f2fs_gc_begin: dev = (254,48), gc_type = Background GC, no_background_GC = 0, nr_free_secs = 0, nodes = 1053, dents = 2, imeta = 18, free_sec:44898, free_seg:44898, rsv_seg:239, prefree_seg:0 f2fs_gc-254:48-396 [007] ..... 2311600.684527: f2fs_get_victim: dev = (254,48), type = No TYPE, policy = (Background GC, LFS-mode, Age-threshold), victim = 10, cost = 4294364975, ofs_unit = 1, pre_victim_secno = -1, prefree = 0, free = 44898 f2fs_gc-254:48-396 [007] ..... 2311600.714835: f2fs_gc_end: dev = (254,48), ret = 0, seg_freed = 0, sec_freed = 0, nodes = 1562, dents = 2, imeta = 18, free_sec:44898, free_seg:44898, rsv_seg:239, prefree_seg:0 f2fs_gc-254:48-396 [007] ..... 2311600.714843: f2fs_background_gc: dev = (254,48), wait_ms = 50, prefree = 0, free = 44898 f2fs_gc-254:48-396 [007] ..... 2311600.771785: f2fs_gc_begin: dev = (254,48), gc_type = Background GC, no_background_GC = 0, nr_free_secs = 0, nodes = 1562, dents = 2, imeta = 18, free_sec:44898, free_seg:44898, rsv_seg:239, prefree_seg: f2fs_gc-254:48-396 [007] ..... 2311600.772275: f2fs_gc_end: dev = (254,48), ret = -61, seg_freed = 0, sec_freed = 0, nodes = 1562, dents = 2, imeta = 18, free_sec:44898, free_seg:44898, rsv_seg:239, prefree_seg:0 Fixes: 0e5e81114de1 ("f2fs: add GC_URGENT_LOW mode in gc_urgent") Fixes: d98af5f45520 ("f2fs: introduce gc_urgent_mid mode") Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 7 +++++-- fs/f2fs/gc.c | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index fdedf1ea944b..513296bb6f29 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -311,10 +311,13 @@ Description: Do background GC aggressively when set. Set to 0 by default. GC approach and turns SSR mode on. gc urgent low(2): lowers the bar of checking I/O idling in order to process outstanding discard commands and GC a - little bit aggressively. uses cost benefit GC approach. + little bit aggressively. always uses cost benefit GC approach, + and will override age-threshold GC approach if ATGC is enabled + at the same time. gc urgent mid(3): does GC forcibly in a period of given gc_urgent_sleep_time and executes a mid level of I/O idling check. - uses cost benefit GC approach. + always uses cost benefit GC approach, and will override + age-threshold GC approach if ATGC is enabled at the same time. What: /sys/fs/f2fs//gc_urgent_sleep_time Date: August 2017 diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e40bdd12e36d..3e1b6d2ff3a7 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -257,6 +257,8 @@ static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type) switch (sbi->gc_mode) { case GC_IDLE_CB: + case GC_URGENT_LOW: + case GC_URGENT_MID: gc_mode = GC_CB; break; case GC_IDLE_GREEDY: From cffaa0976fcc941a618951b56745a817befa8f91 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 29 Oct 2024 08:57:03 +0800 Subject: [PATCH 20/40] f2fs: clean up opened code w/ {get,set}_nid() Just cleanup, no logic change. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index af36c6d6542b..7d904f2bd5b6 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1066,7 +1066,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, int i; int idx = depth - 2; - nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + nid[0] = get_nid(dn->inode_page, offset[0], true); if (!nid[0]) return 0; @@ -1177,7 +1177,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]); + dn.nid = get_nid(page, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1209,13 +1209,10 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) } if (err < 0) goto fail; - if (offset[1] == 0 && - ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { + if (offset[1] == 0 && get_nid(page, offset[0], true)) { lock_page(page); BUG_ON(page->mapping != NODE_MAPPING(sbi)); - f2fs_wait_on_page_writeback(page, NODE, true, true); - ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; - set_page_dirty(page); + set_nid(page, offset[0], 0, true); unlock_page(page); } offset[1] = 0; From 1df2bc3c8252261860787a02f61797a914b99163 Mon Sep 17 00:00:00 2001 From: LongPing Wei Date: Tue, 29 Oct 2024 18:49:07 +0800 Subject: [PATCH 21/40] f2fs: clean up the unused variable additional_reserved_segments additional_reserved_segments was introduced by commit 300a842937fb ("f2fs: fix to reserve space for IO align feature"), and its initialization was deleted by commit 87161a2b0aed ("f2fs: deprecate io_bits"). Signed-off-by: LongPing Wei Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.h | 3 +-- fs/f2fs/sysfs.c | 4 +--- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c13812dc7d3b..119706dbaefa 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1064,7 +1064,6 @@ struct f2fs_sm_info { unsigned int segment_count; /* total # of segments */ unsigned int main_segments; /* # of segments in main area */ unsigned int reserved_segments; /* # of reserved segments */ - unsigned int additional_reserved_segments;/* reserved segs for IO align feature */ unsigned int ovp_segments; /* # of overprovision segments */ /* a threshold to reclaim prefree segments */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index d440fdb83ca4..943be4f1d6d2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -522,8 +522,7 @@ static inline unsigned int free_segments(struct f2fs_sb_info *sbi) static inline unsigned int reserved_segments(struct f2fs_sb_info *sbi) { - return SM_I(sbi)->reserved_segments + - SM_I(sbi)->additional_reserved_segments; + return SM_I(sbi)->reserved_segments; } static inline unsigned int free_sections(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e51304bc65ea..bdbf24db667b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -501,9 +501,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); if (t > (unsigned long)(sbi->user_block_count - - F2FS_OPTION(sbi).root_reserved_blocks - - SEGS_TO_BLKS(sbi, - SM_I(sbi)->additional_reserved_segments))) { + F2FS_OPTION(sbi).root_reserved_blocks)) { spin_unlock(&sbi->stat_lock); return -EINVAL; } From 51d3d952c5084393d89cce0b951bb5f18eb97bb1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 30 Oct 2024 16:17:01 +0800 Subject: [PATCH 22/40] f2fs: fix to convert log type to segment data type correctly This patch introduces a new helper log_type_to_seg_type() to convert log type to segment data type, and uses it to clean up opened codes in build_curseg(), and also it fixes to convert log type before use in do_write_page(). Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d8a0c50b8dfc..edf2a74207b3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3812,10 +3812,35 @@ void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, } } +static int log_type_to_seg_type(enum log_type type) +{ + int seg_type = CURSEG_COLD_DATA; + + switch (type) { + case CURSEG_HOT_DATA: + case CURSEG_WARM_DATA: + case CURSEG_COLD_DATA: + case CURSEG_HOT_NODE: + case CURSEG_WARM_NODE: + case CURSEG_COLD_NODE: + seg_type = (int)type; + break; + case CURSEG_COLD_DATA_PINNED: + case CURSEG_ALL_DATA_ATGC: + seg_type = CURSEG_COLD_DATA; + break; + default: + break; + } + return seg_type; +} + static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio); - bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA); + enum log_type type = __get_segment_type(fio); + int seg_type = log_type_to_seg_type(type); + bool keep_order = (f2fs_lfs_mode(fio->sbi) && + seg_type == CURSEG_COLD_DATA); if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); @@ -4797,12 +4822,7 @@ static int build_curseg(struct f2fs_sb_info *sbi) sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; - if (i < NR_PERSISTENT_LOG) - array[i].seg_type = CURSEG_HOT_DATA + i; - else if (i == CURSEG_COLD_DATA_PINNED) - array[i].seg_type = CURSEG_COLD_DATA; - else if (i == CURSEG_ALL_DATA_ATGC) - array[i].seg_type = CURSEG_COLD_DATA; + array[i].seg_type = log_type_to_seg_type(i); reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); From 744e66cb8779fcbdbe20b0ec3235ef1ee9815261 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 31 Oct 2024 13:59:52 +0800 Subject: [PATCH 23/40] f2fs: remove redundant atomic file check in defragment f2fs_is_atomic_file(inode) is checked in f2fs_defragment_range, so remove the redundant checking in f2fs_ioc_defragment. Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0e7a0195eca8..b619f7e55640 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2883,7 +2883,7 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + if (!S_ISREG(inode->i_mode)) return -EINVAL; if (f2fs_readonly(sbi->sb)) From a7a7c1d423a6351a6541e95c797da5358e5ad1ea Mon Sep 17 00:00:00 2001 From: Xiuhong Wang Date: Tue, 29 Oct 2024 14:15:35 +0800 Subject: [PATCH 24/40] f2fs: fix fiemap failure issue when page size is 16KB After enable 16K page size, an infinite loop may occur in fiemap (fm_length=UINT64_MAX) on a file, such as the 16KB scratch.img during the remount operation in Android. The condition for whether fiemap continues to map is to check whether the number of bytes corresponding to the next map.m_lblk exceeds blks_to_bytes(inode,max_inode_blocks(inode)) if there are HOLE. The latter does not take into account the maximum size of a file with 16KB page size, so the loop cannot be jumped out. The following is the fail trace: When f2fs_map_blocks reaches map.m_lblk=3936, it needs to go to the first direct node block, so the map is 3936 + 4090 = 8026, The next map is the second direct node block, that is, 8026 + 4090 = 12116, The next map is the first indirect node block, that is, 12116 + 4090 * 4090 = 16740216, The next map is the second indirect node block, that is, 16740216 + 4090 * 4090 = 33468316, The next map is the first double indirect node block, that is, 33468316 + 4090 * 4090 * 4090 = 68451397316 Since map.m_lblk represents the address of a block, which is 32 bits, truncation will occur, that is, 68451397316 becomes 4026887876, and the number of bytes corresponding to the block number does not exceed blks_to_bytes(inode,max_inode_blocks(inode)), so the loop will not be jumped out. The next time, it will be considered that it should still be a double indirect node block, that is, 4026887876 + 4090 * 4090 * 4090 = 72444816876, which will be truncated to 3725340140, and the loop will not be jumped out. 156.374871: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 0, start blkaddr = 0x8e00, len = 0x200, flags = 2,seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.374916: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 512, start blkaddr = 0x0, len = 0x0, flags = 0 , seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.374920: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 513, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 ...... 156.385747: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 3935, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385752: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 3936, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385755: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 8026, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385758: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 12116, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385761: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 16740216, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385764: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 33468316, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385767: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 4026887876, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385770: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 3725340140, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385772: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 4026887876, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 156.385775: f2fs_map_blocks: dev = (254,57), ino = 7449, file offset = 3725340140, start blkaddr = 0x0, len = 0x0, flags = 0, seg_type = 8, may_create = 0, multidevice = 0, flag = 1, err = 0 Commit a6a010f5def5 ("f2fs: Restrict max filesize for 16K f2fs") has set the maximum allowed file size to (U32_MAX + 1) * F2FS_BLKSIZE, so max_file_blocks should be used here to limit it, that is, maxbytes defined above. And the max_inode_blocks function is not called by other functions except here, so cleanup it. Signed-off-by: Xiuhong Wang Signed-off-by: Zhiguo Niu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 306b86b0595d..402e00d54c0b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1901,25 +1901,6 @@ static int f2fs_xattr_fiemap(struct inode *inode, return (err < 0 ? err : 0); } -static loff_t max_inode_blocks(struct inode *inode) -{ - loff_t result = ADDRS_PER_INODE(inode); - loff_t leaf_count = ADDRS_PER_BLOCK(inode); - - /* two direct node blocks */ - result += (leaf_count * 2); - - /* two indirect node blocks */ - leaf_count *= NIDS_PER_BLOCK; - result += (leaf_count * 2); - - /* one double indirect node block */ - leaf_count *= NIDS_PER_BLOCK; - result += leaf_count; - - return result; -} - int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -1992,8 +1973,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; - if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, - max_inode_blocks(inode))) + if (blks_to_bytes(inode, start_blk) < maxbytes) goto prep_next; flags |= FIEMAP_EXTENT_LAST; From 7b0033dbc48340a1c1c3f12448ba17d6587ca092 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 4 Nov 2024 10:05:42 +0800 Subject: [PATCH 25/40] f2fs: fix race in concurrent f2fs_stop_gc_thread In my test case, concurrent calls to f2fs shutdown report the following stack trace: Oops: general protection fault, probably for non-canonical address 0xc6cfff63bb5513fc: 0000 [#1] PREEMPT SMP PTI CPU: 0 UID: 0 PID: 678 Comm: f2fs_rep_shutdo Not tainted 6.12.0-rc5-next-20241029-g6fb2fa9805c5-dirty #85 Call Trace: ? show_regs+0x8b/0xa0 ? __die_body+0x26/0xa0 ? die_addr+0x54/0x90 ? exc_general_protection+0x24b/0x5c0 ? asm_exc_general_protection+0x26/0x30 ? kthread_stop+0x46/0x390 f2fs_stop_gc_thread+0x6c/0x110 f2fs_do_shutdown+0x309/0x3a0 f2fs_ioc_shutdown+0x150/0x1c0 __f2fs_ioctl+0xffd/0x2ac0 f2fs_ioctl+0x76/0xe0 vfs_ioctl+0x23/0x60 __x64_sys_ioctl+0xce/0xf0 x64_sys_call+0x2b1b/0x4540 do_syscall_64+0xa7/0x240 entry_SYSCALL_64_after_hwframe+0x76/0x7e The root cause is a race condition in f2fs_stop_gc_thread() called from different f2fs shutdown paths: [CPU0] [CPU1] ---------------------- ----------------------- f2fs_stop_gc_thread f2fs_stop_gc_thread gc_th = sbi->gc_thread gc_th = sbi->gc_thread kfree(gc_th) sbi->gc_thread = NULL < gc_th != NULL > kthread_stop(gc_th->f2fs_gc_task) //UAF The commit c7f114d864ac ("f2fs: fix to avoid use-after-free in f2fs_stop_gc_thread()") attempted to fix this issue by using a read semaphore to prevent races between shutdown and remount threads, but it fails to prevent all race conditions. Fix it by converting to write lock of s_umount in f2fs_do_shutdown(). Fixes: 7950e9ac638e ("f2fs: stop gc/discard thread after fs shutdown") Signed-off-by: Long Li Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index b619f7e55640..c0818a4f87c1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2365,9 +2365,12 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, if (readonly) goto out; - /* grab sb->s_umount to avoid racing w/ remount() */ + /* + * grab sb->s_umount to avoid racing w/ remount() and other shutdown + * paths. + */ if (need_lock) - down_read(&sbi->sb->s_umount); + down_write(&sbi->sb->s_umount); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); @@ -2376,7 +2379,7 @@ int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag, clear_opt(sbi, DISCARD); if (need_lock) - up_read(&sbi->sb->s_umount); + up_write(&sbi->sb->s_umount); f2fs_update_time(sbi, REQ_TIME); out: From 5dd00ebda337b9295e7027691fa70540da369ff2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Nov 2024 09:35:51 +0800 Subject: [PATCH 26/40] f2fs: fix to map blocks correctly for direct write f2fs_map_blocks() supports to map continuous holes or preallocated address, we should avoid setting F2FS_MAP_MAPPED for these cases only, otherwise, it may fail f2fs_iomap_begin(), and make direct write fallbacking to use buffered IO and flush, result in performance regression. Fixes: 9f0f6bf42714 ("f2fs: support to map continuous holes or preallocated address") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409122103.e45aa13b-oliver.sang@intel.com Cc: Cyril Hrubis Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 402e00d54c0b..b33aca24b9ef 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1676,7 +1676,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) /* reserved delalloc block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) map->m_flags |= F2FS_MAP_DELALLOC; - if (flag != F2FS_GET_BLOCK_DIO || !is_hole) + /* DIO READ and hole case, should not map the blocks. */ + if (!(flag == F2FS_GET_BLOCK_DIO && is_hole && !map->m_may_create)) map->m_flags |= F2FS_MAP_MAPPED; map->m_pblk = blkaddr; From 26e6f59d0bbaac76fa3413462d780bd2b5f9f653 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 4 Nov 2024 09:50:16 +0800 Subject: [PATCH 27/40] f2fs: fix to avoid forcing direct write to use buffered IO on inline_data inode Jinsu Lee reported a performance regression issue, after commit 5c8764f8679e ("f2fs: fix to force buffered IO on inline_data inode"), we forced direct write to use buffered IO on inline_data inode, it will cause performace regression due to memory copy and data flush. It's fine to not force direct write to use buffered IO, as it can convert inline inode before committing direct write IO. Fixes: 5c8764f8679e ("f2fs: fix to force buffered IO on inline_data inode") Reported-by: Jinsu Lee Closes: https://lore.kernel.org/linux-f2fs-devel/af03dd2c-e361-4f80-b2fd-39440766cf6e@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c0818a4f87c1..cb2172a698be 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -883,7 +883,11 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw) return true; if (f2fs_compressed_file(inode)) return true; - if (f2fs_has_inline_data(inode)) + /* + * only force direct read to use buffered IO, for direct write, + * it expects inline data conversion before committing IO. + */ + if (f2fs_has_inline_data(inode) && rw == READ) return true; /* disallow direct IO if any of devices has unaligned blksize */ From acff9409dd40beaca2bd982678d222e2740ad84b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 12 Nov 2024 01:04:58 +0000 Subject: [PATCH 28/40] Revert "f2fs: remove unreachable lazytime mount option parsing" This reverts commit 54f43a10fa257ad4af02a1d157fefef6ebcfa7dc. The above commit broke the lazytime mount, given mount("/dev/vdb", "/mnt/test", "f2fs", 0, "lazytime"); CC: stable@vger.kernel.org # 6.11+ Signed-off-by: Daniel Rosenberg Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 49519439b770..35c4394e4fc6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -150,6 +150,8 @@ enum { Opt_mode, Opt_fault_injection, Opt_fault_type, + Opt_lazytime, + Opt_nolazytime, Opt_quota, Opt_noquota, Opt_usrquota, @@ -226,6 +228,8 @@ static match_table_t f2fs_tokens = { {Opt_mode, "mode=%s"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_fault_type, "fault_type=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_quota, "quota"}, {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, @@ -922,6 +926,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) f2fs_info(sbi, "fault_type options not supported"); break; #endif + case Opt_lazytime: + sb->s_flags |= SB_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~SB_LAZYTIME; + break; #ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: From 789ca0eb47f7782b3230ab0957eabd7967b78cae Mon Sep 17 00:00:00 2001 From: Daniel Yang Date: Sun, 17 Nov 2024 23:01:40 -0800 Subject: [PATCH 29/40] f2fs: replace deprecated strcpy with strscpy strcpy is deprecated. Kernel docs recommend replacing strcpy with strscpy. The function strcpy() return value isn't used so there shouldn't be an issue replacing with the safer alternative strscpy. Signed-off-by: Daniel Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 35c4394e4fc6..7bcd6471af9a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1172,7 +1172,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; } - strcpy(ext[ext_cnt], name); + ret = strscpy(ext[ext_cnt], name); + if (ret < 0) { + kfree(name); + return ret; + } F2FS_OPTION(sbi).compress_ext_cnt++; kfree(name); break; @@ -1201,7 +1205,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; } - strcpy(noext[noext_cnt], name); + ret = strscpy(noext[noext_cnt], name); + if (ret < 0) { + kfree(name); + return ret; + } F2FS_OPTION(sbi).nocompress_ext_cnt++; kfree(name); break; From 3273d8ad947dea925a65a78ca29e5351c960c801 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 Nov 2024 09:25:54 +0800 Subject: [PATCH 30/40] f2fs: fix to do cast in F2FS_{BLK_TO_BYTES, BTYES_TO_BLK} to avoid overflow It missed to cast variable to unsigned long long type before bit shift, which will cause overflow, fix it. Fixes: f7ef9b83b583 ("f2fs: introduce macros to convert bytes and blocks in f2fs") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- include/linux/f2fs_fs.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7bcd6471af9a..0097a5770fa3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3344,7 +3344,7 @@ loff_t max_file_blocks(struct inode *inode) * fit within U32_MAX + 1 data units. */ - result = min(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096)); + result = umin(result, F2FS_BYTES_TO_BLK(((loff_t)U32_MAX + 1) * 4096)); return result; } diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index b0b821edfd97..3b2ad444c002 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -24,10 +24,10 @@ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ #define COMPRESS_ADDR ((block_t)-2) /* used as compressed data flag */ -#define F2FS_BYTES_TO_BLK(bytes) ((bytes) >> F2FS_BLKSIZE_BITS) -#define F2FS_BLK_TO_BYTES(blk) ((blk) << F2FS_BLKSIZE_BITS) +#define F2FS_BYTES_TO_BLK(bytes) ((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS) +#define F2FS_BLK_TO_BYTES(blk) ((unsigned long long)(blk) << F2FS_BLKSIZE_BITS) #define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) -#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1)) +#define F2FS_BLK_ALIGN(x) (F2FS_BYTES_TO_BLK((x) + F2FS_BLKSIZE - 1)) /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 From 7461f37094180200cb2f98e60ef99a0cea97beec Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 Nov 2024 09:25:55 +0800 Subject: [PATCH 31/40] f2fs: clean up w/ F2FS_{BLK_TO_BYTES,BTYES_TO_BLK} f2fs doesn't support different blksize in one instance, so bytes_to_blks() and blks_to_bytes() are equal to F2FS_BYTES_TO_BLK and F2FS_BLK_TO_BYTES, let's use F2FS_BYTES_TO_BLK/F2FS_BLK_TO_BYTES instead for cleanup. Reviewed-by: Zhiguo Niu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 68 +++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b33aca24b9ef..0e8390cbdb5b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1819,16 +1819,6 @@ bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len) return true; } -static inline u64 bytes_to_blks(struct inode *inode, u64 bytes) -{ - return (bytes >> inode->i_blkbits); -} - -static inline u64 blks_to_bytes(struct inode *inode, u64 blks) -{ - return (blks << inode->i_blkbits); -} - static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { @@ -1854,7 +1844,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = blks_to_bytes(inode, ni.blk_addr); + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + sizeof(__le32) * (DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)); @@ -1886,7 +1876,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, return err; } - phys = blks_to_bytes(inode, ni.blk_addr); + phys = F2FS_BLK_TO_BYTES(ni.blk_addr); len = inode->i_sb->s_blocksize; f2fs_put_page(page, 1); @@ -1948,16 +1938,16 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (bytes_to_blks(inode, len) == 0) - len = blks_to_bytes(inode, 1); + if (F2FS_BYTES_TO_BLK(len) == 0) + len = F2FS_BLKSIZE; - start_blk = bytes_to_blks(inode, start); - last_blk = bytes_to_blks(inode, start + len - 1); + start_blk = F2FS_BYTES_TO_BLK(start); + last_blk = F2FS_BYTES_TO_BLK(start + len - 1); next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; - map.m_len = bytes_to_blks(inode, len); + map.m_len = F2FS_BYTES_TO_BLK(len); map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; @@ -1974,7 +1964,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; - if (blks_to_bytes(inode, start_blk) < maxbytes) + if (F2FS_BLK_TO_BYTES(start_blk) < maxbytes) goto prep_next; flags |= FIEMAP_EXTENT_LAST; @@ -2011,14 +2001,14 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } else if (compr_appended) { unsigned int appended_blks = cluster_size - count_in_cluster + 1; - size += blks_to_bytes(inode, appended_blks); + size += F2FS_BLK_TO_BYTES(appended_blks); start_blk += appended_blks; compr_cluster = false; } else { - logical = blks_to_bytes(inode, start_blk); + logical = F2FS_BLK_TO_BYTES(start_blk); phys = __is_valid_data_blkaddr(map.m_pblk) ? - blks_to_bytes(inode, map.m_pblk) : 0; - size = blks_to_bytes(inode, map.m_len); + F2FS_BLK_TO_BYTES(map.m_pblk) : 0; + size = F2FS_BLK_TO_BYTES(map.m_len); flags = 0; if (compr_cluster) { @@ -2026,13 +2016,13 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, count_in_cluster += map.m_len; if (count_in_cluster == cluster_size) { compr_cluster = false; - size += blks_to_bytes(inode, 1); + size += F2FS_BLKSIZE; } } else if (map.m_flags & F2FS_MAP_DELALLOC) { flags = FIEMAP_EXTENT_UNWRITTEN; } - start_blk += bytes_to_blks(inode, size); + start_blk += F2FS_BYTES_TO_BLK(size); } prep_next: @@ -2070,7 +2060,7 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, struct readahead_control *rac) { struct bio *bio = *bio_ret; - const unsigned blocksize = blks_to_bytes(inode, 1); + const unsigned int blocksize = F2FS_BLKSIZE; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; @@ -2080,8 +2070,8 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, block_in_file = (sector_t)index; last_block = block_in_file + nr_pages; - last_block_in_file = bytes_to_blks(inode, - f2fs_readpage_limit(inode) + blocksize - 1); + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -2181,7 +2171,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct bio *bio = *bio_ret; unsigned int start_idx = cc->cluster_idx << cc->log_cluster_size; sector_t last_block_in_file; - const unsigned blocksize = blks_to_bytes(inode, 1); + const unsigned int blocksize = F2FS_BLKSIZE; struct decompress_io_ctx *dic = NULL; struct extent_info ei = {}; bool from_dnode = true; @@ -2190,8 +2180,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); - last_block_in_file = bytes_to_blks(inode, - f2fs_readpage_limit(inode) + blocksize - 1); + last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + + blocksize - 1); /* get rid of pages beyond EOF */ for (i = 0; i < cc->cluster_size; i++) { @@ -3957,7 +3947,7 @@ static int check_swap_activate(struct swap_info_struct *sis, * to be very smart. */ cur_lblock = 0; - last_lblock = bytes_to_blks(inode, i_size_read(inode)); + last_lblock = F2FS_BYTES_TO_BLK(i_size_read(inode)); while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; @@ -4200,8 +4190,8 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, pgoff_t next_pgofs = 0; int err; - map.m_lblk = bytes_to_blks(inode, offset); - map.m_len = bytes_to_blks(inode, offset + length - 1) - map.m_lblk + 1; + map.m_lblk = F2FS_BYTES_TO_BLK(offset); + map.m_len = F2FS_BYTES_TO_BLK(offset + length - 1) - map.m_lblk + 1; map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); @@ -4212,7 +4202,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (err) return err; - iomap->offset = blks_to_bytes(inode, map.m_lblk); + iomap->offset = F2FS_BLK_TO_BYTES(map.m_lblk); /* * When inline encryption is enabled, sometimes I/O to an encrypted file @@ -4232,21 +4222,21 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (WARN_ON_ONCE(map.m_pblk == NEW_ADDR)) return -EINVAL; - iomap->length = blks_to_bytes(inode, map.m_len); + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_MAPPED; iomap->flags |= IOMAP_F_MERGED; iomap->bdev = map.m_bdev; - iomap->addr = blks_to_bytes(inode, map.m_pblk); + iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk); } else { if (flags & IOMAP_WRITE) return -ENOTBLK; if (map.m_pblk == NULL_ADDR) { - iomap->length = blks_to_bytes(inode, next_pgofs) - - iomap->offset; + iomap->length = F2FS_BLK_TO_BYTES(next_pgofs) - + iomap->offset; iomap->type = IOMAP_HOLE; } else if (map.m_pblk == NEW_ADDR) { - iomap->length = blks_to_bytes(inode, map.m_len); + iomap->length = F2FS_BLK_TO_BYTES(map.m_len); iomap->type = IOMAP_UNWRITTEN; } else { f2fs_bug_on(F2FS_I_SB(inode), 1); From 77569f785c8624fa4189795fb52e635a973672e5 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Fri, 8 Nov 2024 09:25:56 +0800 Subject: [PATCH 32/40] f2fs: fix to adjust appropriate length for fiemap If user give a file size as "length" parameter for fiemap operations, but if this size is non-block size aligned, it will show 2 segments fiemap results even this whole file is contiguous on disk, such as the following results: ./f2fs_io fiemap 0 19034 ylog/analyzer.py Fiemap: offset = 0 len = 19034 logical addr. physical addr. length flags 0 0000000000000000 0000000020baa000 0000000000004000 00001000 1 0000000000004000 0000000020bae000 0000000000001000 00001001 after this patch: ./f2fs_io fiemap 0 19034 ylog/analyzer.py Fiemap: offset = 0 len = 19034 logical addr. physical addr. length flags 0 0000000000000000 00000000315f3000 0000000000005000 00001001 Signed-off-by: Zhiguo Niu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +++--- include/linux/f2fs_fs.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0e8390cbdb5b..69f1cb0490ee 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1938,12 +1938,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - if (F2FS_BYTES_TO_BLK(len) == 0) - len = F2FS_BLKSIZE; - start_blk = F2FS_BYTES_TO_BLK(start); last_blk = F2FS_BYTES_TO_BLK(start + len - 1); + if (len & F2FS_BLKSIZE_MASK) + len = round_up(len, F2FS_BLKSIZE); + next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3b2ad444c002..c24f8bc01045 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -24,6 +24,7 @@ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ #define COMPRESS_ADDR ((block_t)-2) /* used as compressed data flag */ +#define F2FS_BLKSIZE_MASK (F2FS_BLKSIZE - 1) #define F2FS_BYTES_TO_BLK(bytes) ((unsigned long long)(bytes) >> F2FS_BLKSIZE_BITS) #define F2FS_BLK_TO_BYTES(blk) ((unsigned long long)(blk) << F2FS_BLKSIZE_BITS) #define F2FS_BLK_END_BYTES(blk) (F2FS_BLK_TO_BYTES(blk + 1) - 1) From 6787a82245857271133b63ae7f72f1dc9f29e985 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 8 Nov 2024 09:25:57 +0800 Subject: [PATCH 33/40] f2fs: fix to requery extent which cross boundary of inquiry dd if=/dev/zero of=file bs=4k count=5 xfs_io file -c "fiemap -v 2 16384" file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..31]: 139272..139303 32 0x1000 1: [32..39]: 139304..139311 8 0x1001 xfs_io file -c "fiemap -v 0 16384" file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..31]: 139272..139303 32 0x1000 xfs_io file -c "fiemap -v 0 16385" file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..39]: 139272..139311 40 0x1001 There are two problems: - continuous extent is split to two - FIEMAP_EXTENT_LAST is missing in last extent The root cause is: if upper boundary of inquiry crosses extent, f2fs_map_blocks() will truncate length of returned extent to F2FS_BYTES_TO_BLK(len), and also, it will stop to query latter extent or hole to make sure current extent is last or not. In order to fix this issue, once we found an extent locates in the end of inquiry range by f2fs_map_blocks(), we need to expand inquiry range to requiry. Cc: stable@vger.kernel.org Fixes: 7f63eb77af7b ("f2fs: report unwritten area in f2fs_fiemap") Reported-by: Zhiguo Niu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 69f1cb0490ee..ee5614324df0 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1896,7 +1896,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct f2fs_map_blocks map; - sector_t start_blk, last_blk; + sector_t start_blk, last_blk, blk_len, max_len; pgoff_t next_pgofs; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; @@ -1940,14 +1940,13 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, start_blk = F2FS_BYTES_TO_BLK(start); last_blk = F2FS_BYTES_TO_BLK(start + len - 1); - - if (len & F2FS_BLKSIZE_MASK) - len = round_up(len, F2FS_BLKSIZE); + blk_len = last_blk - start_blk + 1; + max_len = F2FS_BYTES_TO_BLK(maxbytes) - start_blk; next: memset(&map, 0, sizeof(map)); map.m_lblk = start_blk; - map.m_len = F2FS_BYTES_TO_BLK(len); + map.m_len = blk_len; map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; @@ -1970,6 +1969,17 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= FIEMAP_EXTENT_LAST; } + /* + * current extent may cross boundary of inquiry, increase len to + * requery. + */ + if (!compr_cluster && (map.m_flags & F2FS_MAP_MAPPED) && + map.m_lblk + map.m_len - 1 == last_blk && + blk_len != max_len) { + blk_len = max_len; + goto next; + } + compr_appended = false; /* In a case of compressed cluster, append this to the last extent */ if (compr_cluster && ((map.m_flags & F2FS_MAP_DELALLOC) || From a35749b1ed64ec7f4df0364fcc6002082d366486 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 18 Nov 2024 10:45:35 -0800 Subject: [PATCH 34/40] f2fs: adjust unusable cap before checkpoint=disable mode The unusable cap value must be adjusted before checking whether checkpoint=disable is feasible. Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0097a5770fa3..e9d2267b8788 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2496,6 +2496,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } } + adjust_unusable_cap_perc(sbi); if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); @@ -2540,7 +2541,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); limit_reserve_root(sbi); - adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; restore_checkpoint: From 1015035609e4ccb32e967015a600e01158377dfc Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 21 Nov 2024 16:26:56 +0800 Subject: [PATCH 35/40] f2fs: fix changing cursegs if recovery fails on zoned device Fsync data recovery attempts to check and fix write pointer consistency of cursegs and all other zones. If the write pointers of cursegs are unaligned, cursegs are changed to new sections. If recovery fails, zone write pointers are still checked and fixed, but the latest checkpoint cannot be written back. Additionally, retry- mount skips recovery and rolls back to reuse the old cursegs whose zones are already finished. This can lead to unaligned write later. This patch addresses the issue by leaving writer pointers untouched if recovery fails. When retry-mount is performed, cursegs and other zones are checked and fixed after skipping recovery. Signed-off-by: Song Feng Signed-off-by: Yongpeng Yang Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +-- fs/f2fs/recovery.c | 9 ++------- fs/f2fs/segment.c | 29 +++++++++++++++++++---------- fs/f2fs/super.c | 15 ++++++--------- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 119706dbaefa..b65b023a588a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3775,8 +3775,7 @@ void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc); void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi); -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi); +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi); int f2fs_build_segment_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi); int __init f2fs_create_segment_manager_caches(void); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e4d81b8705d1..f35be2c48e3c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -899,13 +899,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) * and the f2fs is not read only, check and fix zoned block devices' * write pointer consistency. */ - if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) { - int err2 = f2fs_fix_curseg_write_pointer(sbi); - - if (!err2) - err2 = f2fs_check_write_pointer(sbi); - if (err2) - err = err2; + if (!err) { + err = f2fs_check_and_fix_write_pointer(sbi); ret = err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index edf2a74207b3..4236040e3994 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -5246,7 +5246,7 @@ static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, return 0; } -static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) +static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) { struct curseg_info *cs = CURSEG_I(sbi, type); struct f2fs_dev_info *zbd; @@ -5351,12 +5351,12 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) return 0; } -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) +static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; for (i = 0; i < NR_PERSISTENT_LOG; i++) { - ret = fix_curseg_write_pointer(sbi, i); + ret = do_fix_curseg_write_pointer(sbi, i); if (ret) return ret; } @@ -5379,7 +5379,7 @@ static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, return check_zone_write_pointer(args->sbi, args->fdev, zone); } -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +static int check_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; struct check_zone_write_pointer_args args; @@ -5399,6 +5399,20 @@ int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) return 0; } +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) +{ + int ret; + + if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb)) + return 0; + + f2fs_notice(sbi, "Checking entire write pointers"); + ret = fix_curseg_write_pointer(sbi); + if (!ret) + ret = check_write_pointer(sbi); + return ret; +} + /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for @@ -5435,12 +5449,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( return BLKS_PER_SEG(sbi); } #else -int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi) -{ - return 0; -} - -int f2fs_check_write_pointer(struct f2fs_sb_info *sbi) +int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { return 0; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e9d2267b8788..41cc13b43ce1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4774,16 +4774,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, - * check zoned block devices' write pointer consistency. + * write pointer consistency of cursegs and other zones are already + * checked and fixed during recovery. However, if recovery fails, + * write pointers are left untouched, and retry-mount should check + * them here. */ - if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) { - int err2; - - f2fs_notice(sbi, "Checking entire write pointers"); - err2 = f2fs_check_write_pointer(sbi); - if (err2) - err = err2; - } + if (skip_recovery) + err = f2fs_check_and_fix_write_pointer(sbi); if (err) goto free_meta; From f88c7904b5c7e35ab8037e2a59e10d80adf6fd7e Mon Sep 17 00:00:00 2001 From: Sheng Yong Date: Thu, 21 Nov 2024 16:26:57 +0800 Subject: [PATCH 36/40] f2fs: clear SBI_POR_DOING before initing inmem curseg SBI_POR_DOING can be cleared after recovery is completed, so that changes made before recovery can be persistent, and subsequent errors can be recorded into cp/sb. Signed-off-by: Song Feng Signed-off-by: Yongpeng Yang Signed-off-by: Sheng Yong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 41cc13b43ce1..c0670cd61956 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4784,13 +4784,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_meta; + /* f2fs_recover_fsync_data() cleared this already */ + clear_sbi_flag(sbi, SBI_POR_DOING); + err = f2fs_init_inmem_curseg(sbi); if (err) goto sync_free_meta; - /* f2fs_recover_fsync_data() cleared this already */ - clear_sbi_flag(sbi, SBI_POR_DOING); - if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) From 81520c684ca67aea6a589461a3caebb9b11dcc90 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 20 Nov 2024 14:58:50 +0800 Subject: [PATCH 37/40] f2fs: print message if fscorrupted was found in f2fs_new_node_page() If fs corruption occurs in f2fs_new_node_page(), let's print more information about corrupted metadata into kernel log. Meanwhile, it updates to record ERROR_INCONSISTENT_NAT instead of ERROR_INVALID_BLKADDR if blkaddr in nat entry is not NULL_ADDR which means nat bitmap and nat entry is inconsistent. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7d904f2bd5b6..0b900a7a48e5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1338,7 +1338,12 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) err = -EFSCORRUPTED; dec_valid_node_count(sbi, dn->inode, !ofs); set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); + f2fs_warn_ratelimited(sbi, + "f2fs_new_node_page: inconsistent nat entry, " + "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", + new_ni.ino, new_ni.nid, new_ni.blk_addr, + new_ni.version, new_ni.flag); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT); goto fail; } #endif From 3fc5d5a182f6a1f8bd4dc775feb54c369dd2c343 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Nov 2024 09:57:50 +0800 Subject: [PATCH 38/40] f2fs: fix to shrink read extent node in batches We use rwlock to protect core structure data of extent tree during its shrink, however, if there is a huge number of extent nodes in extent tree, during shrink of extent tree, it may hold rwlock for a very long time, which may trigger kernel hang issue. This patch fixes to shrink read extent node in batches, so that, critical region of the rwlock can be shrunk to avoid its extreme long time hold. Reported-by: Xiuhong Wang Closes: https://lore.kernel.org/linux-f2fs-devel/20241112110627.1314632-1-xiuhong.wang@unisoc.com/ Signed-off-by: Xiuhong Wang Signed-off-by: Zhiguo Niu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 69 +++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 019c1f7b7fa5..b7a6817b44b0 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -379,21 +379,22 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode, } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et) + struct extent_tree *et, unsigned int nr_shrink) { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = atomic_read(&et->node_cnt); + unsigned int count; node = rb_first_cached(&et->root); - while (node) { + + for (count = 0; node && count < nr_shrink; count++) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); __release_extent_node(sbi, et, en); node = next; } - return count - atomic_read(&et->node_cnt); + return count; } static void __drop_largest_extent(struct extent_tree *et, @@ -622,6 +623,30 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, return en; } +static unsigned int __destroy_extent_node(struct inode *inode, + enum extent_type type) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; + unsigned int nr_shrink = type == EX_READ ? + READ_EXTENT_CACHE_SHRINK_NUMBER : + AGE_EXTENT_CACHE_SHRINK_NUMBER; + unsigned int node_cnt = 0; + + if (!et || !atomic_read(&et->node_cnt)) + return 0; + + while (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, nr_shrink); + write_unlock(&et->lock); + } + + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + + return node_cnt; +} + static void __update_extent_tree_range(struct inode *inode, struct extent_info *tei, enum extent_type type) { @@ -760,9 +785,6 @@ static void __update_extent_tree_range(struct inode *inode, } } - if (is_inode_flag_set(inode, FI_NO_EXTENT)) - __free_extent_tree(sbi, et); - if (et->largest_updated) { et->largest_updated = false; updated = true; @@ -780,6 +802,9 @@ static void __update_extent_tree_range(struct inode *inode, out_read_extent_cache: write_unlock(&et->lock); + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __destroy_extent_node(inode, EX_READ); + if (updated) f2fs_mark_inode_dirty_sync(inode, true); } @@ -942,10 +967,14 @@ static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink list_for_each_entry_safe(et, next, &eti->zombie_list, list) { if (atomic_read(&et->node_cnt)) { write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et); + node_cnt += __free_extent_tree(sbi, et, + nr_shrink - node_cnt - tree_cnt); write_unlock(&et->lock); } - f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + + if (atomic_read(&et->node_cnt)) + goto unlock_out; + list_del_init(&et->list); radix_tree_delete(&eti->extent_tree_root, et->ino); kmem_cache_free(extent_tree_slab, et); @@ -1084,23 +1113,6 @@ unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE); } -static unsigned int __destroy_extent_node(struct inode *inode, - enum extent_type type) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; - unsigned int node_cnt = 0; - - if (!et || !atomic_read(&et->node_cnt)) - return 0; - - write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et); - write_unlock(&et->lock); - - return node_cnt; -} - void f2fs_destroy_extent_node(struct inode *inode) { __destroy_extent_node(inode, EX_READ); @@ -1109,7 +1121,6 @@ void f2fs_destroy_extent_node(struct inode *inode) static void __drop_extent_tree(struct inode *inode, enum extent_type type) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et = F2FS_I(inode)->extent_tree[type]; bool updated = false; @@ -1117,7 +1128,6 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) return; write_lock(&et->lock); - __free_extent_tree(sbi, et); if (type == EX_READ) { set_inode_flag(inode, FI_NO_EXTENT); if (et->largest.len) { @@ -1126,6 +1136,9 @@ static void __drop_extent_tree(struct inode *inode, enum extent_type type) } } write_unlock(&et->lock); + + __destroy_extent_node(inode, type); + if (updated) f2fs_mark_inode_dirty_sync(inode, true); } From 009a8241a8e5a14ea2dd0b8db42dbf283527dd44 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 22 Nov 2024 14:50:05 +0800 Subject: [PATCH 39/40] f2fs: add a sysfs node to limit max read extent count per-inode Quoted: "at this time, there are still 1086911 extent nodes in this zombie extent tree that need to be cleaned up. crash_arm64_sprd_v8.0.3++> extent_tree.node_cnt ffffff80896cc500 node_cnt = { counter = 1086911 }, " As reported by Xiuhong, there will be a huge number of extent nodes in extent tree, it may potentially cause: - slab memory fragments - extreme long time shrink on extent tree - low mapping efficiency Let's add a sysfs node to limit max read extent count for each inode, by default, value of this threshold is 10240, it can be updated according to user's requirement. Reported-by: Xiuhong Wang Closes: https://lore.kernel.org/linux-f2fs-devel/20241112110627.1314632-1-xiuhong.wang@unisoc.com/ Signed-off-by: Xiuhong Wang Signed-off-by: Zhiguo Niu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/extent_cache.c | 5 ++++- fs/f2fs/f2fs.h | 4 ++++ fs/f2fs/sysfs.c | 10 ++++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 513296bb6f29..3e1630c70d8a 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -822,3 +822,9 @@ Description: It controls the valid block ratio threshold not to trigger excessiv for zoned deivces. The initial value of it is 95(%). F2FS will stop the background GC thread from intiating GC for sections having valid blocks exceeding the ratio. + +What: /sys/fs/f2fs//max_read_extent_count +Date: November 2024 +Contact: "Chao Yu" +Description: It controls max read extent count for per-inode, the value of threshold + is 10240 by default. diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index b7a6817b44b0..347b3b647834 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -717,7 +717,9 @@ static void __update_extent_tree_range(struct inode *inode, } if (end < org_end && (type != EX_READ || - org_end - end >= F2FS_MIN_EXTENT_LEN)) { + (org_end - end >= F2FS_MIN_EXTENT_LEN && + atomic_read(&et->node_cnt) < + sbi->max_read_extent_count))) { if (parts) { __set_extent_info(&ei, end, org_end - end, @@ -1212,6 +1214,7 @@ void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi) sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD; sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD; sbi->last_age_weight = LAST_AGE_WEIGHT; + sbi->max_read_extent_count = DEF_MAX_READ_EXTENT_COUNT; } int __init f2fs_create_extent_cache(void) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b65b023a588a..6f2cbf4c5740 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -635,6 +635,9 @@ enum { #define DEF_HOT_DATA_AGE_THRESHOLD 262144 #define DEF_WARM_DATA_AGE_THRESHOLD 2621440 +/* default max read extent count per inode */ +#define DEF_MAX_READ_EXTENT_COUNT 10240 + /* extent cache type */ enum extent_type { EX_READ, @@ -1619,6 +1622,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct extent_tree_info extent_tree[NR_EXTENT_CACHES]; atomic64_t allocated_data_blocks; /* for block age extent_cache */ + unsigned int max_read_extent_count; /* max read extent count per inode */ /* The threshold used for hot and warm data seperation*/ unsigned int hot_data_age_threshold; diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index bdbf24db667b..6b99dc49f776 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -787,6 +787,13 @@ static ssize_t __sbi_store(struct f2fs_attr *a, return count; } + if (!strcmp(a->attr.name, "max_read_extent_count")) { + if (t > UINT_MAX) + return -EINVAL; + *ui = (unsigned int)t; + return count; + } + if (!strcmp(a->attr.name, "ipu_policy")) { if (t >= BIT(F2FS_IPU_MAX)) return -EINVAL; @@ -1052,6 +1059,8 @@ F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block); F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold); F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); +/* read extent cache */ +F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); #ifdef CONFIG_BLK_DEV_ZONED F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); @@ -1242,6 +1251,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(hot_data_age_threshold), ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), + ATTR_LIST(max_read_extent_count), NULL, }; ATTRIBUTE_GROUPS(f2fs); From bc8aeb04fd80cb8cfae3058445c84410fd0beb5e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 21 Nov 2024 22:17:16 +0800 Subject: [PATCH 40/40] f2fs: fix to drop all discards after creating snapshot on lvm device Piergiorgio reported a bug in bugzilla as below: ------------[ cut here ]------------ WARNING: CPU: 2 PID: 969 at fs/f2fs/segment.c:1330 RIP: 0010:__submit_discard_cmd+0x27d/0x400 [f2fs] Call Trace: __issue_discard_cmd+0x1ca/0x350 [f2fs] issue_discard_thread+0x191/0x480 [f2fs] kthread+0xcf/0x100 ret_from_fork+0x31/0x50 ret_from_fork_asm+0x1a/0x30 w/ below testcase, it can reproduce this bug quickly: - pvcreate /dev/vdb - vgcreate myvg1 /dev/vdb - lvcreate -L 1024m -n mylv1 myvg1 - mount /dev/myvg1/mylv1 /mnt/f2fs - dd if=/dev/zero of=/mnt/f2fs/file bs=1M count=20 - sync - rm /mnt/f2fs/file - sync - lvcreate -L 1024m -s -n mylv1-snapshot /dev/myvg1/mylv1 - umount /mnt/f2fs The root cause is: it will update discard_max_bytes of mounted lvm device to zero after creating snapshot on this lvm device, then, __submit_discard_cmd() will pass parameter @nr_sects w/ zero value to __blkdev_issue_discard(), it returns a NULL bio pointer, result in panic. This patch changes as below for fixing: 1. Let's drop all remained discards in f2fs_unfreeze() if snapshot of lvm device is created. 2. Checking discard_max_bytes before submitting discard during __submit_discard_cmd(). Cc: stable@vger.kernel.org Fixes: 35ec7d574884 ("f2fs: split discard command in prior to block layer") Reported-by: Piergiorgio Sartor Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219484 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 +++++++++------- fs/f2fs/super.c | 12 ++++++++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4236040e3994..eade36c5ef13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1290,16 +1290,18 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, wait_list, issued); return 0; } - - /* - * Issue discard for conventional zones only if the device - * supports discard. - */ - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; } #endif + /* + * stop issuing discard for any of below cases: + * 1. device is conventional zone, but it doesn't support discard. + * 2. device is regulare device, after snapshot it doesn't support + * discard. + */ + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; + trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c0670cd61956..fc7d463dee15 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1760,6 +1760,18 @@ static int f2fs_freeze(struct super_block *sb) static int f2fs_unfreeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + /* + * It will update discard_max_bytes of mounted lvm device to zero + * after creating snapshot on this lvm device, let's drop all + * remained discards. + * We don't need to disable real-time discard because discard_max_bytes + * will recover after removal of snapshot. + */ + if (test_opt(sbi, DISCARD) && !f2fs_hw_support_discard(sbi)) + f2fs_issue_discard_timeout(sbi); + clear_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); return 0; }