From d431a2cd28e8b7a91474d496e9226ef06a31c6eb Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 25 Sep 2024 21:47:39 +0200 Subject: [PATCH 01/33] ext4: Call ext4_journal_stop(handle) only once in ext4_dio_write_iter() An ext4_journal_stop(handle) call was immediately used after a return value check for a ext4_orphan_add() call in this function implementation. Thus call such a function only once instead directly before the check. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/cf895072-43cf-412c-bced-8268498ad13e@web.de Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f14aed14b9cf..23005f1345a8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -564,12 +564,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - ext4_journal_stop(handle); + if (ret) + goto out; } if (ilock_shared && !unwritten) From 902cc179c931a033cd7f4242353aa2733bf8524c Mon Sep 17 00:00:00 2001 From: Jeongjun Park Date: Thu, 3 Oct 2024 21:53:37 +0900 Subject: [PATCH 02/33] ext4: supress data-race warnings in ext4_free_inodes_{count,set}() find_group_other() and find_group_orlov() read *_lo, *_hi with ext4_free_inodes_count without additional locking. This can cause data-race warning, but since the lock is held for most writes and free inodes value is generally not a problem even if it is incorrect, it is more appropriate to use READ_ONCE()/WRITE_ONCE() than to add locking. ================================================================== BUG: KCSAN: data-race in ext4_free_inodes_count / ext4_free_inodes_set write to 0xffff88810404300e of 2 bytes by task 6254 on cpu 1: ext4_free_inodes_set+0x1f/0x80 fs/ext4/super.c:405 __ext4_new_inode+0x15ca/0x2200 fs/ext4/ialloc.c:1216 ext4_symlink+0x242/0x5a0 fs/ext4/namei.c:3391 vfs_symlink+0xca/0x1d0 fs/namei.c:4615 do_symlinkat+0xe3/0x340 fs/namei.c:4641 __do_sys_symlinkat fs/namei.c:4657 [inline] __se_sys_symlinkat fs/namei.c:4654 [inline] __x64_sys_symlinkat+0x5e/0x70 fs/namei.c:4654 x64_sys_call+0x1dda/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:267 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x76/0x7e read to 0xffff88810404300e of 2 bytes by task 6257 on cpu 0: ext4_free_inodes_count+0x1c/0x80 fs/ext4/super.c:349 find_group_other fs/ext4/ialloc.c:594 [inline] __ext4_new_inode+0x6ec/0x2200 fs/ext4/ialloc.c:1017 ext4_symlink+0x242/0x5a0 fs/ext4/namei.c:3391 vfs_symlink+0xca/0x1d0 fs/namei.c:4615 do_symlinkat+0xe3/0x340 fs/namei.c:4641 __do_sys_symlinkat fs/namei.c:4657 [inline] __se_sys_symlinkat fs/namei.c:4654 [inline] __x64_sys_symlinkat+0x5e/0x70 fs/namei.c:4654 x64_sys_call+0x1dda/0x2d60 arch/x86/include/generated/asm/syscalls_64.h:267 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x54/0x120 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x76/0x7e Cc: stable@vger.kernel.org Signed-off-by: Jeongjun Park Reviewed-by: Andreas Dilger Link: https://patch.msgid.link/20241003125337.47283-1-aha310510@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16a4ce704460..8337c4999f90 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -346,9 +346,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb, __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg) { - return le16_to_cpu(bg->bg_free_inodes_count_lo) | + return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); + (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, @@ -402,9 +402,9 @@ void ext4_free_group_clusters_set(struct super_block *sb, void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count) { - bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count)); if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) - bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); + WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16)); } void ext4_used_dirs_set(struct super_block *sb, From 76486b104168ae59703190566e372badf433314b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 5 Oct 2024 00:15:56 +0200 Subject: [PATCH 03/33] ext4: avoid remount errors with 'abort' mount option When we remount filesystem with 'abort' mount option while changing other mount options as well (as is LTP test doing), we can return error from the system call after commit d3476f3dad4a ("ext4: don't set SB_RDONLY after filesystem errors") because the application of mount option changes detects shutdown filesystem and refuses to do anything. The behavior of application of other mount options in presence of 'abort' mount option is currently rather arbitary as some mount option changes are handled before 'abort' and some after it. Move aborting of the filesystem to the end of remount handling so all requested changes are properly applied before the filesystem is shutdown to have a reasonably consistent behavior. Fixes: d3476f3dad4a ("ext4: don't set SB_RDONLY after filesystem errors") Reported-by: Jan Stancek Link: https://lore.kernel.org/all/Zvp6L+oFnfASaoHl@t14s Signed-off-by: Jan Kara Tested-by: Jan Stancek Link: https://patch.msgid.link/20241004221556.19222-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8337c4999f90..8a0af97e5d90 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6518,9 +6518,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (test_opt2(sb, ABORT)) - ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); - sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -6689,6 +6686,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb)) ext4_stop_mmpd(sbi); + /* + * Handle aborting the filesystem as the last thing during remount to + * avoid obsure errors during remount when some option changes fail to + * apply due to shutdown filesystem. + */ + if (test_opt2(sb, ABORT)) + ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); + return 0; restore_opts: From fdfa648ab9393de8c1be21cad17c17bdced3a68a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 8 Oct 2024 20:01:34 +0800 Subject: [PATCH 04/33] ext4: show the default enabled prefetch_block_bitmaps option After commit 21175ca434c5 ("ext4: make prefetch_block_bitmaps default"), we enable 'prefetch_block_bitmaps' by default, but this is not shown in the '/proc/fs/ext4/sdx/options' procfs interface. This makes it impossible to distinguish whether the feature is enabled by default or not, so 'prefetch_block_bitmaps' is shown in the 'options' procfs interface when prefetch_block_bitmaps is enabled by default. This makes it easy to notice changes to the default mount options between versions through the '/proc/fs/ext4/sdx/options' procfs interface. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20241008120134.3758097-1-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8a0af97e5d90..e7d3c2e209ba 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3030,6 +3030,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PUTS("mb_optimize_scan=1"); } + if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) + SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + ext4_show_quota_options(seq, sb); return 0; } From 40eb3104cf416188c4bd8fd7946f6c388409c286 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 8 Oct 2024 20:11:52 +0800 Subject: [PATCH 05/33] ext4: WARN if a full dir leaf block has only one dentry The maximum length of a filename is 255 and the minimum block size is 1024, so it is always guaranteed that the number of entries is greater than or equal to 2 when do_split() is called. So unless ext4_dx_add_entry() and make_indexed_dir() or some other functions are buggy, 'split == 0' will not occur. Setting 'continued' to 0 in this case masks the problem that the file system has become corrupted, even though it prevents possible out-of-bounds access. Hence WARN_ON_ONCE() is used to check if 'split' is 0, and if it is then warns and returns an error to abort split. Suggested-by: Theodore Ts'o Link: https://lore.kernel.org/r/20240823160518.GA424729@mit.edu Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://patch.msgid.link/20241008121152.3771906-1-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 790db7eac6c2..08d15cd2b594 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2000,8 +2000,17 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, else split = count/2; + if (WARN_ON_ONCE(split == 0)) { + /* Should never happen, but avoid out-of-bounds access below */ + ext4_error_inode_block(dir, (*bh)->b_blocknr, 0, + "bad indexed directory? hash=%08x:%08x count=%d move=%u", + hinfo->hash, hinfo->minor_hash, count, move); + err = -EFSCORRUPTED; + goto out; + } + hash2 = map[split].hash; - continued = split > 0 ? hash2 == map[split - 1].hash : 0; + continued = hash2 == map[split - 1].hash; dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n", (unsigned long)dx_get_block(frame->at), hash2, split, count-split)); @@ -2043,10 +2052,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, return de; journal_error: + ext4_std_error(dir->i_sb, err); +out: brelse(*bh); brelse(bh2); *bh = NULL; - ext4_std_error(dir->i_sb, err); return ERR_PTR(err); } From 4a622e4d477bb12ad5ed4abbc7ad1365de1fa347 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 23 Oct 2024 00:25:37 -0400 Subject: [PATCH 06/33] ext4: fix FS_IOC_GETFSMAP handling The original implementation ext4's FS_IOC_GETFSMAP handling only worked when the range of queried blocks included at least one free (unallocated) block range. This is because how the metadata blocks were emitted was as a side effect of ext4_mballoc_query_range() calling ext4_getfsmap_datadev_helper(), and that function was only called when a free block range was identified. As a result, this caused generic/365 to fail. Fix this by creating a new function ext4_getfsmap_meta_helper() which gets called so that blocks before the first free block range in a block group can get properly reported. Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/fsmap.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/mballoc.c | 18 ++++++++++++---- fs/ext4/mballoc.h | 1 + 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index df853c4d3a8c..383c6edea6dd 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -185,6 +185,56 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) return fmr->fmr_physical + fmr->fmr_length; } +static int ext4_getfsmap_meta_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb, fs_start, fs_end; + int error; + + fs_start = fsb = (EXT4_C2B(sbi, start) + + ext4_group_first_block_no(sb, agno)); + fs_end = fs_start + EXT4_C2B(sbi, len); + + /* Return relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical < info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + continue; + } + if (p->fmr_physical <= fs_start || + p->fmr_physical + p->fmr_length <= fs_end) { + /* Emit the retained free extent record if present */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, + &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + fsb = p->fmr_physical + p->fmr_length; + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + list_del(&p->fmr_list); + kfree(p); + continue; + } + } + if (info->gfi_next_fsblk < fsb) + info->gfi_next_fsblk = fsb; + + return 0; +} + + /* Transform a blockgroup's free record into a fsmap */ static int ext4_getfsmap_datadev_helper(struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, @@ -539,6 +589,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb, error = ext4_mballoc_query_range(sb, info->gfi_agno, EXT4_B2C(sbi, info->gfi_low.fmr_physical), EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_meta_helper, ext4_getfsmap_datadev_helper, info); if (error) goto err; @@ -560,7 +611,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb, /* Report any gaps at the end of the bg */ info->gfi_last = true; - error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); + error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1, + 0, info); if (error) goto err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d73e38323879..92f49d7eb3c0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6999,13 +6999,14 @@ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, + ext4_grpblk_t first, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; - ext4_grpblk_t next; + ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; @@ -7016,10 +7017,19 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = max(e4b.bd_info->bb_first_free, start); + start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - + if (meta_formatter && start != first) { + if (start > end) + start = end; + ext4_unlock_group(sb, group); + error = meta_formatter(sb, group, first, start - first, + priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index d8553f1498d3..f8280de3e882 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -259,6 +259,7 @@ ext4_mballoc_query_range( ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t end, + ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv); From c7f9a6fa405248bbd2a398c91d4df30b399f84aa Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 30 Aug 2024 15:17:13 +0800 Subject: [PATCH 07/33] ext4: simplify if condition The if condition !A || A && B can be simplified to !A || B. ./fs/ext4/fast_commit.c:362:21-23: WARNING !A || A && B is equivalent to !A || B. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=9837 Signed-off-by: Jiapeng Chong Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240830071713.40565-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b33664f6ce2a..3dee94a38a68 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -357,9 +357,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl } spin_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - if (has_transaction && - (!is_ineligible || - (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid)))) + if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); spin_unlock(&sbi->s_fc_lock); From a9cdf82a47addae72b119a063fc834923e7838fe Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Wed, 4 Sep 2024 15:53:00 +0800 Subject: [PATCH 08/33] fs: ext4: Don't use CMA for buffer_head cma_alloc() keep failed in our system which thanks to a jh->bh->b_page can not be migrated out of CMA area[1] as the jh has one cp_transaction pending on it because of j_free > j_max_transaction_buffers[2][3][4][5][6]. We temporarily solve this by launching jbd2_log_do_checkpoint forcefully somewhere. Since journal is common mechanism to all JFSs and cp_transaction has a little fewer opportunity to be launched, the cma_alloc() could be affected under the same scenario. This patch would like to have buffer_head of ext4 not use CMA pages when doing sb_getblk. [1] crash_arm64_v8.0.4++> kmem -p|grep ffffff808f0aa150(sb->s_bdev->bd_inode->i_mapping) fffffffe01a51c00 e9470000 ffffff808f0aa150 3 2 8000000008020 lru,private fffffffe03d189c0 174627000 ffffff808f0aa150 4 2 2004000000008020 lru,private fffffffe03d88e00 176238000 ffffff808f0aa150 3f9 2 2008000000008020 lru,private fffffffe03d88e40 176239000 ffffff808f0aa150 6 2 2008000000008020 lru,private fffffffe03d88e80 17623a000 ffffff808f0aa150 5 2 2008000000008020 lru,private fffffffe03d88ec0 17623b000 ffffff808f0aa150 1 2 2008000000008020 lru,private fffffffe03d88f00 17623c000 ffffff808f0aa150 0 2 2008000000008020 lru,private fffffffe040e6540 183995000 ffffff808f0aa150 3f4 2 2004000000008020 lru,private [2] page -> buffer_head crash_arm64_v8.0.4++> struct page.private fffffffe01a51c00 -x private = 0xffffff802fca0c00 [3] buffer_head -> journal_head crash_arm64_v8.0.4++> struct buffer_head.b_private 0xffffff802fca0c00 b_private = 0xffffff8041338e10, [4] journal_head -> b_cp_transaction crash_arm64_v8.0.4++> struct journal_head.b_cp_transaction 0xffffff8041338e10 -x b_cp_transaction = 0xffffff80410f1900, [5] transaction_t -> journal crash_arm64_v8.0.4++> struct transaction_t.t_journal 0xffffff80410f1900 -x t_journal = 0xffffff80e70f3000, [6] j_free & j_max_transaction_buffers crash_arm64_v8.0.4++> struct journal_t.j_free,j_max_transaction_buffers 0xffffff80e70f3000 -x j_free = 0x3f1, j_max_transaction_buffers = 0x100, Suggested-by: Theodore Ts'o Signed-off-by: Zhaoyang Huang Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240904075300.1148836-1-zhaoyang.huang@unisoc.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 54bdd4884fe6..5d20275526c3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -856,7 +856,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); - bh = sb_getblk(inode->i_sb, map.m_pblk); + /* + * Since bh could introduce extra ref count such as referred by + * journal_head etc. Try to avoid using __GFP_MOVABLE here + * as it may fail the migration when journal_head remains. + */ + bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk, + inode->i_sb->s_blocksize); + if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { From 150c174a6053efc215b7a10b7fbcc869039bb6c3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 4 Sep 2024 10:46:57 +0200 Subject: [PATCH 09/33] ext4: return error on syncfs after shutdown This is the logic behavior and one that we would like to verify using a generic fstest similar to xfs/546. Link: https://lore.kernel.org/fstests/20240830152648.GE6216@frogsfrogsfrogs/ Suggested-by: Darrick J. Wong Signed-off-by: Amir Goldstein Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240904084657.1062243-1-amir73il@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e7d3c2e209ba..e4bdb896e202 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6304,7 +6304,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(ext4_forced_shutdown(sb))) - return 0; + return -EIO; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); From 667de03a3b5eab4ccf532c6e399fe3488a1db58b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 5 Sep 2024 19:32:29 +0300 Subject: [PATCH 10/33] ext4: mark ctx_*_flags() with __maybe_unused When ctx_set_flags() is unused, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y: .../ext4/super.c:2120:1: error: unused function 'ctx_set_flags' [-Werror,-Wunused-function] 2120 | EXT4_SET_CTX(flags); /* set only */ | ^~~~~~~~~~~~~~~~~~~ Fix this by marking ctx_*_flags() with __maybe_unused (mark both for the sake of symmetry). See also commit 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build"). Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20240905163229.140522-1-andriy.shevchenko@linux.intel.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e4bdb896e202..dce37f784b59 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2096,16 +2096,16 @@ static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param, } #define EXT4_SET_CTX(name) \ -static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name |= flag; \ } #define EXT4_CLEAR_CTX(name) \ -static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ - unsigned long flag) \ +static inline __maybe_unused \ +void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \ { \ ctx->mask_s_##name |= flag; \ ctx->vals_s_##name &= ~flag; \ From a90825898becb730377b157884c82a725f1d3ffa Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 6 Sep 2024 14:14:01 +0800 Subject: [PATCH 11/33] ext4: don't pass full mapping flags to ext4_es_insert_extent() When converting a delalloc extent in ext4_es_insert_extent(), since we only want to pass the info of whether the quota has already been claimed if the allocation is a direct allocation from ext4_map_create_blocks(), there is no need to pass full mapping flags, so changes to just pass whether the EXT4_GET_BLOCKS_DELALLOC_RESERVE bit is set. Suggested-by: Jan Kara Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240906061401.2980330-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 4 ++-- fs/ext4/extents_status.c | 8 ++++---- fs/ext4/extents_status.h | 3 ++- fs/ext4/inode.c | 6 +++--- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 34e25eee6521..c144fe43a29f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3138,7 +3138,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN, 0); + EXTENT_STATUS_WRITTEN, false); } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -4158,7 +4158,7 @@ insert_hole: /* Put just found gap into cache to speed up subsequent requests */ ext_debug(inode, " -> %u:%u\n", hole_start, len); ext4_es_insert_extent(inode, hole_start, len, ~0, - EXTENT_STATUS_HOLE, 0); + EXTENT_STATUS_HOLE, false); /* Update hole_len to reflect hole size after lblk */ if (hole_start != lblk) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c786691dabd3..ae29832aab1e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -848,7 +848,7 @@ out: */ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status, int flags) + unsigned int status, bool delalloc_reserve_used) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; @@ -863,8 +863,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n", - lblk, len, pblk, status, flags, inode->i_ino); + es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n", + lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino); if (!len) return; @@ -945,7 +945,7 @@ error: resv_used += pending; if (resv_used) ext4_da_update_reserve_space(inode, resv_used, - flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); + delalloc_reserve_used); if (err1 || err2 || err3 < 0) goto retry; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 4424232de298..8f9c008d11e8 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -135,7 +135,8 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status, int flags); + unsigned int status, + bool delalloc_reserve_used); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5d20275526c3..9b06c9e0a7c5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -483,7 +483,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, 0); + map->m_pblk, status, false); return retval; } @@ -563,8 +563,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, flags); + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, + status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); return retval; } From 2f3d93e210b9c2866c8b3662adae427d5bf511ec Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 6 Sep 2024 17:17:46 +0800 Subject: [PATCH 12/33] ext4: fix race in buffer_head read fault injection When I enabled ext4 debug for fault injection testing, I encountered the following warning: EXT4-fs error (device sda): ext4_read_inode_bitmap:201: comm fsstress: Cannot read inode bitmap - block_group = 8, inode_bitmap = 1051 WARNING: CPU: 0 PID: 511 at fs/buffer.c:1181 mark_buffer_dirty+0x1b3/0x1d0 The root cause of the issue lies in the improper implementation of ext4's buffer_head read fault injection. The actual completion of buffer_head read and the buffer_head fault injection are not atomic, which can lead to the uptodate flag being cleared on normally used buffer_heads in race conditions. [CPU0] [CPU1] [CPU2] ext4_read_inode_bitmap ext4_read_bh() ext4_read_inode_bitmap if (buffer_uptodate(bh)) return bh jbd2_journal_commit_transaction __jbd2_journal_refile_buffer __jbd2_journal_unfile_buffer __jbd2_journal_temp_unlink_buffer ext4_simulate_fail_bh() clear_buffer_uptodate mark_buffer_dirty WARN_ON_ONCE(!buffer_uptodate(bh)) The best approach would be to perform fault injection in the IO completion callback function, rather than after IO completion. However, the IO completion callback function cannot get the fault injection code in sb. Fix it by passing the result of fault injection into the bh read function, we simulate faults within the bh read function itself. This requires adding an extra parameter to the bh read functions that need fault injection. Fixes: 46f870d690fe ("ext4: simulate various I/O and checksum errors when reading metadata") Signed-off-by: Long Li Link: https://patch.msgid.link/20240906091746.510163-1-leo.lilong@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 4 ++-- fs/ext4/ext4.h | 12 ++---------- fs/ext4/extents.c | 2 +- fs/ext4/ialloc.c | 5 +++-- fs/ext4/indirect.c | 2 +- fs/ext4/inode.c | 4 ++-- fs/ext4/mmp.c | 2 +- fs/ext4/move_extent.c | 2 +- fs/ext4/resize.c | 2 +- fs/ext4/super.c | 23 +++++++++++++++-------- 10 files changed, 29 insertions(+), 29 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 591fb3f710be..8042ad873808 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -550,7 +550,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), - ext4_end_bitmap_read); + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO)); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); @@ -577,7 +578,6 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 44b0d418143c..bbffb76d9a90 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1865,14 +1865,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb, return false; } -static inline void ext4_simulate_fail_bh(struct super_block *sb, - struct buffer_head *bh, - unsigned long code) -{ - if (!IS_ERR(bh) && ext4_simulate_fail(sb, code)) - clear_buffer_uptodate(bh); -} - /* * Error number codes for s_{first,last}_error_errno * @@ -3100,9 +3092,9 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io); + bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c144fe43a29f..1c8123866d81 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -568,7 +568,7 @@ __read_extent_tree_block(const char *function, unsigned int line, if (!bh_uptodate_or_lock(bh)) { trace_ext4_ext_load_extent(inode, pblk, _RET_IP_); - err = ext4_read_bh(bh, 0, NULL); + err = ext4_read_bh(bh, 0, NULL, false); if (err < 0) goto errout; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 7f1a5f90dbbd..21d228073d79 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -193,8 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * submit the buffer_head for reading */ trace_ext4_load_inode_bitmap(sb, block_group); - ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO); + ext4_read_bh(bh, REQ_META | REQ_PRIO, + ext4_end_bitmap_read, + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO)); if (!buffer_uptodate(bh)) { put_bh(bh); ext4_error_err(sb, EIO, "Cannot read inode bitmap - " diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 7404f0935c90..7de327fa7b1c 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -170,7 +170,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, } if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { put_bh(bh); goto failure; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b06c9e0a7c5..569887741bf1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4504,10 +4504,10 @@ make_io: * Read the block from disk. */ trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL, + ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)); blk_finish_plug(&plug); wait_on_buffer(bh); - ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index bd946d0c71b7..d64c04ed061a 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -94,7 +94,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } lock_buffer(*bh); - ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL); + ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false); if (ret) goto warn_exit; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b64661ea6e0e..898443e98efc 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -213,7 +213,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) unlock_buffer(bh); continue; } - ext4_read_bh_nowait(bh, 0, NULL); + ext4_read_bh_nowait(bh, 0, NULL, false); nr++; } while (block++, (bh = bh->b_this_page) != head); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a2704f064361..72f77f78ae8d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1300,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) if (unlikely(!bh)) return NULL; if (!bh_uptodate_or_lock(bh)) { - if (ext4_read_bh(bh, 0, NULL) < 0) { + if (ext4_read_bh(bh, 0, NULL, false) < 0) { brelse(bh); return NULL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dce37f784b59..cb0ce57bd896 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -161,8 +161,14 @@ MODULE_ALIAS("ext3"); static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { + if (simu_fail) { + clear_buffer_uptodate(bh); + unlock_buffer(bh); + return; + } + /* * buffer's verified bit is no longer valid after reading from * disk again due to write out error, clear it to make sure we @@ -176,7 +182,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, } void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, - bh_end_io_t *end_io) + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -184,10 +190,11 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, unlock_buffer(bh); return; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); } -int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io) +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, + bh_end_io_t *end_io, bool simu_fail) { BUG_ON(!buffer_locked(bh)); @@ -196,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io return 0; } - __ext4_read_bh(bh, op_flags, end_io); + __ext4_read_bh(bh, op_flags, end_io, simu_fail); wait_on_buffer(bh); if (buffer_uptodate(bh)) @@ -208,10 +215,10 @@ int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait) { lock_buffer(bh); if (!wait) { - ext4_read_bh_nowait(bh, op_flags, NULL); + ext4_read_bh_nowait(bh, op_flags, NULL, false); return 0; } - return ext4_read_bh(bh, op_flags, NULL); + return ext4_read_bh(bh, op_flags, NULL, false); } /* @@ -266,7 +273,7 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) if (likely(bh)) { if (trylock_buffer(bh)) - ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL); + ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false); brelse(bh); } } From 813f85360404028cd8340ae7f7f77abc68c86fbc Mon Sep 17 00:00:00 2001 From: "j.xia" Date: Thu, 19 Sep 2024 10:03:41 +0800 Subject: [PATCH 13/33] ext4: pass write-hint for buffered IO Commit 449813515d3e ("block, fs: Restore the per-bio/request data lifetime fields") restored write-hint support in ext4. But that is applicable only for direct IO. This patch supports passing write-hint for buffered IO from ext4 file system to block layer by filling bi_write_hint of struct bio in io_submit_add_bh(). Signed-off-by: j.xia Link: https://patch.msgid.link/20240919020341.2657646-1-j.xia@samsung.com Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ad5543866d21..bb8b17c394d4 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -417,8 +417,10 @@ static void io_submit_add_bh(struct ext4_io_submit *io, submit_and_retry: ext4_io_submit(io); } - if (io->io_bio == NULL) + if (io->io_bio == NULL) { io_submit_init_bio(io, bh); + io->io_bio->bi_write_hint = inode->i_write_hint; + } if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); From 25f51ea8ac8144af2fefb743c3439547610368ff Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Thu, 19 Sep 2024 16:25:39 +0800 Subject: [PATCH 14/33] ext4: disambiguate the return value of ext4_dio_write_end_io() The commit 91562895f803 ("ext4: properly sync file size update after O_SYNC direct IO") causes confusion about the meaning of the return value of ext4_dio_write_end_io(). Specifically, when the ext4_handle_inode_extension() operation succeeds, ext4_dio_write_end_io() directly returns count instead of 0. This does not cause a bug in the current kernel, but the semantics of the return value of the ext4_dio_write_end_io() function are wrong, which is likely to introduce bugs in the future code evolution. Signed-off-by: Jinliang Zheng Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240919082539.381626-1-alexjlzheng@tencent.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 23005f1345a8..8e9dfed1c34c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -392,8 +392,9 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) - return size; - return ext4_handle_inode_extension(inode, pos, size, size); + return 0; + error = ext4_handle_inode_extension(inode, pos, size, size); + return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { From c7fc0366c65628fd69bfc310affec4918199aae2 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 19 Sep 2024 12:07:40 -0400 Subject: [PATCH 15/33] ext4: partial zero eof block on unaligned inode size extension Using mapped writes, it's technically possible to expose stale post-eof data on a truncate up operation. Consider the following example: $ xfs_io -fc "pwrite 0 2k" -c "mmap 0 4k" -c "mwrite 2k 2k" \ -c "truncate 8k" -c "pread -v 2k 16" ... 00000800: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX ... This shows that the post-eof data written via mwrite lands within EOF after a truncate up. While this is deliberate of the test case, behavior is somewhat unpredictable because writeback does post-eof zeroing, and writeback can occur at any time in the background. For example, an fsync inserted between the mwrite and truncate causes the subsequent read to instead return zeroes. This basically means that there is a race window in this situation between any subsequent extending operation and writeback that dictates whether post-eof data is exposed to the file or zeroed. To prevent this problem, perform partial block zeroing as part of the various inode size extending operations that are susceptible to it. For truncate extension, zero around the original eof similar to how truncate down does partial zeroing of the new eof. For extension via writes and fallocate related operations, zero the newly exposed range of the file to cover any partial zeroing that must occur at the original and new eof blocks. Signed-off-by: Brian Foster Link: https://patch.msgid.link/20240919160741.208162-2-bfoster@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 7 ++++++- fs/ext4/inode.c | 53 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1c8123866d81..a07a98a4b97a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int depth = 0; struct ext4_map_blocks map; unsigned int credits; - loff_t epos; + loff_t epos, old_size = i_size_read(inode); BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); map.m_lblk = offset; @@ -4541,6 +4541,11 @@ retry: if (ext4_update_inode_size(inode, epos) & 0x1) inode_set_mtime_to_ts(inode, inode_get_ctime(inode)); + if (epos > old_size) { + pagecache_isize_extended(inode, old_size, epos); + ext4_zero_partial_blocks(handle, inode, + old_size, epos - old_size); + } } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 569887741bf1..aadd8bb3c679 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1314,8 +1314,10 @@ static int ext4_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock @@ -1430,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file, folio_unlock(folio); folio_put(folio); - if (old_size < pos && !verity) + if (old_size < pos && !verity) { pagecache_isize_extended(inode, old_size, pos); + ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); + } if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); @@ -2992,7 +2996,8 @@ static int ext4_da_do_write_end(struct address_space *mapping, struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; - loff_t new_i_size; + loff_t new_i_size, zero_len = 0; + handle_t *handle; if (unlikely(!folio_buffers(folio))) { folio_unlock(folio); @@ -3036,19 +3041,22 @@ static int ext4_da_do_write_end(struct address_space *mapping, folio_unlock(folio); folio_put(folio); - if (old_size < pos) + if (pos > old_size) { pagecache_isize_extended(inode, old_size, pos); - - if (disksize_changed) { - handle_t *handle; - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) - return PTR_ERR(handle); - ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); + zero_len = pos - old_size; } + if (!disksize_changed && !zero_len) + return copied; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (zero_len) + ext4_zero_partial_blocks(handle, inode, old_size, zero_len); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + return copied; } @@ -5433,6 +5441,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_size != inode->i_size) { + /* attach jbd2 jinode for EOF folio tail zeroing */ + if (attr->ia_size & (inode->i_sb->s_blocksize - 1) || + oldsize & (inode->i_sb->s_blocksize - 1)) { + error = ext4_inode_attach_jinode(inode); + if (error) + goto err_out; + } + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); @@ -5443,12 +5459,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, orphan = 1; } /* - * Update c/mtime on truncate up, ext4_truncate() will - * update c/mtime in shrink case below + * Update c/mtime and tail zero the EOF folio on + * truncate up. ext4_truncate() handles the shrink case + * below. */ - if (!shrink) + if (!shrink) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + if (oldsize & (inode->i_sb->s_blocksize - 1)) + ext4_block_truncate_page(handle, + inode->i_mapping, oldsize); + } if (shrink) ext4_fc_track_range(handle, inode, From 52aecaee1c26409ebafe080293e0841884f6e9fb Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 19 Sep 2024 12:07:41 -0400 Subject: [PATCH 16/33] mm: zero range of eof folio exposed by inode size extension On some filesystems, it is currently possible to create a transient data inconsistency between pagecache and on-disk state. For example, on a 1k block size ext4 filesystem: $ xfs_io -fc "pwrite 0 2k" -c "mmap 0 4k" -c "mwrite 2k 2k" \ -c "truncate 8k" -c "fiemap -v" -c "pread -v 2k 16" ... EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..3]: 17410..17413 4 0x1 1: [4..15]: hole 12 00000800: 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 XXXXXXXXXXXXXXXX $ umount ; mount $ xfs_io -c "pread -v 2k 16" 00000800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ This allocates and writes two 1k blocks, map writes to the post-eof portion of the (4k) eof folio, extends the file, and then shows that the post-eof data is not cleared before the file size is extended. The result is pagecache with a clean and uptodate folio over a hole that returns non-zero data. Once reclaimed, pagecache begins to return valid data. Some filesystems avoid this problem by flushing the EOF folio before inode size extension. This triggers writeback time partial post-eof zeroing. XFS explicitly zeroes newly exposed file ranges via iomap_zero_range(), but this includes a hack to flush dirty but hole-backed folios, which means writeback actually does the zeroing in this particular case as well. bcachefs explicitly flushes the eof folio on truncate extension to the same effect, but doesn't handle the analogous write extension case (i.e., replace "truncate 8k" with "pwrite 4k 4k" in the above example command to reproduce the same problem on bcachefs). btrfs doesn't seem to support subpage block sizes. The two main options to avoid this behavior are to either flush or do the appropriate zeroing during size extending operations. Zeroing is only required when the size change exposes ranges of the file that haven't been directly written, such as a write or truncate that starts beyond the current eof. The pagecache_isize_extended() helper is already used for this particular scenario. It currently cleans any pte's for the eof folio to ensure preexisting mappings fault and allow the filesystem to take action based on the updated inode size. This is required to ensure the folio is fully backed by allocated blocks, for example, but this also happens to be the same scenario zeroing is required. Update pagecache_isize_extended() to zero the post-eof range of the eof folio if it is dirty at the time of the size change, since writeback now won't have the chance. If non-dirty, the folio has either not been written or the post-eof portion was zeroed by writeback. Signed-off-by: Brian Foster Link: https://patch.msgid.link/20240919160741.208162-3-bfoster@redhat.com Signed-off-by: Theodore Ts'o --- mm/truncate.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/truncate.c b/mm/truncate.c index 0668cd340a46..6e7f3cfb982d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -797,6 +797,21 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) */ if (folio_mkclean(folio)) folio_mark_dirty(folio); + + /* + * The post-eof range of the folio must be zeroed before it is exposed + * to the file. Writeback normally does this, but since i_size has been + * increased we handle it here. + */ + if (folio_test_dirty(folio)) { + unsigned int offset, end; + + offset = from - folio_pos(folio); + end = min_t(unsigned int, to - folio_pos(folio), + folio_size(folio)); + folio_zero_segment(folio, offset, end); + } + folio_unlock(folio); folio_put(folio); } From 5ad585bcfe24d840f6d324951e4c18b0014c0178 Mon Sep 17 00:00:00 2001 From: Yu Jiaoliang Date: Fri, 20 Sep 2024 10:14:40 +0800 Subject: [PATCH 17/33] ext4: use ERR_CAST to return an error-valued pointer Instead of directly casting and returning an error-valued pointer, use ERR_CAST to make the error handling more explicit and improve code clarity. Signed-off-by: Yu Jiaoliang Link: https://patch.msgid.link/20240920021440.1959243-1-yujiaoliang@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 08d15cd2b594..1012781ae9b4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1747,7 +1747,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, #endif frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) - return (struct buffer_head *) frame; + return ERR_CAST(frame); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT_HTREE); @@ -1952,7 +1952,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - return (struct ext4_dir_entry_2 *) bh2; + return ERR_CAST(bh2); } BUFFER_TRACE(*bh, "get_write_access"); From 4309a94da7937775613bba2aa794e13c9e837cdd Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Sep 2024 08:59:37 +0800 Subject: [PATCH 18/33] jbd2: remove redundant judgments for check v1 checksum 'need_check_commit_time' is only used by v2/v3 checksum, so there isn't need to add 'need_check_commit_time' judegement for v1 checksum logic. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240930005942.626942-2-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 667f67342c52..5efbca6a98c4 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -619,7 +619,6 @@ static int do_one_pass(journal_t *journal, if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && jbd2_has_feature_checksum(journal) && - !need_check_commit_time && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, From 4c199241b662c7c992dbb2d5176b48a77d8c9291 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Sep 2024 08:59:38 +0800 Subject: [PATCH 19/33] jbd2: unified release of buffer_head in do_one_pass() Now buffer_head free is very fragmented in do_one_pass(), unified release of buffer_head in do_one_pass() Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240930005942.626942-3-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 5efbca6a98c4..0adf0cb31a03 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -493,7 +493,7 @@ static int do_one_pass(journal_t *journal, int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; - struct buffer_head * bh; + struct buffer_head *bh = NULL; unsigned int sequence; int blocktype; int tag_bytes = journal_tag_bytes(journal); @@ -552,6 +552,8 @@ static int do_one_pass(journal_t *journal, * record. */ jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); + brelse(bh); + bh = NULL; err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -567,20 +569,16 @@ static int do_one_pass(journal_t *journal, tmp = (journal_header_t *)bh->b_data; - if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) { - brelse(bh); + if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) break; - } blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); jbd2_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); - if (sequence != next_commit_ID) { - brelse(bh); + if (sequence != next_commit_ID) break; - } /* OK, we have a valid descriptor block which matches * all of the sequence number checks. What are we going @@ -603,7 +601,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum recovering block %lu in log\n", next_log_block); err = -EFSBADCRC; - brelse(bh); goto failed; } need_check_commit_time = true; @@ -622,16 +619,12 @@ static int do_one_pass(journal_t *journal, !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, - &crc32_sum)) { - put_bh(bh); + &crc32_sum)) break; - } - put_bh(bh); continue; } next_log_block += count_tags(journal, bh); wrap(journal, next_log_block); - put_bh(bh); continue; } @@ -701,7 +694,6 @@ static int do_one_pass(journal_t *journal, "JBD2: Out of memory " "during recovery.\n"); err = -ENOMEM; - brelse(bh); brelse(obh); goto failed; } @@ -733,7 +725,6 @@ static int do_one_pass(journal_t *journal, break; } - brelse(bh); continue; case JBD2_COMMIT_BLOCK: @@ -781,7 +772,6 @@ static int do_one_pass(journal_t *journal, pr_err("JBD2: Invalid checksum found in transaction %u\n", next_commit_ID); err = -EFSBADCRC; - brelse(bh); goto failed; } ignore_crc_mismatch: @@ -791,7 +781,6 @@ static int do_one_pass(journal_t *journal, */ jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - brelse(bh); goto done; } @@ -811,7 +800,6 @@ static int do_one_pass(journal_t *journal, if (info->end_transaction) { journal->j_failed_commit = info->end_transaction; - brelse(bh); break; } @@ -847,7 +835,6 @@ static int do_one_pass(journal_t *journal, if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = next_commit_ID; - brelse(bh); break; } } @@ -856,7 +843,6 @@ static int do_one_pass(journal_t *journal, last_trans_commit_time = commit_time; head_block = next_log_block; } - brelse(bh); next_commit_ID++; continue; @@ -875,14 +861,11 @@ static int do_one_pass(journal_t *journal, /* If we aren't in the REVOKE pass, then we can * just skip over this block. */ - if (pass != PASS_REVOKE) { - brelse(bh); + if (pass != PASS_REVOKE) continue; - } err = scan_revoke_records(journal, bh, next_commit_ID, info); - brelse(bh); if (err) goto failed; continue; @@ -890,12 +873,12 @@ static int do_one_pass(journal_t *journal, default: jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); - brelse(bh); goto done; } } done: + brelse(bh); /* * We broke out of the log scan loop: either we came to the * known end of the log or we found an unexpected block in the @@ -931,6 +914,7 @@ static int do_one_pass(journal_t *journal, return success; failed: + brelse(bh); return err; } From a805ae3ab9dc75079686cd2889fd563915e90b9d Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Sep 2024 08:59:39 +0800 Subject: [PATCH 20/33] jbd2: refactor JBD2_COMMIT_BLOCK process in do_one_pass() To make JBD2_COMMIT_BLOCK process more clean, no functional change. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240930005942.626942-4-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 55 ++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0adf0cb31a03..0d697979d83e 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -728,6 +728,11 @@ static int do_one_pass(journal_t *journal, continue; case JBD2_COMMIT_BLOCK: + if (pass != PASS_SCAN) { + next_commit_ID++; + continue; + } + /* How to differentiate between interrupted commit * and journal corruption ? * @@ -790,8 +795,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ - if (pass == PASS_SCAN && - jbd2_has_feature_checksum(journal)) { + if (jbd2_has_feature_checksum(journal)) { struct commit_header *cbh = (struct commit_header *)bh->b_data; unsigned found_chksum = @@ -815,34 +819,33 @@ static int do_one_pass(journal_t *journal, goto chksum_error; crc32_sum = ~0; + goto chksum_ok; } - if (pass == PASS_SCAN && - !jbd2_commit_block_csum_verify(journal, - bh->b_data)) { - if (jbd2_commit_block_csum_verify_partial( - journal, - bh->b_data)) { - pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", - next_commit_ID, next_log_block); - goto chksum_ok; - } - chksum_error: - if (commit_time < last_trans_commit_time) - goto ignore_crc_mismatch; - info->end_transaction = next_commit_ID; - info->head_block = head_block; - if (!jbd2_has_feature_async_commit(journal)) { - journal->j_failed_commit = - next_commit_ID; - break; - } + if (jbd2_commit_block_csum_verify(journal, bh->b_data)) + goto chksum_ok; + + if (jbd2_commit_block_csum_verify_partial(journal, + bh->b_data)) { + pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n", + next_commit_ID, next_log_block); + goto chksum_ok; } - if (pass == PASS_SCAN) { - chksum_ok: - last_trans_commit_time = commit_time; - head_block = next_log_block; + +chksum_error: + if (commit_time < last_trans_commit_time) + goto ignore_crc_mismatch; + info->end_transaction = next_commit_ID; + info->head_block = head_block; + + if (!jbd2_has_feature_async_commit(journal)) { + journal->j_failed_commit = next_commit_ID; + break; } + +chksum_ok: + last_trans_commit_time = commit_time; + head_block = next_log_block; next_commit_ID++; continue; From ac626a3d52ac097cb22d8450fadd8706b3f2533c Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Sep 2024 08:59:40 +0800 Subject: [PATCH 21/33] jbd2: factor out jbd2_do_replay() Factor out jbd2_do_replay() no funtional change. Signed-off-by: Ye Bin Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240930005942.626942-5-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 219 +++++++++++++++++++++++---------------------- 1 file changed, 110 insertions(+), 109 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0d697979d83e..046744d6239c 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -485,6 +485,105 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return tag->t_checksum == cpu_to_be16(csum32); } +static __always_inline int jbd2_do_replay(journal_t *journal, + struct recovery_info *info, + struct buffer_head *bh, + unsigned long *next_log_block, + unsigned int next_commit_ID, + int *success, int *block_error) +{ + char *tagp; + int flags; + int err; + int tag_bytes = journal_tag_bytes(journal); + int descr_csum_size = 0; + unsigned long io_block; + journal_block_tag_t tag; + struct buffer_head *obh; + struct buffer_head *nbh; + + if (jbd2_journal_has_csum_v2or3(journal)) + descr_csum_size = sizeof(struct jbd2_journal_block_tail); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while (tagp - bh->b_data + tag_bytes <= + journal->j_blocksize - descr_csum_size) { + + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); + + io_block = (*next_log_block)++; + wrap(journal, *next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but report failure at the end. */ + *success = err; + pr_err("JBD2: IO error %d recovering block %lu in log\n", + err, io_block); + } else { + unsigned long long blocknr; + + J_ASSERT(obh != NULL); + blocknr = read_tag_block(journal, &tag); + + /* If the block has been revoked, then we're all done here. */ + if (jbd2_journal_test_revoke(journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Look for block corruption */ + if (!jbd2_block_tag_csum_verify(journal, &tag, + (journal_block_tag3_t *)tagp, + obh->b_data, next_commit_ID)) { + brelse(obh); + *success = -EFSBADCRC; + pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", + blocknr, io_block); + *block_error = 1; + goto skip_write; + } + + /* Find a buffer for the new data being restored */ + nbh = __getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + pr_err("JBD2: Out of memory during recovery.\n"); + brelse(obh); + return -ENOMEM; + } + + lock_buffer(nbh); + memcpy(nbh->b_data, obh->b_data, journal->j_blocksize); + if (flags & JBD2_FLAG_ESCAPE) { + *((__be32 *)nbh->b_data) = + cpu_to_be32(JBD2_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + set_buffer_uptodate(nbh); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + ++info->nr_replays; + unlock_buffer(nbh); + brelse(obh); + brelse(nbh); + } + +skip_write: + tagp += tag_bytes; + if (!(flags & JBD2_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JBD2_FLAG_LAST_TAG) + break; + } + + return 0; +} + static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { @@ -496,9 +595,7 @@ static int do_one_pass(journal_t *journal, struct buffer_head *bh = NULL; unsigned int sequence; int blocktype; - int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - int descr_csum_size = 0; int block_error = 0; bool need_check_commit_time = false; __u64 last_trans_commit_time = 0, commit_time; @@ -528,12 +625,6 @@ static int do_one_pass(journal_t *journal, */ while (1) { - int flags; - char * tagp; - journal_block_tag_t tag; - struct buffer_head * obh; - struct buffer_head * nbh; - cond_resched(); /* If we already know where to stop the log traversal, @@ -587,11 +678,7 @@ static int do_one_pass(journal_t *journal, switch(blocktype) { case JBD2_DESCRIPTOR_BLOCK: /* Verify checksum first */ - if (jbd2_journal_has_csum_v2or3(journal)) - descr_csum_size = - sizeof(struct jbd2_journal_block_tail); - if (descr_csum_size > 0 && - !jbd2_descriptor_block_csum_verify(journal, + if (!jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { /* * PASS_SCAN can see stale blocks due to lazy @@ -628,102 +715,16 @@ static int do_one_pass(journal_t *journal, continue; } - /* A descriptor block: we can now write all of - * the data blocks. Yay, useful work is finally - * getting done here! */ - - tagp = &bh->b_data[sizeof(journal_header_t)]; - while ((tagp - bh->b_data + tag_bytes) - <= journal->j_blocksize - descr_csum_size) { - unsigned long io_block; - - memcpy(&tag, tagp, sizeof(tag)); - flags = be16_to_cpu(tag.t_flags); - - io_block = next_log_block++; - wrap(journal, next_log_block); - err = jread(&obh, journal, io_block); - if (err) { - /* Recover what we can, but - * report failure at the end. */ - success = err; - printk(KERN_ERR - "JBD2: IO error %d recovering " - "block %lu in log\n", - err, io_block); - } else { - unsigned long long blocknr; - - J_ASSERT(obh != NULL); - blocknr = read_tag_block(journal, - &tag); - - /* If the block has been - * revoked, then we're all done - * here. */ - if (jbd2_journal_test_revoke - (journal, blocknr, - next_commit_ID)) { - brelse(obh); - ++info->nr_revoke_hits; - goto skip_write; - } - - /* Look for block corruption */ - if (!jbd2_block_tag_csum_verify( - journal, &tag, (journal_block_tag3_t *)tagp, - obh->b_data, be32_to_cpu(tmp->h_sequence))) { - brelse(obh); - success = -EFSBADCRC; - printk(KERN_ERR "JBD2: Invalid " - "checksum recovering " - "data block %llu in " - "journal block %lu\n", - blocknr, io_block); - block_error = 1; - goto skip_write; - } - - /* Find a buffer for the new - * data being restored */ - nbh = __getblk(journal->j_fs_dev, - blocknr, - journal->j_blocksize); - if (nbh == NULL) { - printk(KERN_ERR - "JBD2: Out of memory " - "during recovery.\n"); - err = -ENOMEM; - brelse(obh); - goto failed; - } - - lock_buffer(nbh); - memcpy(nbh->b_data, obh->b_data, - journal->j_blocksize); - if (flags & JBD2_FLAG_ESCAPE) { - *((__be32 *)nbh->b_data) = - cpu_to_be32(JBD2_MAGIC_NUMBER); - } - - BUFFER_TRACE(nbh, "marking dirty"); - set_buffer_uptodate(nbh); - mark_buffer_dirty(nbh); - BUFFER_TRACE(nbh, "marking uptodate"); - ++info->nr_replays; - unlock_buffer(nbh); - brelse(obh); - brelse(nbh); - } - - skip_write: - tagp += tag_bytes; - if (!(flags & JBD2_FLAG_SAME_UUID)) - tagp += 16; - - if (flags & JBD2_FLAG_LAST_TAG) - break; - } + /* + * A descriptor block: we can now write all of the + * data blocks. Yay, useful work is finally getting + * done here! + */ + err = jbd2_do_replay(journal, info, bh, &next_log_block, + next_commit_ID, &success, + &block_error); + if (err) + goto failed; continue; From 0f67827bf44fdf2a4426cc918ee8cfc1274f8efa Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Sep 2024 08:59:41 +0800 Subject: [PATCH 22/33] jbd2: remove useless 'block_error' variable The judgement 'if (block_error && success == 0)' is never valid. Just remove useless 'block_error' variable. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20240930005942.626942-6-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 046744d6239c..4f1e9ca34503 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -490,7 +490,7 @@ static __always_inline int jbd2_do_replay(journal_t *journal, struct buffer_head *bh, unsigned long *next_log_block, unsigned int next_commit_ID, - int *success, int *block_error) + int *success) { char *tagp; int flags; @@ -542,7 +542,6 @@ static __always_inline int jbd2_do_replay(journal_t *journal, *success = -EFSBADCRC; pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", blocknr, io_block); - *block_error = 1; goto skip_write; } @@ -596,7 +595,6 @@ static int do_one_pass(journal_t *journal, unsigned int sequence; int blocktype; __u32 crc32_sum = ~0; /* Transactional Checksums */ - int block_error = 0; bool need_check_commit_time = false; __u64 last_trans_commit_time = 0, commit_time; @@ -721,8 +719,7 @@ static int do_one_pass(journal_t *journal, * done here! */ err = jbd2_do_replay(journal, info, bh, &next_log_block, - next_commit_ID, &success, - &block_error); + next_commit_ID, &success); if (err) goto failed; @@ -913,8 +910,6 @@ chksum_ok: success = err; } - if (block_error && success == 0) - success = -EIO; return success; failed: From 22d26f9b0c3ee2ef75daf2feee8f0a9eac385939 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 30 Sep 2024 08:59:42 +0800 Subject: [PATCH 23/33] jbd2: remove the 'success' parameter from the jbd2_do_replay() function Keep 'success' internally to track if any error happened and then return it at the end in do_one_pass(). If jbd2_do_replay() return -ENOMEM then stop replay journal. Signed-off-by: Ye Bin Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20240930005942.626942-7-yebin@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 4f1e9ca34503..9192be7c19d8 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -489,12 +489,11 @@ static __always_inline int jbd2_do_replay(journal_t *journal, struct recovery_info *info, struct buffer_head *bh, unsigned long *next_log_block, - unsigned int next_commit_ID, - int *success) + unsigned int next_commit_ID) { char *tagp; int flags; - int err; + int ret = 0; int tag_bytes = journal_tag_bytes(journal); int descr_csum_size = 0; unsigned long io_block; @@ -508,6 +507,7 @@ static __always_inline int jbd2_do_replay(journal_t *journal, tagp = &bh->b_data[sizeof(journal_header_t)]; while (tagp - bh->b_data + tag_bytes <= journal->j_blocksize - descr_csum_size) { + int err; memcpy(&tag, tagp, sizeof(tag)); flags = be16_to_cpu(tag.t_flags); @@ -517,7 +517,7 @@ static __always_inline int jbd2_do_replay(journal_t *journal, err = jread(&obh, journal, io_block); if (err) { /* Recover what we can, but report failure at the end. */ - *success = err; + ret = err; pr_err("JBD2: IO error %d recovering block %lu in log\n", err, io_block); } else { @@ -539,7 +539,7 @@ static __always_inline int jbd2_do_replay(journal_t *journal, (journal_block_tag3_t *)tagp, obh->b_data, next_commit_ID)) { brelse(obh); - *success = -EFSBADCRC; + ret = -EFSBADCRC; pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n", blocknr, io_block); goto skip_write; @@ -580,7 +580,7 @@ skip_write: break; } - return 0; + return ret; } static int do_one_pass(journal_t *journal, @@ -719,9 +719,12 @@ static int do_one_pass(journal_t *journal, * done here! */ err = jbd2_do_replay(journal, info, bh, &next_log_block, - next_commit_ID, &success); - if (err) - goto failed; + next_commit_ID); + if (err) { + if (err == -ENOMEM) + goto failed; + success = err; + } continue; From 867b73909ae0fb6a44552f22ad589c0511c9f2a4 Mon Sep 17 00:00:00 2001 From: R Sundar Date: Mon, 7 Oct 2024 22:50:06 +0530 Subject: [PATCH 24/33] ext4: use string choices helpers Use string choice helpers for better readability and to fix cocci warning Reported-by: kernel test robot Reported-by: Julia Lawall Closes: https://lore.kernel.org/r/202410062256.BoynX3c2-lkp@intel.com/ Signed-off-by: R Sundar Link: https://patch.msgid.link/20241007172006.83339-1-prosunofficial@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 92f49d7eb3c0..9fc4b6177129 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6056,7 +6056,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, } out_dbg: - mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } From 27349b4d2ed072cabedd5115f0542b3b7b538aa8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 11 Oct 2024 22:43:44 +0300 Subject: [PATCH 25/33] ext4: cleanup variable name in ext4_fc_del() The variables "&EXT4_SB(inode->i_sb)->s_fc_lock" and "&sbi->s_fc_lock" are the same lock. This function uses a mix of both, which is a bit unsightly and confuses Smatch. Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/96008557-8ff4-44cc-b5e3-ce242212f1a3@stanley.mountain Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 3dee94a38a68..26c4fc37edcf 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -291,9 +291,9 @@ void ext4_fc_del(struct inode *inode) return; restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); + spin_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); + spin_unlock(&sbi->s_fc_lock); return; } From abe1ac7ca84236513a3d8ede02cc47584437f24f Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Sat, 12 Oct 2024 16:55:30 +0800 Subject: [PATCH 26/33] jbd2: make b_frozen_data allocation always succeed The b_frozen_data allocation should not be failed during journal committing process, otherwise jbd2 will abort. Since commit 490c1b444ce653d("jbd2: do not fail journal because of frozen_buffer allocation failure") already added '__GFP_NOFAIL' flag in do_get_write_access(), just add '__GFP_NOFAIL' flag for all allocations in jbd2_journal_write_metadata_buffer(), like 'new_bh' allocation does. Besides, remove all error handling branches for do_get_write_access(). Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20241012085530.2147846-1-chengzhihao@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 4 ---- fs/jbd2/journal.c | 8 +------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 4305a1ac808a..9153ff3a08e7 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -662,10 +662,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) JBUFFER_TRACE(jh, "ph3: write metadata"); escape = jbd2_journal_write_metadata_buffer(commit_transaction, jh, &wbuf[bufs], blocknr); - if (escape < 0) { - jbd2_journal_abort(journal, escape); - continue; - } jbd2_file_log_bh(&io_bufs, wbuf[bufs]); /* Record the new block's tag in the current descriptor diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 97f487c3d8fc..29d30eddf727 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -318,7 +318,6 @@ static inline void jbd2_data_do_escape(char *data) * * * Return value: - * <0: Error * =0: Finished OK without escape * =1: Finished OK with escape */ @@ -386,12 +385,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, goto escape_done; spin_unlock(&jh_in->b_state_lock); - tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); - if (!tmp) { - brelse(new_bh); - free_buffer_head(new_bh); - return -ENOMEM; - } + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL); spin_lock(&jh_in->b_state_lock); if (jh_in->b_frozen_data) { jbd2_free(tmp, bh_in->b_size); From 97f5ec3b166db4e47ee2c0bdb0deb027413d4f2a Mon Sep 17 00:00:00 2001 From: Nicolas Bretz Date: Sun, 13 Oct 2024 20:41:43 -0700 Subject: [PATCH 27/33] ext4: prevent delalloc to nodelalloc on remount Implemented the suggested solution mentioned in the bug https://bugzilla.kernel.org/show_bug.cgi?id=218820 Preventing the disabling of delayed allocation mode on remount. delalloc to nodelalloc not permitted anymore nodelalloc to delalloc permitted, not affected Signed-off-by: Nicolas Bretz Link: https://patch.msgid.link/20241014034143.59779-1-bretznic@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cb0ce57bd896..33a81ef3c08a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6528,6 +6528,13 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } + if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) && + !test_opt(sb, DELALLOC)) { + ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount"); + err = -EINVAL; + goto restore_opts; + } + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); From 6a0c5887a54318eaaafb00825ebb10d1f7c8f0cf Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 21 Oct 2024 12:00:57 +0200 Subject: [PATCH 28/33] ext4: use str_yes_no() helper function Remove hard-coded strings by using the str_yes_no() helper function. Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20241021100056.5521-2-thorsten.blum@linux.dev Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9fc4b6177129..b25a27c86696 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5711,7 +5711,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); - mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); From 2bd9077b6261b8f1281d0fa74d51afe090319263 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 1 Nov 2024 14:45:23 -0600 Subject: [PATCH 29/33] jbd2: avoid dozens of -Wflex-array-member-not-at-end warnings -Wflex-array-member-not-at-end was introduced in GCC-14, and we are getting ready to enable it, globally. Use the `DEFINE_RAW_FLEX()` helper for an on-stack definition of a flexible structure (`struct shash_desc`) where the size of the flexible-array member (`__ctx`) is known at compile-time, and refactor the rest of the code, accordingly. So, with this, fix 77 of the following warnings: include/linux/jbd2.h:1800:35: warning: structure containing a flexible array member is not at the end of another structure [-Wflex-array-member-not-at-end] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Jan Kara Link: https://patch.msgid.link/ZyU94w0IALVhc9Jy@kspp Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 8aef9bb6ad57..50f7ea8714bf 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1796,22 +1796,21 @@ static inline unsigned long jbd2_log_space_left(journal_t *journal) static inline u32 jbd2_chksum(journal_t *journal, u32 crc, const void *address, unsigned int length) { - struct { - struct shash_desc shash; - char ctx[JBD_MAX_CHECKSUM_SIZE]; - } desc; + DEFINE_RAW_FLEX(struct shash_desc, desc, __ctx, + DIV_ROUND_UP(JBD_MAX_CHECKSUM_SIZE, + sizeof(*((struct shash_desc *)0)->__ctx))); int err; BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) > JBD_MAX_CHECKSUM_SIZE); - desc.shash.tfm = journal->j_chksum_driver; - *(u32 *)desc.ctx = crc; + desc->tfm = journal->j_chksum_driver; + *(u32 *)desc->__ctx = crc; - err = crypto_shash_update(&desc.shash, address, length); + err = crypto_shash_update(desc, address, length); BUG_ON(err); - return *(u32 *)desc.ctx; + return *(u32 *)desc->__ctx; } /* Return most recent uncommitted transaction */ From de183b2baf90f0acc1854a3998c14b8b228f9643 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 5 Nov 2024 11:18:14 +0100 Subject: [PATCH 30/33] ext4: annotate struct fname with __counted_by() Add the __counted_by compiler attribute to the flexible array member name to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20241105101813.10864-2-thorsten.blum@linux.dev Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ef6a3c8f3a9a..233479647f1b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -418,7 +418,7 @@ struct fname { __u32 inode; __u8 name_len; __u8 file_type; - char name[]; + char name[] __counted_by(name_len); }; /* From d5e9836e13a53ef36af702d87ab20d1a126b0fb8 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 5 Nov 2024 11:33:54 +0100 Subject: [PATCH 31/33] ext4: use struct_size() to improve ext4_htree_store_dirent() Inline and use struct_size() to calculate the number of bytes to allocate for new_fn and remove the local variable len. Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20241105103353.11590-2-thorsten.blum@linux.dev Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 233479647f1b..02d47a64e8d1 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -471,14 +471,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; struct dir_private_info *info; - int len; info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ - len = sizeof(struct fname) + ent_name->len + 1; - new_fn = kzalloc(len, GFP_KERNEL); + new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1), + GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; From e06a8c24f6445c2f1b5255caa4f63b38e31c43fa Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Wed, 6 Nov 2024 14:47:41 +0100 Subject: [PATCH 32/33] ext4: prevent an infinite loop in the lazyinit thread Use ktime_get_ns instead of ktime_get_real_ns when computing the lr_timeout not to be affected by system time jumps. Use a boolean instead of the MAX_JIFFY_OFFSET value to determine whether the next_wakeup value has been set. Comparing elr->lr_next_sched to MAX_JIFFY_OFFSET can cause the lazyinit thread to loop indefinitely. Co-developed-by: Lukas Skupinski Signed-off-by: Lukas Skupinski Signed-off-by: Mathieu Othacehe Reviewed-by: Jan Kara Link: https://patch.msgid.link/20241106134741.26948-2-othacehe@gnu.org Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 33a81ef3c08a..a09f4621b10d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3719,12 +3719,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = 1; if (!ret) { - start_time = ktime_get_real_ns(); + start_time = ktime_get_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); trace_ext4_lazy_itable_init(sb, group); if (elr->lr_timeout == 0) { - elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * + elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) * EXT4_SB(elr->lr_super)->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; @@ -3784,8 +3784,9 @@ static int ext4_lazyinit_thread(void *arg) cont_thread: while (true) { - next_wakeup = MAX_JIFFY_OFFSET; + bool next_wakeup_initialized = false; + next_wakeup = 0; mutex_lock(&eli->li_list_mtx); if (list_empty(&eli->li_request_list)) { mutex_unlock(&eli->li_list_mtx); @@ -3798,8 +3799,11 @@ cont_thread: lr_request); if (time_before(jiffies, elr->lr_next_sched)) { - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } continue; } if (down_read_trylock(&elr->lr_super->s_umount)) { @@ -3827,16 +3831,18 @@ cont_thread: elr->lr_next_sched = jiffies + get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ); } - if (time_before(elr->lr_next_sched, next_wakeup)) + if (!next_wakeup_initialized || + time_before(elr->lr_next_sched, next_wakeup)) { next_wakeup = elr->lr_next_sched; + next_wakeup_initialized = true; + } } mutex_unlock(&eli->li_list_mtx); try_to_freeze(); cur = jiffies; - if ((time_after_eq(cur, next_wakeup)) || - (MAX_JIFFY_OFFSET == next_wakeup)) { + if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) { cond_resched(); continue; } From 3e7c69cdb053f9edea95502853f35952ab6cbf06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Mart=C3=ADn=20G=C3=B3mez?= Date: Thu, 7 Nov 2024 15:45:38 +0100 Subject: [PATCH 33/33] jbd2: Fix comment describing journal_init_common() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The code indicates that journal_init_common() fills the journal_t object it returns while the comment incorrectly states that only a few fields are initialised. Also, the comment claims that journal structures could be created from scratch which isn't possible as journal_init_common() calls journal_load_superblock() which loads and checks journal superblock from disk. Signed-off-by: Daniel Martín Gómez Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://patch.msgid.link/20241107144538.3544-1-dalme@riseup.net Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 29d30eddf727..7e49d912b091 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1512,9 +1512,10 @@ static int journal_load_superblock(journal_t *journal) * destroy journal_t structures, and to initialise and read existing * journal blocks from disk. */ -/* First: create and setup a journal_t object in memory. We initialise - * very few fields yet: that has to wait until we have created the - * journal structures from from scratch, or loaded them from disk. */ +/* The journal_init_common() function creates and fills a journal_t object + * in memory. It calls journal_load_superblock() to load the on-disk journal + * superblock and initialize the journal_t object. + */ static journal_t *journal_init_common(struct block_device *bdev, struct block_device *fs_dev,