From 1fd95c05d8f742abfe906620780aee4dbe1a2db0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 2 Sep 2021 11:36:01 -0400 Subject: [PATCH 01/13] ext4: add error checking to ext4_ext_replay_set_iblocks() If the call to ext4_map_blocks() fails due to an corrupted file system, ext4_ext_replay_set_iblocks() can get stuck in an infinite loop. This could be reproduced by running generic/526 with a file system that has inline_data and fast_commit enabled. The system will repeatedly log to the console: EXT4-fs warning (device dm-3): ext4_block_to_path:105: block 1074800922 > max in inode 131076 and the stack that it gets stuck in is: ext4_block_to_path+0xe3/0x130 ext4_ind_map_blocks+0x93/0x690 ext4_map_blocks+0x100/0x660 skip_hole+0x47/0x70 ext4_ext_replay_set_iblocks+0x223/0x440 ext4_fc_replay_inode+0x29e/0x3b0 ext4_fc_replay+0x278/0x550 do_one_pass+0x646/0xc10 jbd2_journal_recover+0x14a/0x270 jbd2_journal_load+0xc4/0x150 ext4_load_journal+0x1f3/0x490 ext4_fill_super+0x22d4/0x2c00 With this patch, generic/526 still fails, but system is no longer locking up in a tight loop. It's likely the root casue is that fast_commit replay is corrupting file systems with inline_data, and we probably need to add better error handling in the fast commit replay code path beyond what is done here, which essentially just breaks the infinite loop without reporting the to the higher levels of the code. Fixes: 8016E29F4362 ("ext4: fast commit recovery path") Cc: stable@kernel.org Cc: Harshad Shirwadkar Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb1dd4f024f2..e57019cc3601 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5913,7 +5913,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) } /* Check if *cur is a hole and if it is, skip it */ -static void skip_hole(struct inode *inode, ext4_lblk_t *cur) +static int skip_hole(struct inode *inode, ext4_lblk_t *cur) { int ret; struct ext4_map_blocks map; @@ -5922,9 +5922,12 @@ static void skip_hole(struct inode *inode, ext4_lblk_t *cur) map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; if (ret != 0) - return; + return 0; *cur = *cur + map.m_len; + return 0; } /* Count number of blocks used by this inode and update i_blocks */ @@ -5973,7 +5976,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) * iblocks by total number of differences found. */ cur = 0; - skip_hole(inode, &cur); + ret = skip_hole(inode, &cur); + if (ret < 0) + goto out; path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; @@ -5992,8 +5997,12 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); - skip_hole(inode, &cur); - + ret = skip_hole(inode, &cur); + if (ret < 0) { + ext4_ext_drop_refs(path); + kfree(path); + break; + } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { ext4_ext_drop_refs(path); From 4df031ff5876d94b48dd9ee486ba5522382a06b2 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 16 Jul 2021 20:20:21 +0800 Subject: [PATCH 02/13] ext4: check and update i_disksize properly After commit 3da40c7b0898 ("ext4: only call ext4_truncate when size <= isize"), i_disksize could always be updated to i_size in ext4_setattr(), and we could sure that i_disksize <= i_size since holding inode lock and if i_disksize < i_size there are delalloc writes pending in the range upto i_size. If the end of the current write is <= i_size, there's no need to touch i_disksize since writeback will push i_disksize upto i_size eventually. So we can switch to check i_size instead of i_disksize in ext4_da_write_end() when write to the end of the file. we also could remove ext4_mark_inode_dirty() together because we defer inode dirtying to generic_write_end() or ext4_da_write_inline_data_end(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210716122024.1105856-2-yi.zhang@huawei.com --- fs/ext4/inode.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2c33c795c4a7..76d9e25fe62c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3084,35 +3084,37 @@ static int ext4_da_write_end(struct file *file, end = start + copied - 1; /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range upto i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize upto i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block (ext4_da_should_update_i_disksize() + * check), we need to update i_disksize here as neither + * ext4_writepage() nor certain ext4_writepages() paths not + * allocating blocks update i_disksize. + * + * Note that we defer inode dirtying to generic_write_end() / + * ext4_da_write_inline_data_end(). */ new_i_size = pos + copied; - if (copied && new_i_size > EXT4_I(inode)->i_disksize) { + if (copied && new_i_size > inode->i_size) { if (ext4_has_inline_data(inode) || - ext4_da_should_update_i_disksize(page, end)) { + ext4_da_should_update_i_disksize(page, end)) ext4_update_i_disksize(inode, new_i_size); - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) - */ - ret = ext4_mark_inode_dirty(handle, inode); - } } if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) - ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, + ret = ext4_da_write_inline_data_end(inode, pos, len, copied, page); else - ret2 = generic_write_end(file, mapping, pos, len, copied, + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - copied = ret2; - if (ret2 < 0) - ret = ret2; + copied = ret; ret2 = ext4_journal_stop(handle); if (unlikely(ret2 && !ret)) ret = ret2; From 55ce2f649b9e88111270333a8127e23f4f8f42d7 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 16 Jul 2021 20:20:22 +0800 Subject: [PATCH 03/13] ext4: correct the error path of ext4_write_inline_data_end() Current error path of ext4_write_inline_data_end() is not correct. Firstly, it should pass out the error value if ext4_get_inode_loc() return fail, or else it could trigger infinite loop if we inject error here. And then it's better to add inode to orphan list if it return fail in ext4_journal_stop(), otherwise we could not restore inline xattr entry after power failure. Finally, we need to reset the 'ret' value if ext4_write_inline_data_end() return success in ext4_write_end() and ext4_journalled_write_end(), otherwise we could not get the error return value of ext4_journal_stop(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210716122024.1105856-3-yi.zhang@huawei.com --- fs/ext4/inline.c | 15 +++++---------- fs/ext4/inode.c | 7 +++++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 70cb64db33f7..28b666f25ac2 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -733,25 +733,20 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, void *kaddr; struct ext4_iloc iloc; - if (unlikely(copied < len)) { - if (!PageUptodate(page)) { - copied = 0; - goto out; - } - } + if (unlikely(copied < len) && !PageUptodate(page)) + return 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) { ext4_std_error(inode->i_sb, ret); - copied = 0; - goto out; + return ret; } ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, len); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); kunmap_atomic(kaddr); SetPageUptodate(page); /* clear page dirty so that writepages wouldn't work for us. */ @@ -760,7 +755,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); mark_inode_dirty(inode); -out: + return copied; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 76d9e25fe62c..2baa17285096 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1295,6 +1295,7 @@ static int ext4_write_end(struct file *file, goto errout; } copied = ret; + ret = 0; } else copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -1321,13 +1322,14 @@ static int ext4_write_end(struct file *file, if (i_size_changed || inline_data) ret = ext4_mark_inode_dirty(handle, inode); +errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); -errout: + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1410,6 +1412,7 @@ static int ext4_journalled_write_end(struct file *file, goto errout; } copied = ret; + ret = 0; } else if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, page, from, to); @@ -1439,6 +1442,7 @@ static int ext4_journalled_write_end(struct file *file, ret = ret2; } +errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1446,7 +1450,6 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); -errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; From 6984aef59814fb5c47b0e30c56e101186b5ebf8c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 16 Jul 2021 20:20:23 +0800 Subject: [PATCH 04/13] ext4: factor out write end code of inline file Now that the inline_data file write end procedure are falled into the common write end functions, it is not clear. Factor them out and do some cleanup. This patch also drop ext4_da_write_inline_data_end() and switch to use ext4_write_inline_data_end() instead because we also need to do the same error processing if we failed to write data into inline entry. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210716122024.1105856-4-yi.zhang@huawei.com --- fs/ext4/ext4.h | 3 -- fs/ext4/inline.c | 119 ++++++++++++++++++++++++----------------------- fs/ext4/inode.c | 73 +++++++++-------------------- 3 files changed, 85 insertions(+), 110 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d71dcac3b97f..ea6c0aae8f1b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3533,9 +3533,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, unsigned flags, struct page **pagep, void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 28b666f25ac2..d30709d42a27 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -729,34 +729,76 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret, no_expand; + handle_t *handle = ext4_journal_current_handle(); + int no_expand; void *kaddr; struct ext4_iloc iloc; + int ret = 0, ret2; if (unlikely(copied < len) && !PageUptodate(page)) - return 0; + copied = 0; - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - return ret; + if (likely(copied)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + unlock_page(page); + put_page(page); + ext4_std_error(inode->i_sb, ret); + goto out; + } + ext4_write_lock_xattr(inode, &no_expand); + BUG_ON(!ext4_has_inline_data(inode)); + + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); + + ext4_write_unlock_xattr(inode, &no_expand); + brelse(iloc.bh); + + /* + * It's important to update i_size while still holding page + * lock: page writeout could otherwise come in and zero + * beyond i_size. + */ + ext4_update_inode_size(inode, pos + copied); } + unlock_page(page); + put_page(page); - ext4_write_lock_xattr(inode, &no_expand); - BUG_ON(!ext4_has_inline_data(inode)); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (likely(copied)) + mark_inode_dirty(inode); +out: + /* + * If we didn't copy as much data as expected, we need to trim back + * size of xattr containing inline data. + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); - - ext4_write_unlock_xattr(inode, &no_expand); - brelse(iloc.bh); - mark_inode_dirty(inode); - - return copied; + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret ? ret : copied; } struct buffer_head * @@ -937,43 +979,6 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, return ret; } -int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - int ret; - - ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ret; - } - copied = ret; - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - unlock_page(page); - put_page(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - mark_inode_dirty(inode); - - return copied; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2baa17285096..b80f15bba727 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1282,23 +1282,14 @@ static int ext4_write_end(struct file *file, loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - ret = 0; - } else - copied = block_write_end(file, mapping, pos, - len, copied, page, fsdata); + + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1319,10 +1310,9 @@ static int ext4_write_end(struct file *file, * ordering of page lock and transaction start for journaling * filesystems. */ - if (i_size_changed || inline_data) + if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); -errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -1394,7 +1384,6 @@ static int ext4_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; int size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); @@ -1403,17 +1392,10 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - ret = 0; - } else if (unlikely(copied < len) && !PageUptodate(page)) { + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, page, from, to); } else { @@ -1436,13 +1418,12 @@ static int ext4_journalled_write_end(struct file *file, if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - if (size_changed || inline_data) { + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } -errout: if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside @@ -3072,7 +3053,7 @@ static int ext4_da_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - int ret = 0, ret2; + int ret; handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; @@ -3083,6 +3064,12 @@ static int ext4_da_write_end(struct file *file, len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; @@ -3102,26 +3089,12 @@ static int ext4_da_write_end(struct file *file, * ext4_da_write_inline_data_end(). */ new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size) { - if (ext4_has_inline_data(inode) || - ext4_da_should_update_i_disksize(page, end)) - ext4_update_i_disksize(inode, new_i_size); - } - - if (write_mode != CONVERT_INLINE_DATA && - ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && - ext4_has_inline_data(inode)) - ret = ext4_da_write_inline_data_end(inode, pos, len, copied, - page); - else - ret = generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - - copied = ret; - ret2 = ext4_journal_stop(handle); - if (unlikely(ret2 && !ret)) - ret = ret2; + if (copied && new_i_size > inode->i_size && + ext4_da_should_update_i_disksize(page, end)) + ext4_update_i_disksize(inode, new_i_size); + copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + ret = ext4_journal_stop(handle); return ret ? ret : copied; } From cc883236b79297f6266ca6f4e7f24f3fd3c736c1 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 16 Jul 2021 20:20:24 +0800 Subject: [PATCH 05/13] ext4: drop unnecessary journal handle in delalloc write After we factor out the inline data write procedure from ext4_da_write_end(), we don't need to start journal handle for the cases of both buffer overwrite and append-write. If we need to update i_disksize, mark_inode_dirty() do start handle and update inode buffer. So we could just remove all the journal handle codes in the delalloc write procedure. After this patch, we could get a lot of performance improvement. Below is the Unixbench comparison data test on my machine with 'Intel Xeon Gold 5120' CPU and nvme SSD backend. Test cmd: ./Run -c 56 -i 3 fstime fsbuffer fsdisk Before this patch: System Benchmarks Partial Index BASELINE RESULT INDEX File Copy 1024 bufsize 2000 maxblocks 3960.0 422965.0 1068.1 File Copy 256 bufsize 500 maxblocks 1655.0 105077.0 634.9 File Copy 4096 bufsize 8000 maxblocks 5800.0 1429092.0 2464.0 ====== System Benchmarks Index Score (Partial Only) 1186.6 After this patch: System Benchmarks Partial Index BASELINE RESULT INDEX File Copy 1024 bufsize 2000 maxblocks 3960.0 732716.0 1850.3 File Copy 256 bufsize 500 maxblocks 1655.0 184940.0 1117.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 2427152.0 4184.7 ====== System Benchmarks Index Score (Partial Only) 2053.0 Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210716122024.1105856-5-yi.zhang@huawei.com --- fs/ext4/inode.c | 60 +++++-------------------------------------------- 1 file changed, 5 insertions(+), 55 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b80f15bba727..502f60621bad 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2910,19 +2910,6 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -/* We always reserve for an inode update; the superblock could be there too */ -static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) -{ - if (likely(ext4_has_feature_large_file(inode->i_sb))) - return 1; - - if (pos + len <= 0x7fffffffULL) - return 1; - - /* We might need to update the superblock to set LARGE_FILE */ - return 2; -} - static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2931,7 +2918,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page *page; pgoff_t index; struct inode *inode = mapping->host; - handle_t *handle; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -2957,41 +2943,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, return 0; } - /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page - * is being written back. So grab it first before we start - * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. - */ -retry_grab: +retry: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; - unlock_page(page); - /* - * With delayed allocation, we don't log the i_disksize update - * if there is delayed block allocation. But we still need - * to journalling the i_disksize update if writes to the end - * of file which has an already mapped buffer. - */ -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_da_write_credits(inode, pos, len)); - if (IS_ERR(handle)) { - put_page(page); - return PTR_ERR(handle); - } - - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); - ext4_journal_stop(handle); - goto retry_grab; - } /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); @@ -3003,20 +2959,18 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, #endif if (ret < 0) { unlock_page(page); - ext4_journal_stop(handle); + put_page(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - - put_page(page); + goto retry; return ret; } @@ -3053,8 +3007,6 @@ static int ext4_da_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - int ret; - handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; @@ -3093,9 +3045,7 @@ static int ext4_da_write_end(struct file *file, ext4_da_should_update_i_disksize(page, end)) ext4_update_i_disksize(inode, new_i_size); - copied = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - ret = ext4_journal_stop(handle); - return ret ? ret : copied; + return generic_write_end(file, mapping, pos, len, copied, page, fsdata); } /* From 0add491df4e5e2c8cc6eeeaa6dbcca50f932090c Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 19 Aug 2021 10:49:26 -0400 Subject: [PATCH 06/13] ext4: remove extent cache entries when truncating inline data Conditionally remove all cached extents belonging to an inode when truncating its inline data. It's only necessary to attempt to remove cached extents when a conversion from inline to extent storage has been initiated (!EXT4_STATE_MAY_INLINE_DATA). This avoids unnecessary es lock overhead in the more common inline case. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210819144927.25163-2-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 50a3031bf466..39a1ab129fdc 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "ext4_jbd2.h" #include "ext4.h" @@ -1918,6 +1919,24 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { + /* + * if there's inline data to truncate and this file was + * converted to extents after that inline data was written, + * the extent status cache must be cleared to avoid leaving + * behind stale delayed allocated extent entries + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { +retry: + err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (err) + goto out_error; + } + /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) From 948ca5f30e1df0c11eb5b0f410b9ceb97fa77ad9 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 19 Aug 2021 10:49:27 -0400 Subject: [PATCH 07/13] ext4: enforce buffer head state assertion in ext4_da_map_blocks Remove the code that re-initializes a buffer head with an invalid block number and BH_New and BH_Delay bits when a matching delayed and unwritten block has been found in the extent status cache. Replace it with assertions that verify the buffer head already has this state correctly set. The current code masked an inline data truncation bug that left stale entries in the extent status cache. With this change, generic/130 can be used to reproduce and detect that bug. Signed-off-by: Eric Whitney Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210819144927.25163-3-enwlinux@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8204176256c8..2a076d236ba1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1706,13 +1706,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, } /* - * Delayed extent could be allocated by fallocate. - * So we need to check it. + * the buffer head associated with a delayed and not unwritten + * block found in the extent status cache must contain an + * invalid block number and have its BH_New and BH_Delay bits + * set, reflecting the state assigned when the block was + * initially delayed allocated */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delonly(&es)) { + BUG_ON(bh->b_blocknr != invalid_block); + BUG_ON(!buffer_new(bh)); + BUG_ON(!buffer_delay(bh)); return 0; } From a2c2f0826e2b75560b31daf1cd9a755ab93cf4c6 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 20 Aug 2021 12:45:05 +0800 Subject: [PATCH 08/13] ext4: limit the number of blocks in one ADD_RANGE TLV Now EXT4_FC_TAG_ADD_RANGE uses ext4_extent to track the newly-added blocks, but the limit on the max value of ee_len field is ignored, and it can lead to BUG_ON as shown below when running command "fallocate -l 128M file" on a fast_commit-enabled fs: kernel BUG at fs/ext4/ext4_extents.h:199! invalid opcode: 0000 [#1] SMP PTI CPU: 3 PID: 624 Comm: fallocate Not tainted 5.14.0-rc6+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:ext4_fc_write_inode_data+0x1f3/0x200 Call Trace: ? ext4_fc_write_inode+0xf2/0x150 ext4_fc_commit+0x93b/0xa00 ? ext4_fallocate+0x1ad/0x10d0 ext4_sync_file+0x157/0x340 ? ext4_sync_file+0x157/0x340 vfs_fsync_range+0x49/0x80 do_fsync+0x3d/0x70 __x64_sys_fsync+0x14/0x20 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae Simply fixing it by limiting the number of blocks in one EXT4_FC_TAG_ADD_RANGE TLV. Fixes: aa75f4d3daae ("ext4: main fast-commit commit path") Cc: stable@kernel.org Signed-off-by: Hou Tao Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210820044505.474318-1-houtao1@huawei.com --- fs/ext4/fast_commit.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 8e610a381862..8ea5a81e6554 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -892,6 +892,12 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { + unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? + EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; + + /* Limit the number of blocks in one extent */ + map.m_len = min(max, map.m_len); + fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); From 6fed83957f21eff11c8496e9f24253b03d2bc1dc Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 23 Aug 2021 14:13:58 +0800 Subject: [PATCH 09/13] ext4: fix reserved space counter leakage When ext4_insert_delayed block receives and recovers from an error from ext4_es_insert_delayed_block(), e.g., ENOMEM, it does not release the space it has reserved for that block insertion as it should. One effect of this bug is that s_dirtyclusters_counter is not decremented and remains incorrectly elevated until the file system has been unmounted. This can result in premature ENOSPC returns and apparent loss of free space. Another effect of this bug is that /sys/fs/ext4//delayed_allocation_blocks can remain non-zero even after syncfs has been executed on the filesystem. Besides, add check for s_dirtyclusters_counter when inode is going to be evicted and freed. s_dirtyclusters_counter can still keep non-zero until inode is written back in .evict_inode(), and thus the check is delayed to .destroy_inode(). Fixes: 51865fda28e5 ("ext4: let ext4 maintain extent status tree") Cc: stable@kernel.org Suggested-by: Gao Xiang Signed-off-by: Jeffle Xu Reviewed-by: Eric Whitney Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210823061358.84473-1-jefflexu@linux.alibaba.com --- fs/ext4/inode.c | 5 +++++ fs/ext4/super.c | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a076d236ba1..9df1ab070fa5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1628,6 +1628,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; + bool reserved = false; /* * If the cluster containing lblk is shared with a delayed, @@ -1644,6 +1645,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, @@ -1656,6 +1658,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { allocated = true; } @@ -1666,6 +1669,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) } ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + if (ret && reserved) + ext4_da_release_space(inode, 1); errout: return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index feca816b6bf3..a52f1572daa5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1352,6 +1352,12 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } + + if (EXT4_I(inode)->i_reserved_data_blocks) + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + inode->i_ino, EXT4_I(inode), + EXT4_I(inode)->i_reserved_data_blocks); } static void init_once(void *foo) From 75ca6ad408f459f00b09a64f04c774559848c097 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sat, 5 Jun 2021 10:39:32 +0530 Subject: [PATCH 10/13] ext4: fix loff_t overflow in ext4_max_bitmap_size() We should use unsigned long long rather than loff_t to avoid overflow in ext4_max_bitmap_size() for comparison before returning. w/o this patch sbi->s_bitmap_maxbytes was becoming a negative value due to overflow of upper_limit (with has_huge_files as true) Below is a quick test to trigger it on a 64KB pagesize system. sudo mkfs.ext4 -b 65536 -O ^has_extents,^64bit /dev/loop2 sudo mount /dev/loop2 /mnt sudo echo "hello" > /mnt/hello -> This will error out with "echo: write error: File too large" Signed-off-by: Ritesh Harjani Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Link: https://lore.kernel.org/r/594f409e2c543e90fd836b78188dfa5c575065ba.1622867594.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a52f1572daa5..9b5b2f63b470 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3030,17 +3030,17 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { - loff_t res = EXT4_NDIR_BLOCKS; + unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; - /* This is calculated to be the largest file size for a dense, block + + /* + * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ - if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field @@ -3083,7 +3083,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; - return res; + return (loff_t)res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, From bb9464e08309f6befe80866f5be51778ca355ee9 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Fri, 24 Sep 2021 17:39:17 +0800 Subject: [PATCH 11/13] ext4: flush s_error_work before journal destroy in ext4_fill_super The error path in ext4_fill_super forget to flush s_error_work before journal destroy, and it may trigger the follow bug since flush_stashed_error_work can run concurrently with journal destroy without any protection for sbi->s_journal. [32031.740193] EXT4-fs (loop66): get root inode failed [32031.740484] EXT4-fs (loop66): mount failed [32031.759805] ------------[ cut here ]------------ [32031.759807] kernel BUG at fs/jbd2/transaction.c:373! [32031.760075] invalid opcode: 0000 [#1] SMP PTI [32031.760336] CPU: 5 PID: 1029268 Comm: kworker/5:1 Kdump: loaded 4.18.0 [32031.765112] Call Trace: [32031.765375] ? __switch_to_asm+0x35/0x70 [32031.765635] ? __switch_to_asm+0x41/0x70 [32031.765893] ? __switch_to_asm+0x35/0x70 [32031.766148] ? __switch_to_asm+0x41/0x70 [32031.766405] ? _cond_resched+0x15/0x40 [32031.766665] jbd2__journal_start+0xf1/0x1f0 [jbd2] [32031.766934] jbd2_journal_start+0x19/0x20 [jbd2] [32031.767218] flush_stashed_error_work+0x30/0x90 [ext4] [32031.767487] process_one_work+0x195/0x390 [32031.767747] worker_thread+0x30/0x390 [32031.768007] ? process_one_work+0x390/0x390 [32031.768265] kthread+0x10d/0x130 [32031.768521] ? kthread_flush_work_fn+0x10/0x10 [32031.768778] ret_from_fork+0x35/0x40 static int start_this_handle(...) BUG_ON(journal->j_flags & JBD2_UNMOUNT); <---- Trigger this Besides, after we enable fast commit, ext4_fc_replay can add work to s_error_work but return success, so the latter journal destroy in ext4_load_journal can trigger this problem too. Fix this problem with two steps: 1. Call ext4_commit_super directly in ext4_handle_error for the case that called from ext4_fc_replay 2. Since it's hard to pair the init and flush for s_error_work, we'd better add a extras flush_work before journal destroy in ext4_fill_super Besides, this patch will call ext4_commit_super in ext4_handle_error for any nojournal case too. But it seems safe since the reason we call schedule_work was that we should save error info to sb through journal if available. Conversely, for the nojournal case, it seems useless delay commit superblock to s_error_work. Fixes: c92dc856848f ("ext4: defer saving error info from atomic context") Fixes: 2d01ddc86606 ("ext4: save error info to sb through journal if available") Cc: stable@kernel.org Signed-off-by: yangerkun Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210924093917.1953239-1-yangerkun@huawei.com --- fs/ext4/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9b5b2f63b470..0049eac9de1a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -660,7 +660,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ - if (continue_fs) + if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_error_work); else ext4_commit_super(sb); @@ -5050,12 +5050,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: + /* flush s_error_work before sbi destroy */ flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); From 42cb447410d024e9d54139ae9c21ea132a8c384c Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 14 Sep 2021 19:14:15 +0800 Subject: [PATCH 12/13] ext4: fix potential infinite loop in ext4_dx_readdir() When ext4_htree_fill_tree() fails, ext4_dx_readdir() can run into an infinite loop since if info->last_pos != ctx->pos this will reset the directory scan and reread the failing entry. For example: 1. a dx_dir which has 3 block, block 0 as dx_root block, block 1/2 as leaf block which own the ext4_dir_entry_2 2. block 1 read ok and call_filldir which will fill the dirent and update the ctx->pos 3. block 2 read fail, but we has already fill some dirent, so we will return back to userspace will a positive return val(see ksys_getdents64) 4. the second ext4_dx_readdir will reset the world since info->last_pos != ctx->pos, and will also init the curr_hash which pos to block 1 5. So we will read block1 too, and once block2 still read fail, we can only fill one dirent because the hash of the entry in block1(besides the last one) won't greater than curr_hash 6. this time, we forget update last_pos too since the read for block2 will fail, and since we has got the one entry, ksys_getdents64 can return success 7. Latter we will trapped in a loop with step 4~6 Cc: stable@kernel.org Signed-off-by: yangerkun Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210914111415.3921954-1-yangerkun@huawei.com --- fs/ext4/dir.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ffb295aa891c..74b172a4adda 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -551,7 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; - int ret; + int ret = 0; if (!info) { info = ext4_htree_create_dir_info(file, ctx->pos); @@ -599,7 +599,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) info->curr_minor_hash, &info->next_hash); if (ret < 0) - return ret; + goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; @@ -630,7 +630,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) } finished: info->last_pos = ctx->pos; - return 0; + return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) From f2c77973507fd116c3657df1dc048864a2b16a1c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 10 Sep 2021 16:03:16 +0800 Subject: [PATCH 13/13] ext4: recheck buffer uptodate bit under buffer lock Commit 8e33fadf945a ("ext4: remove an unnecessary if statement in __ext4_get_inode_loc()") forget to recheck buffer's uptodate bit again under buffer lock, which may overwrite the buffer if someone else have already brought it uptodate and changed it. Fixes: 8e33fadf945a ("ext4: remove an unnecessary if statement in __ext4_get_inode_loc()") Cc: stable@kernel.org Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20210910080316.70421-1-yi.zhang@huawei.com --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9df1ab070fa5..b7c561ad38dd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4277,6 +4277,12 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, goto has_buffer; lock_buffer(bh); + if (ext4_buffer_uptodate(bh)) { + /* Someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the