From 0b956de1512e23329aa27f60fe4a8b3e6afc7d6a Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:40 +0530 Subject: [PATCH 01/55] ext4: kill unused function ext4_journalled_write_inline_data Commit 3f079114bf522 ("ext4: Convert data=journal writeback to use ext4_writepages()") Added support for writeback of journalled data into ext4_writepages() and killed function __ext4_journalled_writepage() which used to call ext4_journalled_write_inline_data() for inline data. This function got left over by mistake. Hence kill it's definition as no one uses it. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/122b2a8d5e0650686f23ed6da26ed9e04105562b.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ---- fs/ext4/inline.c | 24 ------------------------ 2 files changed, 28 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8104a21b001a..15abcbe988ec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3484,10 +3484,6 @@ extern int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); -extern struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 5854bd5a3352..c0b2dc6514b2 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -823,30 +823,6 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, return ret ? ret : copied; } -struct buffer_head * -ext4_journalled_write_inline_data(struct inode *inode, - unsigned len, - struct page *page) -{ - int ret, no_expand; - void *kaddr; - struct ext4_iloc iloc; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - return NULL; - } - - ext4_write_lock_xattr(inode, &no_expand); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, 0, len); - kunmap_atomic(kaddr); - ext4_write_unlock_xattr(inode, &no_expand); - - return iloc.bh; -} - /* * Try to make the page cache and handle ready for the inline data case. * We can call this function in 2 cases: From 36c9b4504088089185f7743433c914935b518ab2 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:42 +0530 Subject: [PATCH 02/55] ext4: Change remaining tracepoints to use folio ext4_readpage() is converted to ext4_read_folio() hence change the related tracepoint from trace_ext4_readpage(page) to trace_ext4_read_folio(folio). Do the same for trace_ext4_releasepage(page) to trace_ext4_release_folio(folio) As a minor bit of optimization to avoid an extra dereferencing, since both of the above functions already were dereferencing folio->mapping->host, hence change the tracepoint argument to take (inode, folio). Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/caba2b3c0147bed4ea7706767dc1d19cd0e29ab0.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 ++++--- include/trace/events/ext4.h | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 02de439bf1f0..e76a95267178 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3105,7 +3105,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio) int ret = -EAGAIN; struct inode *inode = folio->mapping->host; - trace_ext4_readpage(&folio->page); + trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); @@ -3164,9 +3164,10 @@ static void ext4_journalled_invalidate_folio(struct folio *folio, static bool ext4_release_folio(struct folio *folio, gfp_t wait) { - journal_t *journal = EXT4_JOURNAL(folio->mapping->host); + struct inode *inode = folio->mapping->host; + journal_t *journal = EXT4_JOURNAL(inode); - trace_ext4_releasepage(&folio->page); + trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index ebccf6a6aa1b..54cd509ced0f 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -560,10 +560,10 @@ TRACE_EVENT(ext4_writepages_result, (unsigned long) __entry->writeback_index) ); -DECLARE_EVENT_CLASS(ext4__page_op, - TP_PROTO(struct page *page), +DECLARE_EVENT_CLASS(ext4__folio_op, + TP_PROTO(struct inode *inode, struct folio *folio), - TP_ARGS(page), + TP_ARGS(inode, folio), TP_STRUCT__entry( __field( dev_t, dev ) @@ -573,29 +573,29 @@ DECLARE_EVENT_CLASS(ext4__page_op, ), TP_fast_assign( - __entry->dev = page->mapping->host->i_sb->s_dev; - __entry->ino = page->mapping->host->i_ino; - __entry->index = page->index; + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->index = folio->index; ), - TP_printk("dev %d,%d ino %lu page_index %lu", + TP_printk("dev %d,%d ino %lu folio_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index) ); -DEFINE_EVENT(ext4__page_op, ext4_readpage, +DEFINE_EVENT(ext4__folio_op, ext4_read_folio, - TP_PROTO(struct page *page), + TP_PROTO(struct inode *inode, struct folio *folio), - TP_ARGS(page) + TP_ARGS(inode, folio) ); -DEFINE_EVENT(ext4__page_op, ext4_releasepage, +DEFINE_EVENT(ext4__folio_op, ext4_release_folio, - TP_PROTO(struct page *page), + TP_PROTO(struct inode *inode, struct folio *folio), - TP_ARGS(page) + TP_ARGS(inode, folio) ); DECLARE_EVENT_CLASS(ext4_invalidate_folio_op, From 80be8c5cc9251484ba856e2cf7d15a189326fe9c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:43 +0530 Subject: [PATCH 03/55] ext4: Make mpage_journal_page_buffers use folio This patch converts mpage_journal_page_buffers() to use folio and also removes the PAGE_SIZE assumption. Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/ebc3ac80e6a54b53327740d010ce684a83512021.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e76a95267178..0ff4624178cd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2321,11 +2321,11 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -static int ext4_journal_page_buffers(handle_t *handle, struct page *page, - int len) +static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, + size_t len) { - struct buffer_head *page_bufs = page_buffers(page); - struct inode *inode = page->mapping->host; + struct buffer_head *page_bufs = folio_buffers(folio); + struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, @@ -2334,7 +2334,7 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, NULL, write_end_fn); if (ret == 0) ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -2344,22 +2344,20 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, - struct page *page) + struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); - int len; + size_t len = folio_size(folio); - ClearPageChecked(page); + folio_clear_checked(folio); mpd->wbc->nr_to_write--; - if (page->index == size >> PAGE_SHIFT && + if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = size - folio_pos(folio); - return ext4_journal_page_buffers(handle, page, len); + return ext4_journal_folio_buffers(handle, folio, len); } /* @@ -2499,7 +2497,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, - mpd, &folio->page); + mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; @@ -6157,7 +6155,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_journal_page_buffers(handle, &folio->page, len)) + if (ext4_journal_folio_buffers(handle, folio, len)) goto out_error; } else { folio_unlock(folio); From d19500da4b8cfe1791e1583ab237c3213e1bd39c Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Mon, 15 May 2023 16:10:44 +0530 Subject: [PATCH 04/55] ext4: Make ext4_write_inline_data_end() use folio ext4_write_inline_data_end() is completely converted to work with folio. Also all callers of ext4_write_inline_data_end() already works on folio except ext4_da_write_end(). Mostly for consistency and saving few instructions maybe, this patch just converts ext4_da_write_end() to work with folio which makes the last caller of ext4_write_inline_data_end() also converted to work with folio. We then make ext4_write_inline_data_end() take folio instead of page. Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/1bcea771720ff451a5a59b3f1bcd5fae51cb7ce7.1684122756.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ++---- fs/ext4/inline.c | 3 +-- fs/ext4/inode.c | 23 ++++++++++++++--------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 15abcbe988ec..ec9cd6252185 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3480,10 +3480,8 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct page **pagep); -extern int ext4_write_inline_data_end(struct inode *inode, - loff_t pos, unsigned len, - unsigned copied, - struct page *page); +int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, + unsigned copied, struct folio *folio); extern int ext4_da_write_inline_data_begin(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c0b2dc6514b2..f48183f91c87 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -741,9 +741,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, - unsigned copied, struct page *page) + unsigned copied, struct folio *folio) { - struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0ff4624178cd..c2868282ad81 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1287,7 +1287,8 @@ static int ext4_write_end(struct file *file, if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* @@ -1395,7 +1396,8 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; @@ -2942,15 +2944,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, * Check if we should update i_disksize * when write to the end of file but not require block allocation */ -static int ext4_da_should_update_i_disksize(struct page *page, +static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned int idx; int i; - bh = page_buffers(page); + bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) @@ -2970,17 +2972,19 @@ static int ext4_da_write_end(struct file *file, loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; + struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, - len, copied, page, fsdata); + len, copied, &folio->page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) - return ext4_write_inline_data_end(inode, pos, len, copied, page); + return ext4_write_inline_data_end(inode, pos, len, copied, + folio); if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; @@ -3004,10 +3008,11 @@ static int ext4_da_write_end(struct file *file, */ new_i_size = pos + copied; if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(page, end)) + ext4_da_should_update_i_disksize(folio, end)) ext4_update_i_disksize(inode, new_i_size); - return generic_write_end(file, mapping, pos, len, copied, page, fsdata); + return generic_write_end(file, mapping, pos, len, copied, &folio->page, + fsdata); } /* From 0dea40aa315d01f5d0ffbd871483ea8a45c71f65 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 16 May 2023 20:27:13 +0100 Subject: [PATCH 05/55] ext4: Call fsverity_verify_folio() Now that fsverity supports working on entire folios, call fsverity_verify_folio() instead of fsverity_verify_page() Reported-by: Eric Biggers Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Eric Biggers Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230516192713.1070469-1-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 6f46823fba61..3e7d160f543f 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && - !fsverity_verify_page(&folio->page)) + !fsverity_verify_folio(folio)) goto set_error_page; folio_mark_uptodate(folio); folio_unlock(folio); From b3916da0d9636285c8a881802956631f9e2ebb5b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:09 +0800 Subject: [PATCH 06/55] ext4: fix wrong unit use in ext4_mb_normalize_request NRL_CHECK_SIZE will compare input req and size, so req and size should be in same unit. Input req "fe_len" is in cluster unit while input size "(8<<20)>>bsbits" is in block unit. Convert "fe_len" to block unit to fix the mismatch. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20f67a260df5..7d88f76070be 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4283,7 +4283,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; - } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, + } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; From 497885f72d930305d8e61b6b616b22b4da1adf90 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:10 +0800 Subject: [PATCH 07/55] ext4: fix unit mismatch in ext4_mb_new_blocks_simple The "i" returned from mb_find_next_zero_bit is in cluster unit and we need offset "block" corresponding to "i" in block unit. Convert "i" to block unit to fix the unit mismatch. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7d88f76070be..d8caef7cd9d0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6011,6 +6011,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, { struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); @@ -6039,7 +6040,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, if (i >= max) break; if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + i)) { + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, i))) { blkoff = i + 1; } else break; @@ -6056,7 +6058,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, return 0; } - block = ext4_group_first_block_no(sb, group) + i; + block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); ext4_mb_mark_bb(sb, block, 1, 1); ar->len = 1; From 99c515e3a860576ba90c11acbc1d6488dfca6463 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:11 +0800 Subject: [PATCH 08/55] ext4: fix wrong unit use in ext4_mb_find_by_goal We need start in block unit while fe_start is in cluster unit. Use ext4_grp_offs_to_block helper to convert fe_start to get start in block unit. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d8caef7cd9d0..f6dc4f276ca4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2210,8 +2210,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; - start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + - ex.fe_start; + start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; From c3defd99d58cbdd132bd197714e5523dac976b66 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:12 +0800 Subject: [PATCH 09/55] ext4: treat stripe in block unit Stripe is misused in block unit and in cluster unit in different code paths. User awared of stripe maybe not awared of bigalloc feature, so treat stripe only in block unit to fix this. Besides, it's hard to get stripe aligned blocks (start and length are both aligned with stripe) if stripe is not aligned with cluster, just disable stripe and alert user in this case to simpfy the code and avoid unnecessary work to get stripe aligned blocks which likely to be failed. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 +++++++++++------- fs/ext4/super.c | 13 +++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f6dc4f276ca4..7ef95bdde91d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2207,7 +2207,8 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ - if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { + if (max >= ac->ac_g_ex.fe_len && + ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); @@ -2372,7 +2373,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; - ext4_grpblk_t i; + ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); @@ -2384,10 +2385,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; + stripe = EXT4_B2C(sbi, sbi->s_stripe); + i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { - max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); - if (max >= sbi->s_stripe) { + max = mb_find_extent(e4b, i, stripe, &ex); + if (max >= stripe) { ac->ac_found++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; @@ -2395,7 +2398,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, break; } } - i += sbi->s_stripe; + i += stripe; } } @@ -2758,7 +2761,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && sbi->s_stripe && - !(ac->ac_g_ex.fe_len % sbi->s_stripe)) + !(ac->ac_g_ex.fe_len % + EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); @@ -3477,7 +3481,7 @@ int ext4_mb_init(struct super_block *sb) */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( - sbi->s_mb_group_prealloc, sbi->s_stripe); + sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 05fcecc36244..39591fb78c1d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5297,6 +5297,19 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount3; sbi->s_stripe = ext4_get_stripe_size(sbi); + /* + * It's hard to get stripe aligned blocks if stripe is not aligned with + * cluster, just disable stripe and alert user to simpfy code and avoid + * stripe aligned allocation which will rarely successes. + */ + if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 && + sbi->s_stripe % sbi->s_cluster_ratio != 0) { + ext4_msg(sb, KERN_WARNING, + "stripe (%lu) is not aligned with cluster size (%u), " + "stripe is disabled", + sbi->s_stripe, sbi->s_cluster_ratio); + sbi->s_stripe = 0; + } sbi->s_extent_max_zeroout_kb = 32; /* From 1eff590489a213a213c57d96b86f48b32cdf8c3a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:13 +0800 Subject: [PATCH 10/55] ext4: add EXT4_MB_HINT_GOAL_ONLY test in ext4_mb_use_preallocated ext4_mb_use_preallocated will ignore the demand to alloc goal blocks, although the EXT4_MB_HINT_GOAL_ONLY is requested. For group pa, ext4_mb_group_or_file will not set EXT4_MB_HINT_GROUP_ALLOC if EXT4_MB_HINT_GOAL_ONLY is set. So we will not alloc goal blocks from group pa if EXT4_MB_HINT_GOAL_ONLY is set. For inode pa, ext4_mb_pa_goal_check is added to check if free extent in found inode pa meets goal blocks when EXT4_MB_HINT_GOAL_ONLY is set. Signed-off-by: Kemeng Shi Suggested-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7ef95bdde91d..839db9c46aea 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4531,6 +4531,37 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, return pa; } +/* + * check if found pa meets EXT4_MB_HINT_GOAL_ONLY + */ +static bool +ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, + struct ext4_prealloc_space *pa) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + ext4_fsblk_t start; + + if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) + return true; + + /* + * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted + * in ext4_mb_normalize_request and will keep same with ac_o_ex + * from ext4_mb_initialize_context. Choose ac_g_ex here to keep + * consistent with ext4_mb_find_by_goal. + */ + start = pa->pa_pstart + + (ac->ac_g_ex.fe_logical - pa->pa_lstart); + if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) + return false; + + if (ac->ac_g_ex.fe_len > pa->pa_len - + EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) + return false; + + return true; +} + /* * search goal blocks in preallocated space */ @@ -4581,7 +4612,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* found preallocated blocks, use them */ spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free && + likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); From 95a4c3c7e0342f553fed1ac70d10c2efc6bba3b1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:14 +0800 Subject: [PATCH 11/55] ext4: remove ext4_block_group and ext4_block_group_offset declaration For ext4_block_group and ext4_block_group_offset, there are only declaration without definition. Just remove them. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ec9cd6252185..e6d80325718d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2632,10 +2632,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb, extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); -extern unsigned int ext4_block_group(struct super_block *sb, - ext4_fsblk_t blocknr); -extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, - ext4_fsblk_t blocknr); extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); From 19a043bb1fd1b5cb2652ca33536c55e6c0a70df0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:15 +0800 Subject: [PATCH 12/55] ext4: try all groups in ext4_mb_new_blocks_simple ext4_mb_new_blocks_simple ignores the group before goal, so it will fail if free blocks reside in group before goal. Try all groups to avoid unexpected failure. Search finishes either if any free block is found or if no available blocks are found. Simpliy check "i >= max" to distinguish the above cases. Signed-off-by: Kemeng Shi Suggested-by: Theodore Ts'o Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 839db9c46aea..bcc9622daa4a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6047,7 +6047,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t group; + ext4_group_t group, nr; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; @@ -6061,7 +6061,7 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, ar->len = 0; ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); - for (; group < ext4_get_groups_count(sb); group++) { + for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { *errp = PTR_ERR(bitmap_bh); @@ -6085,10 +6085,13 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, if (i < max) break; + if (++group >= ext4_get_groups_count(sb)) + group = 0; + blkoff = 0; } - if (group >= ext4_get_groups_count(sb) || i >= max) { + if (i >= max) { *errp = -ENOSPC; return 0; } From 11b6890be0084ad4df0e06d89a9fdcc948472c65 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:16 +0800 Subject: [PATCH 13/55] ext4: get block from bh in ext4_free_blocks for fast commit replay ext4_free_blocks will retrieve block from bh if block parameter is zero. Retrieve block before ext4_free_blocks_simple to avoid potentially passing wrong block to ext4_free_blocks_simple. Signed-off-by: Kemeng Shi Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bcc9622daa4a..dadda9e8cf29 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6368,12 +6368,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, sbi = EXT4_SB(sb); - if (sbi->s_mount_state & EXT4_FC_REPLAY) { - ext4_free_blocks_simple(inode, block, count); - return; - } - - might_sleep(); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); @@ -6381,6 +6375,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, block = bh->b_blocknr; } + if (sbi->s_mount_state & EXT4_FC_REPLAY) { + ext4_free_blocks_simple(inode, block, count); + return; + } + + might_sleep(); + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " From ad78b5efe4246e5deba8d44a6ed172b8a00d3113 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:17 +0800 Subject: [PATCH 14/55] ext4: remove unused parameter from ext4_mb_new_blocks_simple() Two cleanups for ext4_mb_new_blocks_simple: Remove unused parameter handle of ext4_mb_new_blocks_simple. Move ext4_mb_new_blocks_simple definition before ext4_mb_new_blocks to remove unnecessary forward declaration of ext4_mb_new_blocks_simple. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-10-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 137 +++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 70 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dadda9e8cf29..1345c4e84cc3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5786,8 +5786,72 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, return ret; } -static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, - struct ext4_allocation_request *ar, int *errp); +/* + * Simple allocator for Ext4 fast commit replay path. It searches for blocks + * linearly starting at the goal block and also excludes the blocks which + * are going to be in use after fast commit replay. + */ +static ext4_fsblk_t +ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) +{ + struct buffer_head *bitmap_bh; + struct super_block *sb = ar->inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_group_t group, nr; + ext4_grpblk_t blkoff; + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_fsblk_t goal, block; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + goal = ar->goal; + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= ext4_blocks_count(es)) + goal = le32_to_cpu(es->s_first_data_block); + + ar->len = 0; + ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); + for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + *errp = PTR_ERR(bitmap_bh); + pr_warn("Failed to read block bitmap\n"); + return 0; + } + + while (1) { + i = mb_find_next_zero_bit(bitmap_bh->b_data, max, + blkoff); + if (i >= max) + break; + if (ext4_fc_replay_check_excluded(sb, + ext4_group_first_block_no(sb, group) + + EXT4_C2B(sbi, i))) { + blkoff = i + 1; + } else + break; + } + brelse(bitmap_bh); + if (i < max) + break; + + if (++group >= ext4_get_groups_count(sb)) + group = 0; + + blkoff = 0; + } + + if (i >= max) { + *errp = -ENOSPC; + return 0; + } + + block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); + ext4_mb_mark_bb(sb, block, 1, 1); + ar->len = 1; + + return block; +} /* * Main entry point into mballoc to allocate blocks @@ -5812,7 +5876,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, trace_ext4_request_blocks(ar); if (sbi->s_mount_state & EXT4_FC_REPLAY) - return ext4_mb_new_blocks_simple(handle, ar, errp); + return ext4_mb_new_blocks_simple(ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) @@ -6036,73 +6100,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, spin_unlock(&sbi->s_md_lock); } -/* - * Simple allocator for Ext4 fast commit replay path. It searches for blocks - * linearly starting at the goal block and also excludes the blocks which - * are going to be in use after fast commit replay. - */ -static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) -{ - struct buffer_head *bitmap_bh; - struct super_block *sb = ar->inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t group, nr; - ext4_grpblk_t blkoff; - ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); - ext4_grpblk_t i = 0; - ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - goal = ar->goal; - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= ext4_blocks_count(es)) - goal = le32_to_cpu(es->s_first_data_block); - - ar->len = 0; - ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); - for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - *errp = PTR_ERR(bitmap_bh); - pr_warn("Failed to read block bitmap\n"); - return 0; - } - - while (1) { - i = mb_find_next_zero_bit(bitmap_bh->b_data, max, - blkoff); - if (i >= max) - break; - if (ext4_fc_replay_check_excluded(sb, - ext4_group_first_block_no(sb, group) + - EXT4_C2B(sbi, i))) { - blkoff = i + 1; - } else - break; - } - brelse(bitmap_bh); - if (i < max) - break; - - if (++group >= ext4_get_groups_count(sb)) - group = 0; - - blkoff = 0; - } - - if (i >= max) { - *errp = -ENOSPC; - return 0; - } - - block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); - ext4_mb_mark_bb(sb, block, 1, 1); - ar->len = 1; - - return block; -} - static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { From 247c3d214c23dfeeeb892e91a82ac1188bdaec9f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:18 +0800 Subject: [PATCH 15/55] ext4: fix wrong unit use in ext4_mb_clear_bb Function ext4_issue_discard need count in cluster. Pass count_clusters instead of count to fix the mismatch. Signed-off-by: Kemeng Shi Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-11-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1345c4e84cc3..474aebfdc1dd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6280,8 +6280,8 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, * them with group lock_held */ if (test_opt(sb, DISCARD)) { - err = ext4_issue_discard(sb, block_group, bit, count, - NULL); + err = ext4_issue_discard(sb, block_group, bit, + count_clusters, NULL); if (err && err != -EOPNOTSUPP) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" From 2ec6d0a5ea72689a79e6f725fd8b443a788ae279 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 3 Jun 2023 23:03:19 +0800 Subject: [PATCH 16/55] ext4: fix wrong unit use in ext4_mb_new_blocks Function ext4_free_blocks_simple needs count in cluster. Function ext4_free_blocks accepts count in block. Convert count to cluster to fix the mismatch. Signed-off-by: Kemeng Shi Cc: stable@kernel.org Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230603150327.3596033-12-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 474aebfdc1dd..4322eb7559fc 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6373,7 +6373,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } if (sbi->s_mount_state & EXT4_FC_REPLAY) { - ext4_free_blocks_simple(inode, block, count); + ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); return; } From 569f196f1e7a14472f21734170411f75a3179db0 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 30 May 2023 18:03:40 +0530 Subject: [PATCH 17/55] ext4: mballoc: Remove useless setting of ac_criteria There will be changes coming in future patches which will introduce a new criteria for block allocation. This removes the useless setting of ac_criteria. AFAIU, this might be only used to differentiate between whether a preallocated blocks was allocated or was regular allocator called for allocating blocks. Hence this also adds the debug prints to identify what type of block allocation was done in ext4_mb_show_ac(). Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1dbae05617519cb6202f1b299c9d1be3e7cda763.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4322eb7559fc..8a3946f1bdd9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4617,7 +4617,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); - ac->ac_criteria = 10; read_unlock(&ei->i_prealloc_lock); return true; } @@ -4660,7 +4659,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) } if (cpa) { ext4_mb_use_group_pa(ac, cpa); - ac->ac_criteria = 20; return true; } return false; @@ -5434,6 +5432,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); + mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no"); + if (ac->ac_pa) + mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? + "group pa" : "inode pa"); ext4_mb_show_pa(sb); } #else From 5730cce35344fba94e2c329d2bc0a170333a059f Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Tue, 30 May 2023 18:03:41 +0530 Subject: [PATCH 18/55] ext4: Remove unused extern variables declaration ext4_mb_stats & ext4_mb_max_to_scan are never used. We use sbi->s_mb_stats and sbi->s_mb_max_to_scan instead. Hence kill these extern declarations. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/928b3142062172533b6d1b5a94de94700590fef3.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- fs/ext4/mballoc.h | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e6d80325718d..2a7e254cbae3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2837,8 +2837,6 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; -extern long ext4_mb_stats; -extern long ext4_mb_max_to_scan; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 6d85ee8674a6..24b666e558f1 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -49,7 +49,7 @@ #define MB_DEFAULT_MIN_TO_SCAN 10 /* - * with 'ext4_mb_stats' allocator will collect stats that will be + * with 's_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ #define MB_DEFAULT_STATS 0 From 4eb7a4a1a33bdaf259fca8528f2546c90ad18f0d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:42 +0530 Subject: [PATCH 19/55] ext4: Convert mballoc cr (criteria) to enum Convert criteria to be an enum so it easier to maintain and update the tracefiles to use enum names. This change also makes it easier to insert new criterias in the future. There is no functional change in this patch. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/5d82fd467bdf70ea45bdaef810af3b146013946c.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 23 +++++++-- fs/ext4/mballoc.c | 96 ++++++++++++++++++------------------- include/trace/events/ext4.h | 16 ++++++- 3 files changed, 82 insertions(+), 53 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2a7e254cbae3..914475cbb060 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -127,6 +127,23 @@ enum SHIFT_DIRECTION { SHIFT_RIGHT, }; +/* + * Number of criterias defined. For each criteria, mballoc has slightly + * different way of finding the required blocks nad usually, higher the + * criteria the slower the allocation. We start at lower criterias and keep + * falling back to higher ones if we are not able to find any blocks. + */ +#define EXT4_MB_NUM_CRS 4 +/* + * All possible allocation criterias for mballoc + */ +enum criteria { + CR0, + CR1, + CR2, + CR3, +}; + /* * Flags used in mballoc's allocation_context flags field. * @@ -1544,9 +1561,9 @@ struct ext4_sb_info { atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_cr0_bad_suggestions; atomic_t s_bal_cr1_bad_suggestions; - atomic64_t s_bal_cX_groups_considered[4]; - atomic64_t s_bal_cX_hits[4]; - atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; + atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8a3946f1bdd9..4359280d2fa7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -154,19 +154,19 @@ * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * - * At CR = 0, we look for groups which have the largest_free_order >= the order + * At CR0 , we look for groups which have the largest_free_order >= the order * of the request. We directly look at the largest free order list in the data * structure (1) above where largest_free_order = order of the request. If that * list is empty, we look at remaining list in the increasing order of - * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * largest_free_order. This allows us to perform CR0 lookup in O(1) time. * - * At CR = 1, we only consider groups where average fragment size > request + * At CR1, we only consider groups where average fragment size > request * size. So, we lookup a group which has average fragment size just above or * equal to request size using our average fragment size group lists (data * structure 2) in O(1) time. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in - * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * linear order which requires O(N) search time for each CR0 and CR1 phase. * * The regular allocator (using the buddy cache) supports a few tunables. * @@ -409,7 +409,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr); + ext4_group_t group, enum criteria cr); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, @@ -859,7 +859,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * cr level needs an update. */ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter, *grp; @@ -884,8 +884,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR0]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR0))) { grp = iter; break; } @@ -897,7 +897,7 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (!grp) { /* Increment cr and search again */ - *new_cr = 1; + *new_cr = CR1; } else { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; @@ -909,7 +909,7 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL, *iter; @@ -932,8 +932,8 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], bb_avg_fragment_size_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR1]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR1))) { grp = iter; break; } @@ -947,7 +947,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { - *new_cr = 2; + *new_cr = CR2; } } @@ -955,7 +955,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; - if (ac->ac_criteria >= 2) + if (ac->ac_criteria >= CR2) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; @@ -1000,7 +1000,7 @@ next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) * @ngroups Total number of groups */ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, - int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { *new_cr = ac->ac_criteria; @@ -1009,9 +1009,9 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, return; } - if (*new_cr == 0) { + if (*new_cr == CR0) { ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); - } else if (*new_cr == 1) { + } else if (*new_cr == CR1) { ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); } else { /* @@ -2408,13 +2408,13 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - BUG_ON(cr < 0 || cr >= 4); + BUG_ON(cr < CR0 || cr >= EXT4_MB_NUM_CRS); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; @@ -2428,7 +2428,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; switch (cr) { - case 0: + case CR0: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ @@ -2447,15 +2447,15 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; return true; - case 1: + case CR1: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; - case 2: + case CR2: if (free >= ac->ac_g_ex.fe_len) return true; break; - case 3: + case CR3: return true; default: BUG(); @@ -2476,7 +2476,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, - ext4_group_t group, int cr) + ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; @@ -2496,7 +2496,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= 2 && free < ac->ac_g_ex.fe_len) + if (cr <= CR2 && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2511,7 +2511,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_get_group_desc(sb, group, NULL); int ret; - /* cr=0/1 is a very optimistic search to find large + /* cr=CR0/CR1 is a very optimistic search to find large * good chunks almost for free. If buddy data is not * ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, @@ -2519,7 +2519,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * and we want to make sure we locate metadata blocks * in the first block group in the flex_bg if possible. */ - if (cr < 2 && + if (cr < CR2 && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2625,7 +2625,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - int cr = -1, new_cr; + enum criteria cr, new_cr; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2683,13 +2683,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? 0 : 1; + cr = ac->ac_2order ? CR0 : CR1; /* - * cr == 0 try to get exact allocation, - * cr == 3 try to get anything + * cr == CR0 try to get exact allocation, + * cr == CR3 try to get anything */ repeat: - for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { + for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; /* * searching for the right group start @@ -2716,7 +2716,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > 1 || + (cr > CR1 || prefetch_ios < sbi->s_mb_prefetch_limit)) { unsigned int curr_ios = prefetch_ios; @@ -2758,9 +2758,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } ac->ac_groups_scanned++; - if (cr == 0) + if (cr == CR0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && sbi->s_stripe && + else if (cr == CR1 && sbi->s_stripe && !(ac->ac_g_ex.fe_len % EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); @@ -2801,7 +2801,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = 3; + cr = CR3; goto repeat; } } @@ -2926,36 +2926,36 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); seq_puts(seq, "\tcr0_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[0])); + atomic64_read(&sbi->s_bal_cX_failed[CR0])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr0_bad_suggestions)); seq_puts(seq, "\tcr1_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[1])); + atomic64_read(&sbi->s_bal_cX_failed[CR1])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr1_bad_suggestions)); seq_puts(seq, "\tcr2_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[2])); + atomic64_read(&sbi->s_bal_cX_failed[CR2])); seq_puts(seq, "\tcr3_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[3])); + atomic64_read(&sbi->s_bal_cX_failed[CR3])); seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 54cd509ced0f..51eab82e897c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -120,6 +120,18 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) +TRACE_DEFINE_ENUM(CR0); +TRACE_DEFINE_ENUM(CR1); +TRACE_DEFINE_ENUM(CR2); +TRACE_DEFINE_ENUM(CR3); + +#define show_criteria(cr) \ + __print_symbolic(cr, \ + { CR0, "CR0" }, \ + { CR1, "CR1" }, \ + { CR2, "CR2" }, \ + { CR3, "CR3" }) + TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), @@ -1063,7 +1075,7 @@ TRACE_EVENT(ext4_mballoc_alloc, ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " - "result %u/%d/%u@%u blks %u grps %u cr %u flags %s " + "result %u/%d/%u@%u blks %u grps %u cr %s flags %s " "tail %u broken %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, @@ -1073,7 +1085,7 @@ TRACE_EVENT(ext4_mballoc_alloc, __entry->goal_len, __entry->goal_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical, - __entry->found, __entry->groups, __entry->cr, + __entry->found, __entry->groups, show_criteria(__entry->cr), show_mballoc_flags(__entry->flags), __entry->tail, __entry->buddy ? 1 << __entry->buddy : 0) ); From fdd9a00943a5f6687937628e01506dad697c9140 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:43 +0530 Subject: [PATCH 20/55] ext4: Add per CR extent scanned counter This gives better visibility into the number of extents scanned in each particular CR. For example, this information can be used to see how out block group scanning logic is performing when the BG is fragmented. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/55bb6d80f6e22ed2a5a830aa045572bdffc8b1b9.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 12 ++++++++++++ fs/ext4/mballoc.h | 1 + 3 files changed, 14 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 914475cbb060..34ec65126ea7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1555,6 +1555,7 @@ struct ext4_sb_info { atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_breaks; /* too long searches */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4359280d2fa7..e28731426ee5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2103,6 +2103,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first @@ -2277,6 +2278,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, break; } ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; @@ -2392,6 +2394,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, max = mb_find_extent(e4b, i, stripe, &ex); if (max >= stripe) { ac->ac_found++; + ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); @@ -2929,6 +2932,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR0])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR0])); seq_printf(seq, "\t\tbad_suggestions: %u\n", @@ -2938,6 +2942,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR1])); seq_printf(seq, "\t\tbad_suggestions: %u\n", @@ -2947,6 +2952,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR2])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR2])); @@ -2954,6 +2960,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR3])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR3])); seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); @@ -4393,7 +4400,12 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); + atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + for (int i=0; iac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); + } + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 24b666e558f1..acfdc204e15d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -184,6 +184,7 @@ struct ext4_allocation_context { __u16 ac_groups_scanned; __u16 ac_groups_linear_remaining; __u16 ac_found; + __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; __u16 ac_buddy; __u8 ac_status; From 3ef5d263879696027c70548532a94418aad3bd95 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:44 +0530 Subject: [PATCH 21/55] ext4: Add counter to track successful allocation of goal length Track number of allocations where the length of blocks allocated is equal to the length of goal blocks (post normalization). This metric could be useful if making changes to the allocator logic in the future as it could give us visibility into how often do we trim our requests. PS: ac_b_ex.fe_len might get modified due to preallocation efforts and hence we use ac_f_ex.fe_len instead since we want to compare how much the allocator was able to actually find. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/343620e2be8a237239ea2613a7a866ee8607e973.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 34ec65126ea7..e4f53947f51f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1558,6 +1558,7 @@ struct ext4_sb_info { atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_cr0_bad_suggestions; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e28731426ee5..64d3760e4bcf 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2965,6 +2965,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic64_read(&sbi->s_bal_cX_failed[CR3])); seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); @@ -4410,6 +4411,8 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); + if (ac->ac_f_ex.fe_len == ac->ac_g_ex.fe_len) + atomic_inc(&sbi->s_bal_len_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } From 1b420011210802a7a1b1e99f30bc1d62c352ac71 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:45 +0530 Subject: [PATCH 22/55] ext4: Avoid scanning smaller extents in BG during CR1 When we are inside ext4_mb_complex_scan_group() in CR1, we can be sure that this group has atleast 1 big enough continuous free extent to satisfy our request because (free / fragments) > goal length. Hence, instead of wasting time looping over smaller free extents, only try to consider the free extent if we are sure that it has enough continuous free space to satisfy goal length. This is particularly useful when scanning highly fragmented BGs in CR1 as, without this patch, the allocator might stop scanning early before reaching the big enough free extent (due to ac_found > mb_max_to_scan) which causes us to uncessarily trim the request. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/a5473df4517c53ec940bc9b603ef83a547032a32.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 64d3760e4bcf..704f8e5608f7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2307,7 +2307,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; - int i; + int i, j, freelen; int free; free = e4b->bd_info->bb_free; @@ -2334,6 +2334,23 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } + if (ac->ac_criteria < CR2) { + /* + * In CR1, we are sure that this group will + * have a large enough continuous free extent, so skip + * over the smaller free extents + */ + j = mb_find_next_bit(bitmap, + EXT4_CLUSTERS_PER_GROUP(sb), i); + freelen = j - i; + + if (freelen < ac->ac_g_ex.fe_len) { + i = j; + free -= freelen; + continue; + } + } + mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; From 3c6296046c85333bc52555a670a9093d9e2657bb Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:46 +0530 Subject: [PATCH 23/55] ext4: Don't skip prefetching BLOCK_UNINIT groups Currently, ext4_mb_prefetch() and ext4_mb_prefetch_fini() skip BLOCK_UNINIT groups since fetching their bitmaps doesn't need disk IO. As a consequence, we end not initializing the buddy structures and CR0/1 lists for these BGs, even though it can be done without any disk IO overhead. Hence, don't skip such BGs during prefetch and prefetch_fini. This improves the accuracy of CR0/1 allocation as earlier, we could have essentially empty BLOCK_UNINIT groups being ignored by CR0/1 due to their buddy not being initialized, leading to slower CR2 allocations. With this patch CR0/1 will be able to discover these groups as well, thus improving performance. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/dc3130b8daf45ffe63d8a3c1edcf00eb8ba70e1f.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 704f8e5608f7..fb9780bb1b54 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2589,9 +2589,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, */ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && - ext4_free_group_clusters(sb, gdp) > 0 && - !(ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); if (bh && !IS_ERR(bh)) { if (!buffer_uptodate(bh) && cnt) @@ -2632,9 +2630,7 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, grp = ext4_get_group_info(sb, group); if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && - ext4_free_group_clusters(sb, gdp) > 0 && - !(ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) { + ext4_free_group_clusters(sb, gdp) > 0) { if (ext4_mb_init_group(sb, group, GFP_NOFS)) break; } From 4f3d1e4533b0982034f316ace85415d3bc57e3da Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:47 +0530 Subject: [PATCH 24/55] ext4: Ensure ext4_mb_prefetch_fini() is called for all prefetched BGs Before this patch, the call stack in ext4_run_li_request is as follows: /* * nr = no. of BGs we want to fetch (=s_mb_prefetch) * prefetch_ios = no. of BGs not uptodate after * ext4_read_block_bitmap_nowait() */ next_group = ext4_mb_prefetch(sb, group, nr, prefetch_ios); ext4_mb_prefetch_fini(sb, next_group prefetch_ios); ext4_mb_prefetch_fini() will only try to initialize buddies for BGs in range [next_group - prefetch_ios, next_group). This is incorrect since sometimes (prefetch_ios < nr), which causes ext4_mb_prefetch_fini() to incorrectly ignore some of the BGs that might need initialization. This issue is more notable now with the previous patch enabling "fetching" of BLOCK_UNINIT BGs which are marked buffer_uptodate by default. Fix this by passing nr to ext4_mb_prefetch_fini() instead of prefetch_ios so that it considers the right range of groups. Similarly, make sure we don't pass nr=0 to ext4_mb_prefetch_fini() in ext4_mb_regular_allocator() since we might have prefetched BLOCK_UNINIT groups that would need buddy initialization. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/05e648ae04ec5b754207032823e9c1de9a54f87a.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 ---- fs/ext4/super.c | 11 ++++------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index fb9780bb1b54..d2a0259107c2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2734,8 +2734,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) if ((prefetch_grp == group) && (cr > CR1 || prefetch_ios < sbi->s_mb_prefetch_limit)) { - unsigned int curr_ios = prefetch_ios; - nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { nr = 1 << sbi->s_log_groups_per_flex; @@ -2744,8 +2742,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } prefetch_grp = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); - if (prefetch_ios == curr_ios) - nr = 0; } /* This now checks without needing the buddy page */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 39591fb78c1d..7f9b087b9b20 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3692,16 +3692,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ext4_group_t group = elr->lr_next_group; unsigned int prefetch_ios = 0; int ret = 0; + int nr = EXT4_SB(sb)->s_mb_prefetch; u64 start_time; if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) { - elr->lr_next_group = ext4_mb_prefetch(sb, group, - EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios); - if (prefetch_ios) - ext4_mb_prefetch_fini(sb, elr->lr_next_group, - prefetch_ios); - trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, - prefetch_ios); + elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); + ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr); + trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr); if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && From 856d865c178b4fbf4c629d5a7d0df9352d123280 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:48 +0530 Subject: [PATCH 25/55] ext4: Abstract out logic to search average fragment list Make the logic of searching average fragment list of a given order reusable by abstracting it out to a differnet function. This will also avoid code duplication in upcoming patches. No functional changes. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/028c11d95b17ce0285f45456709a0ca922df1b83.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 51 ++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d2a0259107c2..7fc99b209546 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -904,6 +904,37 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, } } +/* + * Find a suitable group of given order from the average fragments list. + */ +static struct ext4_group_info * +ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; + rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; + struct ext4_group_info *grp = NULL, *iter; + enum criteria cr = ac->ac_criteria; + + if (list_empty(frag_list)) + return NULL; + read_lock(frag_list_lock); + if (list_empty(frag_list)) { + read_unlock(frag_list_lock); + return NULL; + } + list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { + grp = iter; + break; + } + } + read_unlock(frag_list_lock); + return grp; +} + /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. @@ -912,7 +943,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *grp = NULL, *iter; + struct ext4_group_info *grp = NULL; int i; if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { @@ -922,23 +953,7 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { - if (list_empty(&sbi->s_mb_avg_fragment_size[i])) - continue; - read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); - if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { - read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); - continue; - } - list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], - bb_avg_fragment_size_node) { - if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[CR1]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, CR1))) { - grp = iter; - break; - } - } - read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); + grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); if (grp) break; } From 7e170922f06bf46effa7c57f6035fc463d6edc7e Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:49 +0530 Subject: [PATCH 26/55] ext4: Add allocation criteria 1.5 (CR1_5) CR1_5 aims to optimize allocations which can't be satisfied in CR1. The fact that we couldn't find a group in CR1 suggests that it would be difficult to find a continuous extent to compleltely satisfy our allocations. So before falling to the slower CR2, in CR1.5 we proactively trim the the preallocations so we can find a group with (free / fragments) big enough. This speeds up our allocation at the cost of slightly reduced preallocation. The patch also adds a new sysfs tunable: * /sys/fs/ext4//mb_cr1_5_max_trim_order This controls how much CR1.5 can trim a request before falling to CR2. For example, for a request of order 7 and max trim order 2, CR1.5 can trim this upto order 5. Suggested-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/150fdf65c8e4cc4dba71e020ce0859bcf636a5ff.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 8 ++- fs/ext4/mballoc.c | 135 +++++++++++++++++++++++++++++++++--- fs/ext4/mballoc.h | 13 ++++ fs/ext4/sysfs.c | 2 + include/trace/events/ext4.h | 2 + 5 files changed, 150 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e4f53947f51f..fb9d2e2a9e42 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -133,13 +133,14 @@ enum SHIFT_DIRECTION { * criteria the slower the allocation. We start at lower criterias and keep * falling back to higher ones if we are not able to find any blocks. */ -#define EXT4_MB_NUM_CRS 4 +#define EXT4_MB_NUM_CRS 5 /* * All possible allocation criterias for mballoc */ enum criteria { CR0, CR1, + CR1_5, CR2, CR3, }; @@ -185,6 +186,9 @@ enum criteria { #define EXT4_MB_CR0_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ #define EXT4_MB_CR1_OPTIMIZED 0x00010000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +#define EXT4_MB_CR1_5_OPTIMIZED 0x00020000 + struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1549,6 +1553,7 @@ struct ext4_sb_info { unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; + unsigned int s_mb_cr1_5_max_trim_order; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1563,6 +1568,7 @@ struct ext4_sb_info { atomic_t s_bal_2orders; /* 2^order hits */ atomic_t s_bal_cr0_bad_suggestions; atomic_t s_bal_cr1_bad_suggestions; + atomic_t s_bal_cr1_5_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7fc99b209546..cc9ad7469fbb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -165,6 +165,14 @@ * equal to request size using our average fragment size group lists (data * structure 2) in O(1) time. * + * At CR1.5 (aka CR1_5), we aim to optimize allocations which can't be satisfied + * in CR1. The fact that we couldn't find a group in CR1 suggests that there is + * no BG that has average fragment size > goal length. So before falling to the + * slower CR2, in CR1.5 we proactively trim goal length and then use the same + * fragment lists as CR1 to find a BG with a big enough average fragment size. + * This increases the chances of finding a suitable block group in O(1) time and + * results * in faster allocation at the cost of reduced size of allocation. + * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR0 and CR1 phase. * @@ -962,6 +970,91 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; } else { + *new_cr = CR1_5; + } +} + +/* + * We couldn't find a group in CR1 so try to find the highest free fragment + * order we have and proactively trim the goal request length to that order to + * find a suitable group faster. + * + * This optimizes allocation speed at the cost of slightly reduced + * preallocations. However, we make sure that we don't trim the request too + * much and fall to CR2 in that case. + */ +static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, + enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *grp = NULL; + int i, order, min_order; + unsigned long num_stripe_clusters = 0; + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_5_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_5_bad_suggestions); + } + + /* + * mb_avg_fragment_size_order() returns order in a way that makes + * retrieving back the length using (1 << order) inaccurate. Hence, use + * fls() instead since we need to know the actual length while modifying + * goal length. + */ + order = fls(ac->ac_g_ex.fe_len); + min_order = order - sbi->s_mb_cr1_5_max_trim_order; + if (min_order < 0) + min_order = 0; + + if (1 << min_order < ac->ac_o_ex.fe_len) + min_order = fls(ac->ac_o_ex.fe_len) + 1; + + if (sbi->s_stripe > 0) { + /* + * We are assuming that stripe size is always a multiple of + * cluster ratio otherwise __ext4_fill_super exists early. + */ + num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); + if (1 << min_order < num_stripe_clusters) + min_order = fls(num_stripe_clusters); + } + + for (i = order; i >= min_order; i--) { + int frag_order; + /* + * Scale down goal len to make sure we find something + * in the free fragments list. Basically, reduce + * preallocations. + */ + ac->ac_g_ex.fe_len = 1 << i; + + if (num_stripe_clusters > 0) { + /* + * Try to round up the adjusted goal to stripe size + * (in cluster units) multiple for efficiency. + * + * XXX: Is s->stripe always a power of 2? In that case + * we can use the faster round_up() variant. + */ + ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, + num_stripe_clusters); + } + + frag_order = mb_avg_fragment_size_order(ac->ac_sb, + ac->ac_g_ex.fe_len); + + grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); + if (grp) + break; + } + + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_5_OPTIMIZED; + } else { + /* Reset goal length to original goal length before falling into CR2 */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; *new_cr = CR2; } } @@ -1028,6 +1121,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); } else if (*new_cr == CR1) { ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); + } else if (*new_cr == CR1_5) { + ext4_mb_choose_next_group_cr1_5(ac, new_cr, group, ngroups); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -2351,7 +2446,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, if (ac->ac_criteria < CR2) { /* - * In CR1, we are sure that this group will + * In CR1 and CR1_5, we are sure that this group will * have a large enough continuous free extent, so skip * over the smaller free extents */ @@ -2483,6 +2578,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return true; case CR1: + case CR1_5: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; @@ -2747,7 +2843,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > CR1 || + (cr > CR1_5 || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { @@ -2787,7 +2883,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_groups_scanned++; if (cr == CR0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == CR1 && sbi->s_stripe && + else if ((cr == CR1 || cr == CR1_5) && sbi->s_stripe && !(ac->ac_g_ex.fe_len % EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); @@ -2803,6 +2899,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); + + if (i == ngroups && ac->ac_criteria == CR1_5) + /* Reset goal length to original goal length before + * falling into CR2 */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2972,6 +3073,16 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_cr1_bad_suggestions)); + seq_puts(seq, "\tcr1.5_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1_5])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[CR1_5])); + seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1_5])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[CR1_5])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr1_5_bad_suggestions)); + seq_puts(seq, "\tcr2_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); seq_printf(seq, "\t\tgroups_considered: %llu\n", @@ -3489,6 +3600,8 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_cr1_5_max_trim_order = MB_DEFAULT_CR1_5_TRIM_ORDER; + /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -4392,6 +4505,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size)) && @@ -4435,8 +4549,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); - if (ac->ac_f_ex.fe_len == ac->ac_g_ex.fe_len) + /* did we allocate as much as normalizer originally wanted? */ + if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) atomic_inc(&sbi->s_bal_len_goals); + if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } @@ -4921,7 +5037,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; - if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { + if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { int new_bex_start; int new_bex_end; @@ -4936,14 +5052,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * fragmentation in check while ensuring logical range of best * extent doesn't overflow out of goal extent: * - * 1. Check if best ex can be kept at end of goal and still - * cover original start + * 1. Check if best ex can be kept at end of goal (before + * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and * still cover original start * 3. Else, keep the best ex at start of original request. */ new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_g_ex.fe_len); + EXT4_C2B(sbi, ac->ac_orig_goal_len); new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (ac->ac_o_ex.fe_logical >= new_bex_start) goto adjust_bex; @@ -4964,7 +5080,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); + EXT4_C2B(sbi, ac->ac_orig_goal_len))); } pa->pa_lstart = ac->ac_b_ex.fe_logical; @@ -5584,6 +5700,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; + ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; ac->ac_flags = ar->flags; /* we have to define context: we'll work with a file or diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index acfdc204e15d..bddc0335c261 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -85,6 +85,13 @@ */ #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 +/* + * The maximum order upto which CR1.5 can trim a particular allocation request. + * Example, if we have an order 7 request and max trim order of 3, CR1.5 can + * trim this upto order 4. + */ +#define MB_DEFAULT_CR1_5_TRIM_ORDER 3 + /* * Number of valid buddy orders */ @@ -179,6 +186,12 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; + /* + * goal len can change in CR1.5, so save the original len. This is + * used while adjusting the PA window and for accounting. + */ + ext4_grpblk_t ac_orig_goal_len; + __u32 ac_groups_considered; __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 3042bc605bbf..4a5c08c8dddb 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(mb_cr1_5_max_trim_order, s_mb_cr1_5_max_trim_order); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -273,6 +274,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(mb_cr1_5_max_trim_order), ATTR_LIST(errors_count), ATTR_LIST(warning_count), ATTR_LIST(msg_count), diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 51eab82e897c..2a3ffa081d2c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -122,6 +122,7 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); TRACE_DEFINE_ENUM(CR0); TRACE_DEFINE_ENUM(CR1); +TRACE_DEFINE_ENUM(CR1_5); TRACE_DEFINE_ENUM(CR2); TRACE_DEFINE_ENUM(CR3); @@ -129,6 +130,7 @@ TRACE_DEFINE_ENUM(CR3); __print_symbolic(cr, \ { CR0, "CR0" }, \ { CR1, "CR1" }, \ + { CR1_5, "CR1.5" } \ { CR2, "CR2" }, \ { CR3, "CR3" }) From f52f3d2b9fbab73c776f4d3386393e9bbc83b87d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 30 May 2023 18:03:50 +0530 Subject: [PATCH 27/55] ext4: Give symbolic names to mballoc criterias mballoc criterias have historically been called by numbers like CR0, CR1... however this makes it confusing to understand what each criteria is about. Change these criterias from numbers to symbolic names and add relevant comments. While we are at it, also reformat and add some comments to ext4_seq_mb_stats_show() for better readability. Additionally, define CR_FAST which signifies the criteria below which we can make quicker decisions like: * quitting early if (free block < requested len) * avoiding to scan free extents smaller than required len. * avoiding to initialize buddy cache and work with existing cache * limiting prefetches Suggested-by: Jan Kara Signed-off-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/a2dc6ec5aea5e5e68cf8e788c2a964ffead9c8b0.1685449706.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 55 ++++++-- fs/ext4/mballoc.c | 265 ++++++++++++++++++++---------------- fs/ext4/mballoc.h | 8 +- fs/ext4/sysfs.c | 4 +- include/trace/events/ext4.h | 24 ++-- 5 files changed, 210 insertions(+), 146 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb9d2e2a9e42..6a1f013d23f7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -135,16 +135,45 @@ enum SHIFT_DIRECTION { */ #define EXT4_MB_NUM_CRS 5 /* - * All possible allocation criterias for mballoc + * All possible allocation criterias for mballoc. Lower are faster. */ enum criteria { - CR0, - CR1, - CR1_5, - CR2, - CR3, + /* + * Used when number of blocks needed is a power of 2. This doesn't + * trigger any disk IO except prefetch and is the fastest criteria. + */ + CR_POWER2_ALIGNED, + + /* + * Tries to lookup in-memory data structures to find the most suitable + * group that satisfies goal request. No disk IO except block prefetch. + */ + CR_GOAL_LEN_FAST, + + /* + * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal length to + * the best available length for faster allocation. + */ + CR_BEST_AVAIL_LEN, + + /* + * Reads each block group sequentially, performing disk IO if necessary, to + * find find_suitable block group. Tries to allocate goal length but might trim + * the request if nothing is found after enough tries. + */ + CR_GOAL_LEN_SLOW, + + /* + * Finds the first free set of blocks and allocates those. This is only + * used in rare cases when CR_GOAL_LEN_SLOW also fails to allocate + * anything. + */ + CR_ANY_FREE, }; +/* criteria below which we use fast block scanning and avoid unnecessary IO */ +#define CR_FAST CR_GOAL_LEN_SLOW + /* * Flags used in mballoc's allocation_context flags field. * @@ -183,11 +212,11 @@ enum criteria { /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 /* Large fragment size list lookup succeeded at least once for cr = 0 */ -#define EXT4_MB_CR0_OPTIMIZED 0x8000 +#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ -#define EXT4_MB_CR1_OPTIMIZED 0x00010000 +#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 /* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ -#define EXT4_MB_CR1_5_OPTIMIZED 0x00020000 +#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -1553,7 +1582,7 @@ struct ext4_sb_info { unsigned long s_mb_last_start; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; - unsigned int s_mb_cr1_5_max_trim_order; + unsigned int s_mb_best_avail_max_trim_order; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1566,9 +1595,9 @@ struct ext4_sb_info { atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - atomic_t s_bal_cr0_bad_suggestions; - atomic_t s_bal_cr1_bad_suggestions; - atomic_t s_bal_cr1_5_bad_suggestions; + atomic_t s_bal_p2_aligned_bad_suggestions; + atomic_t s_bal_goal_fast_bad_suggestions; + atomic_t s_bal_best_avail_bad_suggestions; atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cc9ad7469fbb..74ebe31f8d0f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -154,27 +154,31 @@ * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * - * At CR0 , we look for groups which have the largest_free_order >= the order - * of the request. We directly look at the largest free order list in the data - * structure (1) above where largest_free_order = order of the request. If that - * list is empty, we look at remaining list in the increasing order of - * largest_free_order. This allows us to perform CR0 lookup in O(1) time. + * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order + * >= the order of the request. We directly look at the largest free order list + * in the data structure (1) above where largest_free_order = order of the + * request. If that list is empty, we look at remaining list in the increasing + * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED + * lookup in O(1) time. * - * At CR1, we only consider groups where average fragment size > request - * size. So, we lookup a group which has average fragment size just above or - * equal to request size using our average fragment size group lists (data - * structure 2) in O(1) time. + * At CR_GOAL_LEN_FAST, we only consider groups where + * average fragment size > request size. So, we lookup a group which has average + * fragment size just above or equal to request size using our average fragment + * size group lists (data structure 2) in O(1) time. * - * At CR1.5 (aka CR1_5), we aim to optimize allocations which can't be satisfied - * in CR1. The fact that we couldn't find a group in CR1 suggests that there is - * no BG that has average fragment size > goal length. So before falling to the - * slower CR2, in CR1.5 we proactively trim goal length and then use the same - * fragment lists as CR1 to find a BG with a big enough average fragment size. - * This increases the chances of finding a suitable block group in O(1) time and - * results * in faster allocation at the cost of reduced size of allocation. + * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied + * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in + * CR_GOAL_LEN_FAST suggests that there is no BG that has avg + * fragment size > goal length. So before falling to the slower + * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and + * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big + * enough average fragment size. This increases the chances of finding a + * suitable block group in O(1) time and results in faster allocation at the + * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in - * linear order which requires O(N) search time for each CR0 and CR1 phase. + * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and + * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * @@ -359,8 +363,8 @@ * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) - * - cr0 lists lock (cr0) - * - cr1 tree lock (cr1) + * - cr_power2_aligned lists lock (cr_power2_aligned) + * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa @@ -392,7 +396,7 @@ * * - allocation path (ext4_mb_regular_allocator) * group - * cr0/cr1 + * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; @@ -866,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ -static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, +static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); @@ -876,8 +880,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (ac->ac_status == AC_STATUS_FOUND) return; - if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) - atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) + atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { @@ -892,8 +896,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) - atomic64_inc(&sbi->s_bal_cX_groups_considered[CR0]); - if (likely(ext4_mb_good_group(ac, iter->bb_group, CR0))) { + atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { grp = iter; break; } @@ -905,10 +909,10 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, if (!grp) { /* Increment cr and search again */ - *new_cr = CR1; + *new_cr = CR_GOAL_LEN_FAST; } else { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; } } @@ -947,16 +951,16 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ -static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, +static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; int i; - if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_cr1_bad_suggestions); + atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); } for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); @@ -968,22 +972,22 @@ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, if (grp) { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; } else { - *new_cr = CR1_5; + *new_cr = CR_BEST_AVAIL_LEN; } } /* - * We couldn't find a group in CR1 so try to find the highest free fragment + * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too - * much and fall to CR2 in that case. + * much and fall to CR_GOAL_LEN_SLOW in that case. */ -static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, +static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); @@ -991,9 +995,9 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, int i, order, min_order; unsigned long num_stripe_clusters = 0; - if (unlikely(ac->ac_flags & EXT4_MB_CR1_5_OPTIMIZED)) { + if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { if (sbi->s_mb_stats) - atomic_inc(&sbi->s_bal_cr1_5_bad_suggestions); + atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); } /* @@ -1003,7 +1007,7 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, * goal length. */ order = fls(ac->ac_g_ex.fe_len); - min_order = order - sbi->s_mb_cr1_5_max_trim_order; + min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; @@ -1051,11 +1055,11 @@ static void ext4_mb_choose_next_group_cr1_5(struct ext4_allocation_context *ac, if (grp) { *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR1_5_OPTIMIZED; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; } else { - /* Reset goal length to original goal length before falling into CR2 */ + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR2; + *new_cr = CR_GOAL_LEN_SLOW; } } @@ -1063,7 +1067,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; - if (ac->ac_criteria >= CR2) + if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; @@ -1117,12 +1121,12 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, return; } - if (*new_cr == CR0) { - ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); - } else if (*new_cr == CR1) { - ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); - } else if (*new_cr == CR1_5) { - ext4_mb_choose_next_group_cr1_5(ac, new_cr, group, ngroups); + if (*new_cr == CR_POWER2_ALIGNED) { + ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups); + } else if (*new_cr == CR_GOAL_LEN_FAST) { + ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups); + } else if (*new_cr == CR_BEST_AVAIL_LEN) { + ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups); } else { /* * TODO: For CR=2, we can arrange groups in an rb tree sorted by @@ -2444,11 +2448,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - if (ac->ac_criteria < CR2) { + if (ac->ac_criteria < CR_FAST) { /* - * In CR1 and CR1_5, we are sure that this group will - * have a large enough continuous free extent, so skip - * over the smaller free extents + * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are + * sure that this group will have a large enough + * continuous free extent, so skip over the smaller free + * extents */ j = mb_find_next_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); @@ -2544,7 +2549,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); - BUG_ON(cr < CR0 || cr >= EXT4_MB_NUM_CRS); + BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) return false; @@ -2558,7 +2563,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; switch (cr) { - case CR0: + case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ @@ -2577,16 +2582,16 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, return false; return true; - case CR1: - case CR1_5: + case CR_GOAL_LEN_FAST: + case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; - case CR2: + case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; - case CR3: + case CR_ANY_FREE: return true; default: BUG(); @@ -2627,7 +2632,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR2 && free < ac->ac_g_ex.fe_len) + if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2642,15 +2647,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_get_group_desc(sb, group, NULL); int ret; - /* cr=CR0/CR1 is a very optimistic search to find large - * good chunks almost for free. If buddy data is not - * ready, then this optimization makes no sense. But - * we never skip the first block group in a flex_bg, - * since this gets used for metadata block allocation, - * and we want to make sure we locate metadata blocks - * in the first block group in the flex_bg if possible. + /* + * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * search to find large good chunks almost for free. If buddy + * data is not ready, then this optimization makes no sense. But + * we never skip the first block group in a flex_bg, since this + * gets used for metadata block allocation, and we want to make + * sure we locate metadata blocks in the first block group in + * the flex_bg if possible. */ - if (cr < CR2 && + if (cr < CR_FAST && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2810,10 +2816,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? CR0 : CR1; + cr = ac->ac_2order ? CR_POWER2_ALIGNED : CR_GOAL_LEN_FAST; /* - * cr == CR0 try to get exact allocation, - * cr == CR3 try to get anything + * cr == CR_POWER2_ALIGNED try to get exact allocation, + * cr == CR_ANY_FREE try to get anything */ repeat: for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { @@ -2843,7 +2849,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr > CR1_5 || + (cr >= CR_FAST || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { @@ -2881,9 +2887,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) } ac->ac_groups_scanned++; - if (cr == CR0) + if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); - else if ((cr == CR1 || cr == CR1_5) && sbi->s_stripe && + else if ((cr == CR_GOAL_LEN_FAST || + cr == CR_BEST_AVAIL_LEN) && + sbi->s_stripe && !(ac->ac_g_ex.fe_len % EXT4_B2C(sbi, sbi->s_stripe))) ext4_mb_scan_aligned(ac, &e4b); @@ -2900,9 +2908,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); - if (i == ngroups && ac->ac_criteria == CR1_5) + if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) /* Reset goal length to original goal length before - * falling into CR2 */ + * falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; } @@ -2929,7 +2937,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; - cr = CR3; + cr = CR_ANY_FREE; goto repeat; } } @@ -3045,66 +3053,94 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) seq_puts(seq, "mballoc:\n"); if (!sbi->s_mb_stats) { seq_puts(seq, "\tmb stats collection turned off.\n"); - seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + seq_puts( + seq, + "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); return 0; } seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); - seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); + seq_printf(seq, "\tgroups_scanned: %u\n", + atomic_read(&sbi->s_bal_groups_scanned)); - seq_puts(seq, "\tcr0_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR0])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR0])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR0])); + /* CR_POWER2_ALIGNED stats */ + seq_puts(seq, "\tcr_p2_aligned_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR0])); + atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr0_bad_suggestions)); + atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); - seq_puts(seq, "\tcr1_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1])); + /* CR_GOAL_LEN_FAST stats */ + seq_puts(seq, "\tcr_goal_fast_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR1])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1])); + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR1])); + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr1_bad_suggestions)); + atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); - seq_puts(seq, "\tcr1.5_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR1_5])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR1_5])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR1_5])); + /* CR_BEST_AVAIL_LEN stats */ + seq_puts(seq, "\tcr_best_avail_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR1_5])); + atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tbad_suggestions: %u\n", - atomic_read(&sbi->s_bal_cr1_5_bad_suggestions)); + atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); - seq_puts(seq, "\tcr2_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR2])); + /* CR_GOAL_LEN_SLOW stats */ + seq_puts(seq, "\tcr_goal_slow_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR2])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR2])); + atomic64_read( + &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR2])); + atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); - seq_puts(seq, "\tcr3_stats:\n"); - seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR3])); - seq_printf(seq, "\t\tgroups_considered: %llu\n", - atomic64_read(&sbi->s_bal_cX_groups_considered[CR3])); - seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR3])); + /* CR_ANY_FREE stats */ + seq_puts(seq, "\tcr_any_free_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", + atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); + seq_printf( + seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); + seq_printf(seq, "\t\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); seq_printf(seq, "\t\tuseless_loops: %llu\n", - atomic64_read(&sbi->s_bal_cX_failed[CR3])); - seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); + atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); + + /* Aggregates */ + seq_printf(seq, "\textents_scanned: %u\n", + atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); - seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); + seq_printf(seq, "\t\tlen_goal_hits: %u\n", + atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); - seq_printf(seq, "\tbuddies_generated: %u/%u\n", atomic_read(&sbi->s_mb_buddies_generated), ext4_get_groups_count(sb)); @@ -3112,8 +3148,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) atomic64_read(&sbi->s_mb_generation_time)); seq_printf(seq, "\tpreallocated: %u\n", atomic_read(&sbi->s_mb_preallocated)); - seq_printf(seq, "\tdiscarded: %u\n", - atomic_read(&sbi->s_mb_discarded)); + seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); return 0; } @@ -3600,7 +3635,7 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_cr1_5_max_trim_order = MB_DEFAULT_CR1_5_TRIM_ORDER; + sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; /* * The default group preallocation is 512, which for 4k block diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index bddc0335c261..df6b5e7c2274 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -86,11 +86,11 @@ #define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 /* - * The maximum order upto which CR1.5 can trim a particular allocation request. - * Example, if we have an order 7 request and max trim order of 3, CR1.5 can - * trim this upto order 4. + * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular + * allocation request. Example, if we have an order 7 request and max trim order + * of 3, we can trim this request upto order 4. */ -#define MB_DEFAULT_CR1_5_TRIM_ORDER 3 +#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3 /* * Number of valid buddy orders diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 4a5c08c8dddb..6d332dff79dd 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -223,7 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(mb_cr1_5_max_trim_order, s_mb_cr1_5_max_trim_order); +EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -274,7 +274,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), - ATTR_LIST(mb_cr1_5_max_trim_order), + ATTR_LIST(mb_best_avail_max_trim_order), ATTR_LIST(errors_count), ATTR_LIST(warning_count), ATTR_LIST(msg_count), diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 2a3ffa081d2c..65029dfb92fb 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -120,19 +120,19 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX); { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}, \ { EXT4_FC_REASON_ENCRYPTED_FILENAME, "ENCRYPTED_FILENAME"}) -TRACE_DEFINE_ENUM(CR0); -TRACE_DEFINE_ENUM(CR1); -TRACE_DEFINE_ENUM(CR1_5); -TRACE_DEFINE_ENUM(CR2); -TRACE_DEFINE_ENUM(CR3); +TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED); +TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST); +TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN); +TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW); +TRACE_DEFINE_ENUM(CR_ANY_FREE); -#define show_criteria(cr) \ - __print_symbolic(cr, \ - { CR0, "CR0" }, \ - { CR1, "CR1" }, \ - { CR1_5, "CR1.5" } \ - { CR2, "CR2" }, \ - { CR3, "CR3" }) +#define show_criteria(cr) \ + __print_symbolic(cr, \ + { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" }, \ + { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" }, \ + { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" }, \ + { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" }, \ + { CR_ANY_FREE, "CR_ANY_FREE" }) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), From de25d6e9610a8b30cce9bbb19b50615d02ebca02 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:35 +0800 Subject: [PATCH 28/55] ext4: only update i_reserved_data_blocks on successful block allocation In our fault injection test, we create an ext4 file, migrate it to non-extent based file, then punch a hole and finally trigger a WARN_ON in the ext4_da_update_reserve_space(): EXT4-fs warning (device sda): ext4_da_update_reserve_space:369: ino 14, used 11 with only 10 reserved data blocks When writing back a non-extent based file, if we enable delalloc, the number of reserved blocks will be subtracted from the number of blocks mapped by ext4_ind_map_blocks(), and the extent status tree will be updated. We update the extent status tree by first removing the old extent_status and then inserting the new extent_status. If the block range we remove happens to be in an extent, then we need to allocate another extent_status with ext4_es_alloc_extent(). use old to remove to add new |----------|------------|------------| old extent_status The problem is that the allocation of a new extent_status failed due to a fault injection, and __es_shrink() did not get free memory, resulting in a return of -ENOMEM. Then do_writepages() retries after receiving -ENOMEM, we map to the same extent again, and the number of reserved blocks is again subtracted from the number of blocks in that extent. Since the blocks in the same extent are subtracted twice, we end up triggering WARN_ON at ext4_da_update_reserve_space() because used > ei->i_reserved_data_blocks. For non-extent based file, we update the number of reserved blocks after ext4_ind_map_blocks() is executed, which causes a problem that when we call ext4_ind_map_blocks() to create a block, it doesn't always create a block, but we always reduce the number of reserved blocks. So we move the logic for updating reserved blocks to ext4_ind_map_blocks() to ensure that the number of reserved blocks is updated only after we do succeed in allocating some new blocks. Fixes: 5f634d064c70 ("ext4: Fix quota accounting error with fallocate") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/indirect.c | 8 ++++++++ fs/ext4/inode.c | 10 ---------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c68bebe7ff4b..a9f3716119d3 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -651,6 +651,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ext4_update_inode_fsync_trans(handle, inode, 1); count = ar.len; + + /* + * Update reserved blocks/metadata blocks after successful block + * allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, count, 1); + got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c2868282ad81..ef7ec2690b84 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -632,16 +632,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, */ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } - - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. We don't - * support fallocate for non extent files. So we can update - * reserve space here. - */ - if ((retval > 0) && - (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) - ext4_da_update_reserve_space(inode, retval, 1); } if (retval > 0) { From 9649eb18c6288f514cacffdd699d5cd999c2f8f6 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:36 +0800 Subject: [PATCH 29/55] ext4: add a new helper to check if es must be kept In the extent status tree, we have extents which we can just drop without issues and extents we must not drop - this depends on the extent's status - currently ext4_es_is_delayed() extents must stay, others may be dropped. A helper function is added to help determine if the current extent can be dropped, although only ext4_es_is_delayed() extents cannot be dropped currently. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 595abb9e7d74..931520dfda4d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -446,6 +446,19 @@ static void ext4_es_list_del(struct inode *inode) spin_unlock(&sbi->s_es_lock); } +/* + * Returns true if we cannot fail to allocate memory for this extent_status + * entry and cannot reclaim it until its status changes. + */ +static inline bool ext4_es_must_keep(struct extent_status *es) +{ + /* fiemap, bigalloc, and seek_data/hole need to use it. */ + if (ext4_es_is_delayed(es)) + return true; + + return false; +} + static struct extent_status * ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) @@ -458,10 +471,8 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, es->es_len = len; es->es_pblk = pblk; - /* - * We don't count delayed extent because we never try to reclaim them - */ - if (!ext4_es_is_delayed(es)) { + /* We never try to reclaim a must kept extent, so we don't count it. */ + if (!ext4_es_must_keep(es)) { if (!EXT4_I(inode)->i_es_shk_nr++) ext4_es_list_add(inode); percpu_counter_inc(&EXT4_SB(inode->i_sb)-> @@ -479,8 +490,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); - /* Decrease the shrink counter when this es is not delayed */ - if (!ext4_es_is_delayed(es)) { + /* Decrease the shrink counter when we can reclaim the extent. */ + if (!ext4_es_must_keep(es)) { BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); if (!--EXT4_I(inode)->i_es_shk_nr) ext4_es_list_del(inode); @@ -851,7 +862,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), 128, EXT4_I(inode))) goto retry; - if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) + if (err == -ENOMEM && !ext4_es_must_keep(&newes)) err = 0; if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && @@ -1702,11 +1713,8 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, (*nr_to_scan)--; node = rb_next(&es->rb_node); - /* - * We can't reclaim delayed extent from status tree because - * fiemap, bigallic, and seek_data/hole need to use it. - */ - if (ext4_es_is_delayed(es)) + + if (ext4_es_must_keep(es)) goto next; if (ext4_es_is_referenced(es)) { ext4_es_clear_referenced(es); @@ -1770,7 +1778,7 @@ void ext4_clear_inode_es(struct inode *inode) while (node) { es = rb_entry(node, struct extent_status, rb_node); node = rb_next(node); - if (!ext4_es_is_delayed(es)) { + if (!ext4_es_must_keep(es)) { rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); } From 73a2f033656be11298912201ad50615307b4477a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:37 +0800 Subject: [PATCH 30/55] ext4: factor out __es_alloc_extent() and __es_free_extent() Factor out __es_alloc_extent() and __es_free_extent(), which only allocate and free extent_status in these two helpers. The ext4_es_alloc_extent() function is split into __es_alloc_extent() and ext4_es_init_extent(). In __es_alloc_extent() we allocate memory using GFP_KERNEL | __GFP_NOFAIL | __GFP_ZERO if the memory allocation cannot fail, otherwise we use GFP_ATOMIC. and the ext4_es_init_extent() is used to initialize extent_status and update related variables after a successful allocation. This is to prepare for the use of pre-allocated extent_status later. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 931520dfda4d..703e9a8de7f2 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -459,14 +459,17 @@ static inline bool ext4_es_must_keep(struct extent_status *es) return false; } -static struct extent_status * -ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, - ext4_fsblk_t pblk) +static inline struct extent_status *__es_alloc_extent(bool nofail) +{ + if (!nofail) + return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); + + return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL); +} + +static void ext4_es_init_extent(struct inode *inode, struct extent_status *es, + ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) { - struct extent_status *es; - es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); - if (es == NULL) - return NULL; es->es_lblk = lblk; es->es_len = len; es->es_pblk = pblk; @@ -481,8 +484,11 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, EXT4_I(inode)->i_es_all_nr++; percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); +} - return es; +static inline void __es_free_extent(struct extent_status *es) +{ + kmem_cache_free(ext4_es_cachep, es); } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) @@ -499,7 +505,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) s_es_stats.es_stats_shk_cnt); } - kmem_cache_free(ext4_es_cachep, es); + __es_free_extent(es); } /* @@ -800,10 +806,12 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes) } } - es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len, - newes->es_pblk); + es = __es_alloc_extent(false); if (!es) return -ENOMEM; + ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len, + newes->es_pblk); + rb_link_node(&es->rb_node, parent, p); rb_insert_color(&es->rb_node, &tree->root); From 95f0b320339a977cf69872eac107122bf536775d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:38 +0800 Subject: [PATCH 31/55] ext4: use pre-allocated es in __es_insert_extent() Pass a extent_status pointer prealloc to __es_insert_extent(). If the pointer is non-null, it is used directly when a new extent_status is needed to avoid memory allocation failures. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-5-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 703e9a8de7f2..2ac1f6eb4001 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -144,7 +144,8 @@ static struct kmem_cache *ext4_es_cachep; static struct kmem_cache *ext4_pending_cachep; -static int __es_insert_extent(struct inode *inode, struct extent_status *newes); +static int __es_insert_extent(struct inode *inode, struct extent_status *newes, + struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end, int *reserved); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); @@ -766,7 +767,8 @@ static inline void ext4_es_insert_extent_check(struct inode *inode, } #endif -static int __es_insert_extent(struct inode *inode, struct extent_status *newes) +static int __es_insert_extent(struct inode *inode, struct extent_status *newes, + struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node **p = &tree->root.rb_node; @@ -806,7 +808,10 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes) } } - es = __es_alloc_extent(false); + if (prealloc) + es = prealloc; + else + es = __es_alloc_extent(false); if (!es) return -ENOMEM; ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len, @@ -866,7 +871,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, if (err != 0) goto error; retry: - err = __es_insert_extent(inode, &newes); + err = __es_insert_extent(inode, &newes, NULL); if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), 128, EXT4_I(inode))) goto retry; @@ -916,7 +921,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); if (!es || es->es_lblk > end) - __es_insert_extent(inode, &newes); + __es_insert_extent(inode, &newes, NULL); write_unlock(&EXT4_I(inode)->i_es_lock); } @@ -1362,7 +1367,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, orig_es.es_len - len2; ext4_es_store_pblock_status(&newes, block, ext4_es_status(&orig_es)); - err = __es_insert_extent(inode, &newes); + err = __es_insert_extent(inode, &newes, NULL); if (err) { es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; @@ -2016,7 +2021,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, if (err != 0) goto error; retry: - err = __es_insert_extent(inode, &newes); + err = __es_insert_extent(inode, &newes, NULL); if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), 128, EXT4_I(inode))) goto retry; From bda3efaf774fb687c2b7a555aaec3006b14a8857 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:39 +0800 Subject: [PATCH 32/55] ext4: use pre-allocated es in __es_remove_extent() When splitting extent, if the second extent can not be dropped, we return -ENOMEM and use GFP_NOFAIL to preallocate an extent_status outside of i_es_lock and pass it to __es_remove_extent() to be used as the second extent. This ensures that __es_remove_extent() is executed successfully, thus ensuring consistency in the extent status tree. If the second extent is not undroppable, we simply drop it and return 0. Then retry is no longer necessary, remove it. Now, __es_remove_extent() will always remove what it should, maybe more. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-6-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 2ac1f6eb4001..d28a7465f43c 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -147,7 +147,8 @@ static struct kmem_cache *ext4_pending_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes, struct extent_status *prealloc); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved); + ext4_lblk_t end, int *reserved, + struct extent_status *prealloc); static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); @@ -867,7 +868,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, NULL); + err = __es_remove_extent(inode, lblk, end, NULL, NULL); if (err != 0) goto error; retry: @@ -1311,6 +1312,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, * @lblk - first block in range * @end - last block in range * @reserved - number of cluster reservations released + * @prealloc - pre-allocated es to avoid memory allocation failures * * If @reserved is not NULL and delayed allocation is enabled, counts * block/cluster reservations freed by removing range and if bigalloc @@ -1318,7 +1320,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, * error code on failure. */ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t end, int *reserved) + ext4_lblk_t end, int *reserved, + struct extent_status *prealloc) { struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree; struct rb_node *node; @@ -1326,14 +1329,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status orig_es; ext4_lblk_t len1, len2; ext4_fsblk_t block; - int err; + int err = 0; bool count_reserved = true; struct rsvd_count rc; if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC)) count_reserved = false; -retry: - err = 0; es = __es_tree_search(&tree->root, lblk); if (!es) @@ -1367,14 +1368,13 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, orig_es.es_len - len2; ext4_es_store_pblock_status(&newes, block, ext4_es_status(&orig_es)); - err = __es_insert_extent(inode, &newes, NULL); + err = __es_insert_extent(inode, &newes, prealloc); if (err) { + if (!ext4_es_must_keep(&newes)) + return 0; + es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; - if ((err == -ENOMEM) && - __es_shrink(EXT4_SB(inode->i_sb), - 128, EXT4_I(inode))) - goto retry; goto out; } } else { @@ -1474,7 +1474,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, &reserved); + err = __es_remove_extent(inode, lblk, end, &reserved, NULL); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); @@ -2017,7 +2017,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, lblk, NULL); + err = __es_remove_extent(inode, lblk, lblk, NULL, NULL); if (err != 0) goto error; retry: From e9fe2b882bd5b26b987c9ba110c2222796f72af5 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:40 +0800 Subject: [PATCH 33/55] ext4: using nofail preallocation in ext4_es_remove_extent() If __es_remove_extent() returns an error it means that when splitting extent, allocating an extent that must be kept failed, where returning an error directly would cause the extent tree to be inconsistent. So we use GFP_NOFAIL to pre-allocate an extent_status and pass it to __es_remove_extent() to avoid this problem. In addition, since the allocated memory is outside the i_es_lock, the extent_status tree may change and the pre-allocated extent_status is no longer needed, so we release the pre-allocated extent_status when es->es_len is not initialized. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-7-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d28a7465f43c..60f680f0ba8e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1454,6 +1454,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end; int err = 0; int reserved = 0; + struct extent_status *es = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return 0; @@ -1468,17 +1469,25 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, end = lblk + len - 1; BUG_ON(end < lblk); +retry: + if (err && !es) + es = __es_alloc_extent(true); /* * ext4_clear_inode() depends on us taking i_es_lock unconditionally * so that we are sure __es_shrink() is done with the inode before it * is reclaimed. */ write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, &reserved, NULL); + err = __es_remove_extent(inode, lblk, end, &reserved, es); + if (es && !es->es_len) + __es_free_extent(es); write_unlock(&EXT4_I(inode)->i_es_lock); + if (err) + goto retry; + ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return err; + return 0; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, From 4a2d98447b37bcb68a7f06a1078edcb4f7e6ce7e Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:41 +0800 Subject: [PATCH 34/55] ext4: using nofail preallocation in ext4_es_insert_delayed_block() Similar to in ext4_es_remove_extent(), we use a no-fail preallocation to avoid inconsistencies, except that here we may have to preallocate two extent_status. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-8-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 60f680f0ba8e..1890df271580 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -2009,7 +2009,10 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated) { struct extent_status newes; - int err = 0; + int err1 = 0; + int err2 = 0; + struct extent_status *es1 = NULL; + struct extent_status *es2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return 0; @@ -2024,29 +2027,37 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); +retry: + if (err1 && !es1) + es1 = __es_alloc_extent(true); + if ((err1 || err2) && !es2) + es2 = __es_alloc_extent(true); write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, lblk, NULL, NULL); - if (err != 0) + err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); + if (err1 != 0) goto error; -retry: - err = __es_insert_extent(inode, &newes, NULL); - if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), - 128, EXT4_I(inode))) - goto retry; - if (err != 0) + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 != 0) goto error; if (allocated) __insert_pending(inode, lblk); + /* es is pre-allocated but not used, free it. */ + if (es1 && !es1->es_len) + __es_free_extent(es1); + if (es2 && !es2->es_len) + __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); + if (err1 || err2) + goto retry; ext4_es_print_tree(inode); ext4_print_pending_tree(inode); - - return err; + return 0; } /* From 2a69c450083db164596c75c0f5b4d9c4c0e18eba Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:42 +0800 Subject: [PATCH 35/55] ext4: using nofail preallocation in ext4_es_insert_extent() Similar to in ext4_es_insert_delayed_block(), we use preallocations that do not fail to avoid inconsistencies, but we do not care about es that are not must be kept, and we return 0 even if such es memory allocation fails. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-9-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 1890df271580..807ccc3107ec 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -838,8 +838,11 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; - int err = 0; + int err1 = 0; + int err2 = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct extent_status *es1 = NULL; + struct extent_status *es2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return 0; @@ -867,29 +870,40 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); - write_lock(&EXT4_I(inode)->i_es_lock); - err = __es_remove_extent(inode, lblk, end, NULL, NULL); - if (err != 0) - goto error; retry: - err = __es_insert_extent(inode, &newes, NULL); - if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), - 128, EXT4_I(inode))) - goto retry; - if (err == -ENOMEM && !ext4_es_must_keep(&newes)) - err = 0; + if (err1 && !es1) + es1 = __es_alloc_extent(true); + if ((err1 || err2) && !es2) + es2 = __es_alloc_extent(true); + write_lock(&EXT4_I(inode)->i_es_lock); + + err1 = __es_remove_extent(inode, lblk, end, NULL, es1); + if (err1 != 0) + goto error; + + err2 = __es_insert_extent(inode, &newes, es2); + if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) + err2 = 0; + if (err2 != 0) + goto error; if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & EXTENT_STATUS_WRITTEN || status & EXTENT_STATUS_UNWRITTEN)) __revise_pending(inode, lblk, len); + /* es is pre-allocated but not used, free it. */ + if (es1 && !es1->es_len) + __es_free_extent(es1); + if (es2 && !es2->es_len) + __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); + if (err1 || err2) + goto retry; ext4_es_print_tree(inode); - - return err; + return 0; } /* From ed5d285b3f2a9a37ff778c5e440daf49351fcc4d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:43 +0800 Subject: [PATCH 36/55] ext4: make ext4_es_remove_extent() return void Now ext4_es_remove_extent() never fails, so make it return void. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-10-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 34 ++++++---------------------------- fs/ext4/extents_status.c | 12 ++++++------ fs/ext4/extents_status.h | 4 ++-- fs/ext4/inline.c | 12 ++---------- fs/ext4/inode.c | 8 ++------ 5 files changed, 18 insertions(+), 52 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 35703dce23a3..8b87d5f1f741 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4403,15 +4403,8 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); -retry: - err = ext4_es_remove_extent(inode, last_block, - EXT_MAX_BLOCKS - last_block); - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_ATOMIC); - goto retry; - } - if (err) - return err; + ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); + retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { @@ -5363,13 +5356,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); - - ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); if (ret) { @@ -5547,12 +5534,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ext4_free_ext_path(path); } - ret = ext4_es_remove_extent(inode, offset_lblk, - EXT_MAX_BLOCKS - offset_lblk); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); /* * if offset_lblk lies in a hole which is at start of file, use @@ -5610,12 +5592,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!inode_is_locked(inode1)); BUG_ON(!inode_is_locked(inode2)); - *erp = ext4_es_remove_extent(inode1, lblk1, count); - if (unlikely(*erp)) - return 0; - *erp = ext4_es_remove_extent(inode2, lblk2, count); - if (unlikely(*erp)) - return 0; + ext4_es_remove_extent(inode1, lblk1, count); + ext4_es_remove_extent(inode2, lblk2, count); while (count) { struct ext4_extent *ex1, *ex2, tmp_ex; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 807ccc3107ec..b01b3bdab77d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1460,10 +1460,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, * @len - number of blocks to remove * * Reduces block/cluster reservation count and for bigalloc cancels pending - * reservations as needed. Returns 0 on success, error code on failure. + * reservations as needed. */ -int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) +void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) { ext4_lblk_t end; int err = 0; @@ -1471,14 +1471,14 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - return 0; + return; trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); if (!len) - return err; + return; end = lblk + len - 1; BUG_ON(end < lblk); @@ -1501,7 +1501,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return 0; + return; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 4ec30a798260..526a68890aa6 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -133,8 +133,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); -extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); +extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f48183f91c87..a4b7e4bc32d4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1939,16 +1939,8 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) * the extent status cache must be cleared to avoid leaving * behind stale delayed allocated extent entries */ - if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { -retry: - err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - if (err == -ENOMEM) { - memalloc_retry_wait(GFP_ATOMIC); - goto retry; - } - if (err) - goto out_error; - } + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) + ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ef7ec2690b84..519f5a065fdc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3986,12 +3986,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); - ret = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ext4_es_remove_extent(inode, first_block, + stop_block - first_block); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, From 8782b020ccbef6c4b62f00c86423f4d37ec60932 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:44 +0800 Subject: [PATCH 37/55] ext4: make ext4_es_insert_delayed_block() return void Now it never fails when inserting a delay extent, so the return value in ext4_es_insert_delayed_block is no longer necessary, let it return void. [ Fixed bug which caused system hangs during bigalloc test runs. See https://lore.kernel.org/r/20230612030405.GH1436857@mit.edu for more details. -- TYT ] Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-11-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 10 ++++------ fs/ext4/extents_status.h | 4 ++-- fs/ext4/inode.c | 17 +++++------------ 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index b01b3bdab77d..6a020e600d58 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -2016,11 +2016,9 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk) * @lblk - logical block to be added * @allocated - indicates whether a physical cluster has been allocated for * the logical cluster that contains the block - * - * Returns 0 on success, negative error code on failure. */ -int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated) +void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated) { struct extent_status newes; int err1 = 0; @@ -2029,7 +2027,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - return 0; + return; es_debug("add [%u/1) delayed to extent status tree of inode %lu\n", lblk, inode->i_ino); @@ -2071,7 +2069,7 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, ext4_es_print_tree(inode); ext4_print_pending_tree(inode); - return 0; + return; } /* diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 526a68890aa6..c22edb931f1b 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -249,8 +249,8 @@ extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); -extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, - bool allocated); +extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, + bool allocated); extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_clear_inode_es(struct inode *inode); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 519f5a065fdc..22c87aa56a39 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1630,7 +1630,6 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; - bool reserved = false; /* * If the cluster containing lblk is shared with a delayed, @@ -1646,8 +1645,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ - goto errout; - reserved = true; + return ret; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, @@ -1655,12 +1653,11 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) - goto errout; + return ret; if (ret == 0) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ - goto errout; - reserved = true; + return ret; } else { allocated = true; } @@ -1670,12 +1667,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) } } - ret = ext4_es_insert_delayed_block(inode, lblk, allocated); - if (ret && reserved) - ext4_da_release_space(inode, 1); - -errout: - return ret; + ext4_es_insert_delayed_block(inode, lblk, allocated); + return 0; } /* From 6c120399cde6b1b5cf65ce403765c579fb3d3e50 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:45 +0800 Subject: [PATCH 38/55] ext4: make ext4_es_insert_extent() return void Now ext4_es_insert_extent() never return error, so make it return void. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-12-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 5 +++-- fs/ext4/extents_status.c | 14 ++++++-------- fs/ext4/extents_status.h | 6 +++--- fs/ext4/inode.c | 21 ++++++--------------- 4 files changed, 18 insertions(+), 28 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8b87d5f1f741..a902558fde7d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3136,8 +3136,9 @@ static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) if (ee_len == 0) return 0; - return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, - EXTENT_STATUS_WRITTEN); + ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, + EXTENT_STATUS_WRITTEN); + return 0; } /* FIXME!! we need to try to merge to left or right after zero-out */ diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 6a020e600d58..9b5b8951afb4 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -829,12 +829,10 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes, /* * ext4_es_insert_extent() adds information to an inode's extent * status tree. - * - * Return 0 on success, error code on failure. */ -int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status) +void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status) { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; @@ -845,13 +843,13 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es2 = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) - return 0; + return; es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n", lblk, len, pblk, status, inode->i_ino); if (!len) - return 0; + return; BUG_ON(end < lblk); @@ -903,7 +901,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, goto retry; ext4_es_print_tree(inode); - return 0; + return; } /* diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index c22edb931f1b..d9847a4a25db 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -127,9 +127,9 @@ extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); -extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len, ext4_fsblk_t pblk, - unsigned int status); +extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned int status); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 22c87aa56a39..43be684dabcb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -567,10 +567,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; - ret = ext4_es_insert_extent(inode, map->m_lblk, - map->m_len, map->m_pblk, status); - if (ret < 0) - retval = ret; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -679,12 +677,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - if (ret < 0) { - retval = ret; - goto out_sem; - } + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } out_sem: @@ -1765,7 +1759,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, set_buffer_new(bh); set_buffer_delay(bh); } else if (retval > 0) { - int ret; unsigned int status; if (unlikely(retval != map->m_len)) { @@ -1778,10 +1771,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status); - if (ret != 0) - retval = ret; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status); } out_unlock: From ab8627e104696b8c1c6953ad5255def5b0821e06 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Apr 2023 11:38:46 +0800 Subject: [PATCH 39/55] ext4: make ext4_zeroout_es() return void After ext4_es_insert_extent() returns void, the return value in ext4_zeroout_es() is also unnecessary, so make it return void too. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230424033846.4732-13-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a902558fde7d..e4115d338f10 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3123,7 +3123,7 @@ void ext4_ext_release(struct super_block *sb) #endif } -static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) +static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) { ext4_lblk_t ee_block; ext4_fsblk_t ee_pblock; @@ -3134,11 +3134,10 @@ static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex) ee_pblock = ext4_ext_pblock(ex); if (ee_len == 0) - return 0; + return; ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock, EXTENT_STATUS_WRITTEN); - return 0; } /* FIXME!! we need to try to merge to left or right after zero-out */ @@ -3288,7 +3287,7 @@ static int ext4_split_extent_at(handle_t *handle, err = ext4_ext_dirty(handle, inode, path + path->p_depth); if (!err) /* update extent status tree */ - err = ext4_zeroout_es(inode, &zero_ex); + ext4_zeroout_es(inode, &zero_ex); /* If we failed at this point, we don't know in which * state the extent tree exactly is so don't try to fix * length of the original extent as it may do even more @@ -3641,9 +3640,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, out: /* If we have gotten a failure, don't zero out status tree */ if (!err) { - err = ext4_zeroout_es(inode, &zero_ex1); - if (!err) - err = ext4_zeroout_es(inode, &zero_ex2); + ext4_zeroout_es(inode, &zero_ex1); + ext4_zeroout_es(inode, &zero_ex2); } return err ? err : allocated; } From 4c0cfebdf3c34c9cd2c55844f549fa46b1da3164 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 8 Jun 2023 10:39:35 -0400 Subject: [PATCH 40/55] ext4: clean up mballoc criteria comments Line wrap and slightly clarify the comments describing mballoc's cirtiera. Define EXT4_MB_NUM_CRS as part of the enum, so that it will automatically get updated when criteria is added or removed. Also fix a potential unitialized use of 'cr' variable if CONFIG_EXT4_DEBUG is enabled. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 45 +++++++++++++++++++++++++-------------------- fs/ext4/mballoc.c | 19 +++++++++---------- 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6a1f013d23f7..45a531446ea2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -128,47 +128,52 @@ enum SHIFT_DIRECTION { }; /* - * Number of criterias defined. For each criteria, mballoc has slightly - * different way of finding the required blocks nad usually, higher the - * criteria the slower the allocation. We start at lower criterias and keep - * falling back to higher ones if we are not able to find any blocks. - */ -#define EXT4_MB_NUM_CRS 5 -/* - * All possible allocation criterias for mballoc. Lower are faster. + * For each criteria, mballoc has slightly different way of finding + * the required blocks nad usually, higher the criteria the slower the + * allocation. We start at lower criterias and keep falling back to + * higher ones if we are not able to find any blocks. Lower (earlier) + * criteria are faster. */ enum criteria { /* - * Used when number of blocks needed is a power of 2. This doesn't - * trigger any disk IO except prefetch and is the fastest criteria. + * Used when number of blocks needed is a power of 2. This + * doesn't trigger any disk IO except prefetch and is the + * fastest criteria. */ CR_POWER2_ALIGNED, /* - * Tries to lookup in-memory data structures to find the most suitable - * group that satisfies goal request. No disk IO except block prefetch. + * Tries to lookup in-memory data structures to find the most + * suitable group that satisfies goal request. No disk IO + * except block prefetch. */ CR_GOAL_LEN_FAST, /* - * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal length to - * the best available length for faster allocation. + * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal + * length to the best available length for faster allocation. */ CR_BEST_AVAIL_LEN, /* - * Reads each block group sequentially, performing disk IO if necessary, to - * find find_suitable block group. Tries to allocate goal length but might trim - * the request if nothing is found after enough tries. + * Reads each block group sequentially, performing disk IO if + * necessary, to find find_suitable block group. Tries to + * allocate goal length but might trim the request if nothing + * is found after enough tries. */ CR_GOAL_LEN_SLOW, /* - * Finds the first free set of blocks and allocates those. This is only - * used in rare cases when CR_GOAL_LEN_SLOW also fails to allocate - * anything. + * Finds the first free set of blocks and allocates + * those. This is only used in rare cases when + * CR_GOAL_LEN_SLOW also fails to allocate anything. */ CR_ANY_FREE, + + /* + * Number of criterias defined. + */ + EXT4_MB_NUM_CRS }; /* criteria below which we use fast block scanning and avoid unnecessary IO */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 74ebe31f8d0f..a2475b8c9fb5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1035,11 +1035,9 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context if (num_stripe_clusters > 0) { /* - * Try to round up the adjusted goal to stripe size - * (in cluster units) multiple for efficiency. - * - * XXX: Is s->stripe always a power of 2? In that case - * we can use the faster round_up() variant. + * Try to round up the adjusted goal length to + * stripe size (in cluster units) multiple for + * efficiency. */ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, num_stripe_clusters); @@ -2758,7 +2756,7 @@ static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; - enum criteria cr, new_cr; + enum criteria new_cr, cr = CR_GOAL_LEN_FAST; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; @@ -2815,12 +2813,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) spin_unlock(&sbi->s_md_lock); } - /* Let's just scan groups to find more-less suitable blocks */ - cr = ac->ac_2order ? CR_POWER2_ALIGNED : CR_GOAL_LEN_FAST; /* - * cr == CR_POWER2_ALIGNED try to get exact allocation, - * cr == CR_ANY_FREE try to get anything + * Let's just scan groups to find more-less suitable blocks We + * start with CR_GOAL_LEN_FAST, unless it is power of 2 + * aligned, in which case let's do that faster approach first. */ + if (ac->ac_2order) + cr = CR_POWER2_ALIGNED; repeat: for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; From 310ee0902b8d9d0a13a5a13e94688a5863fa29c2 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 14 Mar 2023 09:07:59 -0400 Subject: [PATCH 41/55] ext4: allow concurrent unaligned dio overwrites We've had reports of significant performance regression of sub-block (unaligned) direct writes due to the added exclusivity restrictions in ext4. The purpose of the exclusivity requirement for unaligned direct writes is to avoid data corruption caused by unserialized partial block zeroing in the iomap dio layer across overlapping writes. XFS has similar requirements for the same underlying reasons, yet doesn't suffer the extreme performance regression that ext4 does. The reason for this is that XFS utilizes IOMAP_DIO_OVERWRITE_ONLY mode, which allows for optimistic submission of concurrent unaligned I/O and kicks back writes that require partial block zeroing such that they can be submitted in a safe, exclusive context. Since ext4 already performs most of these checks pre-submission, it can support something similar without necessarily relying on the iomap flag and associated retry mechanism. Update the dio write submission path to allow concurrent submission of unaligned direct writes that are purely overwrite and so will not require block zeroing. To improve readability of the various related checks, move the unaligned I/O handling down into ext4_dio_write_checks(), where the dio draining and force wait logic can immediately follow the locking requirement checks. Finally, the IOMAP_DIO_OVERWRITE_ONLY flag is set to enable a warning check as a precaution should the ext4 overwrite logic ever become inconsistent with the zeroing expectations of iomap dio. The performance improvement of sub-block direct write I/O is shown in the following fio test on a 64xcpu guest vm: Test: fio --name=test --ioengine=libaio --direct=1 --group_reporting --overwrite=1 --thread --size=10G --filename=/mnt/fio --readwrite=write --ramp_time=10s --runtime=60s --numjobs=8 --blocksize=2k --iodepth=256 --allow_file_create=0 v6.2: write: IOPS=4328, BW=8724KiB/s v6.2 (patched): write: IOPS=801k, BW=1565MiB/s Signed-off-by: Brian Foster Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230314130759.642710-1-bfoster@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 86 +++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 40 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d101b3b0c7da..b18a63e8dbbc 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -444,13 +444,14 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, - bool *unwritten) + bool *unwritten, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; + bool overwrite, unaligned_io; restart: ret = ext4_generic_write_checks(iocb, from); @@ -459,16 +460,20 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, offset = iocb->ki_pos; count = ret; - if (ext4_extending_io(inode, offset, count)) - *extend = true; + + unaligned_io = ext4_unaligned_io(inode, from, offset); + *extend = ext4_extending_io(inode, offset, count); + overwrite = ext4_overwrite_io(inode, offset, count, unwritten); + /* - * Determine whether the IO operation will overwrite allocated - * and initialized blocks. - * We need exclusive i_rwsem for changing security info - * in file_modified(). + * Determine whether we need to upgrade to an exclusive lock. This is + * required to change security info in file_modified(), for extending + * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten + * extents (as partial block zeroing may be required). */ - if (*ilock_shared && (!IS_NOSEC(inode) || *extend || - !ext4_overwrite_io(inode, offset, count, unwritten))) { + if (*ilock_shared && + ((!IS_NOSEC(inode) || *extend || !overwrite || + (unaligned_io && *unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -479,6 +484,32 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, goto restart; } + /* + * Now that locking is settled, determine dio flags and exclusivity + * requirements. Unaligned writes are allowed under shared lock so long + * as they are pure overwrites. Set the iomap overwrite only flag as an + * added precaution in this case. Even though this is unnecessary, we + * can detect and warn on unexpected -EAGAIN if an unsafe unaligned + * write is ever submitted. + * + * Otherwise, concurrent unaligned writes risk data corruption due to + * partial block zeroing in the dio layer, and so the I/O must occur + * exclusively. The inode lock is already held exclusive if the write is + * non-overwrite or extending, so drain all outstanding dio and set the + * force wait dio flag. + */ + if (*ilock_shared && unaligned_io) { + *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; + } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + if (unaligned_io && (!overwrite || *unwritten)) + inode_dio_wait(inode); + *dio_flags = IOMAP_DIO_FORCE_WAIT; + } + ret = file_modified(file); if (ret < 0) goto out; @@ -500,17 +531,10 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; - bool extend = false, unaligned_io = false, unwritten = false; + bool extend = false, unwritten = false; bool ilock_shared = true; + int dio_flags = 0; - /* - * We initially start with shared inode lock unless it is - * unaligned IO which needs exclusive lock anyways. - */ - if (ext4_unaligned_io(inode, from, offset)) { - unaligned_io = true; - ilock_shared = false; - } /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with @@ -543,16 +567,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } - ret = ext4_dio_write_checks(iocb, from, - &ilock_shared, &extend, &unwritten); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, + &unwritten, &dio_flags); if (ret <= 0) return ret; - /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */ - if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) { - ret = -EAGAIN; - goto out; - } /* * Make sure inline data cannot be created anymore since we are going * to allocate blocks for DIO. We know the inode does not have any @@ -563,19 +582,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) offset = iocb->ki_pos; count = ret; - /* - * Unaligned direct IO must be serialized among each other as zeroing - * of partial blocks of two competing unaligned IOs can result in data - * corruption. - * - * So we make sure we don't allow any unaligned IO in flight. - * For IOs where we need not wait (like unaligned non-AIO DIO), - * below inode_dio_wait() may anyway become a no-op, since we start - * with exclusive lock. - */ - if (unaligned_io) - inode_dio_wait(inode); - if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { @@ -595,8 +601,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, - NULL, 0); + dio_flags, NULL, 0); + WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; From 26fb5290240dc31cae99b8b4dd2af7f46dfcba6b Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 15 Mar 2023 09:31:23 +0800 Subject: [PATCH 42/55] ext4: Fix reusing stale buffer heads from last failed mounting Following process makes ext4 load stale buffer heads from last failed mounting in a new mounting operation: mount_bdev ext4_fill_super | ext4_load_and_init_journal | ext4_load_journal | jbd2_journal_load | load_superblock | journal_get_superblock | set_buffer_verified(bh) // buffer head is verified | jbd2_journal_recover // failed caused by EIO | goto failed_mount3a // skip 'sb->s_root' initialization deactivate_locked_super kill_block_super generic_shutdown_super if (sb->s_root) // false, skip ext4_put_super->invalidate_bdev-> // invalidate_mapping_pages->mapping_evict_folio-> // filemap_release_folio->try_to_free_buffers, which // cannot drop buffer head. blkdev_put blkdev_put_whole if (atomic_dec_and_test(&bdev->bd_openers)) // false, systemd-udev happens to open the device. Then // blkdev_flush_mapping->kill_bdev->truncate_inode_pages-> // truncate_inode_folio->truncate_cleanup_folio-> // folio_invalidate->block_invalidate_folio-> // filemap_release_folio->try_to_free_buffers will be skipped, // dropping buffer head is missed again. Second mount: ext4_fill_super ext4_load_and_init_journal ext4_load_journal ext4_get_journal jbd2_journal_init_inode journal_init_common bh = getblk_unmovable bh = __find_get_block // Found stale bh in last failed mounting journal->j_sb_buffer = bh jbd2_journal_load load_superblock journal_get_superblock if (buffer_verified(bh)) // true, skip journal->j_format_version = 2, value is 0 jbd2_journal_recover do_one_pass next_log_block += count_tags(journal, bh) // According to journal_tag_bytes(), 'tag_bytes' calculating is // affected by jbd2_has_feature_csum3(), jbd2_has_feature_csum3() // returns false because 'j->j_format_version >= 2' is not true, // then we get wrong next_log_block. The do_one_pass may exit // early whenoccuring non JBD2_MAGIC_NUMBER in 'next_log_block'. The filesystem is corrupted here, journal is partially replayed, and new journal sequence number actually is already used by last mounting. The invalidate_bdev() can drop all buffer heads even racing with bare reading block device(eg. systemd-udev), so we can fix it by invalidating bdev in error handling path in __ext4_fill_super(). Fetch a reproducer in [Link]. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217171 Fixes: 25ed6e8a54df ("jbd2: enable journal clients to enable v2 checksumming") Cc: stable@vger.kernel.org # v3.5 Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230315013128.3911115-2-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7f9b087b9b20..6a8c5c3c9126 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1128,6 +1128,12 @@ static void ext4_blkdev_remove(struct ext4_sb_info *sbi) struct block_device *bdev; bdev = sbi->s_journal_bdev; if (bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + invalidate_bdev(bdev); ext4_blkdev_put(bdev); sbi->s_journal_bdev = NULL; } @@ -1328,13 +1334,7 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ sync_blockdev(sbi->s_journal_bdev); - invalidate_bdev(sbi->s_journal_bdev); ext4_blkdev_remove(sbi); } @@ -5655,6 +5655,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) brelse(sbi->s_sbh); ext4_blkdev_remove(sbi); out_fail: + invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; return err; } From 93e92cfcc1977b2b914c75333aa4b629b2fa7ca2 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 15 Mar 2023 09:31:24 +0800 Subject: [PATCH 43/55] ext4: ext4_put_super: Remove redundant checking for 'sbi->s_journal_bdev' As discussed in [1], 'sbi->s_journal_bdev != sb->s_bdev' will always become true if sbi->s_journal_bdev exists. Filesystem block device and journal block device are both opened with 'FMODE_EXCL' mode, so these two devices can't be same one. Then we can remove the redundant checking 'sbi->s_journal_bdev != sb->s_bdev' if 'sbi->s_journal_bdev' exists. [1] https://lore.kernel.org/lkml/f86584f6-3877-ff18-47a1-2efaa12d18b2@huawei.com/ Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230315013128.3911115-3-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6a8c5c3c9126..fcc8dad6b661 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1333,7 +1333,7 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) { + if (sbi->s_journal_bdev) { sync_blockdev(sbi->s_journal_bdev); ext4_blkdev_remove(sbi); } From 5c5bd1fef3ec913f9c597c6f61a9b903096415bf Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 15 Mar 2023 09:31:25 +0800 Subject: [PATCH 44/55] jbd2: remove unused feature macros JBD2_HAS_[IN|RO_]COMPAT_FEATURE macros are no longer used, just remove them. Signed-off-by: Zhang Yi Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230315013128.3911115-4-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- include/linux/jbd2.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f619bae1dcc5..a91cf9c7a94b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -274,17 +274,6 @@ typedef struct journal_superblock_s /* 0x0400 */ } journal_superblock_t; -/* Use the jbd2_{has,set,clear}_feature_* helpers; these will be removed */ -#define JBD2_HAS_COMPAT_FEATURE(j,mask) \ - ((j)->j_format_version >= 2 && \ - ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask)))) -#define JBD2_HAS_RO_COMPAT_FEATURE(j,mask) \ - ((j)->j_format_version >= 2 && \ - ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask)))) -#define JBD2_HAS_INCOMPAT_FEATURE(j,mask) \ - ((j)->j_format_version >= 2 && \ - ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) - #define JBD2_FEATURE_COMPAT_CHECKSUM 0x00000001 #define JBD2_FEATURE_INCOMPAT_REVOKE 0x00000001 From 5cf036d4f1489d7ba04b948e415f662521902c30 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 15 Mar 2023 09:31:26 +0800 Subject: [PATCH 45/55] jbd2: switch to check format version in superblock directly We should only check and set extented features if journal format version is 2, and now we check the in memory copy of the superblock 'journal->j_format_version', which relys on the parameter initialization sequence, switch to use the h_blocktype in superblock cloud be more clear. Signed-off-by: Zhang Yi Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230315013128.3911115-5-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 16 +++++++--------- include/linux/jbd2.h | 17 ++++++++++++++--- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ae419152ff6..8d5fe6738cc4 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2062,10 +2062,12 @@ int jbd2_journal_load(journal_t *journal) return err; sb = journal->j_superblock; - /* If this is a V2 superblock, then we have to check the - * features flags on it. */ - if (journal->j_format_version >= 2) { + /* + * If this is a V2 superblock, then we have to check the + * features flags on it. + */ + if (jbd2_format_support_feature(journal)) { if ((sb->s_feature_ro_compat & ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & @@ -2227,7 +2229,7 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (journal->j_format_version == 0 && journal_get_superblock(journal) != 0) return 0; - if (journal->j_format_version == 1) + if (!jbd2_format_support_feature(journal)) return 0; sb = journal->j_superblock; @@ -2257,11 +2259,7 @@ int jbd2_journal_check_available_features(journal_t *journal, unsigned long comp if (!compat && !ro && !incompat) return 1; - /* We can support any known requested features iff the - * superblock is in version 2. Otherwise we fail to support any - * extended sb features. */ - - if (journal->j_format_version != 2) + if (!jbd2_format_support_feature(journal)) return 0; if ((compat & JBD2_KNOWN_COMPAT_FEATURES) == compat && diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a91cf9c7a94b..1ffcea5c024e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1313,11 +1313,22 @@ struct journal_s rwsem_release(&j->j_trans_commit_map, _THIS_IP_); \ } while (0) +/* + * We can support any known requested features iff the + * superblock is not in version 1. Otherwise we fail to support any + * extended sb features. + */ +static inline bool jbd2_format_support_feature(journal_t *j) +{ + return j->j_superblock->s_header.h_blocktype != + cpu_to_be32(JBD2_SUPERBLOCK_V1); +} + /* journal feature predicate functions */ #define JBD2_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ - return ((j)->j_format_version >= 2 && \ + return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_compat & \ cpu_to_be32(JBD2_FEATURE_COMPAT_##flagname)) != 0); \ } \ @@ -1335,7 +1346,7 @@ static inline void jbd2_clear_feature_##name(journal_t *j) \ #define JBD2_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ - return ((j)->j_format_version >= 2 && \ + return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_ro_compat & \ cpu_to_be32(JBD2_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ @@ -1353,7 +1364,7 @@ static inline void jbd2_clear_feature_##name(journal_t *j) \ #define JBD2_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool jbd2_has_feature_##name(journal_t *j) \ { \ - return ((j)->j_format_version >= 2 && \ + return (jbd2_format_support_feature(j) && \ ((j)->j_superblock->s_feature_incompat & \ cpu_to_be32(JBD2_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ From 3e5cf02cfa3fa9cc9f568930624faed6d3a53ff4 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 15 Mar 2023 09:31:27 +0800 Subject: [PATCH 46/55] jbd2: factor out journal initialization from journal_get_superblock() Current journal_get_superblock() couple journal superblock checking and partial journal initialization, factor out initialization part from it to make things clear. Signed-off-by: Zhang Yi Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230315013128.3911115-6-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8d5fe6738cc4..ee678f9e40c4 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1925,21 +1925,13 @@ static int journal_get_superblock(journal_t *journal) goto out; } - switch(be32_to_cpu(sb->s_header.h_blocktype)) { - case JBD2_SUPERBLOCK_V1: - journal->j_format_version = 1; - break; - case JBD2_SUPERBLOCK_V2: - journal->j_format_version = 2; - break; - default: + if (be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V1 && + be32_to_cpu(sb->s_header.h_blocktype) != JBD2_SUPERBLOCK_V2) { printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n"); goto out; } - if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) - journal->j_total_len = be32_to_cpu(sb->s_maxlen); - else if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { + if (be32_to_cpu(sb->s_maxlen) > journal->j_total_len) { printk(KERN_WARNING "JBD2: journal file too short\n"); goto out; } @@ -1982,25 +1974,14 @@ static int journal_get_superblock(journal_t *journal) journal->j_chksum_driver = NULL; goto out; } - } - - if (jbd2_journal_has_csum_v2or3(journal)) { /* Check superblock checksum */ if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; goto out; } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, - sizeof(sb->s_uuid)); } - - journal->j_revoke_records_per_block = - journal_revoke_records_per_block(journal); set_buffer_verified(bh); - return 0; out: @@ -2025,12 +2006,30 @@ static int load_superblock(journal_t *journal) sb = journal->j_superblock; + switch (be32_to_cpu(sb->s_header.h_blocktype)) { + case JBD2_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JBD2_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + } + journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); journal->j_errno = be32_to_cpu(sb->s_errno); journal->j_last = be32_to_cpu(sb->s_maxlen); + if (be32_to_cpu(sb->s_maxlen) < journal->j_total_len) + journal->j_total_len = be32_to_cpu(sb->s_maxlen); + /* Precompute checksum seed for all metadata */ + if (jbd2_journal_has_csum_v2or3(journal)) + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + journal->j_revoke_records_per_block = + journal_revoke_records_per_block(journal); + if (jbd2_has_feature_fast_commit(journal)) { journal->j_fc_last = be32_to_cpu(sb->s_maxlen); num_fc_blocks = jbd2_journal_get_num_fc_blks(sb); @@ -2226,8 +2225,7 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; /* Load journal superblock if it is not loaded yet. */ - if (journal->j_format_version == 0 && - journal_get_superblock(journal) != 0) + if (journal_get_superblock(journal)) return 0; if (!jbd2_format_support_feature(journal)) return 0; From 04c2e98179658d223665661f12c5043224e8f8d3 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 15 Mar 2023 09:31:28 +0800 Subject: [PATCH 47/55] jbd2: remove j_format_version journal->j_format_version is no longer used, remove it. Signed-off-by: Zhang Yi Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230315013128.3911115-7-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 9 --------- include/linux/jbd2.h | 5 ----- 2 files changed, 14 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ee678f9e40c4..837a9a85e585 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2006,15 +2006,6 @@ static int load_superblock(journal_t *journal) sb = journal->j_superblock; - switch (be32_to_cpu(sb->s_header.h_blocktype)) { - case JBD2_SUPERBLOCK_V1: - journal->j_format_version = 1; - break; - case JBD2_SUPERBLOCK_V2: - journal->j_format_version = 2; - break; - } - journal->j_tail_sequence = be32_to_cpu(sb->s_sequence); journal->j_tail = be32_to_cpu(sb->s_start); journal->j_first = be32_to_cpu(sb->s_first); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 1ffcea5c024e..6990fc891612 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -792,11 +792,6 @@ struct journal_s */ journal_superblock_t *j_superblock; - /** - * @j_format_version: Version of the superblock format. - */ - int j_format_version; - /** * @j_state_lock: Protect the various scalars in the journal. */ From c7fc60555864c0e67f5e5754a9053986f8fb8491 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 22 Mar 2023 09:33:51 +0800 Subject: [PATCH 48/55] jbd2: continue to record log between each mount For a newly mounted file system, the journal committing thread always record new transactions from the start of the journal area, no matter whether the journal was clean or just has been recovered. So the logdump code in debugfs cannot dump continuous logs between each mount, it is disadvantageous to analysis corrupted file system image and locate the file system inconsistency bugs. If we get a corrupted file system in the running products and want to find out what has happened, besides lookup the system log, one effective way is to backtrack the journal log. But we may not always run e2fsck before each mount and the default fsck -a mode also cannot always checkout all inconsistencies, so it could left over some inconsistencies into the next mount until we detect it. Finally, transactions in the journal may probably discontinuous and some relatively new transactions has been covered, it becomes hard to analyse. If we could record transactions continuously between each mount, we could acquire more useful info from the journal. Like this: |Previous mount checkpointed/recovered logs|Current mount logs | |{------}{---}{--------} ... {------}| ... |{======}{========}...000000| And yes the journal area is limited and cannot record everything, the problematic transaction may also be covered even if we do this, but this is still useful for fuzzy tests and short-running products. This patch save the head blocknr in the superblock after flushing the journal or unmounting the file system, let the next mount could continue to record new transaction behind it. This change is backward compatible because the old kernel does not care about the head blocknr of the journal. It is also fine if we mount a clean old image without valid head blocknr, we fail back to set it to s_first just like before. Finally, for the case of mount an unclean file system, we could also get the journal head easily after scanning/replaying the journal, it will continue to record new transaction after the recovered transactions. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230322013353.1843306-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 18 ++++++++++++++++-- fs/jbd2/recovery.c | 22 +++++++++++++++++----- include/linux/jbd2.h | 9 +++++++-- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 837a9a85e585..b5e57735ab3f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1559,8 +1559,21 @@ static int journal_reset(journal_t *journal) journal->j_first = first; journal->j_last = last; - journal->j_head = journal->j_first; - journal->j_tail = journal->j_first; + if (journal->j_head != 0 && journal->j_flags & JBD2_CYCLE_RECORD) { + /* + * Disable the cycled recording mode if the journal head block + * number is not correct. + */ + if (journal->j_head < first || journal->j_head >= last) { + printk(KERN_WARNING "JBD2: Incorrect Journal head block %lu, " + "disable journal_cycle_record\n", + journal->j_head); + journal->j_head = journal->j_first; + } + } else { + journal->j_head = journal->j_first; + } + journal->j_tail = journal->j_head; journal->j_free = journal->j_last - journal->j_first; journal->j_tail_sequence = journal->j_transaction_sequence; @@ -1732,6 +1745,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, blk_opf_t write_flags) sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); + sb->s_head = cpu_to_be32(journal->j_head); if (jbd2_has_feature_fast_commit(journal)) { /* * When journal is clean, no need to commit fast commit flag and diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 8286a9ec122f..0184931d47f7 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -29,6 +29,7 @@ struct recovery_info { tid_t start_transaction; tid_t end_transaction; + unsigned long head_block; int nr_replays; int nr_revokes; @@ -301,11 +302,11 @@ int jbd2_journal_recover(journal_t *journal) * is always zero if, and only if, the journal was cleanly * unmounted. */ - if (!sb->s_start) { - jbd2_debug(1, "No recovery required, last transaction %d\n", - be32_to_cpu(sb->s_sequence)); + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", + be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; + journal->j_head = be32_to_cpu(sb->s_head); return 0; } @@ -324,6 +325,9 @@ int jbd2_journal_recover(journal_t *journal) /* Restart the log at the next transaction ID, thus invalidating * any existing commit records in the log. */ journal->j_transaction_sequence = ++info.end_transaction; + journal->j_head = info.head_block; + jbd2_debug(1, "JBD2: last transaction %d, head block %lu\n", + journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); err2 = sync_blockdev(journal->j_fs_dev); @@ -364,6 +368,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) if (err) { printk(KERN_ERR "JBD2: error %d scanning journal\n", err); ++journal->j_transaction_sequence; + journal->j_head = journal->j_first; } else { #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - @@ -373,6 +378,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) dropped, (dropped == 1) ? "" : "s"); #endif journal->j_transaction_sequence = ++info.end_transaction; + journal->j_head = info.head_block; } journal->j_tail = 0; @@ -462,7 +468,7 @@ static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { unsigned int first_commit_ID, next_commit_ID; - unsigned long next_log_block; + unsigned long next_log_block, head_block; int err, success = 0; journal_superblock_t * sb; journal_header_t * tmp; @@ -485,6 +491,7 @@ static int do_one_pass(journal_t *journal, sb = journal->j_superblock; next_commit_ID = be32_to_cpu(sb->s_sequence); next_log_block = be32_to_cpu(sb->s_start); + head_block = next_log_block; first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) @@ -809,6 +816,7 @@ static int do_one_pass(journal_t *journal, if (commit_time < last_trans_commit_time) goto ignore_crc_mismatch; info->end_transaction = next_commit_ID; + info->head_block = head_block; if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = @@ -817,8 +825,10 @@ static int do_one_pass(journal_t *journal, break; } } - if (pass == PASS_SCAN) + if (pass == PASS_SCAN) { last_trans_commit_time = commit_time; + head_block = next_log_block; + } brelse(bh); next_commit_ID++; continue; @@ -868,6 +878,8 @@ static int do_one_pass(journal_t *journal, if (pass == PASS_SCAN) { if (!info->end_transaction) info->end_transaction = next_commit_ID; + if (!info->head_block) + info->head_block = head_block; } else { /* It's really bad news if different passes end up at * different places (but possible due to IO errors). */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 6990fc891612..d860499e15e4 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -265,8 +265,10 @@ typedef struct journal_superblock_s __u8 s_padding2[3]; /* 0x0054 */ __be32 s_num_fc_blks; /* Number of fast commit blocks */ -/* 0x0058 */ - __u32 s_padding[41]; + __be32 s_head; /* blocknr of head of log, only uptodate + * while the filesystem is clean */ +/* 0x005C */ + __u32 s_padding[40]; __be32 s_checksum; /* crc32c(superblock) */ /* 0x0100 */ @@ -1395,6 +1397,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file * data write error in ordered * mode */ +#define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on + * clean and empty filesystem + * logging area */ #define JBD2_FAST_COMMIT_ONGOING 0x100 /* Fast commit is ongoing */ #define JBD2_FULL_COMMIT_ONGOING 0x200 /* Full commit is ongoing */ #define JBD2_JOURNAL_FLUSH_DISCARD 0x0001 From 7294505824254da9cb5ee6cb12d783c9eeea030e Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 22 Mar 2023 09:33:52 +0800 Subject: [PATCH 49/55] ext4: add journal cycled recording support Always enable 'JBD2_CYCLE_RECORD' journal option on ext4, letting the jbd2 continue to record new journal transactions from the recovered journal head or the checkpointed transactions in the previous mount. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230322013353.1843306-3-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fcc8dad6b661..433550e93561 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5738,6 +5738,11 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; + /* + * Always enable journal cycle record option, letting the journal + * records log transactions continuously between each mount. + */ + journal->j_flags |= JBD2_CYCLE_RECORD; write_unlock(&journal->j_state_lock); } From bd5a594b5b1cfa14646bc29e6c7745c961025055 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 22 Mar 2023 09:33:53 +0800 Subject: [PATCH 50/55] ext4: update doc about journal superblock description Update the description in doc about the s_head parameter in the journal superblock. Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20230322013353.1843306-4-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- Documentation/filesystems/ext4/journal.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/ext4/journal.rst b/Documentation/filesystems/ext4/journal.rst index a6bef5293a60..6e8fb2d4b46f 100644 --- a/Documentation/filesystems/ext4/journal.rst +++ b/Documentation/filesystems/ext4/journal.rst @@ -260,8 +260,13 @@ which is 1024 bytes long: - s_num_fc_blocks - Number of fast commit blocks in the journal. * - 0x58 + - __be32 + - s_head + - Block number of the head (first unused block) of the journal, only + up-to-date when the journal is empty. + * - 0x5C - __u32 - - s_padding[42] + - s_padding[40] - * - 0xFC - __be32 From d13f99632748462c32fc95d729f5e754bab06064 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 27 Mar 2023 22:16:29 +0800 Subject: [PATCH 51/55] ext4: turn quotas off if mount failed after enabling quotas Yi found during a review of the patch "ext4: don't BUG on inconsistent journal feature" that when ext4_mark_recovery_complete() returns an error value, the error handling path does not turn off the enabled quotas, which triggers the following kmemleak: ================================================================ unreferenced object 0xffff8cf68678e7c0 (size 64): comm "mount", pid 746, jiffies 4294871231 (age 11.540s) hex dump (first 32 bytes): 00 90 ef 82 f6 8c ff ff 00 00 00 00 41 01 00 00 ............A... c7 00 00 00 bd 00 00 00 0a 00 00 00 48 00 00 00 ............H... backtrace: [<00000000c561ef24>] __kmem_cache_alloc_node+0x4d4/0x880 [<00000000d4e621d7>] kmalloc_trace+0x39/0x140 [<00000000837eee74>] v2_read_file_info+0x18a/0x3a0 [<0000000088f6c877>] dquot_load_quota_sb+0x2ed/0x770 [<00000000340a4782>] dquot_load_quota_inode+0xc6/0x1c0 [<0000000089a18bd5>] ext4_enable_quotas+0x17e/0x3a0 [ext4] [<000000003a0268fa>] __ext4_fill_super+0x3448/0x3910 [ext4] [<00000000b0f2a8a8>] ext4_fill_super+0x13d/0x340 [ext4] [<000000004a9489c4>] get_tree_bdev+0x1dc/0x370 [<000000006e723bf1>] ext4_get_tree+0x1d/0x30 [ext4] [<00000000c7cb663d>] vfs_get_tree+0x31/0x160 [<00000000320e1bed>] do_new_mount+0x1d5/0x480 [<00000000c074654c>] path_mount+0x22e/0xbe0 [<0000000003e97a8e>] do_mount+0x95/0xc0 [<000000002f3d3736>] __x64_sys_mount+0xc4/0x160 [<0000000027d2140c>] do_syscall_64+0x3f/0x90 ================================================================ To solve this problem, we add a "failed_mount10" tag, and call ext4_quota_off_umount() in this tag to release the enabled qoutas. Fixes: 11215630aada ("ext4: don't BUG on inconsistent journal feature") Cc: stable@kernel.org Signed-off-by: Zhang Yi Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230327141630.156875-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 433550e93561..1cc60a7ae4aa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5577,7 +5577,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount9; + goto failed_mount10; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) @@ -5596,7 +5596,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) return 0; -failed_mount9: +failed_mount10: + ext4_quota_off_umount(sb); +failed_mount9: __maybe_unused ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); From f3c1c42e0c40656e73f12ab5dcb1110f83ef8e65 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 27 Mar 2023 22:16:30 +0800 Subject: [PATCH 52/55] ext4: refactoring to use the unified helper ext4_quotas_off() Rename ext4_quota_off_umount() to ext4_quotas_off(), and add type parameter to replace open code in ext4_enable_quotas(). Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230327141630.156875-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1cc60a7ae4aa..b3819e70093e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1165,12 +1165,12 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) #ifdef CONFIG_QUOTA static int ext4_quota_off(struct super_block *sb, int type); -static inline void ext4_quota_off_umount(struct super_block *sb) +static inline void ext4_quotas_off(struct super_block *sb, int type) { - int type; + BUG_ON(type > EXT4_MAXQUOTAS); /* Use our quota_off function to clear inode flags etc. */ - for (type = 0; type < EXT4_MAXQUOTAS; type++) + for (type--; type >= 0; type--) ext4_quota_off(sb, type); } @@ -1186,7 +1186,7 @@ static inline char *get_qf_name(struct super_block *sb, lockdep_is_held(&sb->s_umount)); } #else -static inline void ext4_quota_off_umount(struct super_block *sb) +static inline void ext4_quotas_off(struct super_block *sb, int type) { } #endif @@ -1286,7 +1286,7 @@ static void ext4_put_super(struct super_block *sb) &sb->s_uuid); ext4_unregister_li_request(sb); - ext4_quota_off_umount(sb); + ext4_quotas_off(sb, EXT4_MAXQUOTAS); flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); @@ -5597,7 +5597,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) return 0; failed_mount10: - ext4_quota_off_umount(sb); + ext4_quotas_off(sb, EXT4_MAXQUOTAS); failed_mount9: __maybe_unused ext4_release_orphan_info(sb); failed_mount8: @@ -7044,20 +7044,8 @@ int ext4_enable_quotas(struct super_block *sb) "(type=%d, err=%d, ino=%lu). " "Please run e2fsck to fix.", type, err, qf_inums[type]); - for (type--; type >= 0; type--) { - struct inode *inode; - - inode = sb_dqopt(sb)->files[type]; - if (inode) - inode = igrab(inode); - dquot_quota_off(sb, type); - if (inode) { - lockdep_set_quota_inode(inode, - I_DATA_SEM_NORMAL); - iput(inode); - } - } + ext4_quotas_off(sb, type); return err; } } From c4d13222afd8a64bf11bc7ec68645496ee8b54b9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 6 Jun 2023 15:32:03 +0800 Subject: [PATCH 53/55] ext4: fix to check return value of freeze_bdev() in ext4_shutdown() freeze_bdev() can fail due to a lot of reasons, it needs to check its reason before later process. Fixes: 783d94854499 ("ext4: add EXT4_IOC_GOINGDOWN ioctl") Cc: stable@kernel.org Signed-off-by: Chao Yu Link: https://lore.kernel.org/r/20230606073203.1310389-1-chao@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f9a430152063..55be1b8a6360 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -797,6 +797,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u32 flags; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -815,7 +816,9 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg) switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: - freeze_bdev(sb->s_bdev); + ret = freeze_bdev(sb->s_bdev); + if (ret) + return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); thaw_bdev(sb->s_bdev); break; From 31464ab01fff910cb88376384e2b6824f7bf713f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 16 Jun 2023 09:55:47 +0800 Subject: [PATCH 54/55] jbd2: skip reading super block if it has been verified We got a NULL pointer dereference issue below while running generic/475 I/O failure pressure test. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT SMP PTI CPU: 1 PID: 15600 Comm: fsstress Not tainted 6.4.0-rc5-xfstests-00055-gd3ab1bca26b4 #190 RIP: 0010:jbd2_journal_set_features+0x13d/0x430 ... Call Trace: ? __die+0x23/0x60 ? page_fault_oops+0xa4/0x170 ? exc_page_fault+0x67/0x170 ? asm_exc_page_fault+0x26/0x30 ? jbd2_journal_set_features+0x13d/0x430 jbd2_journal_revoke+0x47/0x1e0 __ext4_forget+0xc3/0x1b0 ext4_free_blocks+0x214/0x2f0 ext4_free_branches+0xeb/0x270 ext4_ind_truncate+0x2bf/0x320 ext4_truncate+0x1e4/0x490 ext4_handle_inode_extension+0x1bd/0x2a0 ? iomap_dio_complete+0xaf/0x1d0 The root cause is the journal super block had been failed to write out due to I/O fault injection, it's uptodate bit was cleared by end_buffer_write_sync() and didn't reset yet in jbd2_write_superblock(). And it raced by journal_get_superblock()->bh_read(), unfortunately, the read IO is also failed, so the error handling in journal_fail_superblock() unexpectedly clear the journal->j_sb_buffer, finally lead to above NULL pointer dereference issue. If the journal super block had been read and verified, there is no need to call bh_read() read it again even if it has been failed to written out. So the fix could be simply move buffer_verified(bh) in front of bh_read(). Also remove a stale comment left in jbd2_journal_check_used_features(). Fixes: 51bacdba23d8 ("jbd2: factor out journal initialization from journal_get_superblock()") Reported-by: Theodore Ts'o Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230616015547.3155195-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b5e57735ab3f..559242df0f9a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1919,6 +1919,9 @@ static int journal_get_superblock(journal_t *journal) bh = journal->j_sb_buffer; J_ASSERT(bh != NULL); + if (buffer_verified(bh)) + return 0; + err = bh_read(bh, 0); if (err < 0) { printk(KERN_ERR @@ -1926,9 +1929,6 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (buffer_verified(bh)) - return 0; - sb = journal->j_superblock; err = -EINVAL; @@ -2229,7 +2229,6 @@ int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat, if (!compat && !ro && !incompat) return 1; - /* Load journal superblock if it is not loaded yet. */ if (journal_get_superblock(journal)) return 0; if (!jbd2_format_support_feature(journal)) From 2ef6c32a914b85217b44a0a2418e830e520b085e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 23 Jun 2023 10:18:51 -0400 Subject: [PATCH 55/55] ext4: avoid updating the superblock on a r/o mount if not needed This was noticed by a user who noticied that the mtime of a file backing a loopback device was getting bumped when the loopback device is mounted read/only. Note: This doesn't show up when doing a loopback mount of a file directly, via "mount -o ro /tmp/foo.img /mnt", since the loop device is set read-only when mount automatically creates loop device. However, this is noticeable for a LUKS loop device like this: % cryptsetup luksOpen /tmp/foo.img test % mount -o ro /dev/loop0 /mnt ; umount /mnt or, if LUKS is not in use, if the user manually creates the loop device like this: % losetup /dev/loop0 /tmp/foo.img % mount -o ro /dev/loop0 /mnt ; umount /mnt The modified mtime causes rsync to do a rolling checksum scan of the file on the local and remote side, incrementally increasing the time to rsync the not-modified-but-touched image file. Fixes: eee00237fa5e ("ext4: commit super block if fs record error when journal record without error") Cc: stable@kernel.org Link: https://lore.kernel.org/r/ZIauBR7YiV3rVAHL@glitch Reported-by: Sean Greenslade Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b3819e70093e..c638b0db3b2b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5997,19 +5997,27 @@ static int ext4_load_journal(struct super_block *sb, err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + __le16 orig_state; + bool changed = false; if (save) memcpy(save, ((char *) es) + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); - if (save) + if (save && memcmp(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN)) { memcpy(((char *) es) + EXT4_S_ERR_START, save, EXT4_S_ERR_LEN); + changed = true; + } kfree(save); + orig_state = es->s_state; es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS); + if (orig_state != es->s_state) + changed = true; /* Write out restored error information to the superblock */ - if (!bdev_read_only(sb->s_bdev)) { + if (changed && !really_read_only) { int err2; err2 = ext4_commit_super(sb); err = err ? : err2;