From 07bcbdf020c9fd3c14bec51c50225a2a02707b94 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 23 Nov 2023 09:48:09 -0800 Subject: [PATCH 001/123] xfs: don't leak recovered attri intent items If recovery finds an xattr log intent item calling for the removal of an attribute and the file doesn't even have an attr fork, we know that the removal is trivially complete. However, we can't just exit the recovery function without doing something about the recovered log intent item -- it's still on the AIL, and not logging an attrd item means it stays there forever. This has likely not been seen in practice because few people use LARP and the runtime code won't log the attri for a no-attrfork removexattr operation. But let's fix this anyway. Also we shouldn't really be testing the attr fork presence until we've taken the ILOCK, though this doesn't matter much in recovery, which is single threaded. Fixes: fdaf1bb3cafc ("xfs: ATTR_REPLACE algorithm with LARP enabled needs rework") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 36fe2abb16e6..11e88a76a33c 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -329,6 +329,13 @@ xfs_xattri_finish_update( goto out; } + /* If an attr removal is trivially complete, we're done. */ + if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && + !xfs_inode_hasattr(args->dp)) { + error = 0; + goto out; + } + error = xfs_attr_set_iter(attr); if (!error && attr->xattri_dela_state != XFS_DAS_DONE) error = -EAGAIN; @@ -608,8 +615,6 @@ xfs_attri_item_recover( attr->xattri_dela_state = xfs_attr_init_add_state(args); break; case XFS_ATTRI_OP_FLAGS_REMOVE: - if (!xfs_inode_hasattr(args->dp)) - goto out; attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; default: From 03f7767c9f6120ac933378fdec3bfd78bf07bc11 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 10:23:23 -0800 Subject: [PATCH 002/123] xfs: use xfs_defer_pending objects to recover intent items One thing I never quite got around to doing is porting the log intent item recovery code to reconstruct the deferred pending work state. As a result, each intent item open codes xfs_defer_finish_one in its recovery method, because that's what the EFI code did before xfs_defer.c even existed. This is a gross thing to have left unfixed -- if an EFI cannot proceed due to busy extents, we end up creating separate new EFIs for each unfinished work item, which is a change in behavior from what runtime would have done. Worse yet, Long Li pointed out that there's a UAF in the recovery code. The ->commit_pass2 function adds the intent item to the AIL and drops the refcount. The one remaining refcount is now owned by the recovery mechanism (aka the log intent items in the AIL) with the intent of giving the refcount to the intent done item in the ->iop_recover function. However, if something fails later in recovery, xlog_recover_finish will walk the recovered intent items in the AIL and release them. If the CIL hasn't been pushed before that point (which is possible since we don't force the log until later) then the intent done release will try to free its associated intent, which has already been freed. This patch starts to address this mess by having the ->commit_pass2 functions recreate the xfs_defer_pending state. The next few patches will fix the recovery functions. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 105 ++++++++++++++++++++++-------- fs/xfs/libxfs/xfs_defer.h | 5 ++ fs/xfs/libxfs/xfs_log_recover.h | 3 + fs/xfs/xfs_attr_item.c | 10 +-- fs/xfs/xfs_bmap_item.c | 9 +-- fs/xfs/xfs_extfree_item.c | 9 +-- fs/xfs/xfs_log.c | 1 + fs/xfs/xfs_log_priv.h | 1 + fs/xfs/xfs_log_recover.c | 111 ++++++++++++++++---------------- fs/xfs/xfs_refcount_item.c | 9 +-- fs/xfs/xfs_rmap_item.c | 9 +-- 11 files changed, 157 insertions(+), 115 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index f71679ce23b9..363da37a8e7f 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -245,23 +245,53 @@ xfs_defer_create_intents( return ret; } -STATIC void +static inline void xfs_defer_pending_abort( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + trace_xfs_defer_pending_abort(mp, dfp); + + if (dfp->dfp_intent && !dfp->dfp_done) { + ops->abort_intent(dfp->dfp_intent); + dfp->dfp_intent = NULL; + } +} + +static inline void +xfs_defer_pending_cancel_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct list_head *pwi; + struct list_head *n; + + trace_xfs_defer_cancel_list(mp, dfp); + + list_del(&dfp->dfp_list); + list_for_each_safe(pwi, n, &dfp->dfp_work) { + list_del(pwi); + dfp->dfp_count--; + trace_xfs_defer_cancel_item(mp, dfp, pwi); + ops->cancel_item(pwi); + } + ASSERT(dfp->dfp_count == 0); + kmem_cache_free(xfs_defer_pending_cache, dfp); +} + +STATIC void +xfs_defer_pending_abort_list( struct xfs_mount *mp, struct list_head *dop_list) { struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(mp, dfp); - if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); - dfp->dfp_intent = NULL; - } - } + list_for_each_entry(dfp, dop_list, dfp_list) + xfs_defer_pending_abort(mp, dfp); } /* Abort all the intents that were committed. */ @@ -271,7 +301,7 @@ xfs_defer_trans_abort( struct list_head *dop_pending) { trace_xfs_defer_trans_abort(tp, _RET_IP_); - xfs_defer_pending_abort(tp->t_mountp, dop_pending); + xfs_defer_pending_abort_list(tp->t_mountp, dop_pending); } /* @@ -389,27 +419,13 @@ xfs_defer_cancel_list( { struct xfs_defer_pending *dfp; struct xfs_defer_pending *pli; - struct list_head *pwi; - struct list_head *n; - const struct xfs_defer_op_type *ops; /* * Free the pending items. Caller should already have arranged * for the intent items to be released. */ - list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_cancel_list(mp, dfp); - list_del(&dfp->dfp_list); - list_for_each_safe(pwi, n, &dfp->dfp_work) { - list_del(pwi); - dfp->dfp_count--; - trace_xfs_defer_cancel_item(mp, dfp, pwi); - ops->cancel_item(pwi); - } - ASSERT(dfp->dfp_count == 0); - kmem_cache_free(xfs_defer_pending_cache, dfp); - } + list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) + xfs_defer_pending_cancel_work(mp, dfp); } /* @@ -665,6 +681,39 @@ xfs_defer_add( dfp->dfp_count++; } +/* + * Create a pending deferred work item to replay the recovered intent item + * and add it to the list. + */ +void +xfs_defer_start_recovery( + struct xfs_log_item *lip, + enum xfs_defer_ops_type dfp_type, + struct list_head *r_dfops) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_type = dfp_type; + dfp->dfp_intent = lip; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, r_dfops); +} + +/* + * Cancel a deferred work item created to recover a log intent item. @dfp + * will be freed after this function returns. + */ +void +xfs_defer_cancel_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp) +{ + xfs_defer_pending_abort(mp, dfp); + xfs_defer_pending_cancel_work(mp, dfp); +} + /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across @@ -769,7 +818,7 @@ xfs_defer_ops_capture_abort( { unsigned short i; - xfs_defer_pending_abort(mp, &dfc->dfc_dfops); + xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 8788ad5f6a73..5dce938ba3d5 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -125,6 +125,11 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); +void xfs_defer_start_recovery(struct xfs_log_item *lip, + enum xfs_defer_ops_type dfp_type, struct list_head *r_dfops); +void xfs_defer_cancel_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp); + int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index a5100a11faf9..271a4ce7375c 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -153,4 +153,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) return ret; } +void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, + xfs_lsn_t lsn, unsigned int dfp_type); + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 11e88a76a33c..a32716b8cbbd 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -772,14 +772,8 @@ xlog_recover_attri_commit_pass2( attrip = xfs_attri_init(mp, nv); memcpy(&attrip->attri_format, attri_formatp, len); - /* - * The ATTRI has two references. One for the ATTRD and one for ATTRI to - * ensure it makes it into the AIL. Insert the ATTRI into the AIL - * directly and drop the ATTRI reference. Note that - * xfs_trans_ail_update() drops the AIL lock. - */ - xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn); - xfs_attri_release(attrip); + xlog_recover_intent_item(log, &attrip->attri_item, lsn, + XFS_DEFER_OPS_TYPE_ATTR); xfs_attri_log_nameval_put(nv); return 0; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e736a0844c89..6cbae4fdf43f 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -681,12 +681,9 @@ xlog_recover_bui_commit_pass2( buip = xfs_bui_init(mp); xfs_bui_copy_format(&buip->bui_format, bui_formatp); atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn); - xfs_bui_release(buip); + + xlog_recover_intent_item(log, &buip->bui_item, lsn, + XFS_DEFER_OPS_TYPE_BMAP); return 0; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3fa8789820ad..cf0ddeb70580 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -820,12 +820,9 @@ xlog_recover_efi_commit_pass2( return error; } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn); - xfs_efi_release(efip); + + xlog_recover_intent_item(log, &efip->efi_item, lsn, + XFS_DEFER_OPS_TYPE_FREE); return 0; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ee206facf0dc..a1650fc81382 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1542,6 +1542,7 @@ xlog_alloc_log( log->l_covered_state = XLOG_STATE_COVER_IDLE; set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); + INIT_LIST_HEAD(&log->r_dfops); log->l_prev_block = -1; /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index fa3ad1d7b31c..e30c06ec20e3 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -407,6 +407,7 @@ struct xlog { long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; + struct list_head r_dfops; /* recovered log intent items */ int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a1e18b24971a..b9d2152a2bad 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks( */ void xlog_recover_release_intent( - struct xlog *log, - unsigned short intent_type, - uint64_t intent_id) + struct xlog *log, + unsigned short intent_type, + uint64_t intent_id) { - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp = log->l_ailp; + struct xfs_defer_pending *dfp, *n; + + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; - spin_lock(&ailp->ail_lock); - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { if (lip->li_type != intent_type) continue; if (!lip->li_ops->iop_match(lip, intent_id)) continue; - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - break; - } + ASSERT(xlog_item_is_intent(lip)); - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + xfs_defer_cancel_recovery(log->l_mp, dfp); + } } int @@ -1939,6 +1933,29 @@ xlog_buf_readahead( xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops); } +/* + * Create a deferred work structure for resuming and tracking the progress of a + * log intent item that was found during recovery. + */ +void +xlog_recover_intent_item( + struct xlog *log, + struct xfs_log_item *lip, + xfs_lsn_t lsn, + unsigned int dfp_type) +{ + ASSERT(xlog_item_is_intent(lip)); + + xfs_defer_start_recovery(lip, dfp_type, &log->r_dfops); + + /* + * Insert the intent into the AIL directly and drop one reference so + * that finishing or canceling the work will drop the other. + */ + xfs_trans_ail_insert(log->l_ailp, lip, lsn); + lip->li_ops->iop_unpin(lip, 0); +} + STATIC int xlog_recover_items_pass2( struct xlog *log, @@ -2533,29 +2550,22 @@ xlog_abort_defer_ops( */ STATIC int xlog_recover_process_intents( - struct xlog *log) + struct xlog *log) { LIST_HEAD(capture_list); - struct xfs_ail_cursor cur; - struct xfs_log_item *lip; - struct xfs_ail *ailp; - int error = 0; + struct xfs_defer_pending *dfp, *n; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) - xfs_lsn_t last_lsn; -#endif + xfs_lsn_t last_lsn; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); -#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - lip != NULL; - lip = xfs_trans_ail_cursor_next(ailp, &cur)) { - const struct xfs_item_ops *ops; - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + struct xfs_log_item *lip = dfp->dfp_intent; + const struct xfs_item_ops *ops = lip->li_ops; + + ASSERT(xlog_item_is_intent(lip)); /* * We should never see a redo item with a LSN higher than @@ -2573,19 +2583,22 @@ xlog_recover_process_intents( * The recovery function can free the log item, so we must not * access lip after it returns. */ - spin_unlock(&ailp->ail_lock); - ops = lip->li_ops; error = ops->iop_recover(lip, &capture_list); - spin_lock(&ailp->ail_lock); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, ops->iop_recover); break; } - } - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); + /* + * XXX: @lip could have been freed, so detach the log item from + * the pending item before freeing the pending item. This does + * not fix the existing UAF bug that occurs if ->iop_recover + * fails after creating the intent done item. + */ + dfp->dfp_intent = NULL; + xfs_defer_cancel_recovery(log->l_mp, dfp); + } if (error) goto err; @@ -2606,27 +2619,15 @@ xlog_recover_process_intents( */ STATIC void xlog_recover_cancel_intents( - struct xlog *log) + struct xlog *log) { - struct xfs_log_item *lip; - struct xfs_ail_cursor cur; - struct xfs_ail *ailp; + struct xfs_defer_pending *dfp, *n; - ailp = log->l_ailp; - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); - while (lip != NULL) { - if (!xlog_item_is_intent(lip)) - break; + list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); - spin_unlock(&ailp->ail_lock); - lip->li_ops->iop_release(lip); - spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_next(ailp, &cur); + xfs_defer_cancel_recovery(log->l_mp, dfp); } - - xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->ail_lock); } /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2d4444d61e98..b88cb2e98227 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -696,12 +696,9 @@ xlog_recover_cui_commit_pass2( cuip = xfs_cui_init(mp, cui_formatp->cui_nextents); xfs_cui_copy_format(&cuip->cui_format, cui_formatp); atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn); - xfs_cui_release(cuip); + + xlog_recover_intent_item(log, &cuip->cui_item, lsn, + XFS_DEFER_OPS_TYPE_REFCOUNT); return 0; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 0e0e747028da..c30d4a4a14b2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -702,12 +702,9 @@ xlog_recover_rui_commit_pass2( ruip = xfs_rui_init(mp, rui_formatp->rui_nextents); xfs_rui_copy_format(&ruip->rui_format, rui_formatp); atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - /* - * Insert the intent into the AIL directly and drop one reference so - * that finishing or canceling the work will drop the other. - */ - xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn); - xfs_rui_release(ruip); + + xlog_recover_intent_item(log, &ruip->rui_item, lsn, + XFS_DEFER_OPS_TYPE_RMAP); return 0; } From a050acdfa8003a44eae4558fddafc7afb1aef458 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 10:38:10 -0800 Subject: [PATCH 003/123] xfs: pass the xfs_defer_pending object to iop_recover Now that log intent item recovery recreates the xfs_defer_pending state, we should pass that into the ->iop_recover routines so that the intent item can finish the recreation work. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 3 ++- fs/xfs/xfs_bmap_item.c | 3 ++- fs/xfs/xfs_extfree_item.c | 3 ++- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_refcount_item.c | 3 ++- fs/xfs/xfs_rmap_item.c | 3 ++- fs/xfs/xfs_trans.h | 4 +++- 7 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index a32716b8cbbd..6119a7a480a0 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -545,9 +545,10 @@ xfs_attri_validate( */ STATIC int xfs_attri_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; struct xfs_mount *mp = lip->li_log->l_mp; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 6cbae4fdf43f..3ef55de370b5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -486,11 +486,12 @@ xfs_bui_validate( */ STATIC int xfs_bui_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_bmap_intent fake = { }; struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index cf0ddeb70580..a8245c5ffe49 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -657,10 +657,11 @@ xfs_efi_validate_ext( */ STATIC int xfs_efi_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b9d2152a2bad..ff768217f2c7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2583,7 +2583,7 @@ xlog_recover_process_intents( * The recovery function can free the log item, so we must not * access lip after it returns. */ - error = ops->iop_recover(lip, &capture_list); + error = ops->iop_recover(dfp, &capture_list); if (error) { trace_xlog_intent_recovery_failed(log->l_mp, error, ops->iop_recover); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index b88cb2e98227..3456201aa3e6 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -474,10 +474,11 @@ xfs_cui_validate_phys( */ STATIC int xfs_cui_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_cud_log_item *cudp; struct xfs_trans *tp; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index c30d4a4a14b2..dfd5a3e4b1fb 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -504,10 +504,11 @@ xfs_rui_validate_map( */ STATIC int xfs_rui_item_recover( - struct xfs_log_item *lip, + struct xfs_defer_pending *dfp, struct list_head *capture_list) { struct xfs_trans_res resv; + struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_rud_log_item *rudp; struct xfs_trans *tp; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6e3646d524ce..4e38357237c3 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -66,6 +66,8 @@ struct xfs_log_item { { (1u << XFS_LI_DIRTY), "DIRTY" }, \ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } +struct xfs_defer_pending; + struct xfs_item_ops { unsigned flags; void (*iop_size)(struct xfs_log_item *, int *, int *); @@ -78,7 +80,7 @@ struct xfs_item_ops { xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); - int (*iop_recover)(struct xfs_log_item *lip, + int (*iop_recover)(struct xfs_defer_pending *dfp, struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, From deb4cd8ba87f17b12c72b3827820d9c703e9fd95 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 10:47:10 -0800 Subject: [PATCH 004/123] xfs: transfer recovered intent item ownership in ->iop_recover Now that we pass the xfs_defer_pending object into the intent item recovery functions, we know exactly when ownership of the sole refcount passes from the recovery context to the intent done item. At that point, we need to null out dfp_intent so that the recovery mechanism won't release it. This should fix the UAF problem reported by Long Li. Note that we still want to recreate the full deferred work state. That will be addressed in the next patches. Fixes: 2e76f188fd90 ("xfs: cancel intents immediately if process_intents fails") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_log_recover.h | 2 ++ fs/xfs/xfs_attr_item.c | 1 + fs/xfs/xfs_bmap_item.c | 2 ++ fs/xfs/xfs_extfree_item.c | 2 ++ fs/xfs/xfs_log_recover.c | 19 ++++++++++++------- fs/xfs/xfs_refcount_item.c | 1 + fs/xfs/xfs_rmap_item.c | 2 ++ 7 files changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 271a4ce7375c..13583df9f239 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -155,5 +155,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, unsigned int dfp_type); +void xlog_recover_transfer_intent(struct xfs_trans *tp, + struct xfs_defer_pending *dfp); #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 6119a7a480a0..82775e9537df 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -632,6 +632,7 @@ xfs_attri_item_recover( args->trans = tp; done_item = xfs_trans_get_attrd(tp, attrip); + xlog_recover_transfer_intent(tp, dfp); xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 3ef55de370b5..b6d63b8bdad5 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -524,6 +524,8 @@ xfs_bui_item_recover( goto err_rele; budp = xfs_trans_get_bud(tp, buip); + xlog_recover_transfer_intent(tp, dfp); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a8245c5ffe49..c9908fb33765 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -689,7 +689,9 @@ xfs_efi_item_recover( error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; + efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); + xlog_recover_transfer_intent(tp, dfp); for (i = 0; i < efip->efi_format.efi_nextents; i++) { struct xfs_extent_free_item fake = { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index ff768217f2c7..cc14cd1c2282 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2590,13 +2590,6 @@ xlog_recover_process_intents( break; } - /* - * XXX: @lip could have been freed, so detach the log item from - * the pending item before freeing the pending item. This does - * not fix the existing UAF bug that occurs if ->iop_recover - * fails after creating the intent done item. - */ - dfp->dfp_intent = NULL; xfs_defer_cancel_recovery(log->l_mp, dfp); } if (error) @@ -2630,6 +2623,18 @@ xlog_recover_cancel_intents( } } +/* + * Transfer ownership of the recovered log intent item to the recovery + * transaction. + */ +void +xlog_recover_transfer_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + dfp->dfp_intent = NULL; +} + /* * This routine performs a transaction to null out a bad inode pointer * in an agi unlinked inode hash bucket. diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 3456201aa3e6..f1b259223802 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -523,6 +523,7 @@ xfs_cui_item_recover( return error; cudp = xfs_trans_get_cud(tp, cuip); + xlog_recover_transfer_intent(tp, dfp); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { struct xfs_refcount_intent fake = { }; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index dfd5a3e4b1fb..5e8a02d2b045 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -537,7 +537,9 @@ xfs_rui_item_recover( XFS_TRANS_RESERVE, &tp); if (error) return error; + rudp = xfs_trans_get_rud(tp, ruip); + xlog_recover_transfer_intent(tp, dfp); for (i = 0; i < ruip->rui_format.rui_nextents; i++) { struct xfs_rmap_intent fake = { }; From e70fb328d5277297ea2d9169a3a046de6412d777 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 11:13:03 -0800 Subject: [PATCH 005/123] xfs: recreate work items when recovering intent items Recreate work items for each xfs_defer_pending object when we are recovering intent items. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 3 +- fs/xfs/libxfs/xfs_defer.h | 9 +++ fs/xfs/xfs_attr_item.c | 90 ++++++++++++++++------------- fs/xfs/xfs_bmap_item.c | 55 +++++++++++------- fs/xfs/xfs_extfree_item.c | 49 +++++++++------- fs/xfs/xfs_refcount_item.c | 60 ++++++++++---------- fs/xfs/xfs_rmap_item.c | 112 ++++++++++++++++++++----------------- 7 files changed, 215 insertions(+), 163 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 363da37a8e7f..8fb523e4f669 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -676,9 +676,8 @@ xfs_defer_add( list_add_tail(&dfp->dfp_list, &tp->t_dfops); } - list_add_tail(li, &dfp->dfp_work); + xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); - dfp->dfp_count++; } /* diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 5dce938ba3d5..bef5823f61fb 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -130,6 +130,15 @@ void xfs_defer_start_recovery(struct xfs_log_item *lip, void xfs_defer_cancel_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp); +static inline void +xfs_defer_add_item( + struct xfs_defer_pending *dfp, + struct list_head *work) +{ + list_add_tail(work, &dfp->dfp_work); + dfp->dfp_count++; +} + int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 82775e9537df..c4441eacf51c 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -539,42 +539,17 @@ xfs_attri_validate( return xfs_verify_ino(mp, attrp->alfi_ino); } -/* - * Process an attr intent item that was recovered from the log. We need to - * delete the attr that it describes. - */ -STATIC int -xfs_attri_item_recover( +static inline struct xfs_attr_intent * +xfs_attri_recover_work( + struct xfs_mount *mp, struct xfs_defer_pending *dfp, - struct list_head *capture_list) + struct xfs_attri_log_format *attrp, + struct xfs_inode *ip, + struct xfs_attri_log_nameval *nv) { - struct xfs_log_item *lip = dfp->dfp_intent; - struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); struct xfs_attr_intent *attr; - struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_inode *ip; struct xfs_da_args *args; - struct xfs_trans *tp; - struct xfs_trans_res resv; - struct xfs_attri_log_format *attrp; - struct xfs_attri_log_nameval *nv = attrip->attri_nameval; - int error; - int total; int local; - struct xfs_attrd_log_item *done_item = NULL; - - /* - * First check the validity of the attr described by the ATTRI. If any - * are bad, then assume that all are bad and just toss the ATTRI. - */ - attrp = &attrip->attri_format; - if (!xfs_attri_validate(mp, attrp) || - !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) - return -EFSCORRUPTED; - - error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); - if (error) - return error; attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); @@ -618,19 +593,58 @@ xfs_attri_item_recover( case XFS_ATTRI_OP_FLAGS_REMOVE: attr->xattri_dela_state = xfs_attr_init_remove_state(args); break; - default: - ASSERT(0); - error = -EFSCORRUPTED; - goto out; } + xfs_defer_add_item(dfp, &attr->xattri_list); + return attr; +} + +/* + * Process an attr intent item that was recovered from the log. We need to + * delete the attr that it describes. + */ +STATIC int +xfs_attri_item_recover( + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + struct xfs_log_item *lip = dfp->dfp_intent; + struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip); + struct xfs_attr_intent *attr; + struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_inode *ip; + struct xfs_da_args *args; + struct xfs_trans *tp; + struct xfs_trans_res resv; + struct xfs_attri_log_format *attrp; + struct xfs_attri_log_nameval *nv = attrip->attri_nameval; + int error; + int total; + struct xfs_attrd_log_item *done_item = NULL; + + /* + * First check the validity of the attr described by the ATTRI. If any + * are bad, then assume that all are bad and just toss the ATTRI. + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || + !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) + return -EFSCORRUPTED; + + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); + if (error) + return error; + + attr = xfs_attri_recover_work(mp, dfp, attrp, ip, nv); + args = attr->xattri_da_args; + xfs_init_attr_trans(args, &resv, &total); resv = xlog_recover_resv(&resv); error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) - goto out; - + return error; args->trans = tp; + done_item = xfs_trans_get_attrd(tp, attrip); xlog_recover_transfer_intent(tp, dfp); @@ -661,8 +675,6 @@ xfs_attri_item_recover( out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); -out: - xfs_attr_free_item(attr); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index b6d63b8bdad5..b65999bf0ea3 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -480,6 +480,28 @@ xfs_bui_validate( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline struct xfs_bmap_intent * +xfs_bui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_map_extent *map) +{ + struct xfs_bmap_intent *bi; + + bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + bi->bi_bmap.br_startblock = map->me_startblock; + bi->bi_bmap.br_startoff = map->me_startoff; + bi->bi_bmap.br_blockcount = map->me_len; + bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + xfs_defer_add_item(dfp, &bi->bi_list); + return bi; +} + /* * Process a bmap update intent item that was recovered from the log. * We need to update some inode's bmbt. @@ -489,7 +511,6 @@ xfs_bui_item_recover( struct xfs_defer_pending *dfp, struct list_head *capture_list) { - struct xfs_bmap_intent fake = { }; struct xfs_trans_res resv; struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_bui_log_item *buip = BUI_ITEM(lip); @@ -498,6 +519,7 @@ xfs_bui_item_recover( struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *map; struct xfs_bud_log_item *budp; + struct xfs_bmap_intent *fake; int iext_delta; int error = 0; @@ -508,9 +530,7 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + fake = xfs_bui_recover_work(mp, dfp, map); error = xlog_recover_iget(mp, map->me_owner, &ip); if (error) @@ -529,36 +549,31 @@ xfs_bui_item_recover( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (fake.bi_type == XFS_BMAP_MAP) + if (fake->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, fake->bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - fake.bi_owner = ip; - fake.bi_bmap.br_startblock = map->me_startblock; - fake.bi_bmap.br_startoff = map->me_startoff; - fake.bi_bmap.br_blockcount = map->me_len; - fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + fake->bi_owner = ip; - xfs_bmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_bmap_update(tp, budp, &fake); + xfs_bmap_update_get_group(mp, fake); + error = xfs_trans_log_finish_bmap_update(tp, budp, fake); if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map, - sizeof(*map)); - xfs_bmap_update_put_group(&fake); + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &buip->bui_format, sizeof(buip->bui_format)); + xfs_bmap_update_put_group(fake); if (error) goto err_cancel; - if (fake.bi_bmap.br_blockcount > 0) { - ASSERT(fake.bi_type == XFS_BMAP_UNMAP); - xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap); + if (fake->bi_bmap.br_blockcount > 0) { + ASSERT(fake->bi_type == XFS_BMAP_UNMAP); + xfs_bmap_unmap_extent(tp, ip, &fake->bi_bmap); } /* diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c9908fb33765..41108a0b60c9 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -651,6 +651,24 @@ xfs_efi_validate_ext( return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len); } +static inline void +xfs_efi_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_extent *extp) +{ + struct xfs_extent_free_item *xefi; + + xefi = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + xefi->xefi_startblock = extp->ext_start; + xefi->xefi_blockcount = extp->ext_len; + xefi->xefi_agresv = XFS_AG_RESV_NONE; + xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; + + xfs_defer_add_item(dfp, &xefi->xefi_list); +} + /* * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. @@ -666,6 +684,7 @@ xfs_efi_item_recover( struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; + struct xfs_extent_free_item *fake; int i; int error = 0; bool requeue_only = false; @@ -683,6 +702,8 @@ xfs_efi_item_recover( sizeof(efip->efi_format)); return -EFSCORRUPTED; } + + xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -693,22 +714,11 @@ xfs_efi_item_recover( efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); xlog_recover_transfer_intent(tp, dfp); - for (i = 0; i < efip->efi_format.efi_nextents; i++) { - struct xfs_extent_free_item fake = { - .xefi_owner = XFS_RMAP_OWN_UNKNOWN, - .xefi_agresv = XFS_AG_RESV_NONE, - }; - struct xfs_extent *extp; - - extp = &efip->efi_format.efi_extents[i]; - - fake.xefi_startblock = extp->ext_start; - fake.xefi_blockcount = extp->ext_len; - + list_for_each_entry(fake, &dfp->dfp_work, xefi_list) { if (!requeue_only) { - xfs_extent_free_get_group(mp, &fake); - error = xfs_trans_free_extent(tp, efdp, &fake); - xfs_extent_free_put_group(&fake); + xfs_extent_free_get_group(mp, fake); + error = xfs_trans_free_extent(tp, efdp, fake); + xfs_extent_free_put_group(fake); } /* @@ -717,10 +727,10 @@ xfs_efi_item_recover( * run again later with a new transaction context. */ if (error == -EAGAIN || requeue_only) { - error = xfs_free_extent_later(tp, fake.xefi_startblock, - fake.xefi_blockcount, + error = xfs_free_extent_later(tp, fake->xefi_startblock, + fake->xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER, - fake.xefi_agresv); + fake->xefi_agresv); if (!error) { requeue_only = true; continue; @@ -729,7 +739,8 @@ xfs_efi_item_recover( if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - extp, sizeof(*extp)); + &efip->efi_format, + sizeof(efip->efi_format)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index f1b259223802..4ffc34e6f0a0 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -468,6 +468,23 @@ xfs_cui_validate_phys( return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len); } +static inline void +xfs_cui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct xfs_phys_extent *pmap) +{ + struct xfs_refcount_intent *ri; + + ri = kmem_cache_alloc(xfs_refcount_intent_cache, + GFP_NOFS | __GFP_NOFAIL); + ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; + ri->ri_startblock = pmap->pe_startblock; + ri->ri_blockcount = pmap->pe_len; + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process a refcount update intent item that was recovered from the log. * We need to update the refcountbt. @@ -484,7 +501,7 @@ xfs_cui_item_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - unsigned int refc_type; + struct xfs_refcount_intent *fake; bool requeue_only = false; int i; int error = 0; @@ -502,6 +519,8 @@ xfs_cui_item_recover( sizeof(cuip->cui_format)); return -EFSCORRUPTED; } + + xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]); } /* @@ -525,35 +544,12 @@ xfs_cui_item_recover( cudp = xfs_trans_get_cud(tp, cuip); xlog_recover_transfer_intent(tp, dfp); - for (i = 0; i < cuip->cui_format.cui_nextents; i++) { - struct xfs_refcount_intent fake = { }; - struct xfs_phys_extent *pmap; - - pmap = &cuip->cui_format.cui_extents[i]; - refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; - switch (refc_type) { - case XFS_REFCOUNT_INCREASE: - case XFS_REFCOUNT_DECREASE: - case XFS_REFCOUNT_ALLOC_COW: - case XFS_REFCOUNT_FREE_COW: - fake.ri_type = refc_type; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_startblock = pmap->pe_startblock; - fake.ri_blockcount = pmap->pe_len; - + list_for_each_entry(fake, &dfp->dfp_work, ri_list) { if (!requeue_only) { - xfs_refcount_update_get_group(mp, &fake); + xfs_refcount_update_get_group(mp, fake); error = xfs_trans_log_finish_refcount_update(tp, cudp, - &fake, &rcur); - xfs_refcount_update_put_group(&fake); + fake, &rcur); + xfs_refcount_update_put_group(fake); } if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -563,13 +559,13 @@ xfs_cui_item_recover( goto abort_error; /* Requeue what we didn't finish. */ - if (fake.ri_blockcount > 0) { + if (fake->ri_blockcount > 0) { struct xfs_bmbt_irec irec = { - .br_startblock = fake.ri_startblock, - .br_blockcount = fake.ri_blockcount, + .br_startblock = fake->ri_startblock, + .br_blockcount = fake->ri_blockcount, }; - switch (fake.ri_type) { + switch (fake->ri_type) { case XFS_REFCOUNT_INCREASE: xfs_refcount_increase_extent(tp, &irec); break; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 5e8a02d2b045..9fb3ae4bfd59 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -498,6 +498,58 @@ xfs_rui_validate_map( return xfs_verify_fsbext(mp, map->me_startblock, map->me_len); } +static inline void +xfs_rui_recover_work( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + const struct xfs_map_extent *map) +{ + struct xfs_rmap_intent *ri; + + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); + + switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { + case XFS_RMAP_EXTENT_MAP: + ri->ri_type = XFS_RMAP_MAP; + break; + case XFS_RMAP_EXTENT_MAP_SHARED: + ri->ri_type = XFS_RMAP_MAP_SHARED; + break; + case XFS_RMAP_EXTENT_UNMAP: + ri->ri_type = XFS_RMAP_UNMAP; + break; + case XFS_RMAP_EXTENT_UNMAP_SHARED: + ri->ri_type = XFS_RMAP_UNMAP_SHARED; + break; + case XFS_RMAP_EXTENT_CONVERT: + ri->ri_type = XFS_RMAP_CONVERT; + break; + case XFS_RMAP_EXTENT_CONVERT_SHARED: + ri->ri_type = XFS_RMAP_CONVERT_SHARED; + break; + case XFS_RMAP_EXTENT_ALLOC: + ri->ri_type = XFS_RMAP_ALLOC; + break; + case XFS_RMAP_EXTENT_FREE: + ri->ri_type = XFS_RMAP_FREE; + break; + default: + ASSERT(0); + return; + } + + ri->ri_owner = map->me_owner; + ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + ri->ri_bmap.br_startblock = map->me_startblock; + ri->ri_bmap.br_startoff = map->me_startoff; + ri->ri_bmap.br_blockcount = map->me_len; + ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + + xfs_defer_add_item(dfp, &ri->ri_list); +} + /* * Process an rmap update intent item that was recovered from the log. * We need to update the rmapbt. @@ -514,6 +566,7 @@ xfs_rui_item_recover( struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; + struct xfs_rmap_intent *fake; int i; int error = 0; @@ -530,6 +583,8 @@ xfs_rui_item_recover( sizeof(ruip->rui_format)); return -EFSCORRUPTED; } + + xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]); } resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -541,60 +596,15 @@ xfs_rui_item_recover( rudp = xfs_trans_get_rud(tp, ruip); xlog_recover_transfer_intent(tp, dfp); - for (i = 0; i < ruip->rui_format.rui_nextents; i++) { - struct xfs_rmap_intent fake = { }; - struct xfs_map_extent *map; - - map = &ruip->rui_format.rui_extents[i]; - switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) { - case XFS_RMAP_EXTENT_MAP: - fake.ri_type = XFS_RMAP_MAP; - break; - case XFS_RMAP_EXTENT_MAP_SHARED: - fake.ri_type = XFS_RMAP_MAP_SHARED; - break; - case XFS_RMAP_EXTENT_UNMAP: - fake.ri_type = XFS_RMAP_UNMAP; - break; - case XFS_RMAP_EXTENT_UNMAP_SHARED: - fake.ri_type = XFS_RMAP_UNMAP_SHARED; - break; - case XFS_RMAP_EXTENT_CONVERT: - fake.ri_type = XFS_RMAP_CONVERT; - break; - case XFS_RMAP_EXTENT_CONVERT_SHARED: - fake.ri_type = XFS_RMAP_CONVERT_SHARED; - break; - case XFS_RMAP_EXTENT_ALLOC: - fake.ri_type = XFS_RMAP_ALLOC; - break; - case XFS_RMAP_EXTENT_FREE: - fake.ri_type = XFS_RMAP_FREE; - break; - default: - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &ruip->rui_format, - sizeof(ruip->rui_format)); - error = -EFSCORRUPTED; - goto abort_error; - } - - fake.ri_owner = map->me_owner; - fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - fake.ri_bmap.br_startblock = map->me_startblock; - fake.ri_bmap.br_startoff = map->me_startoff; - fake.ri_bmap.br_blockcount = map->me_len; - fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - - xfs_rmap_update_get_group(mp, &fake); - error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake, + list_for_each_entry(fake, &dfp->dfp_work, ri_list) { + xfs_rmap_update_get_group(mp, fake); + error = xfs_trans_log_finish_rmap_update(tp, rudp, fake, &rcur); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - map, sizeof(*map)); - xfs_rmap_update_put_group(&fake); + &ruip->rui_format, + sizeof(ruip->rui_format)); + xfs_rmap_update_put_group(fake); if (error) goto abort_error; From a51489e140d302c7afae763eacf882a23513f7e4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 09:57:42 -0800 Subject: [PATCH 006/123] xfs: dump the recovered xattri log item if corruption happens If xfs_attri_item_recover receives a corruption error when it tries to finish a recovered log intent item, it should dump the log item for debugging, just like all the other log intent items. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index c4441eacf51c..c7c308d2f897 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -666,6 +666,10 @@ xfs_attri_item_recover( xfs_irele(ip); return 0; } + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &attrip->attri_format, + sizeof(attrip->attri_format)); if (error) { xfs_trans_cancel(tp); goto out_unlock; From 172538beba82e7b65d3d7c84cb558f287381cd7a Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:27:10 -0800 Subject: [PATCH 007/123] xfs: don't set XFS_TRANS_HAS_INTENT_DONE when there's no ATTRD log item XFS_TRANS_HAS_INTENT_DONE is a flag to the CIL that we've added a log intent done item to the transaction. This enables an optimization wherein we avoid writing out log intent and log intent done items if they would have ended up in the same checkpoint. This reduces writes to the ondisk log and speeds up recovery as a result. However, callers can use the defer ops machinery to modify xattrs without using the log items. In this situation, there won't be an intent done item, so we do not need to set the flag. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index bd23c9594a0d..d19a385f9289 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -347,13 +347,15 @@ xfs_xattri_finish_update( * 1.) releases the ATTRI and frees the ATTRD * 2.) shuts down the filesystem */ - args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; + args->trans->t_flags |= XFS_TRANS_DIRTY; /* * attr intent/done items are null when logged attributes are disabled */ - if (attrdp) + if (attrdp) { + args->trans->t_flags |= XFS_TRANS_HAS_INTENT_DONE; set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); + } return error; } From e5f1a5146ec35f3ed5d7f5ac7807a10c0062b6b8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 11:25:45 -0800 Subject: [PATCH 008/123] xfs: use xfs_defer_finish_one to finish recovered work items Get rid of the open-coded calls to xfs_defer_finish_one. This also means that the recovery transaction takes care of cleaning up the dfp, and we have solved (I hope) all the ownership issues in recovery. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 2 +- fs/xfs/libxfs/xfs_defer.h | 1 + fs/xfs/libxfs/xfs_log_recover.h | 2 +- fs/xfs/xfs_attr_item.c | 20 +---------- fs/xfs/xfs_bmap_item.c | 24 ++++--------- fs/xfs/xfs_extfree_item.c | 45 +++++------------------- fs/xfs/xfs_log_recover.c | 22 +++++++----- fs/xfs/xfs_refcount_item.c | 61 +++++---------------------------- fs/xfs/xfs_rmap_item.c | 29 +++++----------- 9 files changed, 49 insertions(+), 157 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 8fb523e4f669..eb262ea06122 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -484,7 +484,7 @@ xfs_defer_relog( * Log an intent-done item for the first pending intent, and finish the work * items. */ -static int +int xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index bef5823f61fb..c1a648e99174 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -41,6 +41,7 @@ void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, struct list_head *h); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); +int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_cancel(struct xfs_trans *); void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 13583df9f239..52162a17fc5e 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -155,7 +155,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, unsigned int dfp_type); -void xlog_recover_transfer_intent(struct xfs_trans *tp, +int xlog_recover_finish_intent(struct xfs_trans *tp, struct xfs_defer_pending *dfp); #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index c7c308d2f897..eaf8a877c2cc 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -620,7 +620,6 @@ xfs_attri_item_recover( struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; int total; - struct xfs_attrd_log_item *done_item = NULL; /* * First check the validity of the attr described by the ATTRI. If any @@ -645,27 +644,10 @@ xfs_attri_item_recover( return error; args->trans = tp; - done_item = xfs_trans_get_attrd(tp, attrip); - xlog_recover_transfer_intent(tp, dfp); - xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - error = xfs_xattri_finish_update(attr, done_item); - if (error == -EAGAIN) { - /* - * There's more work to do, so add the intent item to this - * transaction so that we can continue it later. - */ - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list); - error = xfs_defer_ops_capture_and_commit(tp, capture_list); - if (error) - goto out_unlock; - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - return 0; - } + error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &attrip->attri_format, diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index b65999bf0ea3..89f2d9e89607 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -497,6 +497,7 @@ xfs_bui_recover_work( bi->bi_bmap.br_blockcount = map->me_len; bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_bmap_update_get_group(mp, bi); xfs_defer_add_item(dfp, &bi->bi_list); return bi; @@ -518,8 +519,7 @@ xfs_bui_item_recover( struct xfs_inode *ip = NULL; struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_map_extent *map; - struct xfs_bud_log_item *budp; - struct xfs_bmap_intent *fake; + struct xfs_bmap_intent *work; int iext_delta; int error = 0; @@ -530,7 +530,7 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - fake = xfs_bui_recover_work(mp, dfp, map); + work = xfs_bui_recover_work(mp, dfp, map); error = xlog_recover_iget(mp, map->me_owner, &ip); if (error) @@ -543,39 +543,29 @@ xfs_bui_item_recover( if (error) goto err_rele; - budp = xfs_trans_get_bud(tp, buip); - xlog_recover_transfer_intent(tp, dfp); - xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - if (fake->bi_type == XFS_BMAP_MAP) + if (work->bi_type == XFS_BMAP_MAP) iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT; else iext_delta = XFS_IEXT_PUNCH_HOLE_CNT; - error = xfs_iext_count_may_overflow(ip, fake->bi_whichfork, iext_delta); + error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, iext_delta); if (error) goto err_cancel; - fake->bi_owner = ip; + work->bi_owner = ip; - xfs_bmap_update_get_group(mp, fake); - error = xfs_trans_log_finish_bmap_update(tp, budp, fake); + error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, &buip->bui_format, sizeof(buip->bui_format)); - xfs_bmap_update_put_group(fake); if (error) goto err_cancel; - if (fake->bi_bmap.br_blockcount > 0) { - ASSERT(fake->bi_type == XFS_BMAP_UNMAP); - xfs_bmap_unmap_extent(tp, ip, &fake->bi_bmap); - } - /* * Commit transaction, which frees the transaction and saves the inode * for later replay activities. diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 41108a0b60c9..6a434ade486c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -665,6 +665,7 @@ xfs_efi_recover_work( xefi->xefi_blockcount = extp->ext_len; xefi->xefi_agresv = XFS_AG_RESV_NONE; xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN; + xfs_extent_free_get_group(mp, xefi); xfs_defer_add_item(dfp, &xefi->xefi_list); } @@ -682,12 +683,9 @@ xfs_efi_item_recover( struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_efd_log_item *efdp; struct xfs_trans *tp; - struct xfs_extent_free_item *fake; int i; int error = 0; - bool requeue_only = false; /* * First check the validity of the extents described by the @@ -711,40 +709,13 @@ xfs_efi_item_recover( if (error) return error; - efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); - xlog_recover_transfer_intent(tp, dfp); - - list_for_each_entry(fake, &dfp->dfp_work, xefi_list) { - if (!requeue_only) { - xfs_extent_free_get_group(mp, fake); - error = xfs_trans_free_extent(tp, efdp, fake); - xfs_extent_free_put_group(fake); - } - - /* - * If we can't free the extent without potentially deadlocking, - * requeue the rest of the extents to a new so that they get - * run again later with a new transaction context. - */ - if (error == -EAGAIN || requeue_only) { - error = xfs_free_extent_later(tp, fake->xefi_startblock, - fake->xefi_blockcount, - &XFS_RMAP_OINFO_ANY_OWNER, - fake->xefi_agresv); - if (!error) { - requeue_only = true; - continue; - } - } - - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &efip->efi_format, - sizeof(efip->efi_format)); - if (error) - goto abort_error; - - } + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &efip->efi_format, + sizeof(efip->efi_format)); + if (error) + goto abort_error; return xfs_defer_ops_capture_and_commit(tp, capture_list); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index cc14cd1c2282..6fab490959d4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2581,7 +2581,8 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. + * access lip after it returns. It must dispose of @dfp if it + * returns 0. */ error = ops->iop_recover(dfp, &capture_list); if (error) { @@ -2589,8 +2590,6 @@ xlog_recover_process_intents( ops->iop_recover); break; } - - xfs_defer_cancel_recovery(log->l_mp, dfp); } if (error) goto err; @@ -2624,15 +2623,22 @@ xlog_recover_cancel_intents( } /* - * Transfer ownership of the recovered log intent item to the recovery - * transaction. + * Transfer ownership of the recovered pending work to the recovery transaction + * and try to finish the work. If there is more work to be done, the dfp will + * remain attached to the transaction. If not, the dfp is freed. */ -void -xlog_recover_transfer_intent( +int +xlog_recover_finish_intent( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - dfp->dfp_intent = NULL; + int error; + + list_move(&dfp->dfp_list, &tp->t_dfops); + error = xfs_defer_finish_one(tp, dfp); + if (error == -EAGAIN) + return 0; + return error; } /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 4ffc34e6f0a0..f561ca73c784 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -481,6 +481,7 @@ xfs_cui_recover_work( ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK; ri->ri_startblock = pmap->pe_startblock; ri->ri_blockcount = pmap->pe_len; + xfs_refcount_update_get_group(mp, ri); xfs_defer_add_item(dfp, &ri->ri_list); } @@ -497,12 +498,8 @@ xfs_cui_item_recover( struct xfs_trans_res resv; struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); - struct xfs_cud_log_item *cudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_refcount_intent *fake; - bool requeue_only = false; int i; int error = 0; @@ -541,59 +538,17 @@ xfs_cui_item_recover( if (error) return error; - cudp = xfs_trans_get_cud(tp, cuip); - xlog_recover_transfer_intent(tp, dfp); + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &cuip->cui_format, + sizeof(cuip->cui_format)); + if (error) + goto abort_error; - list_for_each_entry(fake, &dfp->dfp_work, ri_list) { - if (!requeue_only) { - xfs_refcount_update_get_group(mp, fake); - error = xfs_trans_log_finish_refcount_update(tp, cudp, - fake, &rcur); - xfs_refcount_update_put_group(fake); - } - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &cuip->cui_format, - sizeof(cuip->cui_format)); - if (error) - goto abort_error; - - /* Requeue what we didn't finish. */ - if (fake->ri_blockcount > 0) { - struct xfs_bmbt_irec irec = { - .br_startblock = fake->ri_startblock, - .br_blockcount = fake->ri_blockcount, - }; - - switch (fake->ri_type) { - case XFS_REFCOUNT_INCREASE: - xfs_refcount_increase_extent(tp, &irec); - break; - case XFS_REFCOUNT_DECREASE: - xfs_refcount_decrease_extent(tp, &irec); - break; - case XFS_REFCOUNT_ALLOC_COW: - xfs_refcount_alloc_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - case XFS_REFCOUNT_FREE_COW: - xfs_refcount_free_cow_extent(tp, - irec.br_startblock, - irec.br_blockcount); - break; - default: - ASSERT(0); - } - requeue_only = true; - } - } - - xfs_refcount_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_refcount_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 9fb3ae4bfd59..23e736179894 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -546,6 +546,7 @@ xfs_rui_recover_work( ri->ri_bmap.br_blockcount = map->me_len; ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + xfs_rmap_update_get_group(mp, ri); xfs_defer_add_item(dfp, &ri->ri_list); } @@ -562,11 +563,8 @@ xfs_rui_item_recover( struct xfs_trans_res resv; struct xfs_log_item *lip = dfp->dfp_intent; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); - struct xfs_rud_log_item *rudp; struct xfs_trans *tp; - struct xfs_btree_cur *rcur = NULL; struct xfs_mount *mp = lip->li_log->l_mp; - struct xfs_rmap_intent *fake; int i; int error = 0; @@ -593,28 +591,17 @@ xfs_rui_item_recover( if (error) return error; - rudp = xfs_trans_get_rud(tp, ruip); - xlog_recover_transfer_intent(tp, dfp); + error = xlog_recover_finish_intent(tp, dfp); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + &ruip->rui_format, + sizeof(ruip->rui_format)); + if (error) + goto abort_error; - list_for_each_entry(fake, &dfp->dfp_work, ri_list) { - xfs_rmap_update_get_group(mp, fake); - error = xfs_trans_log_finish_rmap_update(tp, rudp, fake, - &rcur); - if (error == -EFSCORRUPTED) - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, - &ruip->rui_format, - sizeof(ruip->rui_format)); - xfs_rmap_update_put_group(fake); - if (error) - goto abort_error; - - } - - xfs_rmap_finish_one_cleanup(tp, rcur, error); return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: - xfs_rmap_finish_one_cleanup(tp, rcur, error); xfs_trans_cancel(tp); return error; } From 3dd75c8db1c1675a26d3e228bab349c1fc065867 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:45:28 -0800 Subject: [PATCH 009/123] xfs: hoist intent done flag setting to ->finish_item callsite Each log intent item's ->finish_item call chain inevitably includes some code to set the dirty flag of the transaction. If there's an associated log intent done item, it also sets the item's dirty flag and the transaction's INTENT_DONE flag. This is repeated throughout the codebase. Reduce the LOC by moving all that to xfs_defer_finish_one. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 28 +++++++++++++++++++++++++++- fs/xfs/xfs_attr_item.c | 30 ++++-------------------------- fs/xfs/xfs_bmap_item.c | 16 +--------------- fs/xfs/xfs_extfree_item.c | 20 -------------------- fs/xfs/xfs_refcount_item.c | 16 +--------------- fs/xfs/xfs_rmap_item.c | 16 +--------------- 6 files changed, 34 insertions(+), 92 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index dd565e4e3daf..6214abedf394 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -191,6 +191,32 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, }; +/* Create a log intent done item for a log intent item. */ +static inline void +xfs_defer_create_done( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + struct xfs_log_item *lip; + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the log intent item and frees the log done item + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + lip = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + if (!lip) + return; + + tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; + set_bit(XFS_LI_DIRTY, &lip->li_flags); + dfp->dfp_done = lip; +} + /* * Ensure there's a log intent item associated with this deferred work item if * the operation must be restarted on crash. Returns 1 if there's a log item; @@ -496,7 +522,7 @@ xfs_defer_finish_one( trace_xfs_defer_pending_finish(tp->t_mountp, dfp); - dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + xfs_defer_create_done(tp, dfp); list_for_each_safe(li, n, &dfp->dfp_work) { list_del(li); dfp->dfp_count--; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index d19a385f9289..e7acbb736bee 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -324,39 +324,17 @@ xfs_xattri_finish_update( struct xfs_da_args *args = attr->xattri_da_args; int error; - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { - error = -EIO; - goto out; - } + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) + return -EIO; /* If an attr removal is trivially complete, we're done. */ if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && - !xfs_inode_hasattr(args->dp)) { - error = 0; - goto out; - } + !xfs_inode_hasattr(args->dp)) + return 0; error = xfs_attr_set_iter(attr); if (!error && attr->xattri_dela_state != XFS_DAS_DONE) error = -EAGAIN; -out: - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the ATTRI and frees the ATTRD - * 2.) shuts down the filesystem - */ - args->trans->t_flags |= XFS_TRANS_DIRTY; - - /* - * attr intent/done items are null when logged attributes are disabled - */ - if (attrdp) { - args->trans->t_flags |= XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - } - return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index bd8f6fe22b40..913315cb5123 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -249,21 +249,7 @@ xfs_trans_log_finish_bmap_update( struct xfs_bud_log_item *budp, struct xfs_bmap_intent *bi) { - int error; - - error = xfs_bmap_finish_one(tp, bi); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the BUI and frees the BUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - - return error; + return xfs_bmap_finish_one(tp, bi); } /* Sort bmap intents by inode. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 49e96ffd64e0..e8e02f816cbe 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -396,16 +396,6 @@ xfs_trans_free_extent( xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - /* * If we need a new transaction to make progress, the caller will log a * new EFI with the current contents. It will also log an EFD to cancel @@ -601,16 +591,6 @@ xfs_agfl_free_finish_item( error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno, agbno, agbp, &oinfo); - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the EFI and frees the EFD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); - next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); extp = &(efdp->efd_format.efd_extents[next_extent]); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 48f1a38b272e..2628b1e3969c 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -256,21 +256,7 @@ xfs_trans_log_finish_refcount_update( struct xfs_refcount_intent *ri, struct xfs_btree_cur **pcur) { - int error; - - error = xfs_refcount_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the CUI and frees the CUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - - return error; + return xfs_refcount_finish_one(tp, ri, pcur); } /* Sort refcount intents by AG. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 23684bc2ab85..8f216a13a7f2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -297,21 +297,7 @@ xfs_trans_log_finish_rmap_update( struct xfs_rmap_intent *ri, struct xfs_btree_cur **pcur) { - int error; - - error = xfs_rmap_finish_one(tp, ri, pcur); - - /* - * Mark the transaction dirty, even on error. This ensures the - * transaction is aborted, which: - * - * 1.) releases the RUI and frees the RUD - * 2.) shuts down the filesystem - */ - tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE; - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - - return error; + return xfs_rmap_finish_one(tp, ri, pcur); } /* Sort rmap intents by AG. */ From db7ccc0bac2add5a41b66578e376b49328fc99d0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 22 Nov 2023 13:39:25 -0800 Subject: [PATCH 010/123] xfs: move ->iop_recover to xfs_defer_op_type Finish off the series by moving the intent item recovery function pointer to the xfs_defer_op_type struct, since this is really a deferred work function now. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 17 +++++++++++++ fs/xfs/libxfs/xfs_defer.h | 4 +++ fs/xfs/libxfs/xfs_log_recover.h | 2 ++ fs/xfs/xfs_attr_item.c | 21 +++++++++------- fs/xfs/xfs_bmap_item.c | 39 ++++++++++++++++-------------- fs/xfs/xfs_extfree_item.c | 43 +++++++++++++++++---------------- fs/xfs/xfs_log_recover.c | 19 ++++++--------- fs/xfs/xfs_refcount_item.c | 24 +++++++++--------- fs/xfs/xfs_rmap_item.c | 24 +++++++++--------- fs/xfs/xfs_trans.h | 4 --- 10 files changed, 109 insertions(+), 88 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index eb262ea06122..dd565e4e3daf 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -713,6 +713,23 @@ xfs_defer_cancel_recovery( xfs_defer_pending_cancel_work(mp, dfp); } +/* Replay the deferred work item created from a recovered log intent item. */ +int +xfs_defer_finish_recovery( + struct xfs_mount *mp, + struct xfs_defer_pending *dfp, + struct list_head *capture_list) +{ + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + int error; + + error = ops->recover_work(dfp, capture_list); + if (error) + trace_xlog_intent_recovery_failed(mp, error, + ops->recover_work); + return error; +} + /* * Move deferred ops from one transaction to another and reset the source to * initial state. This is primarily used to carry state forward across diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index c1a648e99174..ef86a7f9b059 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -57,6 +57,8 @@ struct xfs_defer_op_type { void (*finish_cleanup)(struct xfs_trans *tp, struct xfs_btree_cur *state, int error); void (*cancel_item)(struct list_head *item); + int (*recover_work)(struct xfs_defer_pending *dfp, + struct list_head *capture_list); unsigned int max_items; }; @@ -130,6 +132,8 @@ void xfs_defer_start_recovery(struct xfs_log_item *lip, enum xfs_defer_ops_type dfp_type, struct list_head *r_dfops); void xfs_defer_cancel_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp); +int xfs_defer_finish_recovery(struct xfs_mount *mp, + struct xfs_defer_pending *dfp, struct list_head *capture_list); static inline void xfs_defer_add_item( diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 52162a17fc5e..c8e5d912895b 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -153,6 +153,8 @@ xlog_recover_resv(const struct xfs_trans_res *r) return ret; } +struct xfs_defer_pending; + void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, unsigned int dfp_type); int xlog_recover_finish_intent(struct xfs_trans *tp, diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index eaf8a877c2cc..bd23c9594a0d 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -544,12 +544,17 @@ xfs_attri_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, struct xfs_attri_log_format *attrp, - struct xfs_inode *ip, + struct xfs_inode **ipp, struct xfs_attri_log_nameval *nv) { struct xfs_attr_intent *attr; struct xfs_da_args *args; int local; + int error; + + error = xlog_recover_iget(mp, attrp->alfi_ino, ipp); + if (error) + return ERR_PTR(error); attr = kmem_zalloc(sizeof(struct xfs_attr_intent) + sizeof(struct xfs_da_args), KM_NOFS); @@ -567,7 +572,7 @@ xfs_attri_recover_work( attr->xattri_nameval = xfs_attri_log_nameval_get(nv); ASSERT(attr->xattri_nameval); - args->dp = ip; + args->dp = *ipp; args->geo = mp->m_attr_geo; args->whichfork = XFS_ATTR_FORK; args->name = nv->name.i_addr; @@ -604,7 +609,7 @@ xfs_attri_recover_work( * delete the attr that it describes. */ STATIC int -xfs_attri_item_recover( +xfs_attr_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -630,11 +635,9 @@ xfs_attri_item_recover( !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) return -EFSCORRUPTED; - error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); - if (error) - return error; - - attr = xfs_attri_recover_work(mp, dfp, attrp, ip, nv); + attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv); + if (IS_ERR(attr)) + return PTR_ERR(attr); args = attr->xattri_da_args; xfs_init_attr_trans(args, &resv, &total); @@ -820,6 +823,7 @@ const struct xfs_defer_op_type xfs_attr_defer_type = { .create_done = xfs_attr_create_done, .finish_item = xfs_attr_finish_item, .cancel_item = xfs_attr_cancel_item, + .recover_work = xfs_attr_recover_work, }; /* @@ -856,7 +860,6 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_format = xfs_attri_item_format, .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, - .iop_recover = xfs_attri_item_recover, .iop_match = xfs_attri_item_match, .iop_relog = xfs_attri_item_relog, }; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 89f2d9e89607..bd8f6fe22b40 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -437,15 +437,6 @@ xfs_bmap_update_cancel_item( kmem_cache_free(xfs_bmap_intent_cache, bi); } -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, -}; - /* Is this recovered BUI ok? */ static inline bool xfs_bui_validate( @@ -484,9 +475,15 @@ static inline struct xfs_bmap_intent * xfs_bui_recover_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp, + struct xfs_inode **ipp, struct xfs_map_extent *map) { struct xfs_bmap_intent *bi; + int error; + + error = xlog_recover_iget(mp, map->me_owner, ipp); + if (error) + return ERR_PTR(error); bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? @@ -497,6 +494,7 @@ xfs_bui_recover_work( bi->bi_bmap.br_blockcount = map->me_len; bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + bi->bi_owner = *ipp; xfs_bmap_update_get_group(mp, bi); xfs_defer_add_item(dfp, &bi->bi_list); @@ -508,7 +506,7 @@ xfs_bui_recover_work( * We need to update some inode's bmbt. */ STATIC int -xfs_bui_item_recover( +xfs_bmap_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -530,11 +528,9 @@ xfs_bui_item_recover( } map = &buip->bui_format.bui_extents[0]; - work = xfs_bui_recover_work(mp, dfp, map); - - error = xlog_recover_iget(mp, map->me_owner, &ip); - if (error) - return error; + work = xfs_bui_recover_work(mp, dfp, &ip, map); + if (IS_ERR(work)) + return PTR_ERR(work); /* Allocate transaction and do the work. */ resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); @@ -557,8 +553,6 @@ xfs_bui_item_recover( if (error) goto err_cancel; - work->bi_owner = ip; - error = xlog_recover_finish_intent(tp, dfp); if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, @@ -587,6 +581,16 @@ xfs_bui_item_recover( return error; } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, + .recover_work = xfs_bmap_recover_work, +}; + STATIC bool xfs_bui_item_match( struct xfs_log_item *lip, @@ -627,7 +631,6 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_format = xfs_bui_item_format, .iop_unpin = xfs_bui_item_unpin, .iop_release = xfs_bui_item_release, - .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, .iop_relog = xfs_bui_item_relog, }; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6a434ade486c..49e96ffd64e0 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -567,15 +567,6 @@ xfs_extent_free_cancel_item( kmem_cache_free(xfs_extfree_item_cache, xefi); } -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* * AGFL blocks are accounted differently in the reserve pools and are not * inserted into the busy extent list. @@ -632,16 +623,6 @@ xfs_agfl_free_finish_item( return error; } -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, -}; - /* Is this recovered EFI ok? */ static inline bool xfs_efi_validate_ext( @@ -675,7 +656,7 @@ xfs_efi_recover_work( * the log. We need to free the extents that it describes. */ STATIC int -xfs_efi_item_recover( +xfs_extent_free_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -724,6 +705,27 @@ xfs_efi_item_recover( return error; } +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, +}; + +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, +}; + STATIC bool xfs_efi_item_match( struct xfs_log_item *lip, @@ -766,7 +768,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_format = xfs_efi_item_format, .iop_unpin = xfs_efi_item_unpin, .iop_release = xfs_efi_item_release, - .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, .iop_relog = xfs_efi_item_relog, }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 6fab490959d4..c18692af2c65 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2562,17 +2562,14 @@ xlog_recover_process_intents( #endif list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) { - struct xfs_log_item *lip = dfp->dfp_intent; - const struct xfs_item_ops *ops = lip->li_ops; - - ASSERT(xlog_item_is_intent(lip)); + ASSERT(xlog_item_is_intent(dfp->dfp_intent)); /* * We should never see a redo item with a LSN higher than * the last transaction we found in the log at the start * of recovery. */ - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); + ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0); /* * NOTE: If your intent processing routine can create more @@ -2581,15 +2578,13 @@ xlog_recover_process_intents( * replayed in the wrong order! * * The recovery function can free the log item, so we must not - * access lip after it returns. It must dispose of @dfp if it - * returns 0. + * access dfp->dfp_intent after it returns. It must dispose of + * @dfp if it returns 0. */ - error = ops->iop_recover(dfp, &capture_list); - if (error) { - trace_xlog_intent_recovery_failed(log->l_mp, error, - ops->iop_recover); + error = xfs_defer_finish_recovery(log->l_mp, dfp, + &capture_list); + if (error) break; - } } if (error) goto err; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index f561ca73c784..48f1a38b272e 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -433,16 +433,6 @@ xfs_refcount_update_cancel_item( kmem_cache_free(xfs_refcount_intent_cache, ri); } -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_finish_one_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, -}; - /* Is this recovered CUI ok? */ static inline bool xfs_cui_validate_phys( @@ -491,7 +481,7 @@ xfs_cui_recover_work( * We need to update the refcountbt. */ STATIC int -xfs_cui_item_recover( +xfs_refcount_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -553,6 +543,17 @@ xfs_cui_item_recover( return error; } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, +}; + STATIC bool xfs_cui_item_match( struct xfs_log_item *lip, @@ -593,7 +594,6 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_format = xfs_cui_item_format, .iop_unpin = xfs_cui_item_unpin, .iop_release = xfs_cui_item_release, - .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, .iop_relog = xfs_cui_item_relog, }; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 23e736179894..23684bc2ab85 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -452,16 +452,6 @@ xfs_rmap_update_cancel_item( kmem_cache_free(xfs_rmap_intent_cache, ri); } -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_finish_one_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, -}; - /* Is this recovered RUI ok? */ static inline bool xfs_rui_validate_map( @@ -556,7 +546,7 @@ xfs_rui_recover_work( * We need to update the rmapbt. */ STATIC int -xfs_rui_item_recover( +xfs_rmap_recover_work( struct xfs_defer_pending *dfp, struct list_head *capture_list) { @@ -606,6 +596,17 @@ xfs_rui_item_recover( return error; } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, +}; + STATIC bool xfs_rui_item_match( struct xfs_log_item *lip, @@ -646,7 +647,6 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_format = xfs_rui_item_format, .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, - .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, .iop_relog = xfs_rui_item_relog, }; diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 4e38357237c3..5fb018ad9fc0 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -66,8 +66,6 @@ struct xfs_log_item { { (1u << XFS_LI_DIRTY), "DIRTY" }, \ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" } -struct xfs_defer_pending; - struct xfs_item_ops { unsigned flags; void (*iop_size)(struct xfs_log_item *, int *, int *); @@ -80,8 +78,6 @@ struct xfs_item_ops { xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); - int (*iop_recover)(struct xfs_defer_pending *dfp, - struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, struct xfs_trans *tp); From e6e5299fcbf0b18cad45cd58f99787549c790857 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:23:19 -0800 Subject: [PATCH 011/123] xfs: collapse the ->finish_item helpers Each log item's ->finish_item function sets up a small amount of state and calls another function to do the work. Collapse that other function into ->finish_item to reduce the call stack height. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 60 +++++++++---------------- fs/xfs/xfs_bmap_item.c | 18 +------- fs/xfs/xfs_extfree_item.c | 90 ++++++++++++++------------------------ fs/xfs/xfs_refcount_item.c | 18 +------- fs/xfs/xfs_rmap_item.c | 18 +------- 5 files changed, 58 insertions(+), 146 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e7acbb736bee..96438cd38633 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -310,34 +310,6 @@ xfs_attrd_item_intent( return &ATTRD_ITEM(lip)->attrd_attrip->attri_item; } -/* - * Performs one step of an attribute update intent and marks the attrd item - * dirty.. An attr operation may be a set or a remove. Note that the - * transaction is marked dirty regardless of whether the operation succeeds or - * fails to support the ATTRI/ATTRD lifecycle rules. - */ -STATIC int -xfs_xattri_finish_update( - struct xfs_attr_intent *attr, - struct xfs_attrd_log_item *attrdp) -{ - struct xfs_da_args *args = attr->xattri_da_args; - int error; - - if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) - return -EIO; - - /* If an attr removal is trivially complete, we're done. */ - if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && - !xfs_inode_hasattr(args->dp)) - return 0; - - error = xfs_attr_set_iter(attr); - if (!error && attr->xattri_dela_state != XFS_DAS_DONE) - error = -EAGAIN; - return error; -} - /* Log an attr to the intent item. */ STATIC void xfs_attr_log_item( @@ -434,23 +406,33 @@ xfs_attr_finish_item( struct xfs_btree_cur **state) { struct xfs_attr_intent *attr; - struct xfs_attrd_log_item *done_item = NULL; + struct xfs_da_args *args; int error; attr = container_of(item, struct xfs_attr_intent, xattri_list); - if (done) - done_item = ATTRD_ITEM(done); + args = attr->xattri_da_args; - /* - * Always reset trans after EAGAIN cycle - * since the transaction is new - */ - attr->xattri_da_args->trans = tp; + /* Reset trans after EAGAIN cycle since the transaction is new */ + args->trans = tp; - error = xfs_xattri_finish_update(attr, done_item); - if (error != -EAGAIN) - xfs_attr_free_item(attr); + if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) { + error = -EIO; + goto out; + } + /* If an attr removal is trivially complete, we're done. */ + if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE && + !xfs_inode_hasattr(args->dp)) { + error = 0; + goto out; + } + + error = xfs_attr_set_iter(attr); + if (!error && attr->xattri_dela_state != XFS_DAS_DONE) + return -EAGAIN; + +out: + xfs_attr_free_item(attr); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 913315cb5123..79d19b5b0e5e 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -238,20 +238,6 @@ xfs_trans_get_bud( return budp; } -/* - * Finish an bmap update and log it to the BUD. Note that the - * transaction is marked dirty regardless of whether the bmap update - * succeeds or fails to support the BUI/BUD lifecycle rules. - */ -static int -xfs_trans_log_finish_bmap_update( - struct xfs_trans *tp, - struct xfs_bud_log_item *budp, - struct xfs_bmap_intent *bi) -{ - return xfs_bmap_finish_one(tp, bi); -} - /* Sort bmap intents by inode. */ static int xfs_bmap_update_diff_items( @@ -378,7 +364,7 @@ xfs_bmap_update_put_group( xfs_perag_intent_put(bi->bi_pag); } -/* Process a deferred rmap update. */ +/* Process a deferred bmap update. */ STATIC int xfs_bmap_update_finish_item( struct xfs_trans *tp, @@ -391,7 +377,7 @@ xfs_bmap_update_finish_item( bi = container_of(item, struct xfs_bmap_intent, bi_list); - error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi); + error = xfs_bmap_finish_one(tp, bi); if (!error && bi->bi_bmap.br_blockcount > 0) { ASSERT(bi->bi_type == XFS_BMAP_UNMAP); return -EAGAIN; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e8e02f816cbe..581a70acd1ac 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -364,59 +364,6 @@ xfs_efd_from_efi( efdp->efd_next_extent = efip->efi_format.efi_nextents; } -/* - * Free an extent and log it to the EFD. Note that the transaction is marked - * dirty regardless of whether the extent free succeeds or fails to support the - * EFI/EFD lifecycle rules. - */ -static int -xfs_trans_free_extent( - struct xfs_trans *tp, - struct xfs_efd_log_item *efdp, - struct xfs_extent_free_item *xefi) -{ - struct xfs_owner_info oinfo = { }; - struct xfs_mount *mp = tp->t_mountp; - struct xfs_extent *extp; - uint next_extent; - xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, - xefi->xefi_startblock); - int error; - - oinfo.oi_owner = xefi->xefi_owner; - if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) - oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; - if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) - oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; - - trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, - agbno, xefi->xefi_blockcount); - - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); - - /* - * If we need a new transaction to make progress, the caller will log a - * new EFI with the current contents. It will also log an EFD to cancel - * the existing EFI, and so we need to copy all the unprocessed extents - * in this EFI to the EFD so this works correctly. - */ - if (error == -EAGAIN) { - xfs_efd_from_efi(efdp); - return error; - } - - next_extent = efdp->efd_next_extent; - ASSERT(next_extent < efdp->efd_format.efd_nextents); - extp = &(efdp->efd_format.efd_extents[next_extent]); - extp->ext_start = xefi->xefi_startblock; - extp->ext_len = xefi->xefi_blockcount; - efdp->efd_next_extent++; - - return error; -} - /* Sort bmap items by AG. */ static int xfs_extent_free_diff_items( @@ -517,19 +464,48 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { + struct xfs_owner_info oinfo = { }; struct xfs_extent_free_item *xefi; + struct xfs_efd_log_item *efdp = EFD_ITEM(done); + struct xfs_mount *mp = tp->t_mountp; + struct xfs_extent *extp; + uint next_extent; + xfs_agblock_t agbno; int error; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); + agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); - error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); + oinfo.oi_owner = xefi->xefi_owner; + if (xefi->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0, + agbno, xefi->xefi_blockcount); /* - * Don't free the XEFI if we need a new transaction to complete - * processing of it. + * If we need a new transaction to make progress, the caller will log a + * new EFI with the current contents. It will also log an EFD to cancel + * the existing EFI, and so we need to copy all the unprocessed extents + * in this EFI to the EFD so this works correctly. */ - if (error == -EAGAIN) + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (error == -EAGAIN) { + xfs_efd_from_efi(efdp); return error; + } + + /* Add the work we finished to the EFD, even though nobody uses that */ + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = xefi->xefi_startblock; + extp->ext_len = xefi->xefi_blockcount; + efdp->efd_next_extent++; xfs_extent_free_put_group(xefi); kmem_cache_free(xfs_extfree_item_cache, xefi); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 2628b1e3969c..7273f538db2e 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -244,21 +244,6 @@ xfs_trans_get_cud( return cudp; } -/* - * Finish an refcount update and log it to the CUD. Note that the - * transaction is marked dirty regardless of whether the refcount - * update succeeds or fails to support the CUI/CUD lifecycle rules. - */ -static int -xfs_trans_log_finish_refcount_update( - struct xfs_trans *tp, - struct xfs_cud_log_item *cudp, - struct xfs_refcount_intent *ri, - struct xfs_btree_cur **pcur) -{ - return xfs_refcount_finish_one(tp, ri, pcur); -} - /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -383,10 +368,9 @@ xfs_refcount_update_finish_item( int error; ri = container_of(item, struct xfs_refcount_intent, ri_list); - error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri, - state); /* Did we run out of reservation? Requeue what we didn't finish. */ + error = xfs_refcount_finish_one(tp, ri, state); if (!error && ri->ri_blockcount > 0) { ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE || ri->ri_type == XFS_REFCOUNT_DECREASE); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 8f216a13a7f2..d54fd925b746 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -285,21 +285,6 @@ xfs_trans_set_rmap_flags( } } -/* - * Finish an rmap update and log it to the RUD. Note that the transaction is - * marked dirty regardless of whether the rmap update succeeds or fails to - * support the RUI/RUD lifecycle rules. - */ -static int -xfs_trans_log_finish_rmap_update( - struct xfs_trans *tp, - struct xfs_rud_log_item *rudp, - struct xfs_rmap_intent *ri, - struct xfs_btree_cur **pcur) -{ - return xfs_rmap_finish_one(tp, ri, pcur); -} - /* Sort rmap intents by AG. */ static int xfs_rmap_update_diff_items( @@ -409,8 +394,7 @@ xfs_rmap_update_finish_item( ri = container_of(item, struct xfs_rmap_intent, ri_list); - error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri, - state); + error = xfs_rmap_finish_one(tp, ri, state); xfs_rmap_update_put_group(ri); kmem_cache_free(xfs_rmap_intent_cache, ri); From f3fd7f6fce1cc9b8eb59705b27f823330207b7c9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 10:58:37 -0800 Subject: [PATCH 012/123] xfs: hoist ->create_intent boilerplate to its callsite Hoist the dirty flag setting code out of each ->create_intent implementation up to the callsite to reduce boilerplate further. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 2 ++ fs/xfs/xfs_attr_item.c | 3 --- fs/xfs/xfs_bmap_item.c | 3 --- fs/xfs/xfs_extfree_item.c | 3 --- fs/xfs/xfs_refcount_item.c | 3 --- fs/xfs/xfs_rmap_item.c | 3 --- 6 files changed, 2 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 6214abedf394..2871c773a122 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -240,6 +240,8 @@ xfs_defer_create_intent( if (IS_ERR(lip)) return PTR_ERR(lip); + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 96438cd38633..fc199256fc8e 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -319,9 +319,6 @@ xfs_attr_log_item( { struct xfs_attri_log_format *attrp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags); - /* * At this point the xfs_attr_intent has been constructed, and we've * created the log intent. Fill in the attri log item and log format diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 79d19b5b0e5e..24cf70154a54 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -286,9 +286,6 @@ xfs_bmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 581a70acd1ac..d07cdc3eb809 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -390,9 +390,6 @@ xfs_extent_free_log_item( uint next_extent; struct xfs_extent *extp; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 7273f538db2e..f604b7e3b77e 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -289,9 +289,6 @@ xfs_refcount_update_log_item( uint next_extent; struct xfs_phys_extent *pmap; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index d54fd925b746..05841548691d 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -311,9 +311,6 @@ xfs_rmap_update_log_item( uint next_extent; struct xfs_map_extent *map; - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); - /* * atomic_inc_return gives us the value after the increment; * we want to use it as an array index so we need to subtract 1 from From bd3a88f6b71c7509566b44b7021581191cc11ae3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 11:44:56 -0800 Subject: [PATCH 013/123] xfs: use xfs_defer_create_done for the relogging operation Now that we have a helper to handle creating a log intent done item and updating all the necessary state flags, use it to reduce boilerplate in the ->iop_relog implementations. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 6 +++++- fs/xfs/xfs_attr_item.c | 6 +----- fs/xfs/xfs_bmap_item.c | 6 +----- fs/xfs/xfs_extfree_item.c | 6 ++---- fs/xfs/xfs_refcount_item.c | 6 +----- fs/xfs/xfs_rmap_item.c | 6 +----- fs/xfs/xfs_trans.h | 4 +++- 7 files changed, 14 insertions(+), 26 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 2871c773a122..63b9960a96e1 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -500,7 +500,11 @@ xfs_defer_relog( trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); - dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + + xfs_defer_create_done(*tpp, dfp); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, + dfp->dfp_done, *tpp); + dfp->dfp_done = NULL; } if ((*tpp)->t_flags & XFS_TRANS_DIRTY) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index fc199256fc8e..e9813fa64461 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -630,9 +630,9 @@ xfs_attr_recover_work( static struct xfs_log_item * xfs_attri_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_attrd_log_item *attrdp; struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; struct xfs_attri_log_format *new_attrp; @@ -641,10 +641,6 @@ xfs_attri_item_relog( old_attrip = ATTRI_ITEM(intent); old_attrp = &old_attrip->attri_format; - tp->t_flags |= XFS_TRANS_DIRTY; - attrdp = xfs_trans_get_attrd(tp, old_attrip); - set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags); - /* * Create a new log item that shares the same name/value buffer as the * old log item. diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 24cf70154a54..ba385c06de5d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -572,9 +572,9 @@ xfs_bui_item_match( static struct xfs_log_item * xfs_bui_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_bud_log_item *budp; struct xfs_bui_log_item *buip; struct xfs_map_extent *map; unsigned int count; @@ -582,10 +582,6 @@ xfs_bui_item_relog( count = BUI_ITEM(intent)->bui_format.bui_nextents; map = BUI_ITEM(intent)->bui_format.bui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); - buip = xfs_bui_init(tp->t_mountp); memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d07cdc3eb809..807398479187 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -691,9 +691,10 @@ xfs_efi_item_match( static struct xfs_log_item * xfs_efi_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_efd_log_item *efdp; + struct xfs_efd_log_item *efdp = EFD_ITEM(done_item); struct xfs_efi_log_item *efip; struct xfs_extent *extp; unsigned int count; @@ -701,11 +702,8 @@ xfs_efi_item_relog( count = EFI_ITEM(intent)->efi_format.efi_nextents; extp = EFI_ITEM(intent)->efi_format.efi_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); efdp->efd_next_extent = count; memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); - set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); efip = xfs_efi_init(tp->t_mountp, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index f604b7e3b77e..142839a8e7b1 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -533,9 +533,9 @@ xfs_cui_item_match( static struct xfs_log_item * xfs_cui_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_cud_log_item *cudp; struct xfs_cui_log_item *cuip; struct xfs_phys_extent *pmap; unsigned int count; @@ -543,10 +543,6 @@ xfs_cui_item_relog( count = CUI_ITEM(intent)->cui_format.cui_nextents; pmap = CUI_ITEM(intent)->cui_format.cui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); - cuip = xfs_cui_init(tp->t_mountp, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 05841548691d..e2730b3e0d96 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -586,9 +586,9 @@ xfs_rui_item_match( static struct xfs_log_item * xfs_rui_item_relog( struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp) { - struct xfs_rud_log_item *rudp; struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; unsigned int count; @@ -596,10 +596,6 @@ xfs_rui_item_relog( count = RUI_ITEM(intent)->rui_format.rui_nextents; map = RUI_ITEM(intent)->rui_format.rui_extents; - tp->t_flags |= XFS_TRANS_DIRTY; - rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); - set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); - ruip = xfs_rui_init(tp->t_mountp, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 5fb018ad9fc0..25646e2b12f4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -80,6 +80,7 @@ struct xfs_item_ops { void (*iop_release)(struct xfs_log_item *); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, + struct xfs_log_item *done_item, struct xfs_trans *tp); struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; @@ -248,9 +249,10 @@ extern struct kmem_cache *xfs_trans_cache; static inline struct xfs_log_item * xfs_trans_item_relog( struct xfs_log_item *lip, + struct xfs_log_item *done_lip, struct xfs_trans *tp) { - return lip->li_ops->iop_relog(lip, tp); + return lip->li_ops->iop_relog(lip, done_lip, tp); } struct xfs_dquot; From 3e0958be2156d90ef908a1a547b4e27a3ec38da9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 11:47:45 -0800 Subject: [PATCH 014/123] xfs: clean out XFS_LI_DIRTY setting boilerplate from ->iop_relog Hoist this dirty flag setting to the ->iop_relog callsite to reduce boilerplate. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 9 +++++++-- fs/xfs/xfs_attr_item.c | 1 - fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 63b9960a96e1..aa19ede91a57 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -474,6 +474,8 @@ xfs_defer_relog( ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); list_for_each_entry(dfp, dfops, dfp_list) { + struct xfs_log_item *lip; + /* * If the log intent item for this deferred op is not a part of * the current log checkpoint, relog the intent item to keep @@ -502,9 +504,12 @@ xfs_defer_relog( XFS_STATS_INC((*tpp)->t_mountp, defer_relog); xfs_defer_create_done(*tpp, dfp); - dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, - dfp->dfp_done, *tpp); + lip = xfs_trans_item_relog(dfp->dfp_intent, dfp->dfp_done, + *tpp); + if (lip) + set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_done = NULL; + dfp->dfp_intent = lip; } if ((*tpp)->t_flags & XFS_TRANS_DIRTY) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index e9813fa64461..5d86a4b8b457 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -655,7 +655,6 @@ xfs_attri_item_relog( new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; xfs_trans_add_item(tp, &new_attrip->attri_item); - set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags); return &new_attrip->attri_item; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ba385c06de5d..ef72061d7cec 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -586,7 +586,7 @@ xfs_bui_item_relog( memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); xfs_trans_add_item(tp, &buip->bui_item); - set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 807398479187..e2e86f2edb3c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -709,7 +709,7 @@ xfs_efi_item_relog( memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); xfs_trans_add_item(tp, &efip->efi_item); - set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 142839a8e7b1..01d16e795068 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -547,7 +547,7 @@ xfs_cui_item_relog( memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); xfs_trans_add_item(tp, &cuip->cui_item); - set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index e2730b3e0d96..96b2dc832d62 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -600,7 +600,7 @@ xfs_rui_item_relog( memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); xfs_trans_add_item(tp, &ruip->rui_item); - set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; } From b28852a5bd08654634e4e32eb072fba14c5fae26 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 12:06:08 -0800 Subject: [PATCH 015/123] xfs: hoist xfs_trans_add_item calls to defer ops functions Remove even more repeated boilerplate. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 7 ++++++- fs/xfs/xfs_attr_item.c | 4 ---- fs/xfs/xfs_bmap_item.c | 3 --- fs/xfs/xfs_extfree_item.c | 3 --- fs/xfs/xfs_refcount_item.c | 3 --- fs/xfs/xfs_rmap_item.c | 3 --- 6 files changed, 6 insertions(+), 17 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index aa19ede91a57..95f15a4b2126 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -26,6 +26,7 @@ #include "xfs_da_format.h" #include "xfs_da_btree.h" #include "xfs_attr.h" +#include "xfs_trans_priv.h" static struct kmem_cache *xfs_defer_pending_cache; @@ -213,6 +214,7 @@ xfs_defer_create_done( return; tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE; + xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_done = lip; } @@ -241,6 +243,7 @@ xfs_defer_create_intent( return PTR_ERR(lip); tp->t_flags |= XFS_TRANS_DIRTY; + xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); dfp->dfp_intent = lip; return 1; @@ -506,8 +509,10 @@ xfs_defer_relog( xfs_defer_create_done(*tpp, dfp); lip = xfs_trans_item_relog(dfp->dfp_intent, dfp->dfp_done, *tpp); - if (lip) + if (lip) { + xfs_trans_add_item(*tpp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); + } dfp->dfp_done = NULL; dfp->dfp_intent = lip; } diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 5d86a4b8b457..c815811d937a 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -375,7 +375,6 @@ xfs_attr_create_intent( } attrip = xfs_attri_init(mp, attr->xattri_nameval); - xfs_trans_add_item(tp, &attrip->attri_item); xfs_attr_log_item(tp, attrip, attr); return &attrip->attri_item; @@ -654,8 +653,6 @@ xfs_attri_item_relog( new_attrp->alfi_name_len = old_attrp->alfi_name_len; new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter; - xfs_trans_add_item(tp, &new_attrip->attri_item); - return &new_attrip->attri_item; } @@ -753,7 +750,6 @@ xfs_trans_get_attrd(struct xfs_trans *tp, attrdp->attrd_attrip = attrip; attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - xfs_trans_add_item(tp, &attrdp->attrd_item); return attrdp; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ef72061d7cec..0be7a1224a81 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -234,7 +234,6 @@ xfs_trans_get_bud( budp->bud_buip = buip; budp->bud_format.bud_bui_id = buip->bui_format.bui_id; - xfs_trans_add_item(tp, &budp->bud_item); return budp; } @@ -315,7 +314,6 @@ xfs_bmap_update_create_intent( ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS); - xfs_trans_add_item(tp, &buip->bui_item); if (sort) list_sort(mp, items, xfs_bmap_update_diff_items); list_for_each_entry(bi, items, bi_list) @@ -585,7 +583,6 @@ xfs_bui_item_relog( buip = xfs_bui_init(tp->t_mountp); memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map)); atomic_set(&buip->bui_next_extent, count); - xfs_trans_add_item(tp, &buip->bui_item); return &buip->bui_item; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e2e86f2edb3c..44bbf620e0cf 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -332,7 +332,6 @@ xfs_trans_get_efd( efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - xfs_trans_add_item(tp, &efdp->efd_item); return efdp; } @@ -415,7 +414,6 @@ xfs_extent_free_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &efip->efi_item); if (sort) list_sort(mp, items, xfs_extent_free_diff_items); list_for_each_entry(xefi, items, xefi_list) @@ -708,7 +706,6 @@ xfs_efi_item_relog( efip = xfs_efi_init(tp->t_mountp, count); memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); atomic_set(&efip->efi_next_extent, count); - xfs_trans_add_item(tp, &efip->efi_item); return &efip->efi_item; } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 01d16e795068..a66bb6aa2e5d 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -240,7 +240,6 @@ xfs_trans_get_cud( cudp->cud_cuip = cuip; cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; - xfs_trans_add_item(tp, &cudp->cud_item); return cudp; } @@ -315,7 +314,6 @@ xfs_refcount_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &cuip->cui_item); if (sort) list_sort(mp, items, xfs_refcount_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -546,7 +544,6 @@ xfs_cui_item_relog( cuip = xfs_cui_init(tp->t_mountp, count); memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap)); atomic_set(&cuip->cui_next_extent, count); - xfs_trans_add_item(tp, &cuip->cui_item); return &cuip->cui_item; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 96b2dc832d62..d668eb4d099e 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -238,7 +238,6 @@ xfs_trans_get_rud( rudp->rud_ruip = ruip; rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; - xfs_trans_add_item(tp, &rudp->rud_item); return rudp; } @@ -340,7 +339,6 @@ xfs_rmap_update_create_intent( ASSERT(count > 0); - xfs_trans_add_item(tp, &ruip->rui_item); if (sort) list_sort(mp, items, xfs_rmap_update_diff_items); list_for_each_entry(ri, items, ri_list) @@ -599,7 +597,6 @@ xfs_rui_item_relog( ruip = xfs_rui_init(tp->t_mountp, count); memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map)); atomic_set(&ruip->rui_next_extent, count); - xfs_trans_add_item(tp, &ruip->rui_item); return &ruip->rui_item; } From 8a9aa763e17c5490d3526cbf4c9484d76ecbbe39 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 12:17:37 -0800 Subject: [PATCH 016/123] xfs: collapse the ->create_done functions Move the meat of the ->create_done function helpers into ->create_done to reduce the amount of boilerplate. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_attr_item.c | 37 ++++++++++---------------- fs/xfs/xfs_bmap_item.c | 29 ++++++++------------- fs/xfs/xfs_extfree_item.c | 53 ++++++++++++++------------------------ fs/xfs/xfs_refcount_item.c | 27 +++++++------------ fs/xfs/xfs_rmap_item.c | 27 +++++++------------ 5 files changed, 64 insertions(+), 109 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index c815811d937a..27553388da99 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache; static const struct xfs_item_ops xfs_attri_item_ops; static const struct xfs_item_ops xfs_attrd_item_ops; -static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip); static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip) { @@ -732,16 +730,20 @@ xlog_recover_attri_commit_pass2( return 0; } -/* - * This routine is called to allocate an "attr free done" log item. - */ -static struct xfs_attrd_log_item * -xfs_trans_get_attrd(struct xfs_trans *tp, - struct xfs_attri_log_item *attrip) +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) { - struct xfs_attrd_log_item *attrdp; + struct xfs_attri_log_item *attrip; + struct xfs_attrd_log_item *attrdp; - ASSERT(tp != NULL); + if (!intent) + return NULL; + + attrip = ATTRI_ITEM(intent); attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); @@ -750,20 +752,7 @@ xfs_trans_get_attrd(struct xfs_trans *tp, attrdp->attrd_attrip = attrip; attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - return attrdp; -} - -/* Get an ATTRD so we can process all the attrs. */ -static struct xfs_log_item * -xfs_attr_create_done( - struct xfs_trans *tp, - struct xfs_log_item *intent, - unsigned int count) -{ - if (!intent) - return NULL; - - return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item; + return &attrdp->attrd_item; } const struct xfs_defer_op_type xfs_attr_defer_type = { diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 0be7a1224a81..f3421e615e1c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -221,22 +221,6 @@ static const struct xfs_item_ops xfs_bud_item_ops = { .iop_intent = xfs_bud_item_intent, }; -static struct xfs_bud_log_item * -xfs_trans_get_bud( - struct xfs_trans *tp, - struct xfs_bui_log_item *buip) -{ - struct xfs_bud_log_item *budp; - - budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, - &xfs_bud_item_ops); - budp->bud_buip = buip; - budp->bud_format.bud_bui_id = buip->bui_format.bui_id; - - return budp; -} - /* Sort bmap intents by inode. */ static int xfs_bmap_update_diff_items( @@ -321,14 +305,23 @@ xfs_bmap_update_create_intent( return &buip->bui_item; } -/* Get an BUD so we can process all the deferred rmap updates. */ +/* Get an BUD so we can process all the deferred bmap updates. */ static struct xfs_log_item * xfs_bmap_update_create_done( struct xfs_trans *tp, struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item; + struct xfs_bui_log_item *buip = BUI_ITEM(intent); + struct xfs_bud_log_item *budp; + + budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, + &xfs_bud_item_ops); + budp->bud_buip = buip; + budp->bud_format.bud_bui_id = buip->bui_format.bui_id; + + return &budp->bud_item; } /* Take a passive ref to the AG containing the space we're mapping. */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 44bbf620e0cf..518569c64e9c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -303,38 +303,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = { .iop_intent = xfs_efd_item_intent, }; -/* - * Allocate an "extent free done" log item that will hold nextents worth of - * extents. The caller must use all nextents extents, because we are not - * flexible about this at all. - */ -static struct xfs_efd_log_item * -xfs_trans_get_efd( - struct xfs_trans *tp, - struct xfs_efi_log_item *efip, - unsigned int nextents) -{ - struct xfs_efd_log_item *efdp; - - ASSERT(nextents > 0); - - if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { - efdp = kzalloc(xfs_efd_log_item_sizeof(nextents), - GFP_KERNEL | __GFP_NOFAIL); - } else { - efdp = kmem_cache_zalloc(xfs_efd_cache, - GFP_KERNEL | __GFP_NOFAIL); - } - - xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, - &xfs_efd_item_ops); - efdp->efd_efip = efip; - efdp->efd_format.efd_nextents = nextents; - efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - - return efdp; -} - /* * Fill the EFD with all extents from the EFI when we need to roll the * transaction and continue with a new EFI. @@ -428,7 +396,26 @@ xfs_extent_free_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item; + struct xfs_efi_log_item *efip = EFI_ITEM(intent); + struct xfs_efd_log_item *efdp; + + ASSERT(count > 0); + + if (count > XFS_EFD_MAX_FAST_EXTENTS) { + efdp = kzalloc(xfs_efd_log_item_sizeof(count), + GFP_KERNEL | __GFP_NOFAIL); + } else { + efdp = kmem_cache_zalloc(xfs_efd_cache, + GFP_KERNEL | __GFP_NOFAIL); + } + + xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD, + &xfs_efd_item_ops); + efdp->efd_efip = efip; + efdp->efd_format.efd_nextents = count; + efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; + + return &efdp->efd_item; } /* Take a passive ref to the AG containing the space we're freeing. */ diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index a66bb6aa2e5d..d218a9ed4d82 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -227,22 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = { .iop_intent = xfs_cud_item_intent, }; -static struct xfs_cud_log_item * -xfs_trans_get_cud( - struct xfs_trans *tp, - struct xfs_cui_log_item *cuip) -{ - struct xfs_cud_log_item *cudp; - - cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, - &xfs_cud_item_ops); - cudp->cud_cuip = cuip; - cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; - - return cudp; -} - /* Sort refcount intents by AG. */ static int xfs_refcount_update_diff_items( @@ -328,7 +312,16 @@ xfs_refcount_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item; + struct xfs_cui_log_item *cuip = CUI_ITEM(intent); + struct xfs_cud_log_item *cudp; + + cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, + &xfs_cud_item_ops); + cudp->cud_cuip = cuip; + cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id; + + return &cudp->cud_item; } /* Take a passive ref to the AG containing the space we're refcounting. */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index d668eb4d099e..96e0c2b0d059 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -225,22 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = { .iop_intent = xfs_rud_item_intent, }; -static struct xfs_rud_log_item * -xfs_trans_get_rud( - struct xfs_trans *tp, - struct xfs_rui_log_item *ruip) -{ - struct xfs_rud_log_item *rudp; - - rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); - xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, - &xfs_rud_item_ops); - rudp->rud_ruip = ruip; - rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; - - return rudp; -} - /* Set the map extent flags for this reverse mapping. */ static void xfs_trans_set_rmap_flags( @@ -353,7 +337,16 @@ xfs_rmap_update_create_done( struct xfs_log_item *intent, unsigned int count) { - return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item; + struct xfs_rui_log_item *ruip = RUI_ITEM(intent); + struct xfs_rud_log_item *rudp; + + rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); + xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, + &xfs_rud_item_ops); + rudp->rud_ruip = ruip; + rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id; + + return &rudp->rud_item; } /* Take a passive ref to the AG containing the space we're rmapping. */ From a6a38f309afc4a7ede01242b603f36c433997780 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Dec 2023 09:17:40 -0800 Subject: [PATCH 017/123] xfs: make rextslog computation consistent with mkfs There's a weird discrepancy in xfsprogs dating back to the creation of the Linux port -- if there are zero rt extents, mkfs will set sb_rextents and sb_rextslog both to zero: sbp->sb_rextslog = (uint8_t)(rtextents ? libxfs_highbit32((unsigned int)rtextents) : 0); However, that's not the check that xfs_repair uses for nonzero rtblocks: if (sb->sb_rextslog != libxfs_highbit32((unsigned int)sb->sb_rextents)) The difference here is that xfs_highbit32 returns -1 if its argument is zero. Unfortunately, this means that in the weird corner case of a realtime volume shorter than 1 rt extent, xfs_repair will immediately flag a freshly formatted filesystem as corrupt. Because mkfs has been writing ondisk artifacts like this for decades, we have to accept that as "correct". TBH, zero rextslog for zero rtextents makes more sense to me anyway. Regrettably, the superblock verifier checks created in commit copied xfs_repair even though mkfs has been writing out such filesystems for ages. Fix the superblock verifier to accept what mkfs spits out; the userspace version of this patch will have to fix xfs_repair as well. Note that the new helper leaves the zeroday bug where the upper 32 bits of sb_rextents is ripped off and fed to highbit32. This leads to a seriously undersized rt summary file, which immediately breaks mkfs: $ hugedisk.sh foo /dev/sdc $(( 0x100000080 * 4096))B $ /sbin/mkfs.xfs -f /dev/sda -m rmapbt=0,reflink=0 -r rtdev=/dev/mapper/foo meta-data=/dev/sda isize=512 agcount=4, agsize=1298176 blks = sectsz=512 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=0 = reflink=0 bigtime=1 inobtcount=1 nrext64=1 data = bsize=4096 blocks=5192704, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0, ftype=1 log =internal log bsize=4096 blocks=16384, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =/dev/mapper/foo extsz=4096 blocks=4294967424, rtextents=4294967424 Discarding blocks...Done. mkfs.xfs: Error initializing the realtime space [117 - Structure needs cleaning] The next patch will drop support for rt volumes with fewer than 1 or more than 2^32-1 rt extents, since they've clearly been broken forever. Fixes: f8e566c0f5e1f ("xfs: validate the realtime geometry in xfs_validate_sb_common") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.c | 12 ++++++++++++ fs/xfs/libxfs/xfs_rtbitmap.h | 3 +++ fs/xfs/libxfs/xfs_sb.c | 3 ++- fs/xfs/xfs_rtalloc.c | 4 ++-- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index c269d704314d..1c9fed76a356 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1130,6 +1130,18 @@ xfs_rtbitmap_blockcount( return howmany_64(rtextents, NBBY * mp->m_sb.sb_blocksize); } +/* + * Compute the maximum level number of the realtime summary file, as defined by + * mkfs. The use of highbit32 on a 64-bit quantity is a historic artifact that + * prohibits correct use of rt volumes with more than 2^32 extents. + */ +uint8_t +xfs_compute_rextslog( + xfs_rtbxlen_t rtextents) +{ + return rtextents ? xfs_highbit32(rtextents) : 0; +} + /* * Compute the number of rtbitmap words needed to populate every block of a * bitmap that is large enough to track the given number of rt extents. diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index c0637057d69c..6e5bae324cc3 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -351,6 +351,8 @@ xfs_rtfree_extent( int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, xfs_filblks_t rtlen); +uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -369,6 +371,7 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtsummary_read_buf(a,b) (-ENOSYS) # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +# define xfs_compute_rextslog(rtx) (0) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 1f74d0cd1618..df12bf82ed18 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -25,6 +25,7 @@ #include "xfs_da_format.h" #include "xfs_health.h" #include "xfs_ag.h" +#include "xfs_rtbitmap.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -509,7 +510,7 @@ xfs_validate_sb_common( NBBY * sbp->sb_blocksize); if (sbp->sb_rextents != rexts || - sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) || + sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, "realtime geometry sanity check failed"); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 88c48de5c9c8..7c5a50163d2d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -964,7 +964,7 @@ xfs_growfs_rt( nrextents = nrblocks; do_div(nrextents, in->extsize); nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); - nrextslog = xfs_highbit32(nrextents); + nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks); nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); @@ -1031,7 +1031,7 @@ xfs_growfs_rt( nsbp->sb_rblocks = min(nrblocks, nrblocks_step); nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks); ASSERT(nsbp->sb_rextents != 0); - nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); + nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nsbp->sb_rbmblocks); From cf8f0e6c1429be7652869059ea44696b72d5b726 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Sun, 3 Dec 2023 09:19:44 -0800 Subject: [PATCH 018/123] xfs: fix 32-bit truncation in xfs_compute_rextslog It's quite reasonable that some customer somewhere will want to configure a realtime volume with more than 2^32 extents. If they try to do this, the highbit32() call will truncate the upper bits of the xfs_rtbxlen_t and produce the wrong value for rextslog. This in turn causes the rsumlevels to be wrong, which results in a realtime summary file that is the wrong length. Fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 1c9fed76a356..30a2844f62e3 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1132,14 +1132,16 @@ xfs_rtbitmap_blockcount( /* * Compute the maximum level number of the realtime summary file, as defined by - * mkfs. The use of highbit32 on a 64-bit quantity is a historic artifact that - * prohibits correct use of rt volumes with more than 2^32 extents. + * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct + * use of rt volumes with more than 2^32 extents. */ uint8_t xfs_compute_rextslog( xfs_rtbxlen_t rtextents) { - return rtextents ? xfs_highbit32(rtextents) : 0; + if (!rtextents) + return 0; + return xfs_highbit64(rtextents); } /* From 94da54d582e6866a9613514b59f326456e88446d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 4 Dec 2023 12:13:26 -0800 Subject: [PATCH 019/123] xfs: document what LARP means Christoph requested a blurb somewhere explaining exactly what LARP means. I don't know of a good place other than the source code (debug knobs aren't covered in Documentation/), so here it is. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index a3c6b1548723..871f16a4a5d8 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -229,6 +229,15 @@ pwork_threads_show( } XFS_SYSFS_ATTR_RW(pwork_threads); +/* + * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob + * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on + * V5 filesystems. As a result, the intermediate progress of all setxattr and + * removexattr operations are tracked via the log and can be restarted during + * recovery. This is useful for testing xattr recovery prior to merging of the + * parent pointer feature which requires it to maintain consistency, and may be + * enabled for userspace xattrs in the future. + */ static ssize_t larp_store( struct kobject *kobject, From a49c708f9a445457f6a5905732081871234f61c6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Nov 2023 12:31:30 -0800 Subject: [PATCH 020/123] xfs: move ->iop_relog to struct xfs_defer_op_type The only log items that need relogging are the ones created for deferred work operations, and the only part of the code base that relogs log items is the deferred work machinery. Move the function pointers. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 31 +++++++++++------- fs/xfs/libxfs/xfs_defer.h | 3 ++ fs/xfs/xfs_attr_item.c | 8 ++--- fs/xfs/xfs_bmap_item.c | 44 ++++++++++++------------- fs/xfs/xfs_extfree_item.c | 67 +++++++++++++++++++------------------- fs/xfs/xfs_refcount_item.c | 46 +++++++++++++------------- fs/xfs/xfs_rmap_item.c | 46 +++++++++++++------------- fs/xfs/xfs_trans.h | 12 ------- 8 files changed, 129 insertions(+), 128 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 95f15a4b2126..54a6be06e6cd 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -459,6 +459,25 @@ xfs_defer_cancel_list( xfs_defer_pending_cancel_work(mp, dfp); } +static inline void +xfs_defer_relog_intent( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + struct xfs_log_item *lip; + const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + + xfs_defer_create_done(tp, dfp); + + lip = ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); + if (lip) { + xfs_trans_add_item(tp, lip); + set_bit(XFS_LI_DIRTY, &lip->li_flags); + } + dfp->dfp_done = NULL; + dfp->dfp_intent = lip; +} + /* * Prevent a log intent item from pinning the tail of the log by logging a * done item to release the intent item; and then log a new intent item. @@ -477,8 +496,6 @@ xfs_defer_relog( ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); list_for_each_entry(dfp, dfops, dfp_list) { - struct xfs_log_item *lip; - /* * If the log intent item for this deferred op is not a part of * the current log checkpoint, relog the intent item to keep @@ -506,15 +523,7 @@ xfs_defer_relog( trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); XFS_STATS_INC((*tpp)->t_mountp, defer_relog); - xfs_defer_create_done(*tpp, dfp); - lip = xfs_trans_item_relog(dfp->dfp_intent, dfp->dfp_done, - *tpp); - if (lip) { - xfs_trans_add_item(*tpp, lip); - set_bit(XFS_LI_DIRTY, &lip->li_flags); - } - dfp->dfp_done = NULL; - dfp->dfp_intent = lip; + xfs_defer_relog_intent(*tpp, dfp); } if ((*tpp)->t_flags & XFS_TRANS_DIRTY) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index ef86a7f9b059..78d6dcd1af2c 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -59,6 +59,9 @@ struct xfs_defer_op_type { void (*cancel_item)(struct list_head *item); int (*recover_work)(struct xfs_defer_pending *dfp, struct list_head *capture_list); + struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, + struct xfs_log_item *intent, + struct xfs_log_item *done_item); unsigned int max_items; }; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 27553388da99..988d395a48ad 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -625,10 +625,10 @@ xfs_attr_recover_work( /* Re-log an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_attri_item_relog( +xfs_attr_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_attri_log_item *old_attrip; struct xfs_attri_log_item *new_attrip; @@ -763,6 +763,7 @@ const struct xfs_defer_op_type xfs_attr_defer_type = { .finish_item = xfs_attr_finish_item, .cancel_item = xfs_attr_cancel_item, .recover_work = xfs_attr_recover_work, + .relog_intent = xfs_attr_relog_intent, }; /* @@ -800,7 +801,6 @@ static const struct xfs_item_ops xfs_attri_item_ops = { .iop_unpin = xfs_attri_item_unpin, .iop_release = xfs_attri_item_release, .iop_match = xfs_attri_item_match, - .iop_relog = xfs_attri_item_relog, }; const struct xlog_recover_item_ops xlog_attri_item_ops = { diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f3421e615e1c..bc48d733634a 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -541,30 +541,12 @@ xfs_bmap_recover_work( return error; } -const struct xfs_defer_op_type xfs_bmap_update_defer_type = { - .max_items = XFS_BUI_MAX_FAST_EXTENTS, - .create_intent = xfs_bmap_update_create_intent, - .abort_intent = xfs_bmap_update_abort_intent, - .create_done = xfs_bmap_update_create_done, - .finish_item = xfs_bmap_update_finish_item, - .cancel_item = xfs_bmap_update_cancel_item, - .recover_work = xfs_bmap_recover_work, -}; - -STATIC bool -xfs_bui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return BUI_ITEM(lip)->bui_format.bui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_bui_item_relog( +xfs_bmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_bui_log_item *buip; struct xfs_map_extent *map; @@ -580,6 +562,25 @@ xfs_bui_item_relog( return &buip->bui_item; } +const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .max_items = XFS_BUI_MAX_FAST_EXTENTS, + .create_intent = xfs_bmap_update_create_intent, + .abort_intent = xfs_bmap_update_abort_intent, + .create_done = xfs_bmap_update_create_done, + .finish_item = xfs_bmap_update_finish_item, + .cancel_item = xfs_bmap_update_cancel_item, + .recover_work = xfs_bmap_recover_work, + .relog_intent = xfs_bmap_relog_intent, +}; + +STATIC bool +xfs_bui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return BUI_ITEM(lip)->bui_format.bui_id == intent_id; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_bui_item_size, @@ -587,7 +588,6 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_unpin = xfs_bui_item_unpin, .iop_release = xfs_bui_item_release, .iop_match = xfs_bui_item_match, - .iop_relog = xfs_bui_item_relog, }; static inline void diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 518569c64e9c..3ca23ab8d92a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -643,41 +643,12 @@ xfs_extent_free_recover_work( return error; } -const struct xfs_defer_op_type xfs_extent_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_extent_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, - .recover_work = xfs_extent_free_recover_work, -}; - -/* sub-type with special handling for AGFL deferred frees */ -const struct xfs_defer_op_type xfs_agfl_free_defer_type = { - .max_items = XFS_EFI_MAX_FAST_EXTENTS, - .create_intent = xfs_extent_free_create_intent, - .abort_intent = xfs_extent_free_abort_intent, - .create_done = xfs_extent_free_create_done, - .finish_item = xfs_agfl_free_finish_item, - .cancel_item = xfs_extent_free_cancel_item, - .recover_work = xfs_extent_free_recover_work, -}; - -STATIC bool -xfs_efi_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return EFI_ITEM(lip)->efi_format.efi_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_efi_item_relog( +xfs_extent_free_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_efd_log_item *efdp = EFD_ITEM(done_item); struct xfs_efi_log_item *efip; @@ -697,6 +668,37 @@ xfs_efi_item_relog( return &efip->efi_item; } +const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_extent_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +/* sub-type with special handling for AGFL deferred frees */ +const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, + .recover_work = xfs_extent_free_recover_work, + .relog_intent = xfs_extent_free_relog_intent, +}; + +STATIC bool +xfs_efi_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return EFI_ITEM(lip)->efi_format.efi_id == intent_id; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_efi_item_size, @@ -704,7 +706,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_unpin = xfs_efi_item_unpin, .iop_release = xfs_efi_item_release, .iop_match = xfs_efi_item_match, - .iop_relog = xfs_efi_item_relog, }; /* diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index d218a9ed4d82..9974be81cb2b 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -501,31 +501,12 @@ xfs_refcount_recover_work( return error; } -const struct xfs_defer_op_type xfs_refcount_update_defer_type = { - .max_items = XFS_CUI_MAX_FAST_EXTENTS, - .create_intent = xfs_refcount_update_create_intent, - .abort_intent = xfs_refcount_update_abort_intent, - .create_done = xfs_refcount_update_create_done, - .finish_item = xfs_refcount_update_finish_item, - .finish_cleanup = xfs_refcount_finish_one_cleanup, - .cancel_item = xfs_refcount_update_cancel_item, - .recover_work = xfs_refcount_recover_work, -}; - -STATIC bool -xfs_cui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return CUI_ITEM(lip)->cui_format.cui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_cui_item_relog( +xfs_refcount_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_cui_log_item *cuip; struct xfs_phys_extent *pmap; @@ -541,6 +522,26 @@ xfs_cui_item_relog( return &cuip->cui_item; } +const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .max_items = XFS_CUI_MAX_FAST_EXTENTS, + .create_intent = xfs_refcount_update_create_intent, + .abort_intent = xfs_refcount_update_abort_intent, + .create_done = xfs_refcount_update_create_done, + .finish_item = xfs_refcount_update_finish_item, + .finish_cleanup = xfs_refcount_finish_one_cleanup, + .cancel_item = xfs_refcount_update_cancel_item, + .recover_work = xfs_refcount_recover_work, + .relog_intent = xfs_refcount_relog_intent, +}; + +STATIC bool +xfs_cui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return CUI_ITEM(lip)->cui_format.cui_id == intent_id; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_cui_item_size, @@ -548,7 +549,6 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_unpin = xfs_cui_item_unpin, .iop_release = xfs_cui_item_release, .iop_match = xfs_cui_item_match, - .iop_relog = xfs_cui_item_relog, }; static inline void diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 96e0c2b0d059..488c4a2a80a3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -554,31 +554,12 @@ xfs_rmap_recover_work( return error; } -const struct xfs_defer_op_type xfs_rmap_update_defer_type = { - .max_items = XFS_RUI_MAX_FAST_EXTENTS, - .create_intent = xfs_rmap_update_create_intent, - .abort_intent = xfs_rmap_update_abort_intent, - .create_done = xfs_rmap_update_create_done, - .finish_item = xfs_rmap_update_finish_item, - .finish_cleanup = xfs_rmap_finish_one_cleanup, - .cancel_item = xfs_rmap_update_cancel_item, - .recover_work = xfs_rmap_recover_work, -}; - -STATIC bool -xfs_rui_item_match( - struct xfs_log_item *lip, - uint64_t intent_id) -{ - return RUI_ITEM(lip)->rui_format.rui_id == intent_id; -} - /* Relog an intent item to push the log tail forward. */ static struct xfs_log_item * -xfs_rui_item_relog( +xfs_rmap_relog_intent( + struct xfs_trans *tp, struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp) + struct xfs_log_item *done_item) { struct xfs_rui_log_item *ruip; struct xfs_map_extent *map; @@ -594,6 +575,26 @@ xfs_rui_item_relog( return &ruip->rui_item; } +const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .max_items = XFS_RUI_MAX_FAST_EXTENTS, + .create_intent = xfs_rmap_update_create_intent, + .abort_intent = xfs_rmap_update_abort_intent, + .create_done = xfs_rmap_update_create_done, + .finish_item = xfs_rmap_update_finish_item, + .finish_cleanup = xfs_rmap_finish_one_cleanup, + .cancel_item = xfs_rmap_update_cancel_item, + .recover_work = xfs_rmap_recover_work, + .relog_intent = xfs_rmap_relog_intent, +}; + +STATIC bool +xfs_rui_item_match( + struct xfs_log_item *lip, + uint64_t intent_id) +{ + return RUI_ITEM(lip)->rui_format.rui_id == intent_id; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .flags = XFS_ITEM_INTENT, .iop_size = xfs_rui_item_size, @@ -601,7 +602,6 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_unpin = xfs_rui_item_unpin, .iop_release = xfs_rui_item_release, .iop_match = xfs_rui_item_match, - .iop_relog = xfs_rui_item_relog, }; static inline void diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 25646e2b12f4..2cb1e143fc49 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -79,9 +79,6 @@ struct xfs_item_ops { uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_release)(struct xfs_log_item *); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); - struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, - struct xfs_log_item *done_item, - struct xfs_trans *tp); struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done); }; @@ -246,15 +243,6 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern struct kmem_cache *xfs_trans_cache; -static inline struct xfs_log_item * -xfs_trans_item_relog( - struct xfs_log_item *lip, - struct xfs_log_item *done_lip, - struct xfs_trans *tp) -{ - return lip->li_ops->iop_relog(lip, done_lip, tp); -} - struct xfs_dquot; int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, From e14293803f4e84eb23a417b462b56251033b5a66 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 1 Dec 2023 09:24:18 -0800 Subject: [PATCH 021/123] xfs: don't allow overly small or large realtime volumes Don't allow realtime volumes that are less than one rt extent long. This has been broken across 4 LTS kernels with nobody noticing, so let's just disable it. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_rtbitmap.h | 13 +++++++++++++ fs/xfs/libxfs/xfs_sb.c | 3 ++- fs/xfs/xfs_rtalloc.c | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 6e5bae324cc3..1c84b52de3d4 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -353,6 +353,18 @@ int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); +/* Do we support an rt volume having this number of rtextents? */ +static inline bool +xfs_validate_rtextents( + xfs_rtbxlen_t rtextents) +{ + /* No runt rt volumes */ + if (rtextents == 0) + return false; + + return true; +} + xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents); unsigned long long xfs_rtbitmap_wordcount(struct xfs_mount *mp, @@ -372,6 +384,7 @@ unsigned long long xfs_rtsummary_wordcount(struct xfs_mount *mp, # define xfs_rtbuf_cache_relse(a) (0) # define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) # define xfs_compute_rextslog(rtx) (0) +# define xfs_validate_rtextents(rtx) (false) static inline xfs_filblks_t xfs_rtbitmap_blockcount(struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index df12bf82ed18..4a9e8588f4c9 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -509,7 +509,8 @@ xfs_validate_sb_common( rbmblocks = howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); - if (sbp->sb_rextents != rexts || + if (!xfs_validate_rtextents(rexts) || + sbp->sb_rextents != rexts || sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 7c5a50163d2d..8feb58c6241c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -963,6 +963,8 @@ xfs_growfs_rt( */ nrextents = nrblocks; do_div(nrextents, in->extsize); + if (!xfs_validate_rtextents(nrextents)) + return -EINVAL; nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents); nrextslog = xfs_compute_rextslog(nrextents); nrsumlevels = nrextslog + 1; From 9c07bca793b4ff9f0b7871e2a928a1b28b8fa4e3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 4 Dec 2023 12:21:06 -0800 Subject: [PATCH 022/123] xfs: elide ->create_done calls for unlogged deferred work Extended attribute updates use the deferred work machinery to manage state across a chain of smaller transactions. All previous deferred work users have employed log intent items and log done items to manage restarting of interrupted operations, which means that ->create_intent sets dfp_intent to a log intent item and ->create_done uses that item to create a log intent done item. However, xattrs have used the INCOMPLETE flag to deal with the lack of recovery support for an interrupted transaction chain. Log items are optional if the xattr update caller didn't set XFS_DA_OP_LOGGED to require a restartable sequence. In other words, ->create_intent can return NULL to say that there's no log intent item. If that's the case, no log intent done item should be created. Clean up xfs_defer_create_done not to do this, so that the ->create_done functions don't have to check for non-null dfp_intent themselves. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 4 ++++ fs/xfs/xfs_attr_item.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 54a6be06e6cd..06e890b44c52 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -201,6 +201,10 @@ xfs_defer_create_done( const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; + /* If there is no log intent item, there can be no log done item. */ + if (!dfp->dfp_intent) + return; + /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 988d395a48ad..39f2c5a46179 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -740,9 +740,6 @@ xfs_attr_create_done( struct xfs_attri_log_item *attrip; struct xfs_attrd_log_item *attrdp; - if (!intent) - return NULL; - attrip = ATTRI_ITEM(intent); attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); From 3f113c2739b1b068854c7ffed635c2bd790d1492 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:54 -0800 Subject: [PATCH 023/123] xfs: make xchk_iget safer in the presence of corrupt inode btrees When scrub is trying to iget an inode, ensure that it won't end up deadlocked on a cycle in the inode btree by using an empty transaction to store all the buffers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/common.c | 6 ++++-- fs/xfs/scrub/common.h | 25 +++++++++++++++++++++++++ fs/xfs/scrub/inode.c | 4 ++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index de24532fe083..23944fcc1a6c 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -733,6 +733,8 @@ xchk_iget( xfs_ino_t inum, struct xfs_inode **ipp) { + ASSERT(sc->tp != NULL); + return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); } @@ -882,8 +884,8 @@ xchk_iget_for_scrubbing( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_inode(sc, ip); if (error == -ENOENT) diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index cabdc0e16838..c83cf9e5b55f 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -151,12 +151,37 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags); void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp); +/* + * Grab the inode at @inum. The caller must have created a scrub transaction + * so that we can confirm the inumber by walking the inobt and not deadlock on + * a loop in the inobt. + */ int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp); int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_buf **agi_bpp, struct xfs_inode **ipp); void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip); int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip); +/* + * Safe version of (untrusted) xchk_iget that uses an empty transaction to + * avoid deadlocking on loops in the inobt. This should only be used in a + * scrub or repair setup routine, and only prior to grabbing a transaction. + */ +static inline int +xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp) +{ + int error; + + ASSERT(sc->tp == NULL); + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + error = xchk_iget(sc, inum, ipp); + xchk_trans_cancel(sc); + return error; +} + /* * Don't bother cross-referencing if we already found corruption or cross * referencing discrepancies. diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 889f556bc98f..b7a93380a1ab 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -95,8 +95,8 @@ xchk_setup_inode( if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino)) return -ENOENT; - /* Try a regular untrusted iget. */ - error = xchk_iget(sc, sc->sm->sm_ino, &ip); + /* Try a safe untrusted iget. */ + error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip); if (!error) return xchk_install_handle_iscrub(sc, ip); if (error == -ENOENT) From 6b126139401a2284402d7c38fe3168d5a26da41d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:55 -0800 Subject: [PATCH 024/123] xfs: don't append work items to logged xfs_defer_pending objects When someone tries to add a deferred work item to xfs_defer_add, it will try to attach the work item to the most recently added xfs_defer_pending object attached to the transaction. However, it doesn't check if the pending object has a log intent item attached to it. This is incorrect behavior because we cannot add more work to an object that has already been committed to the ondisk log. Therefore, change the behavior not to append to pending items with a non null dfp_intent. In practice this has not been an issue because the only way xfs_defer_add gets called after log intent items have been committed is from the defer ops ->finish_item functions themselves, and the @dop_pending isolation in xfs_defer_finish_noroll protects the pending items that have already been logged. However, the next patch will add the ability to pause a deferred extent free object during online btree rebuilding, and any new extfree work items need to have their own pending event. While we're at it, hoist the predicate to its own static inline function for readability. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 61 ++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 06e890b44c52..7f9b1c5f2588 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -695,6 +695,51 @@ xfs_defer_cancel( xfs_defer_cancel_list(mp, &tp->t_dfops); } +/* + * Return the last pending work item attached to this transaction if it matches + * the deferred op type. + */ +static inline struct xfs_defer_pending * +xfs_defer_find_last( + struct xfs_trans *tp, + enum xfs_defer_ops_type type, + const struct xfs_defer_op_type *ops) +{ + struct xfs_defer_pending *dfp = NULL; + + /* No dfops at all? */ + if (list_empty(&tp->t_dfops)) + return NULL; + + dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending, + dfp_list); + + /* Wrong type? */ + if (dfp->dfp_type != type) + return NULL; + return dfp; +} + +/* + * Decide if we can add a deferred work item to the last dfops item attached + * to the transaction. + */ +static inline bool +xfs_defer_can_append( + struct xfs_defer_pending *dfp, + const struct xfs_defer_op_type *ops) +{ + /* Already logged? */ + if (dfp->dfp_intent) + return false; + + /* Already full? */ + if (ops->max_items && dfp->dfp_count >= ops->max_items) + return false; + + return true; +} + /* Add an item for later deferred processing. */ void xfs_defer_add( @@ -708,19 +753,9 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - /* - * Add the item to a pending item at the end of the intake list. - * If the last pending item has the same type, reuse it. Else, - * create a new pending item at the end of the intake list. - */ - if (!list_empty(&tp->t_dfops)) { - dfp = list_last_entry(&tp->t_dfops, - struct xfs_defer_pending, dfp_list); - if (dfp->dfp_type != type || - (ops->max_items && dfp->dfp_count >= ops->max_items)) - dfp = NULL; - } - if (!dfp) { + dfp = xfs_defer_find_last(tp, type, ops); + if (!dfp || !xfs_defer_can_append(dfp, ops)) { + /* Create a new pending item at the end of the intake list. */ dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); dfp->dfp_type = type; From 4dffb2cbb4839fd6f9bbac0b3fd06cc9015cbb9b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:56 -0800 Subject: [PATCH 025/123] xfs: allow pausing of pending deferred work items Traditionally, all pending deferred work attached to a transaction is finished when one of the xfs_defer_finish* functions is called. However, online repair wants to be able to allocate space for a new data structure, format a new metadata structure into the allocated space, and commit that into the filesystem. As a hedge against system crashes during repairs, we also want to log some EFI items for the allocated space speculatively, and cancel them if we elect to commit the new data structure. Therefore, introduce the idea of pausing a pending deferred work item. Log intent items are still created for paused items and relogged as necessary. However, paused items are pushed onto a side list before we start calling ->finish_item, and the whole list is reattach to the transaction afterwards. New work items are never attached to paused pending items. Modify xfs_defer_cancel to clean up pending deferred work items holding a log intent item but not a log intent done item, since that is now possible. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 97 ++++++++++++++++++++++++++++++++++----- fs/xfs/libxfs/xfs_defer.h | 17 ++++++- fs/xfs/xfs_trace.h | 13 +++++- 3 files changed, 111 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 7f9b1c5f2588..c4480dec29ec 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -487,7 +487,7 @@ xfs_defer_relog_intent( * done item to release the intent item; and then log a new intent item. * The caller should provide a fresh transaction and roll it after we're done. */ -static int +static void xfs_defer_relog( struct xfs_trans **tpp, struct list_head *dfops) @@ -529,10 +529,6 @@ xfs_defer_relog( xfs_defer_relog_intent(*tpp, dfp); } - - if ((*tpp)->t_flags & XFS_TRANS_DIRTY) - return xfs_defer_trans_roll(tpp); - return 0; } /* @@ -588,6 +584,24 @@ xfs_defer_finish_one( return error; } +/* Move all paused deferred work from @tp to @paused_list. */ +static void +xfs_defer_isolate_paused( + struct xfs_trans *tp, + struct list_head *paused_list) +{ + struct xfs_defer_pending *dfp; + struct xfs_defer_pending *pli; + + list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) { + if (!(dfp->dfp_flags & XFS_DEFER_PAUSED)) + continue; + + list_move_tail(&dfp->dfp_list, paused_list); + trace_xfs_defer_isolate_paused(tp->t_mountp, dfp); + } +} + /* * Finish all the pending work. This involves logging intent items for * any work items that wandered in since the last transaction roll (if @@ -603,6 +617,7 @@ xfs_defer_finish_noroll( struct xfs_defer_pending *dfp = NULL; int error = 0; LIST_HEAD(dop_pending); + LIST_HEAD(dop_paused); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -621,6 +636,8 @@ xfs_defer_finish_noroll( */ int has_intents = xfs_defer_create_intents(*tp); + xfs_defer_isolate_paused(*tp, &dop_paused); + list_splice_init(&(*tp)->t_dfops, &dop_pending); if (has_intents < 0) { @@ -633,22 +650,33 @@ xfs_defer_finish_noroll( goto out_shutdown; /* Relog intent items to keep the log moving. */ - error = xfs_defer_relog(tp, &dop_pending); - if (error) - goto out_shutdown; + xfs_defer_relog(tp, &dop_pending); + xfs_defer_relog(tp, &dop_paused); + + if ((*tp)->t_flags & XFS_TRANS_DIRTY) { + error = xfs_defer_trans_roll(tp); + if (error) + goto out_shutdown; + } } - dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, - dfp_list); + dfp = list_first_entry_or_null(&dop_pending, + struct xfs_defer_pending, dfp_list); + if (!dfp) + break; error = xfs_defer_finish_one(*tp, dfp); if (error && error != -EAGAIN) goto out_shutdown; } + /* Requeue the paused items in the outgoing transaction. */ + list_splice_tail_init(&dop_paused, &(*tp)->t_dfops); + trace_xfs_defer_finish_done(*tp, _RET_IP_); return 0; out_shutdown: + list_splice_tail_init(&dop_paused, &dop_pending); xfs_defer_trans_abort(*tp, &dop_pending); xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE); trace_xfs_defer_finish_error(*tp, error); @@ -661,6 +689,9 @@ int xfs_defer_finish( struct xfs_trans **tp) { +#ifdef DEBUG + struct xfs_defer_pending *dfp; +#endif int error; /* @@ -680,7 +711,10 @@ xfs_defer_finish( } /* Reset LOWMODE now that we've finished all the dfops. */ - ASSERT(list_empty(&(*tp)->t_dfops)); +#ifdef DEBUG + list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list) + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); +#endif (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -692,6 +726,7 @@ xfs_defer_cancel( struct xfs_mount *mp = tp->t_mountp; trace_xfs_defer_cancel(tp, _RET_IP_); + xfs_defer_trans_abort(tp, &tp->t_dfops); xfs_defer_cancel_list(mp, &tp->t_dfops); } @@ -733,6 +768,10 @@ xfs_defer_can_append( if (dfp->dfp_intent) return false; + /* Paused items cannot absorb more work */ + if (dfp->dfp_flags & XFS_DEFER_PAUSED) + return NULL; + /* Already full? */ if (ops->max_items && dfp->dfp_count >= ops->max_items) return false; @@ -741,7 +780,7 @@ xfs_defer_can_append( } /* Add an item for later deferred processing. */ -void +struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, enum xfs_defer_ops_type type, @@ -768,6 +807,7 @@ xfs_defer_add( xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); + return dfp; } /* @@ -1093,3 +1133,36 @@ xfs_defer_destroy_item_caches(void) xfs_rmap_intent_destroy_cache(); xfs_defer_destroy_cache(); } + +/* + * Mark a deferred work item so that it will be requeued indefinitely without + * being finished. Caller must ensure there are no data dependencies on this + * work item in the meantime. + */ +void +xfs_defer_item_pause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED)); + + dfp->dfp_flags |= XFS_DEFER_PAUSED; + + trace_xfs_defer_item_pause(tp->t_mountp, dfp); +} + +/* + * Release a paused deferred work item so that it will be finished during the + * next transaction roll. + */ +void +xfs_defer_item_unpause( + struct xfs_trans *tp, + struct xfs_defer_pending *dfp) +{ + ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED); + + dfp->dfp_flags &= ~XFS_DEFER_PAUSED; + + trace_xfs_defer_item_unpause(tp->t_mountp, dfp); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 78d6dcd1af2c..b0284154f4e0 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -34,11 +34,24 @@ struct xfs_defer_pending { struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ unsigned int dfp_count; /* # extent items */ + unsigned int dfp_flags; enum xfs_defer_ops_type dfp_type; }; -void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type, - struct list_head *h); +/* + * Create a log intent item for this deferred item, but don't actually finish + * the work. Caller must clear this before the final transaction commit. + */ +#define XFS_DEFER_PAUSED (1U << 0) + +#define XFS_DEFER_PENDING_STRINGS \ + { XFS_DEFER_PAUSED, "paused" } + +void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); +void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); + +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, + enum xfs_defer_ops_type type, struct list_head *h); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3926cf7f2a6e..514095b6ba2b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2551,6 +2551,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __field(dev_t, dev) __field(int, type) __field(void *, intent) + __field(unsigned int, flags) __field(char, committed) __field(int, nr) ), @@ -2558,13 +2559,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->type = dfp->dfp_type; __entry->intent = dfp->dfp_intent; + __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", + TP_printk("dev %d:%d optype %d intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) @@ -2675,6 +2678,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); @@ -2692,6 +2698,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, __field(void *, intent) __field(void *, item) __field(char, committed) + __field(unsigned int, flags) __field(int, nr) ), TP_fast_assign( @@ -2700,13 +2707,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; + __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d", + TP_printk("dev %d:%d optype %d intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, __entry->item, + __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, __entry->nr) ) From 4c88fef3af4a51c2cdba6a28237e98da4873e8dc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:57 -0800 Subject: [PATCH 026/123] xfs: remove __xfs_free_extent_later xfs_free_extent_later is a trivial helper, so remove it to reduce the amount of thinking required to understand the deferred freeing interface. This will make it easier to introduce automatic reaping of speculative allocations in the next patch. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_ag.c | 2 +- fs/xfs/libxfs/xfs_alloc.c | 2 +- fs/xfs/libxfs/xfs_alloc.h | 14 +------------- fs/xfs/libxfs/xfs_bmap.c | 4 ++-- fs/xfs/libxfs/xfs_bmap_btree.c | 2 +- fs/xfs/libxfs/xfs_ialloc.c | 5 +++-- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 +- fs/xfs/libxfs/xfs_refcount.c | 6 +++--- fs/xfs/libxfs/xfs_refcount_btree.c | 2 +- fs/xfs/scrub/reap.c | 2 +- fs/xfs/xfs_reflink.c | 2 +- 11 files changed, 16 insertions(+), 27 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index f9f4d694640d..f62ff125a50a 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -984,7 +984,7 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, + err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, XFS_AG_RESV_NONE, true); if (err2) goto resv_err; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 100ab5931b31..c35224ad9428 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2523,7 +2523,7 @@ xfs_defer_agfl_block( * The list is maintained sorted (by block number). */ int -__xfs_free_extent_later( +xfs_free_extent_later( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6bb8d295c321..6b95d1d8a853 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } -int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, +int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, bool skip_discard); @@ -256,18 +256,6 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ -static inline int -xfs_free_extent_later( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - enum xfs_ag_resv_type type) -{ - return __xfs_free_extent_later(tp, bno, len, oinfo, type, false); -} - - extern struct kmem_cache *xfs_extfree_item_cache; int __init xfs_extfree_intent_init_cache(void); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index be62acffad6c..68be1dd4f0f2 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; @@ -5218,7 +5218,7 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - error = __xfs_free_extent_later(tp, del->br_startblock, + error = xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, XFS_AG_RESV_NONE, ((bflags & XFS_BMAPI_NODISCARD) || diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index bf3f1b36fdd2..8360256cff16 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -272,7 +272,7 @@ xfs_bmbt_free_block( xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b83e54c70906..d61d03e5b853 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1854,7 +1854,7 @@ xfs_difree_inode_chunk( return xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); } /* holemask is only 16-bits (fits in an unsigned long) */ @@ -1900,7 +1900,8 @@ xfs_difree_inode_chunk( ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); error = xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, - &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE); + &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE, + false); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 9258f01c0015..42a5e1f227a0 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -161,7 +161,7 @@ __xfs_inobt_free_block( xfs_inobt_mod_blockcount(cur, -1); fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_INOBT, resv); + &XFS_RMAP_OINFO_INOBT, resv, false); } STATIC int diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 646b3fa362ad..3702b4a07110 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1153,7 +1153,7 @@ xfs_refcount_adjust_extents( tmp.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, tmp.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1215,7 +1215,7 @@ xfs_refcount_adjust_extents( ext.rc_startblock); error = xfs_free_extent_later(cur->bc_tp, fsbno, ext.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_error; } @@ -1985,7 +1985,7 @@ xfs_refcount_recover_cow_leftovers( /* Free the block. */ error = xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) goto out_trans; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 5c3987d8dc24..3fa795e2488d 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -112,7 +112,7 @@ xfs_refcountbt_free_block( be32_add_cpu(&agf->agf_refcount_blocks, -1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); return xfs_free_extent_later(cur->bc_tp, fsbno, 1, - &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false); } STATIC int diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 86a62420e02c..78c9f2085db4 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -410,7 +410,7 @@ xreap_agextent_iter( * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. */ - error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); if (error) return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e5b62dc28466..d5ca8bcae65b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks( error = xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL, - XFS_AG_RESV_NONE); + XFS_AG_RESV_NONE, false); if (error) break; From e3042be36c343207b7af249a09f50b4e37e9fda4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:57 -0800 Subject: [PATCH 027/123] xfs: automatic freeing of freshly allocated unwritten space As mentioned in the previous commit, online repair wants to allocate space to write out a new metadata structure, and it also wants to hedge against system crashes during repairs by logging (and later cancelling) EFIs to free the space if we crash before committing the new data structure. Therefore, create a trio of functions to schedule automatic reaping of freshly allocated unwritten space. xfs_alloc_schedule_autoreap creates a paused EFI representing the space we just allocated. Once the allocations are made and the autoreaps scheduled, we can start writing to disk. If the writes succeed, xfs_alloc_cancel_autoreap marks the EFI work items as stale and unpauses the pending deferred work item. Assuming that's done in the same transaction that commits the new structure into the filesystem, we guarantee that either the new object is fully visible, or that all the space gets reclaimed. If the writes succeed but only part of an extent was used, repair must call the same _cancel_autoreap function to kill the first EFI and then log a new EFI to free the unused space. The first EFI is already committed, so it cannot be changed. For full extents that aren't used, xfs_alloc_commit_autoreap will unpause the EFI, which results in the space being freed during the next _defer_finish cycle. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_alloc.c | 104 ++++++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_alloc.h | 12 +++++ fs/xfs/xfs_extfree_item.c | 9 ++-- 3 files changed, 117 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c35224ad9428..4940f9377f21 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2522,14 +2522,15 @@ xfs_defer_agfl_block( * Add the extent to the list of extents to be free at transaction end. * The list is maintained sorted (by block number). */ -int -xfs_free_extent_later( +static int +xfs_defer_extent_free( struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type, - bool skip_discard) + bool skip_discard, + struct xfs_defer_pending **dfpp) { struct xfs_extent_free_item *xefi; struct xfs_mount *mp = tp->t_mountp; @@ -2577,10 +2578,105 @@ xfs_free_extent_later( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); return 0; } +int +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type, + bool skip_discard) +{ + struct xfs_defer_pending *dontcare = NULL; + + return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, + &dontcare); +} + +/* + * Set up automatic freeing of unwritten space in the filesystem. + * + * This function attached a paused deferred extent free item to the + * transaction. Pausing means that the EFI will be logged in the next + * transaction commit, but the pending EFI will not be finished until the + * pending item is unpaused. + * + * If the system goes down after the EFI has been persisted to the log but + * before the pending item is unpaused, log recovery will find the EFI, fail to + * find the EFD, and free the space. + * + * If the pending item is unpaused, the next transaction commit will log an EFD + * without freeing the space. + * + * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the + * @args structure are set to the relevant values. + */ +int +xfs_alloc_schedule_autoreap( + const struct xfs_alloc_arg *args, + bool skip_discard, + struct xfs_alloc_autoreap *aarp) +{ + int error; + + error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, + &args->oinfo, args->resv, skip_discard, &aarp->dfp); + if (error) + return error; + + xfs_defer_item_pause(args->tp, aarp->dfp); + return 0; +} + +/* + * Cancel automatic freeing of unwritten space in the filesystem. + * + * Earlier, we created a paused deferred extent free item and attached it to + * this transaction so that we could automatically roll back a new space + * allocation if the system went down. Now we want to cancel the paused work + * item by marking the EFI stale so we don't actually free the space, unpausing + * the pending item and logging an EFD. + * + * The caller generally should have already mapped the space into the ondisk + * filesystem. If the reserved space was partially used, the caller must call + * xfs_free_extent_later to create a new EFI to free the unused space. + */ +void +xfs_alloc_cancel_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + struct xfs_defer_pending *dfp = aarp->dfp; + struct xfs_extent_free_item *xefi; + + if (!dfp) + return; + + list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) + xefi->xefi_flags |= XFS_EFI_CANCELLED; + + xfs_defer_item_unpause(tp, dfp); +} + +/* + * Commit automatic freeing of unwritten space in the filesystem. + * + * This unpauses an earlier _schedule_autoreap and commits to freeing the + * allocated space. Call this if none of the reserved space was used. + */ +void +xfs_alloc_commit_autoreap( + struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp) +{ + if (aarp->dfp) + xfs_defer_item_unpause(tp, aarp->dfp); +} + #ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6b95d1d8a853..851cafbd6449 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -255,6 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp, #define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ #define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ #define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ +#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */ + +struct xfs_alloc_autoreap { + struct xfs_defer_pending *dfp; +}; + +int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args, + bool skip_discard, struct xfs_alloc_autoreap *aarp); +void xfs_alloc_cancel_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); +void xfs_alloc_commit_autoreap(struct xfs_trans *tp, + struct xfs_alloc_autoreap *aarp); extern struct kmem_cache *xfs_extfree_item_cache; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3ca23ab8d92a..3e3469504271 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -453,7 +453,7 @@ xfs_extent_free_finish_item( struct xfs_extent *extp; uint next_extent; xfs_agblock_t agbno; - int error; + int error = 0; xefi = container_of(item, struct xfs_extent_free_item, xefi_list); agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock); @@ -473,9 +473,10 @@ xfs_extent_free_finish_item( * the existing EFI, and so we need to copy all the unprocessed extents * in this EFI to the EFD so this works correctly. */ - error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, - xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, - xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) + error = __xfs_free_extent(tp, xefi->xefi_pag, agbno, + xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv, + xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); if (error == -EAGAIN) { xfs_efd_from_efi(efdp); return error; From 4c8ecd1cfdd01fb727121035014d9f654a30bdf2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:58 -0800 Subject: [PATCH 028/123] xfs: remove unused fields from struct xbtree_ifakeroot Remove these unused fields since nobody uses them. They should have been removed years ago in a different cleanup series from Christoph Hellwig. Fixes: daf83964a3681 ("xfs: move the per-fork nextents fields into struct xfs_ifork") Fixes: f7e67b20ecbbc ("xfs: move the fork format fields into struct xfs_ifork") Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree_staging.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0d2976050ae..5f638f711246 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -37,12 +37,6 @@ struct xbtree_ifakeroot { /* Number of bytes available for this fork in the inode. */ unsigned int if_fork_size; - - /* Fork format. */ - unsigned int if_format; - - /* Number of records. */ - unsigned int if_extents; }; /* Cursor interactions with fake roots for inode-rooted btrees. */ From be408417630427984a1fddd069f30b245793234c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:40:59 -0800 Subject: [PATCH 029/123] xfs: implement block reservation accounting for btrees we're staging Create a new xrep_newbt structure to encapsulate a fake root for creating a staged btree cursor as well as to track all the blocks that we need to reserve in order to build that btree. As for the particular choice of lowspace thresholds and btree block slack factors -- at this point one could say that the thresholds in online repair come from bulkload_estimate_ag_slack in xfs_repair[1]. But that's not the entire story, since the offline btree rebuilding code in xfs_repair was merged as a retroport of the online btree code in this patchset! Before xfs_btree_staging.[ch] came along, xfs_repair determined the slack factor (aka the number of slots to leave unfilled in each new btree block) via open-coded logic in repair/phase5.c[2]. At that point the slack factors were arbitrary quantities per btree. The rmapbt automatically left 10 slots free; everything else left zero. That had a noticeable effect on performance straight after mounting because adding records to /any/ btree would result in splits. A few years ago when this patch was first written, Dave and I decided that repair should generate btree blocks that were 75% full unless space was tight, in which case it should try to fill the blocks to nearly full. We defined tight as ~10% free to avoid repair failures but settled on 3/32 (~9%) to avoid div64. IOWs, we mostly pulled the thresholds out of thin air. We've been QAing with those geometry numbers ever since. ;) Link: https://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git/tree/repair/bulkload.c?h=v6.5.0#n114 Link: https://git.kernel.org/pub/scm/fs/xfs/xfsprogs-dev.git/tree/repair/phase5.c?h=v4.19.0#n1349 Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner --- fs/xfs/Makefile | 1 + fs/xfs/scrub/newbt.c | 495 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/newbt.h | 62 ++++++ fs/xfs/scrub/trace.h | 37 ++++ 4 files changed, 595 insertions(+) create mode 100644 fs/xfs/scrub/newbt.c create mode 100644 fs/xfs/scrub/newbt.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7762c01a85cf..1537d66e5ab0 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -181,6 +181,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + newbt.o \ reap.o \ repair.o \ ) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c new file mode 100644 index 000000000000..5d1d75d2b1ad --- /dev/null +++ b/fs/xfs/scrub/newbt.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" +#include "xfs_defer.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/newbt.h" + +/* + * Estimate proper slack values for a btree that's being reloaded. + * + * Under most circumstances, we'll take whatever default loading value the + * btree bulk loading code calculates for us. However, there are some + * exceptions to this rule: + * + * (1) If this is a per-AG btree and the AG has less than 10% space free. + * (2) If this is an inode btree and the FS has less than 10% space free. + + * In either case, format the new btree blocks almost completely full to + * minimize space usage. + */ +static void +xrep_newbt_estimate_slack( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_btree_bload *bload = &xnr->bload; + uint64_t free; + uint64_t sz; + + /* Let the btree code compute the default slack values. */ + bload->leaf_slack = -1; + bload->node_slack = -1; + + if (sc->ops->type == ST_PERAG) { + free = sc->sa.pag->pagf_freeblks; + sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno); + } else { + free = percpu_counter_sum(&sc->mp->m_fdblocks); + sz = sc->mp->m_sb.sb_dblocks; + } + + /* No further changes if there's more than 10% free space left. */ + if (free >= div_u64(sz, 10)) + return; + + /* + * We're low on space; load the btrees as tightly as possible. Leave + * a couple of open slots in each btree block so that we don't end up + * splitting the btrees like crazy after a mount. + */ + if (bload->leaf_slack < 0) + bload->leaf_slack = 2; + if (bload->node_slack < 0) + bload->node_slack = 2; +} + +/* Initialize accounting resources for staging a new AG btree. */ +void +xrep_newbt_init_ag( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, + xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv) +{ + memset(xnr, 0, sizeof(struct xrep_newbt)); + xnr->sc = sc; + xnr->oinfo = *oinfo; /* structure copy */ + xnr->alloc_hint = alloc_hint; + xnr->resv = resv; + INIT_LIST_HEAD(&xnr->resv_list); + xrep_newbt_estimate_slack(xnr); +} + +/* Initialize accounting resources for staging a new inode fork btree. */ +int +xrep_newbt_init_inode( + struct xrep_newbt *xnr, + struct xfs_scrub *sc, + int whichfork, + const struct xfs_owner_info *oinfo) +{ + struct xfs_ifork *ifp; + + ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS); + if (!ifp) + return -ENOMEM; + + xrep_newbt_init_ag(xnr, sc, oinfo, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino), + XFS_AG_RESV_NONE); + xnr->ifake.if_fork = ifp; + xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork); + return 0; +} + +/* + * Initialize accounting resources for staging a new btree. Callers are + * expected to add their own reservations (and clean them up) manually. + */ +void +xrep_newbt_init_bare( + struct xrep_newbt *xnr, + struct xfs_scrub *sc) +{ + xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK, + XFS_AG_RESV_NONE); +} + +/* + * Designate specific blocks to be used to build our new btree. @pag must be + * a passive reference. + */ +STATIC int +xrep_newbt_add_blocks( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + const struct xfs_alloc_arg *args) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xrep_newbt_resv *resv; + + resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); + if (!resv) + return -ENOMEM; + + INIT_LIST_HEAD(&resv->list); + resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); + resv->len = args->len; + resv->used = 0; + resv->pag = xfs_perag_hold(pag); + + list_add_tail(&resv->list, &xnr->resv_list); + return 0; +} + +/* Don't let our allocation hint take us beyond this AG */ +static inline void +xrep_newbt_validate_ag_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint); + + if (agno == sc->sa.pag->pag_agno && + xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for a new per-AG btree. */ +STATIC int +xrep_newbt_alloc_ag_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + ASSERT(sc->sa.pag != NULL); + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + xfs_agnumber_t agno; + + xrep_newbt_validate_ag_alloc_hint(xnr); + + error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_ag_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + if (agno != sc->sa.pag->pag_agno) { + ASSERT(agno == sc->sa.pag->pag_agno); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Don't let our allocation hint take us beyond EOFS */ +static inline void +xrep_newbt_validate_file_alloc_hint( + struct xrep_newbt *xnr) +{ + struct xfs_scrub *sc = xnr->sc; + + if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint)) + return; + + xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1); +} + +/* Allocate disk space for our new file-based btree. */ +STATIC int +xrep_newbt_alloc_file_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + struct xfs_scrub *sc = xnr->sc; + struct xfs_mount *mp = sc->mp; + int error = 0; + + while (nr_blocks > 0) { + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = mp, + .oinfo = xnr->oinfo, + .minlen = 1, + .maxlen = nr_blocks, + .prod = 1, + .resv = xnr->resv, + }; + struct xfs_perag *pag; + xfs_agnumber_t agno; + + xrep_newbt_validate_file_alloc_hint(xnr); + + error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + agno = XFS_FSB_TO_AGNO(mp, args.fsbno); + + trace_xrep_newbt_alloc_file_blocks(mp, agno, + XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len, + xnr->oinfo.oi_owner); + + pag = xfs_perag_get(mp, agno); + if (!pag) { + ASSERT(0); + return -EFSCORRUPTED; + } + + error = xrep_newbt_add_blocks(xnr, pag, &args); + xfs_perag_put(pag); + if (error) + return error; + + nr_blocks -= args.len; + xnr->alloc_hint = args.fsbno + args.len; + + error = xrep_defer_finish(sc); + if (error) + return error; + } + + return 0; +} + +/* Allocate disk space for our new btree. */ +int +xrep_newbt_alloc_blocks( + struct xrep_newbt *xnr, + uint64_t nr_blocks) +{ + if (xnr->sc->ip) + return xrep_newbt_alloc_file_blocks(xnr, nr_blocks); + return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks); +} + +/* + * Free the unused part of a space extent that was reserved for a new ondisk + * structure. Returns the number of EFIs logged or a negative errno. + */ +STATIC int +xrep_newbt_free_extent( + struct xrep_newbt *xnr, + struct xrep_newbt_resv *resv, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + xfs_agblock_t free_agbno = resv->agbno; + xfs_extlen_t free_aglen = resv->len; + xfs_fsblock_t fsbno; + int error; + + if (!btree_committed || resv->used == 0) { + /* + * If we're not committing a new btree or we didn't use the + * space reservation, free the entire space extent. + */ + goto free; + } + + /* + * We used space and committed the btree. Remove the written blocks + * from the reservation and possibly log a new EFI to free any unused + * reservation space. + */ + free_agbno += resv->used; + free_aglen -= resv->used; + + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, xnr->oinfo.oi_owner); + + ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + +free: + /* + * Use EFIs to free the reservations. This reduces the chance + * that we leak blocks if the system goes down. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno); + error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo, + xnr->resv, true); + if (error) + return error; + + return 1; +} + +/* Free all the accounting info and disk space we reserved for a new btree. */ +STATIC int +xrep_newbt_free( + struct xrep_newbt *xnr, + bool btree_committed) +{ + struct xfs_scrub *sc = xnr->sc; + struct xrep_newbt_resv *resv, *n; + unsigned int freed = 0; + int error = 0; + + /* + * If the filesystem already went down, we can't free the blocks. Skip + * ahead to freeing the incore metadata because we can't fix anything. + */ + if (xfs_is_shutdown(sc->mp)) + goto junkit; + + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + int ret; + + ret = xrep_newbt_free_extent(xnr, resv, btree_committed); + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + if (ret < 0) { + error = ret; + goto junkit; + } + + freed += ret; + if (freed >= XREP_MAX_ITRUNCATE_EFIS) { + error = xrep_defer_finish(sc); + if (error) + goto junkit; + freed = 0; + } + } + + if (freed) + error = xrep_defer_finish(sc); + +junkit: + /* + * If we still have reservations attached to @newbt, cleanup must have + * failed and the filesystem is about to go down. Clean up the incore + * reservations. + */ + list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + list_del(&resv->list); + xfs_perag_put(resv->pag); + kfree(resv); + } + + if (sc->ip) { + kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork); + xnr->ifake.if_fork = NULL; + } + + return error; +} + +/* + * Free all the accounting info and unused disk space allocations after + * committing a new btree. + */ +int +xrep_newbt_commit( + struct xrep_newbt *xnr) +{ + return xrep_newbt_free(xnr, true); +} + +/* + * Free all the accounting info and all of the disk space we reserved for a new + * btree that we're not going to commit. We want to try to roll things back + * cleanly for things like ENOSPC midway through allocation. + */ +void +xrep_newbt_cancel( + struct xrep_newbt *xnr) +{ + xrep_newbt_free(xnr, false); +} + +/* Feed one of the reserved btree blocks to the bulk loader. */ +int +xrep_newbt_claim_block( + struct xfs_btree_cur *cur, + struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr) +{ + struct xrep_newbt_resv *resv; + struct xfs_mount *mp = cur->bc_mp; + xfs_agblock_t agbno; + + /* + * The first item in the list should always have a free block unless + * we're completely out. + */ + resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list); + if (resv->used == resv->len) + return -ENOSPC; + + /* + * Peel off a block from the start of the reservation. We allocate + * blocks in order to place blocks on disk in increasing record or key + * order. The block reservations tend to end up on the list in + * decreasing order, which hopefully results in leaf blocks ending up + * together. + */ + agbno = resv->agbno + resv->used; + resv->used++; + + /* If we used all the blocks in this reservation, move it to the end. */ + if (resv->used == resv->len) + list_move_tail(&resv->list, &xnr->resv_list); + + trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1, + xnr->oinfo.oi_owner); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno, + agbno)); + else + ptr->s = cpu_to_be32(agbno); + return 0; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h new file mode 100644 index 000000000000..ca53271f3a4c --- /dev/null +++ b/fs/xfs/scrub/newbt.h @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_NEWBT_H__ +#define __XFS_SCRUB_NEWBT_H__ + +struct xrep_newbt_resv { + /* Link to list of extents that we've reserved. */ + struct list_head list; + + struct xfs_perag *pag; + + /* AG block of the extent we reserved. */ + xfs_agblock_t agbno; + + /* Length of the reservation. */ + xfs_extlen_t len; + + /* How much of this reservation has been used. */ + xfs_extlen_t used; +}; + +struct xrep_newbt { + struct xfs_scrub *sc; + + /* List of extents that we've reserved. */ + struct list_head resv_list; + + /* Fake root for new btree. */ + union { + struct xbtree_afakeroot afake; + struct xbtree_ifakeroot ifake; + }; + + /* rmap owner of these blocks */ + struct xfs_owner_info oinfo; + + /* btree geometry for the bulk loader */ + struct xfs_btree_bload bload; + + /* Allocation hint */ + xfs_fsblock_t alloc_hint; + + /* per-ag reservation type */ + enum xfs_ag_resv_type resv; +}; + +void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc); +void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, + const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, + enum xfs_ag_resv_type resv); +int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, + int whichfork, const struct xfs_owner_info *oinfo); +int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +void xrep_newbt_cancel(struct xrep_newbt *xnr); +int xrep_newbt_commit(struct xrep_newbt *xnr); +int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, + union xfs_btree_ptr *ptr); + +#endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4a8bc6f3c8f2..aa7683075319 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1332,6 +1332,43 @@ TRACE_EVENT(xrep_ialloc_insert, __entry->freemask) ) +DECLARE_EVENT_CLASS(xrep_newbt_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + int64_t owner), + TP_ARGS(mp, agno, agbno, len, owner), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int64_t, owner) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner) +); +#define DEFINE_NEWBT_EXTENT_EVENT(name) \ +DEFINE_EVENT(xrep_newbt_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + int64_t owner), \ + TP_ARGS(mp, agno, agbno, len, owner)) +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); +DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From 6bb9ea8ecd2c58a66324cb799838e7d49d78a877 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:41:00 -0800 Subject: [PATCH 030/123] xfs: log EFIs for all btree blocks being used to stage a btree We need to log EFIs for every extent that we allocate for the purpose of staging a new btree so that if we fail then the blocks will be freed during log recovery. Use the autoreaping mechanism provided by the previous patch to attach paused freeing work to the scrub transaction. We can then mark the EFIs stale if we decide to commit the new btree, or we can unpause the EFIs if we decide to abort the repair. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/newbt.c | 34 ++++++++++++++++++++++++++-------- fs/xfs/scrub/newbt.h | 3 +++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 5d1d75d2b1ad..992cf34a13e7 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -139,6 +139,7 @@ xrep_newbt_add_blocks( { struct xfs_mount *mp = xnr->sc->mp; struct xrep_newbt_resv *resv; + int error; resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS); if (!resv) @@ -150,8 +151,18 @@ xrep_newbt_add_blocks( resv->used = 0; resv->pag = xfs_perag_hold(pag); + ASSERT(xnr->oinfo.oi_offset == 0); + + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + list_add_tail(&resv->list, &xnr->resv_list); return 0; +out_pag: + xfs_perag_put(resv->pag); + kfree(resv); + return error; } /* Don't let our allocation hint take us beyond this AG */ @@ -330,16 +341,21 @@ xrep_newbt_free_extent( if (!btree_committed || resv->used == 0) { /* * If we're not committing a new btree or we didn't use the - * space reservation, free the entire space extent. + * space reservation, let the existing EFI free the entire + * space extent. */ - goto free; + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, + free_agbno, free_aglen, xnr->oinfo.oi_owner); + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); + return 1; } /* - * We used space and committed the btree. Remove the written blocks - * from the reservation and possibly log a new EFI to free any unused - * reservation space. + * We used space and committed the btree. Cancel the autoreap, remove + * the written blocks from the reservation, and possibly log a new EFI + * to free any unused reservation space. */ + xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap); free_agbno += resv->used; free_aglen -= resv->used; @@ -351,7 +367,6 @@ xrep_newbt_free_extent( ASSERT(xnr->resv != XFS_AG_RESV_AGFL); -free: /* * Use EFIs to free the reservations. This reduces the chance * that we leak blocks if the system goes down. @@ -411,9 +426,10 @@ xrep_newbt_free( /* * If we still have reservations attached to @newbt, cleanup must have * failed and the filesystem is about to go down. Clean up the incore - * reservations. + * reservations and try to commit to freeing the space we used. */ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) { + xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap); list_del(&resv->list); xfs_perag_put(resv->pag); kfree(resv); @@ -491,5 +507,7 @@ xrep_newbt_claim_block( agbno)); else ptr->s = cpu_to_be32(agbno); - return 0; + + /* Relog all the EFIs. */ + return xrep_defer_finish(xnr->sc); } diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index ca53271f3a4c..d2baffa17b1a 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -12,6 +12,9 @@ struct xrep_newbt_resv { struct xfs_perag *pag; + /* Auto-freeing this reservation if we don't commit. */ + struct xfs_alloc_autoreap autoreap; + /* AG block of the extent we reserved. */ xfs_agblock_t agbno; From 3f3cec031099c37513727efc978a12b6346e326d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 6 Dec 2023 18:41:00 -0800 Subject: [PATCH 031/123] xfs: force small EFIs for reaping btree extents Introduce the concept of a defer ops barrier to separate consecutively queued pending work items of the same type. With a barrier in place, the two work items will be tracked separately, and receive separate log intent items. The goal here is to prevent reaping of old metadata blocks from creating unnecessarily huge EFIs that could then run the risk of overflowing the scrub transaction. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_defer.c | 107 ++++++++++++++++++++++++++++++++++---- fs/xfs/libxfs/xfs_defer.h | 3 ++ fs/xfs/scrub/reap.c | 5 ++ 3 files changed, 104 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c4480dec29ec..ecc2f7ec6991 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -182,6 +182,58 @@ static struct kmem_cache *xfs_defer_pending_cache; * Note that the continuation requested between t2 and t3 is likely to * reoccur. */ +STATIC struct xfs_log_item * +xfs_defer_barrier_create_intent( + struct xfs_trans *tp, + struct list_head *items, + unsigned int count, + bool sort) +{ + return NULL; +} + +STATIC void +xfs_defer_barrier_abort_intent( + struct xfs_log_item *intent) +{ + /* empty */ +} + +STATIC struct xfs_log_item * +xfs_defer_barrier_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + return NULL; +} + +STATIC int +xfs_defer_barrier_finish_item( + struct xfs_trans *tp, + struct xfs_log_item *done, + struct list_head *item, + struct xfs_btree_cur **state) +{ + ASSERT(0); + return -EFSCORRUPTED; +} + +STATIC void +xfs_defer_barrier_cancel_item( + struct list_head *item) +{ + ASSERT(0); +} + +static const struct xfs_defer_op_type xfs_barrier_defer_type = { + .max_items = 1, + .create_intent = xfs_defer_barrier_create_intent, + .abort_intent = xfs_defer_barrier_abort_intent, + .create_done = xfs_defer_barrier_create_done, + .finish_item = xfs_defer_barrier_finish_item, + .cancel_item = xfs_defer_barrier_cancel_item, +}; static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, @@ -190,6 +242,7 @@ static const struct xfs_defer_op_type *defer_op_types[] = { [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, + [XFS_DEFER_OPS_TYPE_BARRIER] = &xfs_barrier_defer_type, }; /* Create a log intent done item for a log intent item. */ @@ -779,6 +832,23 @@ xfs_defer_can_append( return true; } +/* Create a new pending item at the end of the transaction list. */ +static inline struct xfs_defer_pending * +xfs_defer_alloc( + struct xfs_trans *tp, + enum xfs_defer_ops_type type) +{ + struct xfs_defer_pending *dfp; + + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); + dfp->dfp_type = type; + INIT_LIST_HEAD(&dfp->dfp_work); + list_add_tail(&dfp->dfp_list, &tp->t_dfops); + + return dfp; +} + /* Add an item for later deferred processing. */ struct xfs_defer_pending * xfs_defer_add( @@ -793,23 +863,38 @@ xfs_defer_add( BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); dfp = xfs_defer_find_last(tp, type, ops); - if (!dfp || !xfs_defer_can_append(dfp, ops)) { - /* Create a new pending item at the end of the intake list. */ - dfp = kmem_cache_zalloc(xfs_defer_pending_cache, - GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = type; - dfp->dfp_intent = NULL; - dfp->dfp_done = NULL; - dfp->dfp_count = 0; - INIT_LIST_HEAD(&dfp->dfp_work); - list_add_tail(&dfp->dfp_list, &tp->t_dfops); - } + if (!dfp || !xfs_defer_can_append(dfp, ops)) + dfp = xfs_defer_alloc(tp, type); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); return dfp; } +/* + * Add a defer ops barrier to force two otherwise adjacent deferred work items + * to be tracked separately and have separate log items. + */ +void +xfs_defer_add_barrier( + struct xfs_trans *tp) +{ + struct xfs_defer_pending *dfp; + const enum xfs_defer_ops_type type = XFS_DEFER_OPS_TYPE_BARRIER; + const struct xfs_defer_op_type *ops = defer_op_types[type]; + + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* If the last defer op added was a barrier, we're done. */ + dfp = xfs_defer_find_last(tp, type, ops); + if (dfp) + return; + + xfs_defer_alloc(tp, type); + + trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); +} + /* * Create a pending deferred work item to replay the recovered intent item * and add it to the list. diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index b0284154f4e0..5b1990ef3e5d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -20,6 +20,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_FREE, XFS_DEFER_OPS_TYPE_AGFL_FREE, XFS_DEFER_OPS_TYPE_ATTR, + XFS_DEFER_OPS_TYPE_BARRIER, XFS_DEFER_OPS_TYPE_MAX, }; @@ -163,4 +164,6 @@ xfs_defer_add_item( int __init xfs_defer_init_item_caches(void); void xfs_defer_destroy_item_caches(void); +void xfs_defer_add_barrier(struct xfs_trans *tp); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 78c9f2085db4..ee26fcb500b7 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -31,6 +31,7 @@ #include "xfs_da_btree.h" #include "xfs_attr.h" #include "xfs_attr_remote.h" +#include "xfs_defer.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -409,6 +410,8 @@ xreap_agextent_iter( /* * Use deferred frees to get rid of the old btree blocks to try to * minimize the window in which we could crash and lose the old blocks. + * Add a defer ops barrier every other extent to avoid stressing the + * system with large EFIs. */ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo, rs->resv, true); @@ -416,6 +419,8 @@ xreap_agextent_iter( return error; rs->deferred++; + if (rs->deferred % 2 == 0) + xfs_defer_add_barrier(sc->tp); return 0; } From fa422b353d212373fb2b2857a5ea5a6fa4876f9c Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Mon, 23 Oct 2023 15:20:46 +0800 Subject: [PATCH 032/123] mm, pmem, xfs: Introduce MF_MEM_PRE_REMOVE for unbind Now, if we suddenly remove a PMEM device(by calling unbind) which contains FSDAX while programs are still accessing data in this device, e.g.: ``` $FSSTRESS_PROG -d $SCRATCH_MNT -n 99999 -p 4 & # $FSX_PROG -N 1000000 -o 8192 -l 500000 $SCRATCH_MNT/t001 & echo "pfn1.1" > /sys/bus/nd/drivers/nd_pmem/unbind ``` it could come into an unacceptable state: 1. device has gone but mount point still exists, and umount will fail with "target is busy" 2. programs will hang and cannot be killed 3. may crash with NULL pointer dereference To fix this, we introduce a MF_MEM_PRE_REMOVE flag to let it know that we are going to remove the whole device, and make sure all related processes could be notified so that they could end up gracefully. This patch is inspired by Dan's "mm, dax, pmem: Introduce dev_pagemap_failure()"[1]. With the help of dax_holder and ->notify_failure() mechanism, the pmem driver is able to ask filesystem on it to unmap all files in use, and notify processes who are using those files. Call trace: trigger unbind -> unbind_store() -> ... (skip) -> devres_release_all() -> kill_dax() -> dax_holder_notify_failure(dax_dev, 0, U64_MAX, MF_MEM_PRE_REMOVE) -> xfs_dax_notify_failure() `-> freeze_super() // freeze (kernel call) `-> do xfs rmap ` -> mf_dax_kill_procs() ` -> collect_procs_fsdax() // all associated processes ` -> unmap_and_kill() ` -> invalidate_inode_pages2_range() // drop file's cache `-> thaw_super() // thaw (both kernel & user call) Introduce MF_MEM_PRE_REMOVE to let filesystem know this is a remove event. Use the exclusive freeze/thaw[2] to lock the filesystem to prevent new dax mapping from being created. Do not shutdown filesystem directly if configuration is not supported, or if failure range includes metadata area. Make sure all files and processes(not only the current progress) are handled correctly. Also drop the cache of associated files before pmem is removed. [1]: https://lore.kernel.org/linux-mm/161604050314.1463742.14151665140035795571.stgit@dwillia2-desk3.amr.corp.intel.com/ [2]: https://lore.kernel.org/linux-xfs/169116275623.3187159.16862410128731457358.stg-ugh@frogsfrogsfrogs/ Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Dan Williams Signed-off-by: Chandan Babu R --- drivers/dax/super.c | 3 +- fs/xfs/xfs_notify_failure.c | 108 ++++++++++++++++++++++++++++++++++-- include/linux/mm.h | 1 + mm/memory-failure.c | 21 +++++-- 4 files changed, 122 insertions(+), 11 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 0da9232ea175..f4b635526345 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -326,7 +326,8 @@ void kill_dax(struct dax_device *dax_dev) return; if (dax_dev->holder_data != NULL) - dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + dax_holder_notify_failure(dax_dev, 0, U64_MAX, + MF_MEM_PRE_REMOVE); clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index a7daa522e00f..fa50e5308292 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -22,6 +22,7 @@ #include #include +#include struct xfs_failure_info { xfs_agblock_t startblock; @@ -73,10 +74,16 @@ xfs_dax_failure_fn( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip; struct xfs_failure_info *notify = data; + struct address_space *mapping; + pgoff_t pgoff; + unsigned long pgcnt; int error = 0; if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + /* Continue the query because this isn't a failure. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + return 0; notify->want_shutdown = true; return 0; } @@ -92,14 +99,60 @@ xfs_dax_failure_fn( return 0; } - error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, - xfs_failure_pgoff(mp, rec, notify), - xfs_failure_pgcnt(mp, rec, notify), - notify->mf_flags); + mapping = VFS_I(ip)->i_mapping; + pgoff = xfs_failure_pgoff(mp, rec, notify); + pgcnt = xfs_failure_pgcnt(mp, rec, notify); + + /* Continue the rmap query if the inode isn't a dax file. */ + if (dax_mapping(mapping)) + error = mf_dax_kill_procs(mapping, pgoff, pgcnt, + notify->mf_flags); + + /* Invalidate the cache in dax pages. */ + if (notify->mf_flags & MF_MEM_PRE_REMOVE) + invalidate_inode_pages2_range(mapping, pgoff, + pgoff + pgcnt - 1); + xfs_irele(ip); return error; } +static int +xfs_dax_notify_failure_freeze( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + int error; + + error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "already frozen by kernel, err=%d", error); + + return error; +} + +static void +xfs_dax_notify_failure_thaw( + struct xfs_mount *mp, + bool kernel_frozen) +{ + struct super_block *sb = mp->m_super; + int error; + + if (kernel_frozen) { + error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + if (error) + xfs_emerg(mp, "still frozen after notify failure, err=%d", + error); + } + + /* + * Also thaw userspace call anyway because the device is about to be + * removed immediately. + */ + thaw_super(sb, FREEZE_HOLDER_USERSPACE); +} + static int xfs_dax_notify_ddev_failure( struct xfs_mount *mp, @@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure( struct xfs_btree_cur *cur = NULL; struct xfs_buf *agf_bp = NULL; int error = 0; + bool kernel_frozen = false; xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1); xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + if (mf_flags & MF_MEM_PRE_REMOVE) { + xfs_info(mp, "Device is about to be removed!"); + /* + * Freeze fs to prevent new mappings from being created. + * - Keep going on if others already hold the kernel forzen. + * - Keep going on if other errors too because this device is + * starting to fail. + * - If kernel frozen state is hold successfully here, thaw it + * here as well at the end. + */ + kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0; + } + error = xfs_trans_alloc_empty(mp, &tp); if (error) - return error; + goto out; for (; agno <= end_agno; agno++) { struct xfs_rmap_irec ri_low = { }; @@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure( } xfs_trans_cancel(tp); - if (error || notify.want_shutdown) { + + /* + * Shutdown fs from a force umount in pre-remove case which won't fail, + * so errors can be ignored. Otherwise, shutdown the filesystem with + * CORRUPT flag if error occured or notify.want_shutdown was set during + * RMAP querying. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); + else if (error || notify.want_shutdown) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); if (!error) error = -EFSCORRUPTED; } + +out: + /* Thaw the fs if it has been frozen before. */ + if (mf_flags & MF_MEM_PRE_REMOVE) + xfs_dax_notify_failure_thaw(mp, kernel_frozen); + return error; } @@ -197,6 +279,14 @@ xfs_dax_notify_failure( if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && mp->m_logdev_targp != mp->m_ddev_targp) { + /* + * In the pre-remove case the failure notification is attempting + * to trigger a force unmount. The expectation is that the + * device is still present, but its removal is in progress and + * can not be cancelled, proceed with accessing the log device. + */ + if (mf_flags & MF_MEM_PRE_REMOVE) + return 0; xfs_err(mp, "ondisk log corrupt, shutting down fs!"); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); return -EFSCORRUPTED; @@ -210,6 +300,12 @@ xfs_dax_notify_failure( ddev_start = mp->m_ddev_targp->bt_dax_part_off; ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + /* Notify failure on the whole device. */ + if (offset == 0 && len == U64_MAX) { + offset = ddev_start; + len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev); + } + /* Ignore the range out of filesystem area */ if (offset + len - 1 < ddev_start) return -ENXIO; diff --git a/include/linux/mm.h b/include/linux/mm.h index 418d26608ece..caf13e94260e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3904,6 +3904,7 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, + MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 660c21859118..cff3bda60691 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -679,7 +679,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, */ static void collect_procs_fsdax(struct page *page, struct address_space *mapping, pgoff_t pgoff, - struct list_head *to_kill) + struct list_head *to_kill, bool pre_remove) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -687,8 +687,15 @@ static void collect_procs_fsdax(struct page *page, i_mmap_lock_read(mapping); rcu_read_lock(); for_each_process(tsk) { - struct task_struct *t = task_early_kill(tsk, true); + struct task_struct *t = tsk; + /* + * Search for all tasks while MF_MEM_PRE_REMOVE is set, because + * the current may not be the one accessing the fsdax page. + * Otherwise, search for the current task. + */ + if (!pre_remove) + t = task_early_kill(tsk, true); if (!t) continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1795,6 +1802,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, dax_entry_t cookie; struct page *page; size_t end = index + count; + bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE; mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; @@ -1806,9 +1814,14 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, if (!page) goto unlock; - SetPageHWPoison(page); + if (!pre_remove) + SetPageHWPoison(page); - collect_procs_fsdax(page, mapping, index, &to_kill); + /* + * The pre_remove case is revoking access, the memory is still + * good and could theoretically be put back into service. + */ + collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove); unmap_and_kill(&to_kill, page_to_pfn(page), mapping, index, mf_flags); unlock: From 011f129fee4bd064a3db30ca1a0139548a619482 Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 29 Nov 2023 19:39:47 +0700 Subject: [PATCH 033/123] Documentation: xfs: consolidate XFS docs into its own subdirectory XFS docs are currently in upper-level Documentation/filesystems. Although these are currently 4 docs, they are already outstanding as a group and can be moved to its own subdirectory. Consolidate them into Documentation/filesystems/xfs/. Signed-off-by: Bagas Sanjaya Reviewed-by: Bill O'Donnell Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- Documentation/filesystems/index.rst | 5 +---- Documentation/filesystems/xfs/index.rst | 14 ++++++++++++++ .../{ => xfs}/xfs-delayed-logging-design.rst | 0 .../{ => xfs}/xfs-maintainer-entry-profile.rst | 0 .../{ => xfs}/xfs-online-fsck-design.rst | 2 +- .../{ => xfs}/xfs-self-describing-metadata.rst | 0 .../maintainer/maintainer-entry-profile.rst | 2 +- MAINTAINERS | 4 ++-- 8 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 Documentation/filesystems/xfs/index.rst rename Documentation/filesystems/{ => xfs}/xfs-delayed-logging-design.rst (100%) rename Documentation/filesystems/{ => xfs}/xfs-maintainer-entry-profile.rst (100%) rename Documentation/filesystems/{ => xfs}/xfs-online-fsck-design.rst (99%) rename Documentation/filesystems/{ => xfs}/xfs-self-describing-metadata.rst (100%) diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index 09cade7eaefc..e18bc5ae3b35 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -121,8 +121,5 @@ Documentation for filesystem implementations. udf virtiofs vfat - xfs-delayed-logging-design - xfs-maintainer-entry-profile - xfs-self-describing-metadata - xfs-online-fsck-design + xfs/index zonefs diff --git a/Documentation/filesystems/xfs/index.rst b/Documentation/filesystems/xfs/index.rst new file mode 100644 index 000000000000..ab66c57a5d18 --- /dev/null +++ b/Documentation/filesystems/xfs/index.rst @@ -0,0 +1,14 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================ +XFS Filesystem Documentation +============================ + +.. toctree:: + :maxdepth: 2 + :numbered: + + xfs-delayed-logging-design + xfs-maintainer-entry-profile + xfs-self-describing-metadata + xfs-online-fsck-design diff --git a/Documentation/filesystems/xfs-delayed-logging-design.rst b/Documentation/filesystems/xfs/xfs-delayed-logging-design.rst similarity index 100% rename from Documentation/filesystems/xfs-delayed-logging-design.rst rename to Documentation/filesystems/xfs/xfs-delayed-logging-design.rst diff --git a/Documentation/filesystems/xfs-maintainer-entry-profile.rst b/Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst similarity index 100% rename from Documentation/filesystems/xfs-maintainer-entry-profile.rst rename to Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst diff --git a/Documentation/filesystems/xfs-online-fsck-design.rst b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst similarity index 99% rename from Documentation/filesystems/xfs-online-fsck-design.rst rename to Documentation/filesystems/xfs/xfs-online-fsck-design.rst index a0678101a7d0..352516feef6f 100644 --- a/Documentation/filesystems/xfs-online-fsck-design.rst +++ b/Documentation/filesystems/xfs/xfs-online-fsck-design.rst @@ -962,7 +962,7 @@ disk, but these buffer verifiers cannot provide any consistency checking between metadata structures. For more information, please see the documentation for -Documentation/filesystems/xfs-self-describing-metadata.rst +Documentation/filesystems/xfs/xfs-self-describing-metadata.rst Reverse Mapping --------------- diff --git a/Documentation/filesystems/xfs-self-describing-metadata.rst b/Documentation/filesystems/xfs/xfs-self-describing-metadata.rst similarity index 100% rename from Documentation/filesystems/xfs-self-describing-metadata.rst rename to Documentation/filesystems/xfs/xfs-self-describing-metadata.rst diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst index 7ad4bfc2cc03..18cee1edaecb 100644 --- a/Documentation/maintainer/maintainer-entry-profile.rst +++ b/Documentation/maintainer/maintainer-entry-profile.rst @@ -105,4 +105,4 @@ to do something different in the near future. ../driver-api/media/maintainer-entry-profile ../driver-api/vfio-pci-device-specific-driver-acceptance ../nvme/feature-and-quirk-policy - ../filesystems/xfs-maintainer-entry-profile + ../filesystems/xfs/xfs-maintainer-entry-profile diff --git a/MAINTAINERS b/MAINTAINERS index 788be9ab5b73..76896b511c6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23894,10 +23894,10 @@ S: Supported W: http://xfs.org/ C: irc://irc.oftc.net/xfs T: git git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git -P: Documentation/filesystems/xfs-maintainer-entry-profile.rst +P: Documentation/filesystems/xfs/xfs-maintainer-entry-profile.rst F: Documentation/ABI/testing/sysfs-fs-xfs F: Documentation/admin-guide/xfs.rst -F: Documentation/filesystems/xfs-* +F: Documentation/filesystems/xfs/* F: fs/xfs/ F: include/uapi/linux/dqblk_xfs.h F: include/uapi/linux/fsmap.h From 64f08b152a3bc171f4651271bfe973ad3d085ab6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:54 +0100 Subject: [PATCH 034/123] xfs: clean up the XFS_IOC_{GS}ET_RESBLKS handler The XFS_IOC_GET_RESBLKS and XFS_IOC_SET_RESBLKS already share a fair amount of code, and will share even more soon. Move the logic for both of them out of the main xfs_file_ioctl function into a xfs_ioctl_getset_resblocks helper to share the code and prepare for additional changes. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ioctl.c | 87 +++++++++++++++++++++++----------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 6c3919687ea6..7edc1d892f0c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1872,6 +1872,46 @@ xfs_fs_eofblocks_from_user( return 0; } +static int +xfs_ioctl_getset_resblocks( + struct file *filp, + unsigned int cmd, + void __user *arg) +{ + struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; + struct xfs_fsop_resblks fsop = { }; + int error; + uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (cmd == XFS_IOC_SET_RESBLKS) { + if (xfs_is_readonly(mp)) + return -EROFS; + + if (copy_from_user(&fsop, arg, sizeof(fsop))) + return -EFAULT; + + error = mnt_want_write_file(filp); + if (error) + return error; + in = fsop.resblks; + error = xfs_reserve_blocks(mp, &in, &fsop); + mnt_drop_write_file(filp); + if (error) + return error; + } else { + error = xfs_reserve_blocks(mp, NULL, &fsop); + if (error) + return error; + } + + if (copy_to_user(arg, &fsop, sizeof(fsop))) + return -EFAULT; + return 0; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2018,50 +2058,9 @@ xfs_file_ioctl( return 0; } - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (xfs_is_readonly(mp)) - return -EROFS; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -EFAULT; - - error = mnt_want_write_file(filp); - if (error) - return error; - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - mnt_drop_write_file(filp); - if (error) - return error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -EFAULT; - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - - return 0; - } + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + return xfs_ioctl_getset_resblocks(filp, cmd, arg); case XFS_IOC_FSGROWFSDATA: { struct xfs_growfs_data in; From c2c2620de7577db66a859b934715e98e4501e4f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:55 +0100 Subject: [PATCH 035/123] xfs: clean up the XFS_IOC_FSCOUNTS handler Split XFS_IOC_FSCOUNTS out of the main xfs_file_ioctl function, and merge the xfs_fs_counts helper into the ioctl handler. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 16 ---------------- fs/xfs/xfs_fsops.h | 1 - fs/xfs/xfs_ioctl.c | 29 ++++++++++++++++++++--------- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7cb75cb6b8e9..01681783e2c3 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -343,22 +343,6 @@ xfs_growfs_log( return error; } -/* - * exported through ioctl XFS_IOC_FSCOUNTS - */ - -void -xfs_fs_counts( - xfs_mount_t *mp, - xfs_fsop_counts_t *cnt) -{ - cnt->allocino = percpu_counter_read_positive(&mp->m_icount); - cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); - cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - cnt->freertx = percpu_counter_read_positive(&mp->m_frextents); -} - /* * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS * diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 2cffe51a31e8..45f0cb6e8059 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,6 @@ extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 7edc1d892f0c..8244210f6786 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1912,6 +1912,24 @@ xfs_ioctl_getset_resblocks( return 0; } +static int +xfs_ioctl_fs_counts( + struct xfs_mount *mp, + struct xfs_fsop_counts __user *uarg) +{ + struct xfs_fsop_counts out = { + .allocino = percpu_counter_read_positive(&mp->m_icount), + .freeino = percpu_counter_read_positive(&mp->m_ifree), + .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + xfs_fdblocks_unavailable(mp), + .freertx = percpu_counter_read_positive(&mp->m_frextents), + }; + + if (copy_to_user(uarg, &out, sizeof(out))) + return -EFAULT; + return 0; +} + /* * These long-unused ioctls were removed from the official ioctl API in 5.17, * but retain these definitions so that we can log warnings about them. @@ -2048,15 +2066,8 @@ xfs_file_ioctl( return error; } - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; - - xfs_fs_counts(mp, &out); - - if (copy_to_user(arg, &out, sizeof(out))) - return -EFAULT; - return 0; - } + case XFS_IOC_FSCOUNTS: + return xfs_ioctl_fs_counts(mp, arg); case XFS_IOC_SET_RESBLKS: case XFS_IOC_GET_RESBLKS: From 646ddf0c4df5181a7057ecccd29e535baaf034b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:56 +0100 Subject: [PATCH 036/123] xfs: clean up the xfs_reserve_blocks interface xfs_reserve_blocks has a very odd interface that can only be explained by it directly deriving from the IRIX fcntl handler back in the day. Split reporting out the reserved blocks out of xfs_reserve_blocks into the only caller that cares. This means that the value reported from XFS_IOC_SET_RESBLKS isn't atomically sampled in the same critical section as when it was set anymore, but as the values could change right after setting them anyway that does not matter. It does provide atomic sampling of both values for XFS_IOC_GET_RESBLKS now, though. Also pass a normal scalar integer value for the requested value instead of the pointless pointer. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 34 +++------------------------------- fs/xfs/xfs_fsops.h | 3 +-- fs/xfs/xfs_ioctl.c | 13 ++++++------- fs/xfs/xfs_mount.c | 8 ++------ fs/xfs/xfs_super.c | 6 ++---- 5 files changed, 14 insertions(+), 50 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 01681783e2c3..4f5da19142f2 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -344,43 +344,20 @@ xfs_growfs_log( } /* - * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS - * - * xfs_reserve_blocks is called to set m_resblks - * in the in-core mount table. The number of unused reserved blocks - * is kept in m_resblks_avail. - * * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number - * reserved are returned in outval - * - * A null inval pointer indicates that only the current reserved blocks - * available should be returned no settings are changed. + * reserved are returned in outval. */ - int xfs_reserve_blocks( - xfs_mount_t *mp, - uint64_t *inval, - xfs_fsop_resblks_t *outval) + struct xfs_mount *mp, + uint64_t request) { int64_t lcounter, delta; int64_t fdblks_delta = 0; - uint64_t request; int64_t free; int error = 0; - /* If inval is null, report current values and return */ - if (inval == (uint64_t *)NULL) { - if (!outval) - return -EINVAL; - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - return 0; - } - - request = *inval; - /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -450,11 +427,6 @@ xfs_reserve_blocks( spin_lock(&mp->m_sb_lock); } out: - if (outval) { - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - } - spin_unlock(&mp->m_sb_lock); return error; } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 45f0cb6e8059..7536f8a92746 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,8 +8,7 @@ extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval, - xfs_fsop_resblks_t *outval); +int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 8244210f6786..f02b6e558af5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1881,7 +1881,6 @@ xfs_ioctl_getset_resblocks( struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount; struct xfs_fsop_resblks fsop = { }; int error; - uint64_t in; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1896,17 +1895,17 @@ xfs_ioctl_getset_resblocks( error = mnt_want_write_file(filp); if (error) return error; - in = fsop.resblks; - error = xfs_reserve_blocks(mp, &in, &fsop); + error = xfs_reserve_blocks(mp, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; - } else { - error = xfs_reserve_blocks(mp, NULL, &fsop); - if (error) - return error; } + spin_lock(&mp->m_sb_lock); + fsop.resblks = mp->m_resblks; + fsop.resblks_avail = mp->m_resblks_avail; + spin_unlock(&mp->m_sb_lock); + if (copy_to_user(arg, &fsop, sizeof(fsop))) return -EFAULT; return 0; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index aed5be5508fe..aabb25dc3efa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -637,7 +637,6 @@ xfs_mountfs( struct xfs_sb *sbp = &(mp->m_sb); struct xfs_inode *rip; struct xfs_ino_geometry *igeo = M_IGEO(mp); - uint64_t resblks; uint quotamount = 0; uint quotaflags = 0; int error = 0; @@ -974,8 +973,7 @@ xfs_mountfs( * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - resblks = xfs_default_resblks(mp); - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); if (error) xfs_warn(mp, "Unable to allocate reserve blocks. Continuing without reserve pool."); @@ -1053,7 +1051,6 @@ void xfs_unmountfs( struct xfs_mount *mp) { - uint64_t resblks; int error; /* @@ -1090,8 +1087,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - resblks = 0; - error = xfs_reserve_blocks(mp, &resblks, NULL); + error = xfs_reserve_blocks(mp, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 764304595e8b..d0009430a627 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -906,10 +906,8 @@ xfs_fs_statfs( STATIC void xfs_save_resvblks(struct xfs_mount *mp) { - uint64_t resblks = 0; - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, 0); } STATIC void @@ -923,7 +921,7 @@ xfs_restore_resvblks(struct xfs_mount *mp) } else resblks = xfs_default_resblks(mp); - xfs_reserve_blocks(mp, &resblks, NULL); + xfs_reserve_blocks(mp, resblks); } /* From 08e54ca42d6a0d88709a1be38eb95843142b5101 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 18:40:57 +0100 Subject: [PATCH 037/123] xfs: clean up xfs_fsops.h Use struct types instead of typedefs so that the header can be included with pulling in the headers that define the typedefs, and remove the pointless externs. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 7536f8a92746..44457b0a0593 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -6,12 +6,12 @@ #ifndef __XFS_FSOPS_H__ #define __XFS_FSOPS_H__ -extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); -extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); +int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); +int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); -extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags); +int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); -extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); -extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); +int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ From e6af9c98cbf0164a619d95572136bfb54d482dd6 Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Tue, 5 Dec 2023 13:58:58 +0800 Subject: [PATCH 038/123] xfs: ensure logflagsp is initialized in xfs_bmap_del_extent_real In the case of returning -ENOSPC, ensure logflagsp is initialized by 0. Otherwise the caller __xfs_bunmapi will set uninitialized illegal tmp_logflags value into xfs log, which might cause unpredictable error in the log recovery procedure. Also, remove the flags variable and set the *logflagsp directly, so that the code should be more robust in the long run. Fixes: 1b24b633aafe ("xfs: move some more code into xfs_bmap_del_extent_real") Signed-off-by: Jiachen Zhang Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 73 +++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 42 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 68be1dd4f0f2..ca6614f4eac5 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5010,7 +5010,6 @@ xfs_bmap_del_extent_real( xfs_fileoff_t del_endoff; /* first offset past del */ int do_fx; /* free extent at end of routine */ int error; /* error return value */ - int flags = 0;/* inode logging flags */ struct xfs_bmbt_irec got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -5023,6 +5022,8 @@ xfs_bmap_del_extent_real( uint32_t state = xfs_bmap_fork_to_state(whichfork); struct xfs_bmbt_irec old; + *logflagsp = 0; + mp = ip->i_mount; XFS_STATS_INC(mp, xs_del_exlist); @@ -5035,7 +5036,6 @@ xfs_bmap_del_extent_real( ASSERT(got_endoff >= del_endoff); ASSERT(!isnullstartblock(got.br_startblock)); qfield = 0; - error = 0; /* * If it's the case where the directory code is running with no block @@ -5051,13 +5051,13 @@ xfs_bmap_del_extent_real( del->br_startoff > got.br_startoff && del_endoff < got_endoff) return -ENOSPC; - flags = XFS_ILOG_CORE; + *logflagsp = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { if (!(bflags & XFS_BMAPI_REMAP)) { error = xfs_rtfree_blocks(tp, del->br_startblock, del->br_blockcount); if (error) - goto done; + return error; } do_fx = 0; @@ -5072,11 +5072,9 @@ xfs_bmap_del_extent_real( if (cur) { error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } if (got.br_startoff == del->br_startoff) @@ -5093,17 +5091,15 @@ xfs_bmap_del_extent_real( xfs_iext_prev(ifp, icur); ifp->if_nextents--; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_btree_delete(cur, &i))) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; break; case BMAP_LEFT_FILLING: /* @@ -5114,12 +5110,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case BMAP_RIGHT_FILLING: /* @@ -5128,12 +5124,12 @@ xfs_bmap_del_extent_real( got.br_blockcount -= del->br_blockcount; xfs_iext_update_extent(ip, state, icur, &got); if (!cur) { - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); break; } error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; break; case 0: /* @@ -5150,18 +5146,18 @@ xfs_bmap_del_extent_real( new.br_state = got.br_state; new.br_startblock = del_endblock; - flags |= XFS_ILOG_CORE; + *logflagsp |= XFS_ILOG_CORE; if (cur) { error = xfs_bmbt_update(cur, &got); if (error) - goto done; + return error; error = xfs_btree_increment(cur, 0, &i); if (error) - goto done; + return error; cur->bc_rec.b = new; error = xfs_btree_insert(cur, &i); if (error && error != -ENOSPC) - goto done; + return error; /* * If get no-space back from btree insert, it tried a * split, and we have a zero block reservation. Fix up @@ -5174,33 +5170,28 @@ xfs_bmap_del_extent_real( */ error = xfs_bmbt_lookup_eq(cur, &got, &i); if (error) - goto done; - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; - } + return error; + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; /* * Update the btree record back * to the original value. */ error = xfs_bmbt_update(cur, &old); if (error) - goto done; + return error; /* * Reset the extent record back * to the original value. */ xfs_iext_update_extent(ip, state, icur, &old); - flags = 0; - error = -ENOSPC; - goto done; - } - if (XFS_IS_CORRUPT(mp, i != 1)) { - error = -EFSCORRUPTED; - goto done; + *logflagsp = 0; + return -ENOSPC; } + if (XFS_IS_CORRUPT(mp, i != 1)) + return -EFSCORRUPTED; } else - flags |= xfs_ilog_fext(whichfork); + *logflagsp |= xfs_ilog_fext(whichfork); ifp->if_nextents++; xfs_iext_next(ifp, icur); @@ -5224,7 +5215,7 @@ xfs_bmap_del_extent_real( ((bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN)); if (error) - goto done; + return error; } } @@ -5239,9 +5230,7 @@ xfs_bmap_del_extent_real( if (qfield && !(bflags & XFS_BMAPI_REMAP)) xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); -done: - *logflagsp = flags; - return error; + return 0; } /* From 5759aa4f956034b289b0ae2c99daddfc775442e1 Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Tue, 5 Dec 2023 13:58:59 +0800 Subject: [PATCH 039/123] xfs: update dir3 leaf block metadata after swap xfs_da3_swap_lastblock() copy the last block content to the dead block, but do not update the metadata in it. We need update some metadata for some kinds of type block, such as dir3 leafn block records its blkno, we shall update it to the dead block blkno. Otherwise, before write the xfs_buf to disk, the verify_write() will fail in blk_hdr->blkno != xfs_buf->b_bn, then xfs will be shutdown. We will get this warning: XFS (dm-0): Metadata corruption detected at xfs_dir3_leaf_verify+0xa8/0xe0 [xfs], xfs_dir3_leafn block 0x178 XFS (dm-0): Unmount and run xfs_repair XFS (dm-0): First 128 bytes of corrupted metadata buffer: 00000000e80f1917: 00 80 00 0b 00 80 00 07 3d ff 00 00 00 00 00 00 ........=....... 000000009604c005: 00 00 00 00 00 00 01 a0 00 00 00 00 00 00 00 00 ................ 000000006b6fb2bf: e4 44 e3 97 b5 64 44 41 8b 84 60 0e 50 43 d9 bf .D...dDA..`.PC.. 00000000678978a2: 00 00 00 00 00 00 00 83 01 73 00 93 00 00 00 00 .........s...... 00000000b28b247c: 99 29 1d 38 00 00 00 00 99 29 1d 40 00 00 00 00 .).8.....).@.... 000000002b2a662c: 99 29 1d 48 00 00 00 00 99 49 11 00 00 00 00 00 .).H.....I...... 00000000ea2ffbb8: 99 49 11 08 00 00 45 25 99 49 11 10 00 00 48 fe .I....E%.I....H. 0000000069e86440: 99 49 11 18 00 00 4c 6b 99 49 11 20 00 00 4d 97 .I....Lk.I. ..M. XFS (dm-0): xfs_do_force_shutdown(0x8) called from line 1423 of file fs/xfs/xfs_buf.c. Return address = 00000000c0ff63c1 XFS (dm-0): Corruption of in-memory data detected. Shutting down filesystem XFS (dm-0): Please umount the filesystem and rectify the problem(s) >From the log above, we know xfs_buf->b_no is 0x178, but the block's hdr record its blkno is 0x1a0. Fixes: 24df33b45ecf ("xfs: add CRC checking to dir2 leaf blocks") Signed-off-by: Zhang Tianci Suggested-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_da_btree.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e576560b46e9..282c7cf032f4 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2316,10 +2316,17 @@ xfs_da3_swap_lastblock( return error; /* * Copy the last block into the dead buffer and log it. + * On CRC-enabled file systems, also update the stamped in blkno. */ memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + if (xfs_has_crc(mp)) { + struct xfs_da3_blkinfo *da3 = dead_buf->b_addr; + + da3->blkno = cpu_to_be64(xfs_buf_daddr(dead_buf)); + } xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; + /* * Get values from the moved block. */ From fd45ddb9dd606b3eaddf26e13f64340636955986 Mon Sep 17 00:00:00 2001 From: Zhang Tianci Date: Tue, 5 Dec 2023 13:59:00 +0800 Subject: [PATCH 040/123] xfs: extract xfs_da_buf_copy() helper function This patch does not modify logic. xfs_da_buf_copy() will copy one block from src xfs_buf to dst xfs_buf, and update the block metadata in dst directly. Signed-off-by: Zhang Tianci Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr_leaf.c | 12 ++---- fs/xfs/libxfs/xfs_da_btree.c | 74 ++++++++++++++--------------------- fs/xfs/libxfs/xfs_da_btree.h | 2 + 3 files changed, 36 insertions(+), 52 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2580ae47209a..654e17e6610d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1244,14 +1244,10 @@ xfs_attr3_leaf_to_node( if (error) goto out; - /* copy leaf to new buffer, update identifiers */ - xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); - bp2->b_ops = bp1->b_ops; - memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_has_crc(mp)) { - struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); - } + /* + * Copy leaf to new buffer and log it. + */ + xfs_da_buf_copy(bp2, bp1, args->geo->blksize); xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 282c7cf032f4..5457188bb4de 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -421,6 +421,25 @@ xfs_da3_node_read_mapped( return xfs_da3_node_set_type(tp, *bpp); } +/* + * Copy src directory/attr leaf/node buffer to the dst. + * For v5 file systems make sure the right blkno is stamped in. + */ +void +xfs_da_buf_copy( + struct xfs_buf *dst, + struct xfs_buf *src, + size_t size) +{ + struct xfs_da3_blkinfo *da3 = dst->b_addr; + + memcpy(dst->b_addr, src->b_addr, size); + dst->b_ops = src->b_ops; + xfs_trans_buf_copy_type(dst, src); + if (xfs_has_crc(dst->b_mount)) + da3->blkno = cpu_to_be64(xfs_buf_daddr(dst)); +} + /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -690,12 +709,6 @@ xfs_da3_root_split( btree = icnodehdr.btree; size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); level = icnodehdr.level; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { struct xfs_dir3_icleaf_hdr leafhdr; @@ -707,31 +720,17 @@ xfs_da3_root_split( size = (int)((char *)&leafhdr.ents[leafhdr.count] - (char *)leaf); level = 0; - - /* - * we are about to copy oldroot to bp, so set up the type - * of bp while we know exactly what it will be. - */ - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } /* - * we can copy most of the information in the node from one block to - * another, but for CRC enabled headers we have to make sure that the - * block specific identifiers are kept intact. We update the buffer - * directly for this. + * Copy old root to new buffer and log it. */ - memcpy(node, oldroot, size); - if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || - oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { - struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - - node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); - } + xfs_da_buf_copy(bp, blk1->bp, size); xfs_trans_log_buf(tp, bp, 0, size - 1); - bp->b_ops = blk1->bp->b_ops; - xfs_trans_buf_copy_type(bp, blk1->bp); + /* + * Update blk1 to point to new buffer. + */ blk1->bp = bp; blk1->blkno = blkno; @@ -1220,21 +1219,14 @@ xfs_da3_root_join( xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); /* - * This could be copying a leaf back into the root block in the case of - * there only being a single leaf block left in the tree. Hence we have - * to update the b_ops pointer as well to match the buffer type change - * that could occur. For dir3 blocks we also need to update the block - * number in the buffer header. + * Copy child to root buffer and log it. */ - memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); - root_blk->bp->b_ops = bp->b_ops; - xfs_trans_buf_copy_type(root_blk->bp, bp); - if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { - struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); - } + xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize); xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); + /* + * Now we can drop the child buffer. + */ error = xfs_da_shrink_inode(args, child, bp); return error; } @@ -2316,14 +2308,8 @@ xfs_da3_swap_lastblock( return error; /* * Copy the last block into the dead buffer and log it. - * On CRC-enabled file systems, also update the stamped in blkno. */ - memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); - if (xfs_has_crc(mp)) { - struct xfs_da3_blkinfo *da3 = dead_buf->b_addr; - - da3->blkno = cpu_to_be64(xfs_buf_daddr(dead_buf)); - } + xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize); xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); dead_info = dead_buf->b_addr; diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ffa3df5b2893..706baf36e175 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, struct xfs_buf *dead_buf); +void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src, + size_t size); uint xfs_da_hashname(const uint8_t *name_string, int name_length); enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, From c12c50393c1f6f7d7e45c7f55da9c013c0cc0522 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 21:07:18 +0100 Subject: [PATCH 041/123] xfs: use static_assert to check struct sizes and offsets Use the compiler-provided static_assert built-in from C11 instead of the kernel-specific BUILD_BUG_ON_MSG for the structure size and offset checks in xfs_ondisk. This not only gives slightly nicer error messages in case things go south, but can also be trivially used as-is in userspace. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino Signed-off-by: Chandan Babu R --- fs/xfs/xfs_ondisk.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 21a7e350b4c5..d9c988c5ad69 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -7,16 +7,16 @@ #define __XFS_ONDISK_H #define XFS_CHECK_STRUCT_SIZE(structname, size) \ - BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ - #structname ") is wrong, expected " #size) + static_assert(sizeof(structname) == (size), \ + "XFS: sizeof(" #structname ") is wrong, expected " #size) #define XFS_CHECK_OFFSET(structname, member, off) \ - BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \ + static_assert(offsetof(structname, member) == (off), \ "XFS: offsetof(" #structname ", " #member ") is wrong, " \ "expected " #off) #define XFS_CHECK_VALUE(value, expected) \ - BUILD_BUG_ON_MSG((value) != (expected), \ + static_assert((value) == (expected), \ "XFS: value of " #value " is wrong, expected " #expected) static inline void __init From 18793e050504288345eb455a471677b57117bcc6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 Dec 2023 21:07:19 +0100 Subject: [PATCH 042/123] xfs: move xfs_ondisk.h to libxfs/ Move xfs_ondisk.h to libxfs so that we can do the struct sanity checks in userspace libxfs as well. This should allow us to retire the somewhat fragile xfs/122 test on xfstests. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Carlos Maiolino Signed-off-by: Chandan Babu R --- fs/xfs/{ => libxfs}/xfs_ondisk.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename fs/xfs/{ => libxfs}/xfs_ondisk.h (100%) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h similarity index 100% rename from fs/xfs/xfs_ondisk.h rename to fs/xfs/libxfs/xfs_ondisk.h From 578bd4ce7100ae34f98c6b0147fe75cfa0dadbac Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 11 Dec 2023 10:41:51 -0800 Subject: [PATCH 043/123] xfs: recompute growfsrtfree transaction reservation while growing rt volume While playing with growfs to create a 20TB realtime section on a filesystem that didn't previously have an rt section, I noticed that growfs would occasionally shut down the log due to a transaction reservation overflow. xfs_calc_growrtfree_reservation uses the current size of the realtime summary file (m_rsumsize) to compute the transaction reservation for a growrtfree transaction. The reservations are computed at mount time, which means that m_rsumsize is zero when growfs starts "freeing" the new realtime extents into the rt volume. As a result, the transaction is undersized and fails. Fix this by recomputing the transaction reservations every time we change m_rsumsize. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_rtalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 8feb58c6241c..0c9893b9f2a9 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1038,6 +1038,9 @@ xfs_growfs_rt( nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nsbp->sb_rbmblocks); nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks); + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(nmp, &nmp->m_resv); + /* * Start a transaction, get the log reservation. */ @@ -1124,6 +1127,8 @@ xfs_growfs_rt( */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; + /* recompute growfsrt reservation from new rsumsize */ + xfs_trans_resv_calc(mp, &mp->m_resv); error = xfs_trans_commit(tp); if (error) From c00eebd09e95757c9c1d08f0a6bbc32c543daf90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:29 +0100 Subject: [PATCH 044/123] xfs: consolidate the xfs_attr_defer_* helpers Consolidate the xfs_attr_defer_* helpers into a single xfs_attr_defer_add one that picks the right dela_state based on the passed in operation. Also move to a single trace point as the actual operation is visible through the flags in the delta_state passed to the trace point. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 90 ++++++++++------------------------------ fs/xfs/xfs_trace.h | 2 - 2 files changed, 21 insertions(+), 71 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e28d93d232de..4fed0c87a968 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -880,11 +880,10 @@ xfs_attr_lookup( return error; } -static int -xfs_attr_intent_init( +static void +xfs_attr_defer_add( struct xfs_da_args *args, - unsigned int op_flags, /* op flag (set or remove) */ - struct xfs_attr_intent **attr) /* new xfs_attr_intent */ + unsigned int op_flags) { struct xfs_attr_intent *new; @@ -893,66 +892,22 @@ xfs_attr_intent_init( new->xattri_op_flags = op_flags; new->xattri_da_args = args; - *attr = new; - return 0; -} + switch (op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + new->xattri_dela_state = xfs_attr_init_add_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REPLACE: + new->xattri_dela_state = xfs_attr_init_replace_state(args); + break; + case XFS_ATTRI_OP_FLAGS_REMOVE: + new->xattri_dela_state = xfs_attr_init_remove_state(args); + break; + default: + ASSERT(0); + } -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_add( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_add_state(args); xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Sets an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_replace( - struct xfs_da_args *args) -{ - struct xfs_attr_intent *new; - int error = 0; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_replace_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp); - - return 0; -} - -/* Removes an attribute for an inode as a deferred operation */ -static int -xfs_attr_defer_remove( - struct xfs_da_args *args) -{ - - struct xfs_attr_intent *new; - int error; - - error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new); - if (error) - return error; - - new->xattri_dela_state = xfs_attr_init_remove_state(args); - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); - trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp); - - return 0; } /* @@ -1038,16 +993,16 @@ xfs_attr_set( error = xfs_attr_lookup(args); switch (error) { case -EEXIST: - /* if no value, we are performing a remove operation */ if (!args->value) { - error = xfs_attr_defer_remove(args); + /* if no value, we are performing a remove operation */ + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE); break; } + /* Pure create fails if the attr already exists */ if (args->attr_flags & XATTR_CREATE) goto out_trans_cancel; - - error = xfs_attr_defer_replace(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE); break; case -ENOATTR: /* Can't remove what isn't there. */ @@ -1057,14 +1012,11 @@ xfs_attr_set( /* Pure replace fails if no existing attr to replace. */ if (args->attr_flags & XATTR_REPLACE) goto out_trans_cancel; - - error = xfs_attr_defer_add(args); + xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET); break; default: goto out_trans_cancel; } - if (error) - goto out_trans_cancel; /* * If this is a synchronous mount, make sure that the diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 514095b6ba2b..516529c151ae 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -4408,8 +4408,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc); DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace); -DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove); TRACE_EVENT(xfs_force_shutdown, From 2e8f7b6f4a15ea92cb2186ad300ae4191d0edcef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:30 +0100 Subject: [PATCH 045/123] xfs: move xfs_attr_defer_type up in xfs_attr_item.c We'll reference it directly in xlog_recover_attri_commit_pass2, so move it up a bit. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_attr_item.c | 66 +++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 39f2c5a46179..4e0eaa2640e0 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -654,6 +654,39 @@ xfs_attr_relog_intent( return &new_attrip->attri_item; } +/* Get an ATTRD so we can process all the attrs. */ +static struct xfs_log_item * +xfs_attr_create_done( + struct xfs_trans *tp, + struct xfs_log_item *intent, + unsigned int count) +{ + struct xfs_attri_log_item *attrip; + struct xfs_attrd_log_item *attrdp; + + attrip = ATTRI_ITEM(intent); + + attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); + + xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, + &xfs_attrd_item_ops); + attrdp->attrd_attrip = attrip; + attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; + + return &attrdp->attrd_item; +} + +const struct xfs_defer_op_type xfs_attr_defer_type = { + .max_items = 1, + .create_intent = xfs_attr_create_intent, + .abort_intent = xfs_attr_abort_intent, + .create_done = xfs_attr_create_done, + .finish_item = xfs_attr_finish_item, + .cancel_item = xfs_attr_cancel_item, + .recover_work = xfs_attr_recover_work, + .relog_intent = xfs_attr_relog_intent, +}; + STATIC int xlog_recover_attri_commit_pass2( struct xlog *log, @@ -730,39 +763,6 @@ xlog_recover_attri_commit_pass2( return 0; } -/* Get an ATTRD so we can process all the attrs. */ -static struct xfs_log_item * -xfs_attr_create_done( - struct xfs_trans *tp, - struct xfs_log_item *intent, - unsigned int count) -{ - struct xfs_attri_log_item *attrip; - struct xfs_attrd_log_item *attrdp; - - attrip = ATTRI_ITEM(intent); - - attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL); - - xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD, - &xfs_attrd_item_ops); - attrdp->attrd_attrip = attrip; - attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id; - - return &attrdp->attrd_item; -} - -const struct xfs_defer_op_type xfs_attr_defer_type = { - .max_items = 1, - .create_intent = xfs_attr_create_intent, - .abort_intent = xfs_attr_abort_intent, - .create_done = xfs_attr_create_done, - .finish_item = xfs_attr_finish_item, - .cancel_item = xfs_attr_cancel_item, - .recover_work = xfs_attr_recover_work, - .relog_intent = xfs_attr_relog_intent, -}; - /* * This routine is called when an ATTRD format structure is found in a committed * transaction in the log. Its purpose is to cancel the corresponding ATTRI if From 7f2f7531e0d455f1abb9f48fbbe17c37e8742590 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:31 +0100 Subject: [PATCH 046/123] xfs: store an ops pointer in struct xfs_defer_pending The dfp_type field in struct xfs_defer_pending is only used to either look up the operations associated with the pending word or in trace points. Replace it with a direct pointer to the operations vector, and store a pretty name in the vector for tracing. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 43 +++++++++++++++----------------------- fs/xfs/libxfs/xfs_defer.h | 5 +++-- fs/xfs/xfs_attr_item.c | 1 + fs/xfs/xfs_bmap_item.c | 1 + fs/xfs/xfs_extfree_item.c | 2 ++ fs/xfs/xfs_refcount_item.c | 1 + fs/xfs/xfs_rmap_item.c | 1 + fs/xfs/xfs_trace.h | 16 +++++++------- 8 files changed, 34 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ecc2f7ec6991..e70881ae5cc5 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -251,7 +251,6 @@ xfs_defer_create_done( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; /* If there is no log intent item, there can be no log done item. */ @@ -266,7 +265,7 @@ xfs_defer_create_done( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - lip = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); + lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count); if (!lip) return; @@ -287,13 +286,13 @@ xfs_defer_create_intent( struct xfs_defer_pending *dfp, bool sort) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct xfs_log_item *lip; if (dfp->dfp_intent) return 1; - lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort); + lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, + sort); if (!lip) return 0; if (IS_ERR(lip)) @@ -338,12 +337,10 @@ xfs_defer_pending_abort( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(mp, dfp); if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); + dfp->dfp_ops->abort_intent(dfp->dfp_intent); dfp->dfp_intent = NULL; } } @@ -353,7 +350,6 @@ xfs_defer_pending_cancel_work( struct xfs_mount *mp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; struct list_head *pwi; struct list_head *n; @@ -364,7 +360,7 @@ xfs_defer_pending_cancel_work( list_del(pwi); dfp->dfp_count--; trace_xfs_defer_cancel_item(mp, dfp, pwi); - ops->cancel_item(pwi); + dfp->dfp_ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); kmem_cache_free(xfs_defer_pending_cache, dfp); @@ -522,11 +518,10 @@ xfs_defer_relog_intent( struct xfs_defer_pending *dfp) { struct xfs_log_item *lip; - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; xfs_defer_create_done(tp, dfp); - lip = ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); + lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done); if (lip) { xfs_trans_add_item(tp, lip); set_bit(XFS_LI_DIRTY, &lip->li_flags); @@ -593,7 +588,7 @@ xfs_defer_finish_one( struct xfs_trans *tp, struct xfs_defer_pending *dfp) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; + const struct xfs_defer_op_type *ops = dfp->dfp_ops; struct xfs_btree_cur *state = NULL; struct list_head *li, *n; int error; @@ -790,7 +785,6 @@ xfs_defer_cancel( static inline struct xfs_defer_pending * xfs_defer_find_last( struct xfs_trans *tp, - enum xfs_defer_ops_type type, const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; @@ -803,7 +797,7 @@ xfs_defer_find_last( dfp_list); /* Wrong type? */ - if (dfp->dfp_type != type) + if (dfp->dfp_ops != ops) return NULL; return dfp; } @@ -836,13 +830,13 @@ xfs_defer_can_append( static inline struct xfs_defer_pending * xfs_defer_alloc( struct xfs_trans *tp, - enum xfs_defer_ops_type type) + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = type; + dfp->dfp_ops = ops; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, &tp->t_dfops); @@ -862,9 +856,9 @@ xfs_defer_add( ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); - dfp = xfs_defer_find_last(tp, type, ops); + dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) - dfp = xfs_defer_alloc(tp, type); + dfp = xfs_defer_alloc(tp, ops); xfs_defer_add_item(dfp, li); trace_xfs_defer_add_item(tp->t_mountp, dfp, li); @@ -880,17 +874,15 @@ xfs_defer_add_barrier( struct xfs_trans *tp) { struct xfs_defer_pending *dfp; - const enum xfs_defer_ops_type type = XFS_DEFER_OPS_TYPE_BARRIER; - const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); /* If the last defer op added was a barrier, we're done. */ - dfp = xfs_defer_find_last(tp, type, ops); + dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type); if (dfp) return; - xfs_defer_alloc(tp, type); + xfs_defer_alloc(tp, &xfs_barrier_defer_type); trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL); } @@ -909,7 +901,7 @@ xfs_defer_start_recovery( dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_type = dfp_type; + dfp->dfp_ops = defer_op_types[dfp_type]; dfp->dfp_intent = lip; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, r_dfops); @@ -935,13 +927,12 @@ xfs_defer_finish_recovery( struct xfs_defer_pending *dfp, struct list_head *capture_list) { - const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; int error; - error = ops->recover_work(dfp, capture_list); + error = dfp->dfp_ops->recover_work(dfp, capture_list); if (error) trace_xlog_intent_recovery_failed(mp, error, - ops->recover_work); + dfp->dfp_ops->recover_work); return error; } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 5b1990ef3e5d..957a06278e88 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -34,9 +34,9 @@ struct xfs_defer_pending { struct list_head dfp_work; /* work items */ struct xfs_log_item *dfp_intent; /* log intent item */ struct xfs_log_item *dfp_done; /* log done item */ + const struct xfs_defer_op_type *dfp_ops; unsigned int dfp_count; /* # extent items */ unsigned int dfp_flags; - enum xfs_defer_ops_type dfp_type; }; /* @@ -61,6 +61,8 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp); /* Description of a deferred type. */ struct xfs_defer_op_type { + const char *name; + unsigned int max_items; struct xfs_log_item *(*create_intent)(struct xfs_trans *tp, struct list_head *items, unsigned int count, bool sort); void (*abort_intent)(struct xfs_log_item *intent); @@ -76,7 +78,6 @@ struct xfs_defer_op_type { struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp, struct xfs_log_item *intent, struct xfs_log_item *done_item); - unsigned int max_items; }; extern const struct xfs_defer_op_type xfs_bmap_update_defer_type; diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 4e0eaa2640e0..beae2de82450 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -677,6 +677,7 @@ xfs_attr_create_done( } const struct xfs_defer_op_type xfs_attr_defer_type = { + .name = "attr", .max_items = 1, .create_intent = xfs_attr_create_intent, .abort_intent = xfs_attr_abort_intent, diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index bc48d733634a..f43abf0b6486 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -563,6 +563,7 @@ xfs_bmap_relog_intent( } const struct xfs_defer_op_type xfs_bmap_update_defer_type = { + .name = "bmap", .max_items = XFS_BUI_MAX_FAST_EXTENTS, .create_intent = xfs_bmap_update_create_intent, .abort_intent = xfs_bmap_update_abort_intent, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3e3469504271..e67907a379c8 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -670,6 +670,7 @@ xfs_extent_free_relog_intent( } const struct xfs_defer_op_type xfs_extent_free_defer_type = { + .name = "extent_free", .max_items = XFS_EFI_MAX_FAST_EXTENTS, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, @@ -682,6 +683,7 @@ const struct xfs_defer_op_type xfs_extent_free_defer_type = { /* sub-type with special handling for AGFL deferred frees */ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .name = "agfl_free", .max_items = XFS_EFI_MAX_FAST_EXTENTS, .create_intent = xfs_extent_free_create_intent, .abort_intent = xfs_extent_free_abort_intent, diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 9974be81cb2b..b08839550f34 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -523,6 +523,7 @@ xfs_refcount_relog_intent( } const struct xfs_defer_op_type xfs_refcount_update_defer_type = { + .name = "refcount", .max_items = XFS_CUI_MAX_FAST_EXTENTS, .create_intent = xfs_refcount_update_create_intent, .abort_intent = xfs_refcount_update_abort_intent, diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 488c4a2a80a3..65b432eb5d02 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -576,6 +576,7 @@ xfs_rmap_relog_intent( } const struct xfs_defer_op_type xfs_rmap_update_defer_type = { + .name = "rmap", .max_items = XFS_RUI_MAX_FAST_EXTENTS, .create_intent = xfs_rmap_update_create_intent, .abort_intent = xfs_rmap_update_abort_intent, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 516529c151ae..0efcdb79d10e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2549,7 +2549,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, TP_ARGS(mp, dfp), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) __field(unsigned int, flags) __field(char, committed) @@ -2557,15 +2557,15 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; __entry->flags = dfp->dfp_flags; __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p flags %s committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), __entry->committed, @@ -2694,7 +2694,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, TP_ARGS(mp, dfp, item), TP_STRUCT__entry( __field(dev_t, dev) - __field(int, type) + __string(name, dfp->dfp_ops->name) __field(void *, intent) __field(void *, item) __field(char, committed) @@ -2703,16 +2703,16 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class, ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; - __entry->type = dfp->dfp_type; + __assign_str(name, dfp->dfp_ops->name); __entry->intent = dfp->dfp_intent; __entry->item = item; __entry->committed = dfp->dfp_done != NULL; __entry->flags = dfp->dfp_flags; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p item %p flags %s committed %d nr %d", + TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->type, + __get_str(name), __entry->intent, __entry->item, __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS), From dc22af64368291a86fb6b7eb2adab21c815836b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Dec 2023 06:16:32 +0100 Subject: [PATCH 047/123] xfs: pass the defer ops instead of type to xfs_defer_start_recovery xfs_defer_start_recovery is only called from xlog_recover_intent_item, and the callers of that all have the actual xfs_defer_ops_type operation vector at hand. Pass that directly instead of looking it up from the defer_op_types table. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 6 +++--- fs/xfs/libxfs/xfs_defer.h | 2 +- fs/xfs/libxfs/xfs_log_recover.h | 3 ++- fs/xfs/xfs_attr_item.c | 2 +- fs/xfs/xfs_bmap_item.c | 2 +- fs/xfs/xfs_extfree_item.c | 2 +- fs/xfs/xfs_log_recover.c | 4 ++-- fs/xfs/xfs_refcount_item.c | 2 +- fs/xfs/xfs_rmap_item.c | 2 +- 9 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index e70881ae5cc5..dd964bf825eb 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -894,14 +894,14 @@ xfs_defer_add_barrier( void xfs_defer_start_recovery( struct xfs_log_item *lip, - enum xfs_defer_ops_type dfp_type, - struct list_head *r_dfops) + struct list_head *r_dfops, + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp; dfp = kmem_cache_zalloc(xfs_defer_pending_cache, GFP_NOFS | __GFP_NOFAIL); - dfp->dfp_ops = defer_op_types[dfp_type]; + dfp->dfp_ops = ops; dfp->dfp_intent = lip; INIT_LIST_HEAD(&dfp->dfp_work); list_add_tail(&dfp->dfp_list, r_dfops); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 957a06278e88..60de91b66392 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -147,7 +147,7 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp, void xfs_defer_resources_rele(struct xfs_defer_resources *dres); void xfs_defer_start_recovery(struct xfs_log_item *lip, - enum xfs_defer_ops_type dfp_type, struct list_head *r_dfops); + struct list_head *r_dfops, const struct xfs_defer_op_type *ops); void xfs_defer_cancel_recovery(struct xfs_mount *mp, struct xfs_defer_pending *dfp); int xfs_defer_finish_recovery(struct xfs_mount *mp, diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index c8e5d912895b..9fe7a9564bca 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -11,6 +11,7 @@ * define how recovery should work for that type of log item. */ struct xlog_recover_item; +struct xfs_defer_op_type; /* Sorting hat for log items as they're read in. */ enum xlog_recover_reorder { @@ -156,7 +157,7 @@ xlog_recover_resv(const struct xfs_trans_res *r) struct xfs_defer_pending; void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip, - xfs_lsn_t lsn, unsigned int dfp_type); + xfs_lsn_t lsn, const struct xfs_defer_op_type *ops); int xlog_recover_finish_intent(struct xfs_trans *tp, struct xfs_defer_pending *dfp); diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index beae2de82450..9e02111bd890 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -759,7 +759,7 @@ xlog_recover_attri_commit_pass2( memcpy(&attrip->attri_format, attri_formatp, len); xlog_recover_intent_item(log, &attrip->attri_item, lsn, - XFS_DEFER_OPS_TYPE_ATTR); + &xfs_attr_defer_type); xfs_attri_log_nameval_put(nv); return 0; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index f43abf0b6486..52fb8a148b7d 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -650,7 +650,7 @@ xlog_recover_bui_commit_pass2( atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); xlog_recover_intent_item(log, &buip->bui_item, lsn, - XFS_DEFER_OPS_TYPE_BMAP); + &xfs_bmap_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index e67907a379c8..1d1185fca6a5 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -747,7 +747,7 @@ xlog_recover_efi_commit_pass2( atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); xlog_recover_intent_item(log, &efip->efi_item, lsn, - XFS_DEFER_OPS_TYPE_FREE); + &xfs_extent_free_defer_type); return 0; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c18692af2c65..1251c81e55f9 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1942,11 +1942,11 @@ xlog_recover_intent_item( struct xlog *log, struct xfs_log_item *lip, xfs_lsn_t lsn, - unsigned int dfp_type) + const struct xfs_defer_op_type *ops) { ASSERT(xlog_item_is_intent(lip)); - xfs_defer_start_recovery(lip, dfp_type, &log->r_dfops); + xfs_defer_start_recovery(lip, &log->r_dfops, ops); /* * Insert the intent into the AIL directly and drop one reference so diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index b08839550f34..20ad8086da60 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -605,7 +605,7 @@ xlog_recover_cui_commit_pass2( atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); xlog_recover_intent_item(log, &cuip->cui_item, lsn, - XFS_DEFER_OPS_TYPE_REFCOUNT); + &xfs_refcount_update_defer_type); return 0; } diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 65b432eb5d02..79ad0087aeca 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -658,7 +658,7 @@ xlog_recover_rui_commit_pass2( atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); xlog_recover_intent_item(log, &ruip->rui_item, lsn, - XFS_DEFER_OPS_TYPE_RMAP); + &xfs_rmap_update_defer_type); return 0; } From 603ce8ab12094a2d9483c79a7541335e258a5328 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 Dec 2023 10:06:33 +0100 Subject: [PATCH 048/123] xfs: pass the defer ops directly to xfs_defer_add Pass a pointer to the xfs_defer_op_type structure to xfs_defer_add and remove the indirection through the xfs_defer_ops_type enum and a global table of all possible operations. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_alloc.c | 4 ++-- fs/xfs/libxfs/xfs_attr.c | 2 +- fs/xfs/libxfs/xfs_bmap.c | 2 +- fs/xfs/libxfs/xfs_defer.c | 16 ++-------------- fs/xfs/libxfs/xfs_defer.h | 18 ++---------------- fs/xfs/libxfs/xfs_refcount.c | 2 +- fs/xfs/libxfs/xfs_rmap.c | 2 +- 7 files changed, 10 insertions(+), 36 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4940f9377f21..60c2c18e8e54 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2514,7 +2514,7 @@ xfs_defer_agfl_block( trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_extent_free_get_group(mp, xefi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); + xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); return 0; } @@ -2578,7 +2578,7 @@ xfs_defer_extent_free( XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); xfs_extent_free_get_group(mp, xefi); - *dfpp = xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); + *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); return 0; } diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 4fed0c87a968..fa49c795f407 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -906,7 +906,7 @@ xfs_attr_defer_add( ASSERT(0); } - xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list); + xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type); trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ca6614f4eac5..e308d2f44a3c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6091,7 +6091,7 @@ __xfs_bmap_add( bi->bi_bmap = *bmap; xfs_bmap_update_get_group(tp->t_mountp, bi); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list); + xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type); return 0; } diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index dd964bf825eb..ca7f0ac04896 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -235,16 +235,6 @@ static const struct xfs_defer_op_type xfs_barrier_defer_type = { .cancel_item = xfs_defer_barrier_cancel_item, }; -static const struct xfs_defer_op_type *defer_op_types[] = { - [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type, - [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type, - [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type, - [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type, - [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type, - [XFS_DEFER_OPS_TYPE_BARRIER] = &xfs_barrier_defer_type, -}; - /* Create a log intent done item for a log intent item. */ static inline void xfs_defer_create_done( @@ -847,14 +837,12 @@ xfs_defer_alloc( struct xfs_defer_pending * xfs_defer_add( struct xfs_trans *tp, - enum xfs_defer_ops_type type, - struct list_head *li) + struct list_head *li, + const struct xfs_defer_op_type *ops) { struct xfs_defer_pending *dfp = NULL; - const struct xfs_defer_op_type *ops = defer_op_types[type]; ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX); dfp = xfs_defer_find_last(tp, ops); if (!dfp || !xfs_defer_can_append(dfp, ops)) diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 60de91b66392..18a9fb92dde8 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -10,20 +10,6 @@ struct xfs_btree_cur; struct xfs_defer_op_type; struct xfs_defer_capture; -/* - * Header for deferred operation list. - */ -enum xfs_defer_ops_type { - XFS_DEFER_OPS_TYPE_BMAP, - XFS_DEFER_OPS_TYPE_REFCOUNT, - XFS_DEFER_OPS_TYPE_RMAP, - XFS_DEFER_OPS_TYPE_FREE, - XFS_DEFER_OPS_TYPE_AGFL_FREE, - XFS_DEFER_OPS_TYPE_ATTR, - XFS_DEFER_OPS_TYPE_BARRIER, - XFS_DEFER_OPS_TYPE_MAX, -}; - /* * Save a log intent item and a list of extents, so that we can replay * whatever action had to happen to the extent list and file the log done @@ -51,8 +37,8 @@ struct xfs_defer_pending { void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp); -struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, - enum xfs_defer_ops_type type, struct list_head *h); +struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h, + const struct xfs_defer_op_type *ops); int xfs_defer_finish_noroll(struct xfs_trans **tp); int xfs_defer_finish(struct xfs_trans **tp); int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 3702b4a07110..5b039cd022e0 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1458,7 +1458,7 @@ __xfs_refcount_add( ri->ri_blockcount = blockcount; xfs_refcount_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type); } /* diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fbb0b2637463..76bf7f48cb5a 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2567,7 +2567,7 @@ __xfs_rmap_add( ri->ri_bmap = *bmap; xfs_rmap_update_get_group(tp->t_mountp, ri); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list); + xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type); } /* Map an extent into a file. */ From 84712492e6dab803bf595fb8494d11098b74a652 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 14 Dec 2023 13:28:08 -0600 Subject: [PATCH 049/123] xfs: short circuit xfs_growfs_data_private() if delta is zero Although xfs_growfs_data() doesn't call xfs_growfs_data_private() if in->newblocks == mp->m_sb.sb_dblocks, xfs_growfs_data_private() further massages the new block count so that we don't i.e. try to create a too-small new AG. This may lead to a delta of "0" in xfs_growfs_data_private(), so we end up in the shrink case and emit the EXPERIMENTAL warning even if we're not changing anything at all. Fix this by returning straightaway if the block delta is zero. (nb: in older kernels, the result of entering the shrink case with delta == 0 may actually let an -ENOSPC escape to userspace, which is confusing for users.) Fixes: fb2fc1720185 ("xfs: support shrinking unused space in the last AG") Signed-off-by: Eric Sandeen Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_fsops.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 4f5da19142f2..5e7255e6ad3e 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -134,6 +134,10 @@ xfs_growfs_data_private( if (delta < 0 && nagcount < 2) return -EINVAL; + /* No work to do */ + if (delta == 0) + return 0; + oagcount = mp->m_sb.sb_agcount; /* allocate the new per-ag structures */ if (nagcount > oagcount) { From c0e37f07d2bd3c1ee3fb5a650da7d8673557ed16 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Dec 2023 13:38:45 -0800 Subject: [PATCH 050/123] xfs: fix an off-by-one error in xreap_agextent_binval Overall, this function tries to find and invalidate all buffers for a given extent of space on the data device. The inner for loop in this function tries to find all xfs_bufs for a given daddr. The lengths of all possible cached buffers range from 1 fsblock to the largest needed to contain a 64k xattr value (~17fsb). The scan is capped to avoid looking at anything buffer going past the given extent. Unfortunately, the loop continuation test is wrong -- max_fsbs is the largest size we want to scan, not one past that. Put another way, this loop is actually 1-indexed, not 0-indexed. Therefore, the continuation test should use <=, not <. As a result, online repairs of btree blocks fails to stale any buffers for btrees that are being torn down, which causes later assertions in the buffer cache when another thread creates a different-sized buffer. This happens in xfs/709 when allocating an inode cluster buffer: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3346128 at fs/xfs/xfs_message.c:104 assfail+0x3a/0x40 [xfs] CPU: 0 PID: 3346128 Comm: fsstress Not tainted 6.7.0-rc4-djwx #rc4 RIP: 0010:assfail+0x3a/0x40 [xfs] Call Trace: _xfs_buf_obj_cmp+0x4a/0x50 xfs_buf_get_map+0x191/0xba0 xfs_trans_get_buf_map+0x136/0x280 xfs_ialloc_inode_init+0x186/0x340 xfs_ialloc_ag_alloc+0x254/0x720 xfs_dialloc+0x21f/0x870 xfs_create_tmpfile+0x1a9/0x2f0 xfs_rename+0x369/0xfd0 xfs_vn_rename+0xfa/0x170 vfs_rename+0x5fb/0xc30 do_renameat2+0x52d/0x6e0 __x64_sys_renameat2+0x4b/0x60 do_syscall_64+0x3b/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e A later refactoring patch in the online repair series fixed this by accident, which is why I didn't notice this until I started testing only the patches that are likely to end up in 6.8. Fixes: 1c7ce115e521 ("xfs: reap large AG metadata extents when possible") Signed-off-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/scrub/reap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index ee26fcb500b7..300f49e8e14a 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -248,7 +248,7 @@ xreap_agextent_binval( max_fsbs = min_t(xfs_agblock_t, agbno_next - bno, xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX)); - for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) { + for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) { struct xfs_buf *bp = NULL; xfs_daddr_t daddr; int error; From 0573676fdde7ce3829ee6a42a8e5a56355234712 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 15 Dec 2023 08:40:35 +1100 Subject: [PATCH 051/123] xfs: initialise di_crc in xfs_log_dinode Alexander Potapenko report that KMSAN was issuing these warnings: kmalloc-ed xlog buffer of size 512 : ffff88802fc26200 kmalloc-ed xlog buffer of size 368 : ffff88802fc24a00 kmalloc-ed xlog buffer of size 648 : ffff88802b631000 kmalloc-ed xlog buffer of size 648 : ffff88802b632800 kmalloc-ed xlog buffer of size 648 : ffff88802b631c00 xlog_write_iovec: copying 12 bytes from ffff888017ddbbd8 to ffff88802c300400 xlog_write_iovec: copying 28 bytes from ffff888017ddbbe4 to ffff88802c30040c xlog_write_iovec: copying 68 bytes from ffff88802fc26274 to ffff88802c300428 xlog_write_iovec: copying 188 bytes from ffff88802fc262bc to ffff88802c30046c ===================================================== BUG: KMSAN: uninit-value in xlog_write_iovec fs/xfs/xfs_log.c:2227 BUG: KMSAN: uninit-value in xlog_write_full fs/xfs/xfs_log.c:2263 BUG: KMSAN: uninit-value in xlog_write+0x1fac/0x2600 fs/xfs/xfs_log.c:2532 xlog_write_iovec fs/xfs/xfs_log.c:2227 xlog_write_full fs/xfs/xfs_log.c:2263 xlog_write+0x1fac/0x2600 fs/xfs/xfs_log.c:2532 xlog_cil_write_chain fs/xfs/xfs_log_cil.c:918 xlog_cil_push_work+0x30f2/0x44e0 fs/xfs/xfs_log_cil.c:1263 process_one_work kernel/workqueue.c:2630 process_scheduled_works+0x1188/0x1e30 kernel/workqueue.c:2703 worker_thread+0xee5/0x14f0 kernel/workqueue.c:2784 kthread+0x391/0x500 kernel/kthread.c:388 ret_from_fork+0x66/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:242 Uninit was created at: slab_post_alloc_hook+0x101/0xac0 mm/slab.h:768 slab_alloc_node mm/slub.c:3482 __kmem_cache_alloc_node+0x612/0xae0 mm/slub.c:3521 __do_kmalloc_node mm/slab_common.c:1006 __kmalloc+0x11a/0x410 mm/slab_common.c:1020 kmalloc ./include/linux/slab.h:604 xlog_kvmalloc fs/xfs/xfs_log_priv.h:704 xlog_cil_alloc_shadow_bufs fs/xfs/xfs_log_cil.c:343 xlog_cil_commit+0x487/0x4dc0 fs/xfs/xfs_log_cil.c:1574 __xfs_trans_commit+0x8df/0x1930 fs/xfs/xfs_trans.c:1017 xfs_trans_commit+0x30/0x40 fs/xfs/xfs_trans.c:1061 xfs_create+0x15af/0x2150 fs/xfs/xfs_inode.c:1076 xfs_generic_create+0x4cd/0x1550 fs/xfs/xfs_iops.c:199 xfs_vn_create+0x4a/0x60 fs/xfs/xfs_iops.c:275 lookup_open fs/namei.c:3477 open_last_lookups fs/namei.c:3546 path_openat+0x29ac/0x6180 fs/namei.c:3776 do_filp_open+0x24d/0x680 fs/namei.c:3809 do_sys_openat2+0x1bc/0x330 fs/open.c:1440 do_sys_open fs/open.c:1455 __do_sys_openat fs/open.c:1471 __se_sys_openat fs/open.c:1466 __x64_sys_openat+0x253/0x330 fs/open.c:1466 do_syscall_x64 arch/x86/entry/common.c:51 do_syscall_64+0x4f/0x140 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b arch/x86/entry/entry_64.S:120 Bytes 112-115 of 188 are uninitialized Memory access of size 188 starts at ffff88802fc262bc This is caused by the struct xfs_log_dinode not having the di_crc field initialised. Log recovery never uses this field (it is only present these days for on-disk format compatibility reasons) and so it's value is never checked so nothing in XFS has caught this. Further, none of the uninitialised memory access warning tools have caught this (despite catching other uninit memory accesses in the struct xfs_log_dinode back in 2017!) until recently. Alexander annotated the XFS code to get the dump of the actual bytes that were detected as uninitialised, and from that report it took me about 30s to realise what the issue was. The issue was introduced back in 2016 and every inode that is logged fails to initialise this field. This is no actual bad behaviour caused by this issue - I find it hard to even classify it as a bug... Reported-and-tested-by: Alexander Potapenko Fixes: f8d55aa0523a ("xfs: introduce inode log format object") Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_inode_item.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index cd7803fda8b1..b35335e20342 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -557,6 +557,9 @@ xfs_inode_to_log_dinode( memset(to->di_pad2, 0, sizeof(to->di_pad2)); uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid); to->di_v3_pad = 0; + + /* dummy value for initialisation */ + to->di_crc = 0; } else { to->di_version = 2; to->di_flushiter = ip->i_flushiter; From 13ae04d8d45227c2ba51e188daf9fc13d08a1b12 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:27 -0800 Subject: [PATCH 052/123] xfs: force all buffers to be written during btree bulk load While stress-testing online repair of btrees, I noticed periodic assertion failures from the buffer cache about buffers with incorrect DELWRI_Q state. Looking further, I observed this race between the AIL trying to write out a btree block and repair zapping a btree block after the fact: AIL: Repair0: pin buffer X delwri_queue: set DELWRI_Q add to delwri list stale buf X: clear DELWRI_Q does not clear b_list free space X commit delwri_submit # oops Worse yet, I discovered that running the same repair over and over in a tight loop can result in a second race that cause data integrity problems with the repair: AIL: Repair0: Repair1: pin buffer X delwri_queue: set DELWRI_Q add to delwri list stale buf X: clear DELWRI_Q does not clear b_list free space X commit find free space X get buffer rewrite buffer delwri_queue: set DELWRI_Q already on a list, do not add commit BAD: committed tree root before all blocks written delwri_submit # too late now I traced this to my own misunderstanding of how the delwri lists work, particularly with regards to the AIL's buffer list. If a buffer is logged and committed, the buffer can end up on that AIL buffer list. If btree repairs are run twice in rapid succession, it's possible that the first repair will invalidate the buffer and free it before the next time the AIL wakes up. Marking the buffer stale clears DELWRI_Q from the buffer state without removing the buffer from its delwri list. The buffer doesn't know which list it's on, so it cannot know which lock to take to protect the list for a removal. If the second repair allocates the same block, it will then recycle the buffer to start writing the new btree block. Meanwhile, if the AIL wakes up and walks the buffer list, it will ignore the buffer because it can't lock it, and go back to sleep. When the second repair calls delwri_queue to put the buffer on the list of buffers to write before committing the new btree, it will set DELWRI_Q again, but since the buffer hasn't been removed from the AIL's buffer list, it won't add it to the bulkload buffer's list. This is incorrect, because the bulkload caller relies on delwri_submit to ensure that all the buffers have been sent to disk /before/ committing the new btree root pointer. This ordering requirement is required for data consistency. Worse, the AIL won't clear DELWRI_Q from the buffer when it does finally drop it, so the next thread to walk through the btree will trip over a debug assertion on that flag. To fix this, create a new function that waits for the buffer to be removed from any other delwri lists before adding the buffer to the caller's delwri list. By waiting for the buffer to clear both the delwri list and any potential delwri wait list, we can be sure that repair will initiate writes of all buffers and report all write errors back to userspace instead of committing the new structure. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 4 +-- fs/xfs/xfs_buf.c | 44 ++++++++++++++++++++++++++++--- fs/xfs/xfs_buf.h | 1 + 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index dd75e208b543..29e3f8ccb185 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -342,9 +342,7 @@ xfs_btree_bload_drop_buf( if (*bpp == NULL) return; - if (!xfs_buf_delwri_queue(*bpp, buffers_list)) - ASSERT(0); - + xfs_buf_delwri_queue_here(*bpp, buffers_list); xfs_buf_relse(*bpp); *bpp = NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 545c7991b9b5..ec4bd7a24d88 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -2049,6 +2049,14 @@ xfs_alloc_buftarg( return NULL; } +static inline void +xfs_buf_list_del( + struct xfs_buf *bp) +{ + list_del_init(&bp->b_list); + wake_up_var(&bp->b_list); +} + /* * Cancel a delayed write list. * @@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel( xfs_buf_lock(bp); bp->b_flags &= ~_XBF_DELWRI_Q; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); } } @@ -2119,6 +2127,34 @@ xfs_buf_delwri_queue( return true; } +/* + * Queue a buffer to this delwri list as part of a data integrity operation. + * If the buffer is on any other delwri list, we'll wait for that to clear + * so that the caller can submit the buffer for IO and wait for the result. + * Callers must ensure the buffer is not already on the list. + */ +void +xfs_buf_delwri_queue_here( + struct xfs_buf *bp, + struct list_head *buffer_list) +{ + /* + * We need this buffer to end up on the /caller's/ delwri list, not any + * old list. This can happen if the buffer is marked stale (which + * clears DELWRI_Q) after the AIL queues the buffer to its list but + * before the AIL has a chance to submit the list. + */ + while (!list_empty(&bp->b_list)) { + xfs_buf_unlock(bp); + wait_var_event(&bp->b_list, list_empty(&bp->b_list)); + xfs_buf_lock(bp); + } + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + + xfs_buf_delwri_queue(bp, buffer_list); +} + /* * Compare function is more complex than it needs to be because * the return value is only 32 bits and we are doing comparisons @@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers( * reference and remove it from the list here. */ if (!(bp->b_flags & _XBF_DELWRI_Q)) { - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); xfs_buf_relse(bp); continue; } @@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers( list_move_tail(&bp->b_list, wait_list); } else { bp->b_flags |= XBF_ASYNC; - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); } __xfs_buf_submit(bp, false); } @@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit( while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); + xfs_buf_list_del(bp); /* * Wait on the locked buffer, check for errors and unlock and diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c86e16419656..b470de08a46c 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); From c1e0f8e6fb060b23b6f1b82eb4265983f7d271f8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:27 -0800 Subject: [PATCH 053/123] xfs: set XBF_DONE on newly formatted btree block that are ready for writing The btree bulkloading code calls xfs_buf_delwri_queue_here when it has finished formatting a new btree block and wants to queue it to be written to disk. Once the new btree root has been committed, the blocks (and hence the buffers) will be accessible to the rest of the filesystem. Mark each new buffer as DONE when adding it to the delwri list so that the next btree traversal can skip reloading the contents from disk. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 29e3f8ccb185..1c5f9ed70c3e 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -342,6 +342,12 @@ xfs_btree_bload_drop_buf( if (*bpp == NULL) return; + /* + * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent + * xfs_buf_read will not pointlessly reread the contents from the disk. + */ + (*bpp)->b_flags |= XBF_DONE; + xfs_buf_delwri_queue_here(*bpp, buffers_list); xfs_buf_relse(*bpp); *bpp = NULL; From 26de64629d8b439a03bce243f14a46f7440729f3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:28 -0800 Subject: [PATCH 054/123] xfs: read leaf blocks when computing keys for bulkloading into node blocks When constructing a new btree, xfs_btree_bload_node needs to read the btree blocks for level N to compute the keyptrs for the blocks that will be loaded into level N+1. The level N blocks must be formatted at that point. A subsequent patch will change the btree bulkloader to write new btree blocks in 256K chunks to moderate memory consumption if the new btree is very large. As a consequence of that, it's possible that the buffers for lower level blocks might have been reclaimed by the time the node builder comes back to the block. Therefore, change xfs_btree_bload_node to read the lower level blocks to handle the reclaimed buffer case. As a side effect, the read will increase the LRU refs, which will bias towards keeping new btree buffers in memory after the new btree commits. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree.c | 2 +- fs/xfs/libxfs/xfs_btree.h | 3 +++ fs/xfs/libxfs/xfs_btree_staging.c | 7 ++++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 6a6503ab0cd7..c100e92140be 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block( * Read in the buffer at the given ptr and return the buffer and * the block pointer within the buffer. */ -STATIC int +int xfs_btree_read_buf_block( struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4d68a58be160..e0875cec4939 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, struct xfs_buf **bpp); +int xfs_btree_read_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, int flags, + struct xfs_btree_block **block, struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 1c5f9ed70c3e..c8b46ac3923f 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -489,7 +489,12 @@ xfs_btree_bload_node( ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); - ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block, + /* + * Read the lower-level block in case the buffer for it has + * been reclaimed. LRU refs will be set on the block, which is + * desirable if the new btree commits. + */ + ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block, &child_bp); if (ret) return ret; From a20ffa7d9f863056364b11a680145a76ef15acb2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:28 -0800 Subject: [PATCH 055/123] xfs: add debug knobs to control btree bulk load slack factors Add some debug knobs so that we can control the leaf and node block slack when rebuilding btrees. For developers, it might be useful to construct btrees of various heights by crafting a filesystem with a certain number of records and then using repair+knobs to rebuild the index with a certain shape. Practically speaking, you'd only ever do that for extreme stress testing of the runtime code or the btree generator. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/newbt.c | 11 ++++++--- fs/xfs/xfs_globals.c | 12 ++++++++++ fs/xfs/xfs_sysctl.h | 2 ++ fs/xfs/xfs_sysfs.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 992cf34a13e7..46883606ad88 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -32,6 +32,7 @@ * btree bulk loading code calculates for us. However, there are some * exceptions to this rule: * + * (0) If someone turned one of the debug knobs. * (1) If this is a per-AG btree and the AG has less than 10% space free. * (2) If this is an inode btree and the FS has less than 10% space free. @@ -47,9 +48,13 @@ xrep_newbt_estimate_slack( uint64_t free; uint64_t sz; - /* Let the btree code compute the default slack values. */ - bload->leaf_slack = -1; - bload->node_slack = -1; + /* + * The xfs_globals values are set to -1 (i.e. take the bload defaults) + * unless someone has set them otherwise, so we just pull the values + * here. + */ + bload->leaf_slack = xfs_globals.bload_leaf_slack; + bload->node_slack = xfs_globals.bload_node_slack; if (sc->ops->type == ST_PERAG) { free = sc->sa.pag->pagf_freeblks; diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 9edc1f2bc939..f18fec0adf66 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = { .pwork_threads = -1, /* automatic thread detection */ .larp = false, /* log attribute replay */ #endif + + /* + * Leave this many record slots empty when bulk loading btrees. By + * default we load new btree leaf blocks 75% full. + */ + .bload_leaf_slack = -1, + + /* + * Leave this many key/ptr slots empty when bulk loading btrees. By + * default we load new btree node blocks 75% full. + */ + .bload_node_slack = -1, }; diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index f78ad6b10ea5..276696a07040 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -85,6 +85,8 @@ struct xfs_globals { int pwork_threads; /* parallel workqueue threads */ bool larp; /* log attribute replay */ #endif + int bload_leaf_slack; /* btree bulk load leaf slack */ + int bload_node_slack; /* btree bulk load node slack */ int log_recovery_delay; /* log recovery delay (secs) */ int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 871f16a4a5d8..17485666b672 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -262,6 +262,58 @@ larp_show( XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ +STATIC ssize_t +bload_leaf_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_leaf_slack = val; + return count; +} + +STATIC ssize_t +bload_leaf_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack); +} +XFS_SYSFS_ATTR_RW(bload_leaf_slack); + +STATIC ssize_t +bload_node_slack_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + xfs_globals.bload_node_slack = val; + return count; +} + +STATIC ssize_t +bload_node_slack_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack); +} +XFS_SYSFS_ATTR_RW(bload_node_slack); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), @@ -271,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(pwork_threads), ATTR_LIST(larp), #endif + ATTR_LIST(bload_leaf_slack), + ATTR_LIST(bload_node_slack), NULL, }; ATTRIBUTE_GROUPS(xfs_dbg); From 6dfeb0c2ecde71d61af77f65eabbdd6ca9315161 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:29 -0800 Subject: [PATCH 056/123] xfs: move btree bulkload record initialization to ->get_record implementations When we're performing a bulk load of a btree, move the code that actually stores the btree record in the new btree block out of the generic code and into the individual ->get_record implementations. This is preparation for being able to store multiple records with a single indirect call. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 17 +++++++---------- fs/xfs/libxfs/xfs_btree_staging.h | 15 ++++++++++----- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index c8b46ac3923f..cd409a2ee87b 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -440,22 +440,19 @@ STATIC int xfs_btree_bload_leaf( struct xfs_btree_cur *cur, unsigned int recs_this_block, - xfs_btree_bload_get_record_fn get_record, + xfs_btree_bload_get_records_fn get_records, struct xfs_btree_block *block, void *priv) { - unsigned int j; + unsigned int j = 1; int ret; /* Fill the leaf block with records. */ - for (j = 1; j <= recs_this_block; j++) { - union xfs_btree_rec *block_rec; - - ret = get_record(cur, priv); - if (ret) + while (j <= recs_this_block) { + ret = get_records(cur, j, block, recs_this_block - j + 1, priv); + if (ret < 0) return ret; - block_rec = xfs_btree_rec_addr(cur, j, block); - cur->bc_ops->init_rec_from_cur(cur, block_rec); + j += ret; } return 0; @@ -798,7 +795,7 @@ xfs_btree_bload( trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, nr_this_block); - ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record, + ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, block, priv); if (ret) goto out; diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index 5f638f711246..bd5b3f004823 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -47,7 +47,9 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp, int whichfork, const struct xfs_btree_ops *ops); /* Bulk loading of staged btrees. */ -typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv); +typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, + unsigned int idx, struct xfs_btree_block *block, + unsigned int nr_wanted, void *priv); typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv); typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, @@ -55,11 +57,14 @@ typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, struct xfs_btree_bload { /* - * This function will be called nr_records times to load records into - * the btree. The function does this by setting the cursor's bc_rec - * field in in-core format. Records must be returned in sort order. + * This function will be called to load @nr_wanted records into the + * btree. The implementation does this by setting the cursor's bc_rec + * field in in-core format and using init_rec_from_cur to set the + * records in the btree block. Records must be returned in sort order. + * The function must return the number of records loaded or the usual + * negative errno. */ - xfs_btree_bload_get_record_fn get_record; + xfs_btree_bload_get_records_fn get_records; /* * This function will be called nr_blocks times to obtain a pointer From e069d549705e49841247acf9b3176744e27d5425 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:29 -0800 Subject: [PATCH 057/123] xfs: constrain dirty buffers while formatting a staged btree Constrain the number of dirty buffers that are locked by the btree staging code at any given time by establishing a threshold at which we put them all on the delwri queue and push them to disk. This limits memory consumption while writing out new btrees. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_btree_staging.c | 50 ++++++++++++++++++++++++------- fs/xfs/libxfs/xfs_btree_staging.h | 10 +++++++ fs/xfs/scrub/newbt.c | 1 + 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index cd409a2ee87b..0c978a31e284 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -333,24 +333,41 @@ xfs_btree_commit_ifakeroot( /* * Put a btree block that we're loading onto the ordered list and release it. * The btree blocks will be written to disk when bulk loading is finished. + * If we reach the dirty buffer threshold, flush them to disk before + * continuing. */ -static void +static int xfs_btree_bload_drop_buf( - struct list_head *buffers_list, - struct xfs_buf **bpp) + struct xfs_btree_bload *bbl, + struct list_head *buffers_list, + struct xfs_buf **bpp) { - if (*bpp == NULL) - return; + struct xfs_buf *bp = *bpp; + int error; + + if (!bp) + return 0; /* * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent * xfs_buf_read will not pointlessly reread the contents from the disk. */ - (*bpp)->b_flags |= XBF_DONE; + bp->b_flags |= XBF_DONE; - xfs_buf_delwri_queue_here(*bpp, buffers_list); - xfs_buf_relse(*bpp); + xfs_buf_delwri_queue_here(bp, buffers_list); + xfs_buf_relse(bp); *bpp = NULL; + bbl->nr_dirty++; + + if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) + return 0; + + error = xfs_buf_delwri_submit(buffers_list); + if (error) + return error; + + bbl->nr_dirty = 0; + return 0; } /* @@ -422,7 +439,10 @@ xfs_btree_bload_prep_block( */ if (*blockp) xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); - xfs_btree_bload_drop_buf(buffers_list, bpp); + + ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); + if (ret) + return ret; /* Initialize the new btree block. */ xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); @@ -770,6 +790,7 @@ xfs_btree_bload( cur->bc_nlevels = bbl->btree_height; xfs_btree_set_ptr_null(cur, &child_ptr); xfs_btree_set_ptr_null(cur, &ptr); + bbl->nr_dirty = 0; xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &blocks, &blocks_with_extra); @@ -808,7 +829,10 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; /* Populate the internal btree nodes. */ for (level = 1; level < cur->bc_nlevels; level++) { @@ -850,7 +874,11 @@ xfs_btree_bload( xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); } total_blocks += blocks; - xfs_btree_bload_drop_buf(&buffers_list, &bp); + + ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); + if (ret) + goto out; + xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); } diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index bd5b3f004823..f0a5007284ef 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -112,6 +112,16 @@ struct xfs_btree_bload { * height of the new btree. */ unsigned int btree_height; + + /* + * Flush the new btree block buffer list to disk after this many blocks + * have been formatted. Zero prohibits writing any buffers until all + * blocks have been formatted. + */ + uint16_t max_dirty; + + /* Number of dirty buffers. */ + uint16_t nr_dirty; }; int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur, diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 46883606ad88..81919eeabcdb 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -94,6 +94,7 @@ xrep_newbt_init_ag( xnr->alloc_hint = alloc_hint; xnr->resv = resv; INIT_LIST_HEAD(&xnr->resv_list); + xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */ xrep_newbt_estimate_slack(xnr); } From 6ece924b95226235059ed2ffc2c0f44a124c5910 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:30 -0800 Subject: [PATCH 058/123] xfs: create separate structures and code for u32 bitmaps Create a version of the xbitmap that handles 32-bit integer intervals and adapt the xfs_agblock_t bitmap to use it. This reduces the size of the interval tree nodes from 48 to 36 bytes and enables us to use a more efficient slab (:0000040 instead of :0000048) which allows us to pack more nodes into a single slab page (102 vs 85). As a side effect, the users of these bitmaps no longer have to convert between u32 and u64 quantities just to use the bitmap; and the hairy overflow checking code in xagb_bitmap_test goes away. Later in this patchset we're going to add bitmaps for xfs_agino_t, xfs_rgblock_t, and xfs_dablk_t, so the increase in code size (5622 vs. 9959 bytes) seems worth it. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/agheader_repair.c | 9 +- fs/xfs/scrub/bitmap.c | 518 +++++++++++++++++++++++++-------- fs/xfs/scrub/bitmap.h | 92 +++--- fs/xfs/scrub/reap.c | 5 +- 4 files changed, 458 insertions(+), 166 deletions(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 876a2f41b063..4000bdc8b500 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -494,12 +494,11 @@ xrep_agfl_walk_rmap( /* Strike out the blocks that are cross-linked according to the rmapbt. */ STATIC int xrep_agfl_check_extent( - uint64_t start, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xrep_agfl *ra = priv; - xfs_agblock_t agbno = start; xfs_agblock_t last_agbno = agbno + len - 1; int error; @@ -647,8 +646,8 @@ struct xrep_agfl_fill { /* Fill the AGFL with whatever blocks are in this extent. */ static int xrep_agfl_fill( - uint64_t start, - uint64_t len, + uint32_t start, + uint32_t len, void *priv) { struct xrep_agfl_fill *af = priv; diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index e0c89a9a0ca0..503b79010002 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -16,7 +16,9 @@ #include -struct xbitmap_node { +/* u64 bitmap */ + +struct xbitmap64_node { struct rb_node bn_rbnode; /* First set bit of this interval and subtree. */ @@ -39,72 +41,72 @@ struct xbitmap_node { * forward-declare them anyway for clarity. */ static inline void -xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root); static inline void -xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root); +xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root); -static inline struct xbitmap_node * -xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start, uint64_t last); -static inline struct xbitmap_node * -xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start, +static inline struct xbitmap64_node * +xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start, uint64_t last); -INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t, - __bn_subtree_last, START, LAST, static inline, xbitmap_tree) +INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t, + __bn_subtree_last, START, LAST, static inline, xbitmap64_tree) /* Iterate each interval of a bitmap. Do not change the bitmap. */ -#define for_each_xbitmap_extent(bn, bitmap) \ +#define for_each_xbitmap64_extent(bn, bitmap) \ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ - struct xbitmap_node, bn_rbnode); \ + struct xbitmap64_node, bn_rbnode); \ (bn) != NULL; \ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ - struct xbitmap_node, bn_rbnode)) + struct xbitmap64_node, bn_rbnode)) /* Clear a range of this bitmap. */ int -xbitmap_clear( - struct xbitmap *bitmap, +xbitmap64_clear( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *bn; - struct xbitmap_node *new_bn; + struct xbitmap64_node *bn; + struct xbitmap64_node *new_bn; uint64_t last = start + len - 1; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) { + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) { if (bn->bn_start < start && bn->bn_last > last) { uint64_t old_last = bn->bn_last; /* overlaps with the entire clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); /* add an extent */ - new_bn = kmalloc(sizeof(struct xbitmap_node), + new_bn = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!new_bn) return -ENOMEM; new_bn->bn_start = last + 1; new_bn->bn_last = old_last; - xbitmap_tree_insert(new_bn, &bitmap->xb_root); + xbitmap64_tree_insert(new_bn, &bitmap->xb_root); } else if (bn->bn_start < start) { /* overlaps with the left side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_last = start - 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); } else if (bn->bn_last > last) { /* overlaps with the right side of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); bn->bn_start = last + 1; - xbitmap_tree_insert(bn, &bitmap->xb_root); + xbitmap64_tree_insert(bn, &bitmap->xb_root); break; } else { /* in the middle of the clearing range */ - xbitmap_tree_remove(bn, &bitmap->xb_root); + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } @@ -114,59 +116,59 @@ xbitmap_clear( /* Set a range of this bitmap. */ int -xbitmap_set( - struct xbitmap *bitmap, +xbitmap64_set( + struct xbitmap64 *bitmap, uint64_t start, uint64_t len) { - struct xbitmap_node *left; - struct xbitmap_node *right; + struct xbitmap64_node *left; + struct xbitmap64_node *right; uint64_t last = start + len - 1; int error; /* Is this whole range already set? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); if (left && left->bn_start <= start && left->bn_last >= last) return 0; /* Clear out everything in the range we want to set. */ - error = xbitmap_clear(bitmap, start, len); + error = xbitmap64_clear(bitmap, start, len); if (error) return error; /* Do we have a left-adjacent extent? */ - left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); ASSERT(!left || left->bn_last + 1 == start); /* Do we have a right-adjacent extent? */ - right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); ASSERT(!right || right->bn_start == last + 1); if (left && right) { /* combine left and right adjacent extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); left->bn_last = right->bn_last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); kfree(right); } else if (left) { /* combine with left extent */ - xbitmap_tree_remove(left, &bitmap->xb_root); + xbitmap64_tree_remove(left, &bitmap->xb_root); left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } else if (right) { /* combine with right extent */ - xbitmap_tree_remove(right, &bitmap->xb_root); + xbitmap64_tree_remove(right, &bitmap->xb_root); right->bn_start = start; - xbitmap_tree_insert(right, &bitmap->xb_root); + xbitmap64_tree_insert(right, &bitmap->xb_root); } else { /* add an extent */ - left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS); + left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS); if (!left) return -ENOMEM; left->bn_start = start; left->bn_last = last; - xbitmap_tree_insert(left, &bitmap->xb_root); + xbitmap64_tree_insert(left, &bitmap->xb_root); } return 0; @@ -174,21 +176,21 @@ xbitmap_set( /* Free everything related to this bitmap. */ void -xbitmap_destroy( - struct xbitmap *bitmap) +xbitmap64_destroy( + struct xbitmap64 *bitmap) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; - while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { - xbitmap_tree_remove(bn, &bitmap->xb_root); + while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) { + xbitmap64_tree_remove(bn, &bitmap->xb_root); kfree(bn); } } /* Set up a per-AG block bitmap. */ void -xbitmap_init( - struct xbitmap *bitmap) +xbitmap64_init( + struct xbitmap64 *bitmap) { bitmap->xb_root = RB_ROOT_CACHED; } @@ -208,18 +210,18 @@ xbitmap_init( * This is the logical equivalent of bitmap &= ~sub. */ int -xbitmap_disunion( - struct xbitmap *bitmap, - struct xbitmap *sub) +xbitmap64_disunion( + struct xbitmap64 *bitmap, + struct xbitmap64 *sub) { - struct xbitmap_node *bn; + struct xbitmap64_node *bn; int error; - if (xbitmap_empty(bitmap) || xbitmap_empty(sub)) + if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub)) return 0; - for_each_xbitmap_extent(bn, sub) { - error = xbitmap_clear(bitmap, bn->bn_start, + for_each_xbitmap64_extent(bn, sub) { + error = xbitmap64_clear(bitmap, bn->bn_start, bn->bn_last - bn->bn_start + 1); if (error) return error; @@ -228,6 +230,345 @@ xbitmap_disunion( return 0; } +/* How many bits are set in this bitmap? */ +uint64_t +xbitmap64_hweight( + struct xbitmap64 *bitmap) +{ + struct xbitmap64_node *bn; + uint64_t ret = 0; + + for_each_xbitmap64_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap64_walk( + struct xbitmap64 *bitmap, + xbitmap64_walk_fn fn, + void *priv) +{ + struct xbitmap64_node *bn; + int error = 0; + + for_each_xbitmap64_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap64_empty( + struct xbitmap64 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap64_test( + struct xbitmap64 *bitmap, + uint64_t start, + uint64_t *len) +{ + struct xbitmap64_node *bn; + uint64_t last = start + *len - 1; + + bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* u32 bitmap */ + +struct xbitmap32_node { + struct rb_node bn_rbnode; + + /* First set bit of this interval and subtree. */ + uint32_t bn_start; + + /* Last set bit of this interval. */ + uint32_t bn_last; + + /* Last set bit of this subtree. Do not touch this. */ + uint32_t __bn_subtree_last; +}; + +/* Define our own interval tree type with uint32_t parameters. */ + +/* + * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll + * forward-declare them anyway for clarity. + */ +static inline void +xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline void +xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start, + uint32_t last); + +static inline struct xbitmap32_node * +xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start, + uint32_t last); + +INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t, + __bn_subtree_last, START, LAST, static inline, xbitmap32_tree) + +/* Iterate each interval of a bitmap. Do not change the bitmap. */ +#define for_each_xbitmap32_extent(bn, bitmap) \ + for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \ + struct xbitmap32_node, bn_rbnode); \ + (bn) != NULL; \ + (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \ + struct xbitmap32_node, bn_rbnode)) + +/* Clear a range of this bitmap. */ +int +xbitmap32_clear( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) +{ + struct xbitmap32_node *bn; + struct xbitmap32_node *new_bn; + uint32_t last = start + len - 1; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) { + if (bn->bn_start < start && bn->bn_last > last) { + uint32_t old_last = bn->bn_last; + + /* overlaps with the entire clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + + /* add an extent */ + new_bn = kmalloc(sizeof(struct xbitmap32_node), + XCHK_GFP_FLAGS); + if (!new_bn) + return -ENOMEM; + new_bn->bn_start = last + 1; + new_bn->bn_last = old_last; + xbitmap32_tree_insert(new_bn, &bitmap->xb_root); + } else if (bn->bn_start < start) { + /* overlaps with the left side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_last = start - 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + } else if (bn->bn_last > last) { + /* overlaps with the right side of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + bn->bn_start = last + 1; + xbitmap32_tree_insert(bn, &bitmap->xb_root); + break; + } else { + /* in the middle of the clearing range */ + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } + } + + return 0; +} + +/* Set a range of this bitmap. */ +int +xbitmap32_set( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t len) +{ + struct xbitmap32_node *left; + struct xbitmap32_node *right; + uint32_t last = start + len - 1; + int error; + + /* Is this whole range already set? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (left && left->bn_start <= start && left->bn_last >= last) + return 0; + + /* Clear out everything in the range we want to set. */ + error = xbitmap32_clear(bitmap, start, len); + if (error) + return error; + + /* Do we have a left-adjacent extent? */ + left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1); + ASSERT(!left || left->bn_last + 1 == start); + + /* Do we have a right-adjacent extent? */ + right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1); + ASSERT(!right || right->bn_start == last + 1); + + if (left && right) { + /* combine left and right adjacent extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + xbitmap32_tree_remove(right, &bitmap->xb_root); + left->bn_last = right->bn_last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + kfree(right); + } else if (left) { + /* combine with left extent */ + xbitmap32_tree_remove(left, &bitmap->xb_root); + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } else if (right) { + /* combine with right extent */ + xbitmap32_tree_remove(right, &bitmap->xb_root); + right->bn_start = start; + xbitmap32_tree_insert(right, &bitmap->xb_root); + } else { + /* add an extent */ + left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS); + if (!left) + return -ENOMEM; + left->bn_start = start; + left->bn_last = last; + xbitmap32_tree_insert(left, &bitmap->xb_root); + } + + return 0; +} + +/* Free everything related to this bitmap. */ +void +xbitmap32_destroy( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + + while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) { + xbitmap32_tree_remove(bn, &bitmap->xb_root); + kfree(bn); + } +} + +/* Set up a per-AG block bitmap. */ +void +xbitmap32_init( + struct xbitmap32 *bitmap) +{ + bitmap->xb_root = RB_ROOT_CACHED; +} + +/* + * Remove all the blocks mentioned in @sub from the extents in @bitmap. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @bitmap; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sub. This routine subtracts all the extents + * mentioned in sub from all the extents linked in @bitmap, which leaves + * @bitmap as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @bitmap can be reaped. + * + * This is the logical equivalent of bitmap &= ~sub. + */ +int +xbitmap32_disunion( + struct xbitmap32 *bitmap, + struct xbitmap32 *sub) +{ + struct xbitmap32_node *bn; + int error; + + if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub)) + return 0; + + for_each_xbitmap32_extent(bn, sub) { + error = xbitmap32_clear(bitmap, bn->bn_start, + bn->bn_last - bn->bn_start + 1); + if (error) + return error; + } + + return 0; +} + +/* How many bits are set in this bitmap? */ +uint32_t +xbitmap32_hweight( + struct xbitmap32 *bitmap) +{ + struct xbitmap32_node *bn; + uint32_t ret = 0; + + for_each_xbitmap32_extent(bn, bitmap) + ret += bn->bn_last - bn->bn_start + 1; + + return ret; +} + +/* Call a function for every run of set bits in this bitmap. */ +int +xbitmap32_walk( + struct xbitmap32 *bitmap, + xbitmap32_walk_fn fn, + void *priv) +{ + struct xbitmap32_node *bn; + int error = 0; + + for_each_xbitmap32_extent(bn, bitmap) { + error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); + if (error) + break; + } + + return error; +} + +/* Does this bitmap have no bits set at all? */ +bool +xbitmap32_empty( + struct xbitmap32 *bitmap) +{ + return bitmap->xb_root.rb_root.rb_node == NULL; +} + +/* Is the start of the range set or clear? And for how long? */ +bool +xbitmap32_test( + struct xbitmap32 *bitmap, + uint32_t start, + uint32_t *len) +{ + struct xbitmap32_node *bn; + uint32_t last = start + *len - 1; + + bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last); + if (!bn) + return false; + if (bn->bn_start <= start) { + if (bn->bn_last < last) + *len = bn->bn_last - start + 1; + return true; + } + *len = bn->bn_start - start; + return false; +} + +/* xfs_agblock_t bitmap */ + /* * Record all btree blocks seen while iterating all records of a btree. * @@ -316,66 +657,3 @@ xagb_bitmap_set_btcur_path( return 0; } - -/* How many bits are set in this bitmap? */ -uint64_t -xbitmap_hweight( - struct xbitmap *bitmap) -{ - struct xbitmap_node *bn; - uint64_t ret = 0; - - for_each_xbitmap_extent(bn, bitmap) - ret += bn->bn_last - bn->bn_start + 1; - - return ret; -} - -/* Call a function for every run of set bits in this bitmap. */ -int -xbitmap_walk( - struct xbitmap *bitmap, - xbitmap_walk_fn fn, - void *priv) -{ - struct xbitmap_node *bn; - int error = 0; - - for_each_xbitmap_extent(bn, bitmap) { - error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv); - if (error) - break; - } - - return error; -} - -/* Does this bitmap have no bits set at all? */ -bool -xbitmap_empty( - struct xbitmap *bitmap) -{ - return bitmap->xb_root.rb_root.rb_node == NULL; -} - -/* Is the start of the range set or clear? And for how long? */ -bool -xbitmap_test( - struct xbitmap *bitmap, - uint64_t start, - uint64_t *len) -{ - struct xbitmap_node *bn; - uint64_t last = start + *len - 1; - - bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last); - if (!bn) - return false; - if (bn->bn_start <= start) { - if (bn->bn_last < last) - *len = bn->bn_last - start + 1; - return true; - } - *len = bn->bn_start - start; - return false; -} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 4fe58bad6734..231b27c09b4e 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -6,17 +6,19 @@ #ifndef __XFS_SCRUB_BITMAP_H__ #define __XFS_SCRUB_BITMAP_H__ -struct xbitmap { +/* u64 bitmap */ + +struct xbitmap64 { struct rb_root_cached xb_root; }; -void xbitmap_init(struct xbitmap *bitmap); -void xbitmap_destroy(struct xbitmap *bitmap); +void xbitmap64_init(struct xbitmap64 *bitmap); +void xbitmap64_destroy(struct xbitmap64 *bitmap); -int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len); -int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub); -uint64_t xbitmap_hweight(struct xbitmap *bitmap); +int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len); +int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub); +uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap); /* * Return codes for the bitmap iterator functions are 0 to continue iterating, @@ -25,79 +27,93 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap); * iteration, because neither bitmap iterator ever generates that error code on * its own. Callers must not modify the bitmap while walking it. */ -typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv); -int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn, +typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv); +int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn, void *priv); -bool xbitmap_empty(struct xbitmap *bitmap); -bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len); +bool xbitmap64_empty(struct xbitmap64 *bitmap); +bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len); + +/* u32 bitmap */ + +struct xbitmap32 { + struct rb_root_cached xb_root; +}; + +void xbitmap32_init(struct xbitmap32 *bitmap); +void xbitmap32_destroy(struct xbitmap32 *bitmap); + +int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len); +int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub); +uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap); + +/* + * Return codes for the bitmap iterator functions are 0 to continue iterating, + * and non-zero to stop iterating. Any non-zero value will be passed up to the + * iteration caller. The special value -ECANCELED can be used to stop + * iteration, because neither bitmap iterator ever generates that error code on + * its own. Callers must not modify the bitmap while walking it. + */ +typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv); +int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, + void *priv); + +bool xbitmap32_empty(struct xbitmap32 *bitmap); +bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); /* Bitmaps, but for type-checked for xfs_agblock_t */ struct xagb_bitmap { - struct xbitmap agbitmap; + struct xbitmap32 agbitmap; }; static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) { - xbitmap_init(&bitmap->agbitmap); + xbitmap32_init(&bitmap->agbitmap); } static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) { - xbitmap_destroy(&bitmap->agbitmap); + xbitmap32_destroy(&bitmap->agbitmap); } static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, xfs_agblock_t start, xfs_extlen_t len) { - return xbitmap_clear(&bitmap->agbitmap, start, len); + return xbitmap32_clear(&bitmap->agbitmap, start, len); } static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, xfs_agblock_t start, xfs_extlen_t len) { - return xbitmap_set(&bitmap->agbitmap, start, len); + return xbitmap32_set(&bitmap->agbitmap, start, len); } -static inline bool -xagb_bitmap_test( - struct xagb_bitmap *bitmap, - xfs_agblock_t start, - xfs_extlen_t *len) +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) { - uint64_t biglen = *len; - bool ret; - - ret = xbitmap_test(&bitmap->agbitmap, start, &biglen); - - if (start + biglen >= UINT_MAX) { - ASSERT(0); - biglen = UINT_MAX - start; - } - - *len = biglen; - return ret; + return xbitmap32_test(&bitmap->agbitmap, start, len); } static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, struct xagb_bitmap *sub) { - return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap); + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); } static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) { - return xbitmap_hweight(&bitmap->agbitmap); + return xbitmap32_hweight(&bitmap->agbitmap); } static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) { - return xbitmap_empty(&bitmap->agbitmap); + return xbitmap32_empty(&bitmap->agbitmap); } static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap_walk_fn fn, void *priv) + xbitmap32_walk_fn fn, void *priv) { - return xbitmap_walk(&bitmap->agbitmap, fn, priv); + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); } int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 300f49e8e14a..b1f112ecc062 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -430,13 +430,12 @@ xreap_agextent_iter( */ STATIC int xreap_agmeta_extent( - uint64_t fsbno, - uint64_t len, + uint32_t agbno, + uint32_t len, void *priv) { struct xreap_state *rs = priv; struct xfs_scrub *sc = rs->sc; - xfs_agblock_t agbno = fsbno; xfs_agblock_t agbno_next = agbno + len; int error = 0; From 0f08af0f9f3eb4a67fa3849c63e918bac9773da8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:30 -0800 Subject: [PATCH 059/123] xfs: move the per-AG datatype bitmaps to separate files Move struct xagb_bitmap to its own pair of C and header files per request of Christoph. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/agb_bitmap.c | 103 +++++++++++++++++++++++++++++++++ fs/xfs/scrub/agb_bitmap.h | 68 ++++++++++++++++++++++ fs/xfs/scrub/agheader_repair.c | 1 + fs/xfs/scrub/bitmap.c | 91 ----------------------------- fs/xfs/scrub/bitmap.h | 59 ------------------- fs/xfs/scrub/reap.c | 1 + fs/xfs/scrub/rmap.c | 1 + 8 files changed, 175 insertions(+), 150 deletions(-) create mode 100644 fs/xfs/scrub/agb_bitmap.c create mode 100644 fs/xfs/scrub/agb_bitmap.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 1537d66e5ab0..eb557dca9373 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) xfs-y += $(addprefix scrub/, \ trace.o \ + agb_bitmap.o \ agheader.o \ alloc.o \ attr.o \ diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c new file mode 100644 index 000000000000..573e4e062754 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "bitmap.h" +#include "scrub/agb_bitmap.h" + +/* + * Record all btree blocks seen while iterating all records of a btree. + * + * We know that the btree query_all function starts at the left edge and walks + * towards the right edge of the tree. Therefore, we know that we can walk up + * the btree cursor towards the root; if the pointer for a given level points + * to the first record/key in that block, we haven't seen this block before; + * and therefore we need to remember that we saw this block in the btree. + * + * So if our btree is: + * + * 4 + * / | \ + * 1 2 3 + * + * Pretend for this example that each leaf block has 100 btree records. For + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. + * + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. + * + * For the 101st btree record, we've moved onto leaf block 2. Now + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. + * + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. + * + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. + * + * For the 300th record we just exit, with the list being [1, 4, 2, 3]. + */ + +/* Mark a btree block to the agblock bitmap. */ +STATIC int +xagb_bitmap_visit_btblock( + struct xfs_btree_cur *cur, + int level, + void *priv) +{ + struct xagb_bitmap *bitmap = priv; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t agbno; + + xfs_btree_get_block(cur, level, &bp); + if (!bp) + return 0; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); + agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); + + return xagb_bitmap_set(bitmap, agbno, 1); +} + +/* Mark all (per-AG) btree blocks in the agblock bitmap. */ +int +xagb_bitmap_set_btblocks( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, + XFS_BTREE_VISIT_ALL, bitmap); +} + +/* + * Record all the buffers pointed to by the btree cursor. Callers already + * engaged in a btree walk should call this function to capture the list of + * blocks going from the leaf towards the root. + */ +int +xagb_bitmap_set_btcur_path( + struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur) +{ + int i; + int error; + + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { + error = xagb_bitmap_visit_btblock(cur, i, bitmap); + if (error) + return error; + } + + return 0; +} diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h new file mode 100644 index 000000000000..ed08f76ff4f3 --- /dev/null +++ b/fs/xfs/scrub/agb_bitmap.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_AGB_BITMAP_H__ +#define __XFS_SCRUB_AGB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_agblock_t */ + +struct xagb_bitmap { + struct xbitmap32 agbitmap; +}; + +static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) +{ + xbitmap32_init(&bitmap->agbitmap); +} + +static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) +{ + xbitmap32_destroy(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_clear(&bitmap->agbitmap, start, len); +} +static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t len) +{ + return xbitmap32_set(&bitmap->agbitmap, start, len); +} + +static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, + xfs_agblock_t start, xfs_extlen_t *len) +{ + return xbitmap32_test(&bitmap->agbitmap, start, len); +} + +static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, + struct xagb_bitmap *sub) +{ + return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); +} + +static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) +{ + return xbitmap32_hweight(&bitmap->agbitmap); +} +static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) +{ + return xbitmap32_empty(&bitmap->agbitmap); +} + +static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, + xbitmap32_walk_fn fn, void *priv) +{ + return xbitmap32_walk(&bitmap->agbitmap, fn, priv); +} + +int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); +int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, + struct xfs_btree_cur *cur); + +#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 4000bdc8b500..52956c0b8f79 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -26,6 +26,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" #include "scrub/reap.h" /* Superblock */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 503b79010002..1449bb5262d9 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -566,94 +566,3 @@ xbitmap32_test( *len = bn->bn_start - start; return false; } - -/* xfs_agblock_t bitmap */ - -/* - * Record all btree blocks seen while iterating all records of a btree. - * - * We know that the btree query_all function starts at the left edge and walks - * towards the right edge of the tree. Therefore, we know that we can walk up - * the btree cursor towards the root; if the pointer for a given level points - * to the first record/key in that block, we haven't seen this block before; - * and therefore we need to remember that we saw this block in the btree. - * - * So if our btree is: - * - * 4 - * / | \ - * 1 2 3 - * - * Pretend for this example that each leaf block has 100 btree records. For - * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we - * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so - * we record block 4. The list is [1, 4]. - * - * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit - * the loop. The list remains [1, 4]. - * - * For the 101st btree record, we've moved onto leaf block 2. Now - * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that - * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. - * - * For the 102nd record, bc_levels[0].ptr == 2, so we continue. - * - * For the 201st record, we've moved on to leaf block 3. - * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. - * - * For the 300th record we just exit, with the list being [1, 4, 2, 3]. - */ - -/* Mark a btree block to the agblock bitmap. */ -STATIC int -xagb_bitmap_visit_btblock( - struct xfs_btree_cur *cur, - int level, - void *priv) -{ - struct xagb_bitmap *bitmap = priv; - struct xfs_buf *bp; - xfs_fsblock_t fsbno; - xfs_agblock_t agbno; - - xfs_btree_get_block(cur, level, &bp); - if (!bp) - return 0; - - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); - agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); - - return xagb_bitmap_set(bitmap, agbno, 1); -} - -/* Mark all (per-AG) btree blocks in the agblock bitmap. */ -int -xagb_bitmap_set_btblocks( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) -{ - return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock, - XFS_BTREE_VISIT_ALL, bitmap); -} - -/* - * Record all the buffers pointed to by the btree cursor. Callers already - * engaged in a btree walk should call this function to capture the list of - * blocks going from the leaf towards the root. - */ -int -xagb_bitmap_set_btcur_path( - struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur) -{ - int i; - int error; - - for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { - error = xagb_bitmap_visit_btblock(cur, i, bitmap); - if (error) - return error; - } - - return 0; -} diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h index 231b27c09b4e..2df8911606d6 100644 --- a/fs/xfs/scrub/bitmap.h +++ b/fs/xfs/scrub/bitmap.h @@ -62,63 +62,4 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn, bool xbitmap32_empty(struct xbitmap32 *bitmap); bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len); -/* Bitmaps, but for type-checked for xfs_agblock_t */ - -struct xagb_bitmap { - struct xbitmap32 agbitmap; -}; - -static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap) -{ - xbitmap32_init(&bitmap->agbitmap); -} - -static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap) -{ - xbitmap32_destroy(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap32_clear(&bitmap->agbitmap, start, len); -} -static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t len) -{ - return xbitmap32_set(&bitmap->agbitmap, start, len); -} - -static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap, - xfs_agblock_t start, xfs_extlen_t *len) -{ - return xbitmap32_test(&bitmap->agbitmap, start, len); -} - -static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap, - struct xagb_bitmap *sub) -{ - return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap); -} - -static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap) -{ - return xbitmap32_hweight(&bitmap->agbitmap); -} -static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap) -{ - return xbitmap32_empty(&bitmap->agbitmap); -} - -static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap, - xbitmap32_walk_fn fn, void *priv) -{ - return xbitmap32_walk(&bitmap->agbitmap, fn, priv); -} - -int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); -int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap, - struct xfs_btree_cur *cur); - #endif /* __XFS_SCRUB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b1f112ecc062..bfc3583132ac 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -37,6 +37,7 @@ #include "scrub/trace.h" #include "scrub/repair.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" #include "scrub/reap.h" /* diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index d29a26ecddd6..c99d1714f283 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -24,6 +24,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" /* * Set us up to scrub reverse mapping btrees. From efb43b355457dab474c7eb40d6b2f3cb04c24ecf Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:31 -0800 Subject: [PATCH 060/123] xfs: roll the scrub transaction after completing a repair When we've finished repairing an AG header, roll the scrub transaction. This ensure that any failures caused by defer ops failing are captured by the xrep_done tracepoint and that any stacktraces that occur will point to the repair code that caused it, instead of xchk_teardown. Going forward, repair functions should commit the transaction if they're going to return success. Usually the space reaping functions that run after a successful atomic commit of the new metadata will take care of that for us. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/agheader_repair.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 52956c0b8f79..26bd1ff68f1b 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -73,7 +73,7 @@ xrep_superblock( /* Write this to disk. */ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); - return error; + return 0; } /* AGF */ @@ -342,7 +342,7 @@ xrep_agf_commit_new( pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGF. v5 filesystems only. */ @@ -789,6 +789,9 @@ xrep_agfl( /* Dump any AGFL overflow. */ error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG, XFS_AG_RESV_AGFL); + if (error) + goto err; + err: xagb_bitmap_destroy(&agfl_extents); return error; @@ -962,7 +965,7 @@ xrep_agi_commit_new( pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); - return 0; + return xrep_roll_ag_trans(sc); } /* Repair the AGI. */ From 8bd0bf570bd7b5cbcce3f70b760d8dcccd8df6c8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:31 -0800 Subject: [PATCH 061/123] xfs: remove trivial bnobt/inobt scrub helpers Christoph Hellwig complained about awkward code in the next two repair patches such as: sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; error = xchk_bnobt(sc); This is a little silly, so let's export the xchk_{,i}allocbt functions to the dispatch table in scrub.c directly and get rid of the helpers. Originally I had planned each btree gets its own separate entry point, but since repair doesn't work that way, it no longer makes sense to complicate the call chain that way. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/alloc.c | 34 +++++++++++++++------------------- fs/xfs/scrub/ialloc.c | 37 ++++++++++++++++++------------------- fs/xfs/scrub/scrub.c | 8 ++++---- fs/xfs/scrub/scrub.h | 6 ++---- 4 files changed, 39 insertions(+), 46 deletions(-) diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 279af72b1671..eb8ec47fc129 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -138,33 +138,29 @@ xchk_allocbt_rec( return 0; } -/* Scrub the freespace btrees for some AG. */ -STATIC int +/* Scrub one of the freespace btrees for some AG. */ +int xchk_allocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xchk_alloc ca = { }; struct xfs_btree_cur *cur; - cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_BNOBT: + cur = sc->sa.bno_cur; + break; + case XFS_SCRUB_TYPE_CNTBT: + cur = sc->sa.cnt_cur; + break; + default: + ASSERT(0); + return -EIO; + } + return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca); } -int -xchk_bnobt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_BNO); -} - -int -xchk_cntbt( - struct xfs_scrub *sc) -{ - return xchk_allocbt(sc, XFS_BTNUM_CNT); -} - /* xref check that the extent is not free */ void xchk_xref_is_used_space( diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index fb7bbf47ae5d..83d9a29ce91e 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes( xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } -/* Scrub the inode btrees for some AG. */ -STATIC int +/* Scrub one of the inode btrees for some AG. */ +int xchk_iallocbt( - struct xfs_scrub *sc, - xfs_btnum_t which) + struct xfs_scrub *sc) { struct xfs_btree_cur *cur; struct xchk_iallocbt iabt = { @@ -720,9 +719,23 @@ xchk_iallocbt( .next_startino = NULLAGINO, .next_cluster_ino = NULLAGINO, }; + xfs_btnum_t which; int error; - cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur; + switch (sc->sm->sm_type) { + case XFS_SCRUB_TYPE_INOBT: + cur = sc->sa.ino_cur; + which = XFS_BTNUM_INO; + break; + case XFS_SCRUB_TYPE_FINOBT: + cur = sc->sa.fino_cur; + which = XFS_BTNUM_FINO; + break; + default: + ASSERT(0); + return -EIO; + } + error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT, &iabt); if (error) @@ -743,20 +756,6 @@ xchk_iallocbt( return error; } -int -xchk_inobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_INO); -} - -int -xchk_finobt( - struct xfs_scrub *sc) -{ - return xchk_iallocbt(sc, XFS_BTNUM_FINO); -} - /* See if an inode btree has (or doesn't have) an inode chunk record. */ static inline void xchk_xref_inode_check( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 4849efcaa33a..31fabae588be 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -238,25 +238,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_bnobt, + .scrub = xchk_allocbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, - .scrub = xchk_cntbt, + .scrub = xchk_allocbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_inobt, + .scrub = xchk_iallocbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, - .scrub = xchk_finobt, + .scrub = xchk_iallocbt, .has = xfs_has_finobt, .repair = xrep_notsupported, }, diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 1ef9c6b4842a..a6a1bea4d62b 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -129,10 +129,8 @@ int xchk_superblock(struct xfs_scrub *sc); int xchk_agf(struct xfs_scrub *sc); int xchk_agfl(struct xfs_scrub *sc); int xchk_agi(struct xfs_scrub *sc); -int xchk_bnobt(struct xfs_scrub *sc); -int xchk_cntbt(struct xfs_scrub *sc); -int xchk_inobt(struct xfs_scrub *sc); -int xchk_finobt(struct xfs_scrub *sc); +int xchk_allocbt(struct xfs_scrub *sc); +int xchk_iallocbt(struct xfs_scrub *sc); int xchk_rmapbt(struct xfs_scrub *sc); int xchk_refcountbt(struct xfs_scrub *sc); int xchk_inode(struct xfs_scrub *sc); From 4bdfd7d15747b170ce93a06fafccaf20544b6684 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:32 -0800 Subject: [PATCH 062/123] xfs: repair free space btrees Rebuild the free space btrees from the gaps in the rmap btree. Refer to the case study in Documentation/filesystems/xfs-online-fsck-design.rst for more details. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_ag.h | 9 + fs/xfs/libxfs/xfs_ag_resv.c | 2 + fs/xfs/libxfs/xfs_alloc.c | 10 +- fs/xfs/libxfs/xfs_alloc.h | 2 +- fs/xfs/libxfs/xfs_alloc_btree.c | 13 +- fs/xfs/libxfs/xfs_types.h | 7 + fs/xfs/scrub/alloc.c | 18 +- fs/xfs/scrub/alloc_repair.c | 934 ++++++++++++++++++++++++++++++++ fs/xfs/scrub/common.h | 19 + fs/xfs/scrub/newbt.c | 48 +- fs/xfs/scrub/newbt.h | 3 + fs/xfs/scrub/repair.c | 72 +++ fs/xfs/scrub/repair.h | 24 + fs/xfs/scrub/scrub.c | 14 +- fs/xfs/scrub/scrub.h | 8 + fs/xfs/scrub/trace.h | 24 +- fs/xfs/scrub/xfarray.h | 22 + fs/xfs/xfs_extent_busy.c | 13 + fs/xfs/xfs_extent_busy.h | 2 + 20 files changed, 1224 insertions(+), 21 deletions(-) create mode 100644 fs/xfs/scrub/alloc_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index eb557dca9373..3af3cadc1ca1 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -182,6 +182,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + alloc_repair.o \ newbt.o \ reap.o \ repair.o \ diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 2e0aef87d633..f16cb7a174d4 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -80,6 +80,15 @@ struct xfs_perag { */ uint16_t pag_checked; uint16_t pag_sick; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Alternate btree heights so that online repair won't trip the write + * verifiers while rebuilding the AG btrees. + */ + uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; +#endif + spinlock_t pag_state_lock; spinlock_t pagb_lock; /* lock for pagb_tree */ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 7fd1fea95552..da1057bd0e60 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -411,6 +411,8 @@ xfs_ag_resv_free_extent( fallthrough; case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + fallthrough; + case XFS_AG_RESV_IGNORE: return; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 60c2c18e8e54..3bd0a33fee0a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec( /* Simple checks for free space records. */ xfs_failaddr_t xfs_alloc_check_irec( - struct xfs_btree_cur *cur, - const struct xfs_alloc_rec_incore *irec) + struct xfs_perag *pag, + const struct xfs_alloc_rec_incore *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->ar_blockcount == 0) return __this_address; @@ -299,7 +297,7 @@ xfs_alloc_get_rec( return error; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); @@ -3944,7 +3942,7 @@ xfs_alloc_query_range_helper( xfs_failaddr_t fa; xfs_alloc_btrec_to_irec(rec, &irec); - fa = xfs_alloc_check_irec(cur, &irec); + fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_alloc_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 851cafbd6449..0b956f8b9d5a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -185,7 +185,7 @@ xfs_alloc_get_rec( union xfs_btree_rec; void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_alloc_rec_incore *irec); -xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag, const struct xfs_alloc_rec_incore *irec); int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags, diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index c65228efed4a..a7032bf0cd37 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -323,7 +323,18 @@ xfs_allocbt_verify( if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) btnum = XFS_BTNUM_CNTi; if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_levels[btnum]) + unsigned int maxlevel = pag->pagf_levels[btnum]; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the free space btrees, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_levels[btnum]); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_alloc_maxlevels) return __this_address; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 533200c4ccc2..035bf703d719 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -208,6 +208,13 @@ enum xfs_ag_resv_type { XFS_AG_RESV_AGFL, XFS_AG_RESV_METADATA, XFS_AG_RESV_RMAPBT, + + /* + * Don't increase fdblocks when freeing extent. This is a pony for + * the bnobt repair functions to re-free the free space without + * altering fdblocks. If you think you need this you're wrong. + */ + XFS_AG_RESV_IGNORE, }; /* Results of scanning a btree keyspace to check occupancy. */ diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index eb8ec47fc129..d1b8a4997dd2 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -9,13 +9,16 @@ #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_btree.h" #include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_ag.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" -#include "xfs_ag.h" +#include "scrub/repair.h" /* * Set us up to scrub free space btrees. @@ -24,10 +27,19 @@ int xchk_setup_ag_allocbt( struct xfs_scrub *sc) { + int error; + if (xchk_need_intent_drain(sc)) xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); - return xchk_setup_ag_btree(sc, false); + error = xchk_setup_ag_btree(sc, false); + if (error) + return error; + + if (xchk_could_repair(sc)) + return xrep_setup_ag_allocbt(sc); + + return 0; } /* Free space btree scrubber. */ @@ -127,7 +139,7 @@ xchk_allocbt_rec( struct xchk_alloc *ca = bs->private; xfs_alloc_btrec_to_irec(rec, &irec); - if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) { + if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..45edda096869 --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,934 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recreate the free extent records in an AG by looking + * for gaps in the physical extents recorded in the rmapbt. These records are + * staged in @free_records. Identifying the gaps is more difficult on a + * reflink filesystem because rmap records are allowed to overlap. + * + * Because the final step of building a new index is to free the space used by + * the old index, repair needs to find that space. Unfortunately, all + * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share + * the same rmapbt owner code (OWN_AG), so this is not straightforward. + * + * The scan of the reverse mapping information records the space used by OWN_AG + * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While + * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to + * record all visited rmap btree blocks and all blocks owned by the AGFL. + * + * After that is where the definitions of old_allocbt_blocks shifts. This + * expression identifies possible former bnobt/cntbt blocks: + * + * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks); + * + * Substituting from above definitions, that becomes: + * + * old_allocbt_blocks & ~not_allocbt_blocks + * + * The OWN_AG bitmap itself isn't needed after this point, so what we really do + * instead is: + * + * old_allocbt_blocks &= ~not_allocbt_blocks; + * + * After this point, @old_allocbt_blocks is a bitmap of alleged former + * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first + * parameter in place to avoid copying records around. + * + * Next, some of the space described by @free_records are diverted to the newbt + * reservation and used to format new btree blocks. The remaining records are + * written to the new btree indices. We reconstruct both bnobt and cntbt at + * the same time since we've already done all the work. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xagb_bitmap not_allocbt_blocks; + + /* All OWN_AG blocks. */ + struct xagb_bitmap old_allocbt_blocks; + + /* + * New bnobt information. All btree block reservations are added to + * the reservation list in new_bnobt. + */ + struct xrep_newbt new_bnobt; + + /* new cntbt information */ + struct xrep_newbt new_cntbt; + + /* Free space extents. */ + struct xfarray *free_records; + + struct xfs_scrub *sc; + + /* Number of non-null records in @free_records. */ + uint64_t nr_real_records; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_agbno, we have found unused space. + */ + xfs_agblock_t next_agbno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; + + /* Longest free extent we found in the AG. */ + xfs_agblock_t longest; +}; + +/* Set up to repair AG free space btrees. */ +int +xrep_setup_ag_allocbt( + struct xfs_scrub *sc) +{ + unsigned int busy_gen; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. + */ + busy_gen = READ_ONCE(sc->sa.pag->pagb_gen); + if (xfs_extent_busy_list_empty(sc->sa.pag)) + return 0; + + return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0); +} + +/* Check for any obvious conflicts in the free extent. */ +STATIC int +xrep_abt_check_free_ext( + struct xfs_scrub *sc, + const struct xfs_alloc_rec_incore *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->ar_startblock, rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be shared or CoW staging. */ + if (sc->sa.refc_cur) { + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_SHARED, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + error = xfs_refcount_has_records(sc->sa.refc_cur, + XFS_REFC_DOMAIN_COW, rec->ar_startblock, + rec->ar_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + } + + return 0; +} + +/* + * Stash a free space record for all the space since the last bno we found + * all the way up to @end. + */ +static int +xrep_abt_stash( + struct xrep_abt *ra, + xfs_agblock_t end) +{ + struct xfs_alloc_rec_incore arec = { + .ar_startblock = ra->next_agbno, + .ar_blockcount = end - ra->next_agbno, + }; + struct xfs_scrub *sc = ra->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xrep_abt_check_free_ext(ra->sc, &arec); + if (error) + return error; + + trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec); + + error = xfarray_append(ra->free_records, &arec); + if (error) + return error; + + ra->nr_blocks += arec.ar_blockcount; + return 0; +} + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + error = xagb_bitmap_set(&ra->old_allocbt_blocks, + rec->rm_startblock, rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_agbno) { + error = xrep_abt_stash(ra, rec->rm_startblock); + if (error) + return error; + } + + /* + * rmap records can overlap on reflink filesystems, so project + * next_agbno as far out into the AG space as we currently know about. + */ + ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t agbno, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1); +} + +/* + * Compare two free space extents by block number. We want to sort in order of + * increasing block number. + */ +static int +xrep_bnobt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_startblock > bp->ar_startblock) + return 1; + else if (ap->ar_startblock < bp->ar_startblock) + return -1; + return 0; +} + +/* + * Re-sort the free extents by block number so that we can put the records into + * the bnobt in the correct order. Make sure the records do not overlap in + * physical space. + */ +STATIC int +xrep_bnobt_sort_records( + struct xrep_abt *ra) +{ + struct xfs_alloc_rec_incore arec; + xfarray_idx_t cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0); + if (error) + return error; + + while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) { + if (arec.ar_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = arec.ar_startblock + arec.ar_blockcount; + } + + return error; +} + +/* + * Compare two free space extents by length and then block number. We want + * to sort first in order of increasing length and then in order of increasing + * block number. + */ +static int +xrep_cntbt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_blockcount > bp->ar_blockcount) + return 1; + else if (ap->ar_blockcount < bp->ar_blockcount) + return -1; + return xrep_bnobt_extent_cmp(a, b); +} + +/* + * Sort the free extents by length so so that we can put the records into the + * cntbt in the correct order. Don't let userspace kill us if we're resorting + * after allocating btree blocks. + */ +STATIC int +xrep_cntbt_sort_records( + struct xrep_abt *ra, + bool is_resort) +{ + return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp, + is_resort ? 0 : XFARRAY_SORT_KILLABLE); +} + +/* + * Iterate all reverse mappings to find (1) the gaps between rmap records (all + * unowned space), (2) the OWN_AG extents (which encompass the free space + * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL + * blocks. The free space is (1) + (2) - (3) - (4). + */ +STATIC int +xrep_abt_find_freespace( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + struct xfs_buf *agfl_bp; + xfs_agblock_t agend; + int error; + + xagb_bitmap_init(&ra->not_allocbt_blocks); + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra); + if (error) + goto err; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(agf->agf_length); + if (ra->next_agbno < agend) { + error = xrep_abt_stash(ra, agend); + if (error) + goto err; + } + + /* Collect all the AGFL blocks. */ + error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp); + if (error) + goto err; + + error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra); + if (error) + goto err_agfl; + + /* Compute the old bnobt/cntbt blocks. */ + error = xagb_bitmap_disunion(&ra->old_allocbt_blocks, + &ra->not_allocbt_blocks); + if (error) + goto err_agfl; + + ra->nr_real_records = xfarray_length(ra->free_records); +err_agfl: + xfs_trans_brelse(sc->tp, agfl_bp); +err: + xchk_ag_btcur_free(&sc->sa); + xagb_bitmap_destroy(&ra->not_allocbt_blocks); + return error; +} + +/* + * We're going to use the observed free space records to reserve blocks for the + * new free space btrees, so we play an iterative game where we try to converge + * on the number of blocks we need: + * + * 1. Estimate how many blocks we'll need to store the records. + * 2. If the first free record has more blocks than we need, we're done. + * We will have to re-sort the records prior to building the cntbt. + * 3. If that record has exactly the number of blocks we need, null out the + * record. We're done. + * 4. Otherwise, we still need more blocks. Null out the record, subtract its + * length from the number of blocks we need, and go back to step 1. + * + * Fortunately, we don't have to do any transaction work to play this game, so + * we don't have to tear down the staging cursors. + */ +STATIC int +xrep_abt_reserve_space( + struct xrep_abt *ra, + struct xfs_btree_cur *bno_cur, + struct xfs_btree_cur *cnt_cur, + bool *needs_resort) +{ + struct xfs_scrub *sc = ra->sc; + xfarray_idx_t record_nr; + unsigned int allocated = 0; + int error = 0; + + record_nr = xfarray_length(ra->free_records) - 1; + do { + struct xfs_alloc_rec_incore arec; + uint64_t required; + unsigned int desired; + unsigned int len; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(cnt_cur, + &ra->new_cntbt.bload, ra->nr_real_records); + if (error) + break; + + error = xfs_btree_bload_compute_geometry(bno_cur, + &ra->new_bnobt.bload, ra->nr_real_records); + if (error) + break; + + /* How many btree blocks do we need to store all records? */ + required = ra->new_bnobt.bload.nr_blocks + + ra->new_cntbt.bload.nr_blocks; + ASSERT(required < INT_MAX); + + /* If we've reserved enough blocks, we're done. */ + if (allocated >= required) + break; + + desired = required - allocated; + + /* We need space but there's none left; bye! */ + if (ra->nr_real_records == 0) { + error = -ENOSPC; + break; + } + + /* Grab the first record from the list. */ + error = xfarray_load(ra->free_records, record_nr, &arec); + if (error) + break; + + ASSERT(arec.ar_blockcount <= UINT_MAX); + len = min_t(unsigned int, arec.ar_blockcount, desired); + + trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno, + arec.ar_startblock, len, XFS_RMAP_OWN_AG); + + error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag, + arec.ar_startblock, len); + if (error) + break; + allocated += len; + ra->nr_blocks -= len; + + if (arec.ar_blockcount > desired) { + /* + * Record has more space than we need. The number of + * free records doesn't change, so shrink the free + * record, inform the caller that the records are no + * longer sorted by length, and exit. + */ + arec.ar_startblock += desired; + arec.ar_blockcount -= desired; + error = xfarray_store(ra->free_records, record_nr, + &arec); + if (error) + break; + + *needs_resort = true; + return 0; + } + + /* + * We're going to use up the entire record, so unset it and + * move on to the next one. This changes the number of free + * records (but doesn't break the sorting order), so we must + * go around the loop once more to re-run _bload_init. + */ + error = xfarray_unset(ra->free_records, record_nr); + if (error) + break; + ra->nr_real_records--; + record_nr--; + } while (1); + + return error; +} + +STATIC int +xrep_abt_dispose_one( + struct xrep_abt *ra, + struct xrep_newbt_resv *resv) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + xfs_agblock_t free_agbno = resv->agbno + resv->used; + xfs_extlen_t free_aglen = resv->len - resv->used; + int error; + + ASSERT(pag == resv->pag); + + /* Add a deferred rmap for each extent we used. */ + if (resv->used > 0) + xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno, + resv->used, XFS_RMAP_OWN_AG); + + /* + * For each reserved btree block we didn't use, add it to the free + * space btree. We didn't touch fdblocks when we reserved them, so + * we don't touch it now. + */ + if (free_aglen == 0) + return 0; + + trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno, + free_aglen, ra->new_bnobt.oinfo.oi_owner); + + error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen, + &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true); + if (error) + return error; + + return xrep_defer_finish(sc); +} + +/* + * Deal with all the space we reserved. Blocks that were allocated for the + * free space btrees need to have a (deferred) rmap added for the OWN_AG + * allocation, and blocks that didn't get used can be freed via the usual + * (deferred) means. + */ +STATIC void +xrep_abt_dispose_reservations( + struct xrep_abt *ra, + int error) +{ + struct xrep_newbt_resv *resv, *n; + + if (error) + goto junkit; + + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + error = xrep_abt_dispose_one(ra, resv); + if (error) + goto junkit; + } + +junkit: + list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) { + xfs_perag_put(resv->pag); + list_del(&resv->list); + kfree(resv); + } + + xrep_newbt_cancel(&ra->new_bnobt); + xrep_newbt_cancel(&ra->new_cntbt); +} + +/* Retrieve free space data for bulk load. */ +STATIC int +xrep_abt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; + struct xrep_abt *ra = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load_next(ra->free_records, &ra->array_cur, + arec); + if (error) + return error; + + ra->longest = max(ra->longest, arec->ar_blockcount); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_abt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr); +} + +/* + * Reset the AGF counters to reflect the free space btrees that we just + * rebuilt, then reinitialize the per-AG data. + */ +STATIC int +xrep_abt_reset_counters( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf = sc->sa.agf_bp->b_addr; + unsigned int freesp_btreeblks = 0; + + /* + * Compute the contribution to agf_btreeblks for the new free space + * btrees. This is the computed btree size minus anything we didn't + * use. + */ + freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1; + freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1; + + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt); + freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt); + + /* + * The AGF header contains extra information related to the free space + * btrees, so we must update those fields here. + */ + agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + + (be32_to_cpu(agf->agf_rmap_blocks) - 1)); + agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); + agf->agf_longest = cpu_to_be32(ra->longest); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | + XFS_AGF_LONGEST | + XFS_AGF_FREEBLKS); + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the allocbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi]; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi]; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected free space information to stage new free space btrees. + * If this is successful we'll return with the new btree root + * information logged to the repair transaction but not yet committed. + */ +STATIC int +xrep_abt_build_new_trees( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + struct xfs_perag *pag = sc->sa.pag; + bool needs_resort = false; + int error; + + /* + * Sort the free extents by length so that we can set up the free space + * btrees in as few extents as possible. This reduces the amount of + * deferred rmap / free work we have to do at the end. + */ + error = xrep_cntbt_sort_records(ra, false); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + xrep_newbt_init_bare(&ra->new_bnobt, sc); + xrep_newbt_init_bare(&ra->new_cntbt, sc); + + ra->new_bnobt.bload.get_records = xrep_abt_get_records; + ra->new_cntbt.bload.get_records = xrep_abt_get_records; + + ra->new_bnobt.bload.claim_block = xrep_abt_claim_block; + ra->new_cntbt.bload.claim_block = xrep_abt_claim_block; + + /* Allocate cursors for the staged btrees. */ + bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake, + pag, XFS_BTNUM_BNO); + cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake, + pag, XFS_BTNUM_CNT); + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btrees. */ + error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort); + if (error) + goto err_cur; + + /* + * If we need to re-sort the free extents by length, do so so that we + * can put the records into the cntbt in the correct order. + */ + if (needs_resort) { + error = xrep_cntbt_sort_records(ra, needs_resort); + if (error) + goto err_cur; + } + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the alternate incore btree + * height so that we don't trip the verifiers when writing the new + * btree blocks to disk. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = + ra->new_bnobt.bload.btree_height; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = + ra->new_cntbt.bload.btree_height; + + /* Load the free space by length tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + ra->longest = 0; + error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra); + if (error) + goto err_levels; + + error = xrep_bnobt_sort_records(ra); + if (error) + return error; + + /* Load the free space by block number tree. */ + ra->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra); + if (error) + goto err_levels; + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(bno_cur, 0); + xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(cnt_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_abt_reset_counters(ra); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + xrep_abt_dispose_reservations(ra, error); + + return xrep_roll_ag_trans(sc); + +err_levels: + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; +err_cur: + xfs_btree_del_cursor(cnt_cur, error); + xfs_btree_del_cursor(bno_cur, error); +err_newbt: + xrep_abt_dispose_reservations(ra, error); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_abt_remove_old_trees( + struct xrep_abt *ra) +{ + struct xfs_perag *pag = ra->sc->sa.pag; + int error; + + /* Free the old btree blocks if they're not in use. */ + error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks, + &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); + if (error) + return error; + + /* + * Now that we've zapped all the old allocbt blocks we can turn off + * the alternate height mechanism. + */ + pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0; + pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0; + return 0; +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct xrep_abt *ra; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS); + if (!ra) + return -ENOMEM; + ra->sc = sc; + + /* We rebuild both data structures. */ + sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; + + /* + * Make sure the busy extent list is clear because we can't put extents + * on there twice. In theory we cleared this before we started, but + * let's not risk the filesystem. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) { + error = -EDEADLOCK; + goto out_ra; + } + + /* Set up enough storage to handle maximally fragmented free space. */ + descr = xchk_xfile_ag_descr(sc, "free space records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2, + sizeof(struct xfs_alloc_rec_incore), + &ra->free_records); + kfree(descr); + if (error) + goto out_ra; + + /* Collect the free space data and find the old btree blocks. */ + xagb_bitmap_init(&ra->old_allocbt_blocks); + error = xrep_abt_find_freespace(ra); + if (error) + goto out_bitmap; + + /* Rebuild the free space information. */ + error = xrep_abt_build_new_trees(ra); + if (error) + goto out_bitmap; + + /* Kill the old trees. */ + error = xrep_abt_remove_old_trees(ra); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ra->old_allocbt_blocks); + xfarray_destroy(ra->free_records); +out_ra: + kfree(ra); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_allocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; + error = xchk_allocbt(sc); + if (error) + goto out; + + sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; + error = xchk_allocbt(sc); +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index c83cf9e5b55f..c31be570e7d8 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -200,8 +200,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT | XFS_SCRUB_OFLAG_PREEN); } + +/* + * "Should we prepare for a repair?" + * + * Return true if the caller permits us to repair metadata and we're not + * setting up for a post-repair evaluation. + */ +static inline bool xchk_could_repair(const struct xfs_scrub *sc) +{ + return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED); +} #else # define xchk_needs_repair(sc) (false) +# define xchk_could_repair(sc) (false) #endif /* CONFIG_XFS_ONLINE_REPAIR */ int xchk_metadata_inode_forks(struct xfs_scrub *sc); @@ -213,6 +226,12 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); #define xchk_xfile_descr(sc, fmt, ...) \ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \ (sc)->mp->m_super->s_id, ##__VA_ARGS__) +#define xchk_xfile_ag_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ + ##__VA_ARGS__) + /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index 81919eeabcdb..bb6d980b4fcd 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -157,11 +157,13 @@ xrep_newbt_add_blocks( resv->used = 0; resv->pag = xfs_perag_hold(pag); - ASSERT(xnr->oinfo.oi_offset == 0); + if (args->tp) { + ASSERT(xnr->oinfo.oi_offset == 0); - error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); - if (error) - goto out_pag; + error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap); + if (error) + goto out_pag; + } list_add_tail(&resv->list, &xnr->resv_list); return 0; @@ -171,6 +173,30 @@ xrep_newbt_add_blocks( return error; } +/* + * Add an extent to the new btree reservation pool. Callers are required to + * reap this reservation manually if the repair is cancelled. @pag must be a + * passive reference. + */ +int +xrep_newbt_add_extent( + struct xrep_newbt *xnr, + struct xfs_perag *pag, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = xnr->sc->mp; + struct xfs_alloc_arg args = { + .tp = NULL, /* no autoreap */ + .oinfo = xnr->oinfo, + .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno), + .len = len, + .resv = xnr->resv, + }; + + return xrep_newbt_add_blocks(xnr, pag, &args); +} + /* Don't let our allocation hint take us beyond this AG */ static inline void xrep_newbt_validate_ag_alloc_hint( @@ -372,6 +398,7 @@ xrep_newbt_free_extent( free_aglen, xnr->oinfo.oi_owner); ASSERT(xnr->resv != XFS_AG_RESV_AGFL); + ASSERT(xnr->resv != XFS_AG_RESV_IGNORE); /* * Use EFIs to free the reservations. This reduces the chance @@ -517,3 +544,16 @@ xrep_newbt_claim_block( /* Relog all the EFIs. */ return xrep_defer_finish(xnr->sc); } + +/* How many reserved blocks are unused? */ +unsigned int +xrep_newbt_unused_blocks( + struct xrep_newbt *xnr) +{ + struct xrep_newbt_resv *resv; + unsigned int unused = 0; + + list_for_each_entry(resv, &xnr->resv_list, list) + unused += resv->len - resv->used; + return unused; +} diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h index d2baffa17b1a..89f8e3970b1f 100644 --- a/fs/xfs/scrub/newbt.h +++ b/fs/xfs/scrub/newbt.h @@ -57,9 +57,12 @@ void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc, int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc, int whichfork, const struct xfs_owner_info *oinfo); int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks); +int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag, + xfs_agblock_t agbno, xfs_extlen_t len); void xrep_newbt_cancel(struct xrep_newbt *xnr); int xrep_newbt_commit(struct xrep_newbt *xnr); int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr, union xfs_btree_ptr *ptr); +unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr); #endif /* __XFS_SCRUB_NEWBT_H__ */ diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b8b5439f2d7..01b7e8d1a58b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -734,3 +734,75 @@ xrep_ino_dqattach( return error; } + +/* + * Initialize all the btree cursors for an AG repair except for the btree that + * we're rebuilding. + */ +void +xrep_ag_btcur_init( + struct xfs_scrub *sc, + struct xchk_ag *sa) +{ + struct xfs_mount *mp = sc->mp; + + /* Set up a bnobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) { + sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_BNO); + sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag, XFS_BTNUM_CNT); + } + + /* Set up a inobt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT && + sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) { + sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, + sa->agi_bp, XFS_BTNUM_INO); + if (xfs_has_finobt(mp)) + sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag, + sc->tp, sa->agi_bp, XFS_BTNUM_FINO); + } + + /* Set up a rmapbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT && + xfs_has_rmapbt(mp)) + sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, + sc->sa.pag); + + /* Set up a refcountbt cursor for cross-referencing. */ + if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT && + xfs_has_reflink(mp)) + sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, + sa->agf_bp, sc->sa.pag); +} + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGF + * buffer. We had better get the same AGF buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagf( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agf(pag)); + + clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); + error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp); + if (error) + return error; + + if (bp != sc->sa.agf_bp) { + ASSERT(bp == sc->sa.agf_bp); + return -EFSCORRUPTED; + } + + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 60d2a9ae5f2e..bc3353ecae8a 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -60,6 +60,15 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +/* Repair setup functions */ +int xrep_setup_ag_allocbt(struct xfs_scrub *sc); + +void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); + +/* Metadata revalidators */ + +int xrep_revalidate_allocbt(struct xfs_scrub *sc); + /* Metadata repairers */ int xrep_probe(struct xfs_scrub *sc); @@ -67,6 +76,9 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); + +int xrep_reinit_pagf(struct xfs_scrub *sc); #else @@ -87,11 +99,23 @@ xrep_calc_ag_resblks( return 0; } +/* repair setup functions for no-repair */ +static inline int +xrep_setup_nothing( + struct xfs_scrub *sc) +{ + return 0; +} +#define xrep_setup_ag_allocbt xrep_setup_nothing + +#define xrep_revalidate_allocbt (NULL) + #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 31fabae588be..ebc3b68a8ffb 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -239,13 +239,15 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_allocbt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_allocbt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, @@ -531,7 +533,10 @@ xfs_scrub_metadata( /* Scrub for errors. */ check_start = xchk_stats_now(); - error = sc->ops->scrub(sc); + if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL) + error = sc->ops->repair_eval(sc); + else + error = sc->ops->scrub(sc); run.scrub_ns += xchk_stats_elapsed_ns(check_start); if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER)) goto try_harder; @@ -542,8 +547,7 @@ xfs_scrub_metadata( xchk_update_health(sc); - if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && - !(sc->flags & XREP_ALREADY_FIXED)) { + if (xchk_could_repair(sc)) { bool needs_fix = xchk_needs_repair(sc->sm); /* Userspace asked us to rebuild the structure regardless. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index a6a1bea4d62b..5f934a2a4cb9 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -35,6 +35,14 @@ struct xchk_meta_ops { /* Repair or optimize the metadata. */ int (*repair)(struct xfs_scrub *); + /* + * Re-scrub the metadata we repaired, in case there's extra work that + * we need to do to check our repair work. If this is NULL, we'll use + * the ->scrub function pointer, assuming that the regular scrub is + * sufficient. + */ + int (*repair_eval)(struct xfs_scrub *sc); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_mount *); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index aa7683075319..ea518712efa8 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1172,11 +1172,33 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +TRACE_EVENT(xrep_abt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_alloc_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = rec->ar_startblock; + __entry->blockcount = rec->ar_blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount) +) + TRACE_EVENT(xrep_refcount_extent_fn, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_refcount_irec *irec), diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h index 4ecac01363d9..62b9c506fdd1 100644 --- a/fs/xfs/scrub/xfarray.h +++ b/fs/xfs/scrub/xfarray.h @@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr) uint64_t xfarray_length(struct xfarray *array); int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec); +/* + * Iterate the non-null elements in a sparse xfarray. Callers should + * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it + * will be set to one more than the index of the record that was retrieved. + * Returns 1 if a record was retrieved, 0 if there weren't any more records, or + * a negative errno. + */ +static inline int +xfarray_iter( + struct xfarray *array, + xfarray_idx_t *idx, + void *rec) +{ + int ret = xfarray_load_next(array, idx, rec); + + if (ret == -ENODATA) + return 0; + if (ret == 0) + return 1; + return ret; +} + /* Declarations for xfile array sort functionality. */ typedef cmp_func_t xfarray_cmp_fn; diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 9ecfdcdc752f..2ccde32c9a9e 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp( diff = b1->bno - b2->bno; return diff; } + +/* Are there any busy extents in this AG? */ +bool +xfs_extent_busy_list_empty( + struct xfs_perag *pag) +{ + bool res; + + spin_lock(&pag->pagb_lock); + res = RB_EMPTY_ROOT(&pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + return res; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 0639aab336f3..470032de3139 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); + #endif /* __XFS_EXTENT_BUSY_H__ */ From dbfbf3bdf639a20da7d5fb390cd2e197d25aa418 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:32 -0800 Subject: [PATCH 063/123] xfs: repair inode btrees Use the rmapbt to find inode chunks, query the chunks to compute hole and free masks, and with that information rebuild the inobt and finobt. Refer to the case study in Documentation/filesystems/xfs-online-fsck-design.rst for more details. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_ialloc.c | 31 +- fs/xfs/libxfs/xfs_ialloc.h | 3 +- fs/xfs/scrub/common.c | 1 + fs/xfs/scrub/ialloc.c | 2 +- fs/xfs/scrub/ialloc_repair.c | 884 +++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.c | 59 +++ fs/xfs/scrub/repair.h | 17 + fs/xfs/scrub/scrub.c | 6 +- fs/xfs/scrub/scrub.h | 1 + fs/xfs/scrub/trace.h | 68 +-- 11 files changed, 1022 insertions(+), 51 deletions(-) create mode 100644 fs/xfs/scrub/ialloc_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 3af3cadc1ca1..8758abdcbb20 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -183,6 +183,7 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ + ialloc_repair.o \ newbt.o \ reap.o \ repair.o \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index d61d03e5b853..2361a22035b0 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec( irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } +/* Compute the freecount of an incore inode record. */ +uint8_t +xfs_inobt_rec_freecount( + const struct xfs_inobt_rec_incore *irec) +{ + uint64_t realfree = irec->ir_free; + + if (xfs_inobt_issparse(irec->ir_holemask)) + realfree &= xfs_inobt_irec_to_allocmask(irec); + return hweight64(realfree); +} + /* Simple checks for inode records. */ xfs_failaddr_t xfs_inobt_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec) { - uint64_t realfree; - /* Record has to be properly aligned within the AG. */ - if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino)) + if (!xfs_verify_agino(pag, irec->ir_startino)) return __this_address; - if (!xfs_verify_agino(cur->bc_ag.pag, + if (!xfs_verify_agino(pag, irec->ir_startino + XFS_INODES_PER_CHUNK - 1)) return __this_address; if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT || @@ -115,12 +125,7 @@ xfs_inobt_check_irec( if (irec->ir_freecount > XFS_INODES_PER_CHUNK) return __this_address; - /* if there are no holes, return the first available offset */ - if (!xfs_inobt_issparse(irec->ir_holemask)) - realfree = irec->ir_free; - else - realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec); - if (hweight64(realfree) != irec->ir_freecount) + if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount) return __this_address; return NULL; @@ -164,7 +169,7 @@ xfs_inobt_get_rec( return error; xfs_inobt_btrec_to_irec(mp, rec, irec); - fa = xfs_inobt_check_irec(cur, irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, irec); @@ -2740,7 +2745,7 @@ xfs_ialloc_count_inodes_rec( xfs_failaddr_t fa; xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec); - fa = xfs_inobt_check_irec(cur, &irec); + fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec); if (fa) return xfs_inobt_complain_bad_rec(cur, fa, &irec); diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index fe824bb04a09..f1412183bb44 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, */ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_inobt_rec_incore_t *rec, int *stat); +uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec); /* * Inode chunk initialisation routine @@ -93,7 +94,7 @@ union xfs_btree_rec; void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); -xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag, const struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 23944fcc1a6c..e0d6d8c9f640 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -604,6 +604,7 @@ xchk_ag_free( struct xchk_ag *sa) { xchk_ag_btcur_free(sa); + xrep_reset_perag_resv(sc); if (sa->agf_bp) { xfs_trans_brelse(sc->tp, sa->agf_bp); sa->agf_bp = NULL; diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 83d9a29ce91e..a720fc62262a 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -585,7 +585,7 @@ xchk_iallocbt_rec( uint16_t holemask; xfs_inobt_btrec_to_irec(mp, rec, &irec); - if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) { + if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c new file mode 100644 index 000000000000..b3f7182dd2f5 --- /dev/null +++ b/fs/xfs/scrub/ialloc_repair.c @@ -0,0 +1,884 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_health.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Btree Repair + * ================== + * + * A quick refresher of inode btrees on a v5 filesystem: + * + * - Inode records are read into memory in units of 'inode clusters'. However + * many inodes fit in a cluster buffer is the smallest number of inodes that + * can be allocated or freed. Clusters are never smaller than one fs block + * though they can span multiple blocks. The size (in fs blocks) is + * computed with xfs_icluster_size_fsb(). The fs block alignment of a + * cluster is computed with xfs_ialloc_cluster_alignment(). + * + * - Each inode btree record can describe a single 'inode chunk'. The chunk + * size is defined to be 64 inodes. If sparse inodes are enabled, every + * inobt record must be aligned to the chunk size; if not, every record must + * be aligned to the start of a cluster. It is possible to construct an XFS + * geometry where one inobt record maps to multiple inode clusters; it is + * also possible to construct a geometry where multiple inobt records map to + * different parts of one inode cluster. + * + * - If sparse inodes are not enabled, the smallest unit of allocation for + * inode records is enough to contain one inode chunk's worth of inodes. + * + * - If sparse inodes are enabled, the holemask field will be active. Each + * bit of the holemask represents 4 potential inodes; if set, the + * corresponding space does *not* contain inodes and must be left alone. + * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation + * of inode records is one inode cluster. + * + * So what's the rebuild algorithm? + * + * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT + * records. The OWN_INOBT records are the old inode btree blocks and will be + * cleared out after we've rebuilt the tree. Each possible inode cluster + * within an OWN_INODES record will be read in; for each possible inobt record + * associated with that cluster, compute the freemask calculated from the + * i_mode data in the inode chunk. For sparse inodes the holemask will be + * calculated by creating the properly aligned inobt record and punching out + * any chunk that's missing. Inode allocations and frees grab the AGI first, + * so repair protects itself from concurrent access by locking the AGI. + * + * Once we've reconstructed all the inode records, we can create new inode + * btree roots and reload the btrees. We rebuild both inode trees at the same + * time because they have the same rmap owner and it would be more complex to + * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT + * blocks it owns. We have all the data we need to build both, so dump + * everything and start over. + * + * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once. + */ + +struct xrep_ibt { + /* Record under construction. */ + struct xfs_inobt_rec_incore rie; + + /* new inobt information */ + struct xrep_newbt new_inobt; + + /* new finobt information */ + struct xrep_newbt new_finobt; + + /* Old inode btree blocks we found in the rmap. */ + struct xagb_bitmap old_iallocbt_blocks; + + /* Reconstructed inode records. */ + struct xfarray *inode_records; + + struct xfs_scrub *sc; + + /* Number of inodes assigned disk space. */ + unsigned int icount; + + /* Number of inodes in use. */ + unsigned int iused; + + /* Number of finobt records needed. */ + unsigned int finobt_recs; + + /* get_records()'s position in the inode record array. */ + xfarray_idx_t array_cur; +}; + +/* + * Is this inode in use? If the inode is in memory we can tell from i_mode, + * otherwise we have to check di_mode in the on-disk buffer. We only care + * that the high (i.e. non-permission) bits of _mode are zero. This should be + * safe because repair keeps all AG headers locked until the end, and process + * trying to perform an inode allocation/free must lock the AGI. + * + * @cluster_ag_base is the inode offset of the cluster within the AG. + * @cluster_bp is the cluster buffer. + * @cluster_index is the inode offset within the inode cluster. + */ +STATIC int +xrep_ibt_check_ifree( + struct xrep_ibt *ri, + xfs_agino_t cluster_ag_base, + struct xfs_buf *cluster_bp, + unsigned int cluster_index, + bool *inuse) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dinode *dip; + xfs_ino_t fsino; + xfs_agino_t agino; + xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno; + unsigned int cluster_buf_base; + unsigned int offset; + int error; + + agino = cluster_ag_base + cluster_index; + fsino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* Inode uncached or half assembled, read disk buffer */ + cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base); + offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize; + if (offset >= BBTOB(cluster_bp->b_length)) + return -EFSCORRUPTED; + dip = xfs_buf_offset(cluster_bp, offset); + if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) + return -EFSCORRUPTED; + + if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino) + return -EFSCORRUPTED; + + /* Will the in-core inode tell us if it's in use? */ + error = xchk_inode_is_allocated(sc, agino, inuse); + if (!error) + return 0; + + *inuse = dip->di_mode != 0; + return 0; +} + +/* Stash the accumulated inobt record for rebuilding. */ +STATIC int +xrep_ibt_stash( + struct xrep_ibt *ri) +{ + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie); + if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL) + return -EFSCORRUPTED; + + if (ri->rie.ir_freecount > 0) + ri->finobt_recs++; + + trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie); + + error = xfarray_append(ri->inode_records, &ri->rie); + if (error) + return error; + + ri->rie.ir_startino = NULLAGINO; + return 0; +} + +/* + * Given an extent of inodes and an inode cluster buffer, calculate the + * location of the corresponding inobt record (creating it if necessary), + * then update the parts of the holemask and freemask of that record that + * correspond to the inode extent we were given. + * + * @cluster_ir_startino is the AG inode number of an inobt record that we're + * proposing to create for this inode cluster. If sparse inodes are enabled, + * we must round down to a chunk boundary to find the actual sparse record. + * @cluster_bp is the buffer of the inode cluster. + * @nr_inodes is the number of inodes to check from the cluster. + */ +STATIC int +xrep_ibt_cluster_record( + struct xrep_ibt *ri, + xfs_agino_t cluster_ir_startino, + struct xfs_buf *cluster_bp, + unsigned int nr_inodes) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + xfs_agino_t ir_startino; + unsigned int cluster_base; + unsigned int cluster_index; + int error = 0; + + ir_startino = cluster_ir_startino; + if (xfs_has_sparseinodes(mp)) + ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK); + cluster_base = cluster_ir_startino - ir_startino; + + /* + * If the accumulated inobt record doesn't map this cluster, add it to + * the list and reset it. + */ + if (ri->rie.ir_startino != NULLAGINO && + ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) { + error = xrep_ibt_stash(ri); + if (error) + return error; + } + + if (ri->rie.ir_startino == NULLAGINO) { + ri->rie.ir_startino = ir_startino; + ri->rie.ir_free = XFS_INOBT_ALL_FREE; + ri->rie.ir_holemask = 0xFFFF; + ri->rie.ir_count = 0; + } + + /* Record the whole cluster. */ + ri->icount += nr_inodes; + ri->rie.ir_count += nr_inodes; + ri->rie.ir_holemask &= ~xfs_inobt_maskn( + cluster_base / XFS_INODES_PER_HOLEMASK_BIT, + nr_inodes / XFS_INODES_PER_HOLEMASK_BIT); + + /* Which inodes within this cluster are free? */ + for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) { + bool inuse = false; + + error = xrep_ibt_check_ifree(ri, cluster_ir_startino, + cluster_bp, cluster_index, &inuse); + if (error) + return error; + if (!inuse) + continue; + ri->iused++; + ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base + + cluster_index); + } + return 0; +} + +/* + * For each inode cluster covering the physical extent recorded by the rmapbt, + * we must calculate the properly aligned startino of that cluster, then + * iterate each cluster to fill in used and filled masks appropriately. We + * then use the (startino, used, filled) information to construct the + * appropriate inode records. + */ +STATIC int +xrep_ibt_process_cluster( + struct xrep_ibt *ri, + xfs_agblock_t cluster_bno) +{ + struct xfs_imap imap; + struct xfs_buf *cluster_bp; + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t cluster_ag_base; + xfs_agino_t irec_index; + unsigned int nr_inodes; + int error; + + nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster, + XFS_INODES_PER_CHUNK); + + /* + * Grab the inode cluster buffer. This is safe to do with a broken + * inobt because imap_to_bp directly maps the buffer without touching + * either inode btree. + */ + imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno); + imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster); + imap.im_boffset = 0; + error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp); + if (error) + return error; + + /* + * Record the contents of each possible inobt record mapping this + * cluster. + */ + cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno); + for (irec_index = 0; + irec_index < igeo->inodes_per_cluster; + irec_index += XFS_INODES_PER_CHUNK) { + error = xrep_ibt_cluster_record(ri, + cluster_ag_base + irec_index, cluster_bp, + nr_inodes); + if (error) + break; + + } + + xfs_trans_brelse(sc->tp, cluster_bp); + return error; +} + +/* Check for any obvious conflicts in the inode chunk extent. */ +STATIC int +xrep_ibt_check_inode_ext( + struct xfs_scrub *sc, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agino_t agino; + enum xbtree_recpacking outcome; + int error; + + /* Inode records must be within the AG. */ + if (!xfs_verify_agbext(sc->sa.pag, agbno, len)) + return -EFSCORRUPTED; + + /* The entire record must align to the inode cluster size. */ + if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) || + !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster)) + return -EFSCORRUPTED; + + /* + * The entire record must also adhere to the inode cluster alignment + * size if sparse inodes are not enabled. + */ + if (!xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, igeo->cluster_align) || + !IS_ALIGNED(agbno + len, igeo->cluster_align))) + return -EFSCORRUPTED; + + /* + * On a sparse inode fs, this cluster could be part of a sparse chunk. + * Sparse clusters must be aligned to sparse chunk alignment. + */ + if (xfs_has_sparseinodes(mp) && + (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) || + !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align))) + return -EFSCORRUPTED; + + /* Make sure the entire range of blocks are valid AG inodes. */ + agino = XFS_AGB_TO_AGINO(mp, agbno); + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1; + if (!xfs_verify_agino(sc->sa.pag, agino)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Found a fragment of the old inode btrees; dispose of them later. */ +STATIC int +xrep_ibt_record_old_btree_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock, + rec->rm_blockcount); +} + +/* Record extents that belong to inode cluster blocks. */ +STATIC int +xrep_ibt_record_inode_blocks( + struct xrep_ibt *ri, + const struct xfs_rmap_irec *rec) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t cluster_base; + int error; + + error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock, + rec->rm_blockcount); + if (error) + return error; + + trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno, + rec->rm_startblock, rec->rm_blockcount, rec->rm_owner, + rec->rm_offset, rec->rm_flags); + + /* + * Record the free/hole masks for each inode cluster that could be + * mapped by this rmap record. + */ + for (cluster_base = 0; + cluster_base < rec->rm_blockcount; + cluster_base += igeo->blocks_per_cluster) { + error = xrep_ibt_process_cluster(ri, + rec->rm_startblock + cluster_base); + if (error) + return error; + } + + return 0; +} + +STATIC int +xrep_ibt_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_ibt *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + switch (rec->rm_owner) { + case XFS_RMAP_OWN_INOBT: + return xrep_ibt_record_old_btree_blocks(ri, rec); + case XFS_RMAP_OWN_INODES: + return xrep_ibt_record_inode_blocks(ri, rec); + } + return 0; +} + +/* + * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode + * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct + * the inode btrees. The caller must clean up the lists if anything goes + * wrong. + */ +STATIC int +xrep_ibt_find_inodes( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + ri->rie.ir_startino = NULLAGINO; + + /* Collect all reverse mappings for inode blocks. */ + xrep_ag_btcur_init(sc, &sc->sa); + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri); + xchk_ag_btcur_free(&sc->sa); + if (error) + return error; + + /* If we have a record ready to go, add it to the array. */ + if (ri->rie.ir_startino != NULLAGINO) + return xrep_ibt_stash(ri); + + return 0; +} + +/* Update the AGI counters. */ +STATIC int +xrep_ibt_reset_counters( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_agi *agi = sc->sa.agi_bp->b_addr; + unsigned int freecount = ri->icount - ri->iused; + + /* Trigger inode count recalculation */ + xfs_force_summary_recalc(sc->mp); + + /* + * The AGI header contains extra information related to the inode + * btrees, so we must update those fields here. + */ + agi->agi_count = cpu_to_be32(ri->icount); + agi->agi_freecount = cpu_to_be32(freecount); + xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, + XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagi(sc); +} + +/* Retrieve finobt data for bulk load. */ +STATIC int +xrep_fibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(ri->inode_records, + ri->array_cur++, irec); + } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Retrieve inobt data for bulk load. */ +STATIC int +xrep_ibt_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i; + struct xrep_ibt *ri = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(ri->inode_records, ri->array_cur++, irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new inobt blocks to the bulk loader. */ +STATIC int +xrep_ibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr); +} + +/* Feed one of the new finobt blocks to the bulk loader. */ +STATIC int +xrep_fibt_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_ibt *ri = priv; + + return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr); +} + +/* Make sure the records do not overlap in inumber address space. */ +STATIC int +xrep_ibt_check_overlap( + struct xrep_ibt *ri) +{ + struct xfs_inobt_rec_incore irec; + xfarray_idx_t cur; + xfs_agino_t next_agino = 0; + int error = 0; + + foreach_xfarray_idx(ri->inode_records, cur) { + if (xchk_should_terminate(ri->sc, &error)) + return error; + + error = xfarray_load(ri->inode_records, cur, &irec); + if (error) + return error; + + if (irec.ir_startino < next_agino) + return -EFSCORRUPTED; + + next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK; + } + + return error; +} + +/* Build new inode btrees and dispose of the old one. */ +STATIC int +xrep_ibt_build_new_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_btree_cur *ino_cur; + struct xfs_btree_cur *fino_cur = NULL; + xfs_fsblock_t fsbno; + bool need_finobt; + int error; + + need_finobt = xfs_has_finobt(sc->mp); + + /* + * Create new btrees for staging all the inobt records we collected + * earlier. The records were collected in order of increasing agino, + * so we do not have to sort them. Ensure there are no overlapping + * records. + */ + error = xrep_ibt_check_overlap(ri); + if (error) + return error; + + /* + * The new inode btrees will not be rooted in the AGI until we've + * successfully rebuilt the tree. + * + * Start by setting up the inobt staging cursor. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_IBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno, + XFS_AG_RESV_NONE); + ri->new_inobt.bload.claim_block = xrep_ibt_claim_block; + ri->new_inobt.bload.get_records = xrep_ibt_get_records; + + ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake, + XFS_BTNUM_INO); + error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload, + xfarray_length(ri->inode_records)); + if (error) + goto err_inocur; + + /* Set up finobt staging cursor. */ + if (need_finobt) { + enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA; + + if (sc->mp->m_finobt_nores) + resv = XFS_AG_RESV_NONE; + + fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, + XFS_FIBT_BLOCK(sc->mp)), + xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT, + fsbno, resv); + ri->new_finobt.bload.claim_block = xrep_fibt_claim_block; + ri->new_finobt.bload.get_records = xrep_fibt_get_records; + + fino_cur = xfs_inobt_stage_cursor(sc->sa.pag, + &ri->new_finobt.afake, XFS_BTNUM_FINO); + error = xfs_btree_bload_compute_geometry(fino_cur, + &ri->new_finobt.bload, ri->finobt_recs); + if (error) + goto err_finocur; + } + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_finocur; + + /* Reserve all the space we need to build the new btrees. */ + error = xrep_newbt_alloc_blocks(&ri->new_inobt, + ri->new_inobt.bload.nr_blocks); + if (error) + goto err_finocur; + + if (need_finobt) { + error = xrep_newbt_alloc_blocks(&ri->new_finobt, + ri->new_finobt.bload.nr_blocks); + if (error) + goto err_finocur; + } + + /* Add all inobt records. */ + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri); + if (error) + goto err_finocur; + + /* Add all finobt records. */ + if (need_finobt) { + ri->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri); + if (error) + goto err_finocur; + } + + /* + * Install the new btrees in the AG header. After this point the old + * btrees are no longer accessible and the new trees are live. + */ + xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(ino_cur, 0); + + if (fino_cur) { + xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp); + xfs_btree_del_cursor(fino_cur, 0); + } + + /* Reset the AGI counters now that we've changed the inode roots. */ + error = xrep_ibt_reset_counters(ri); + if (error) + goto err_finobt; + + /* Free unused blocks and bitmap. */ + if (need_finobt) { + error = xrep_newbt_commit(&ri->new_finobt); + if (error) + goto err_inobt; + } + error = xrep_newbt_commit(&ri->new_inobt); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_finocur: + if (need_finobt) + xfs_btree_del_cursor(fino_cur, error); +err_inocur: + xfs_btree_del_cursor(ino_cur, error); +err_finobt: + if (need_finobt) + xrep_newbt_cancel(&ri->new_finobt); +err_inobt: + xrep_newbt_cancel(&ri->new_inobt); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_ibt_remove_old_trees( + struct xrep_ibt *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + /* + * Free the old inode btree blocks if they're not in use. It's ok to + * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG + * reservation because we reset the reservation before releasing the + * AGI and AGF header buffer locks. + */ + error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks, + &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE); + if (error) + return error; + + /* + * If the finobt is enabled and has a per-AG reservation, make sure we + * reinitialize the per-AG reservations. + */ + if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores) + sc->flags |= XREP_RESET_PERAG_RESV; + + return 0; +} + +/* Repair both inode btrees. */ +int +xrep_iallocbt( + struct xfs_scrub *sc) +{ + struct xrep_ibt *ri; + struct xfs_mount *mp = sc->mp; + char *descr; + xfs_agino_t first_agino, last_agino; + int error = 0; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS); + if (!ri) + return -ENOMEM; + ri->sc = sc; + + /* We rebuild both inode btrees. */ + sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT; + + /* Set up enough storage to handle an AG with nothing but inodes. */ + xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino); + last_agino /= XFS_INODES_PER_CHUNK; + descr = xchk_xfile_ag_descr(sc, "inode index records"); + error = xfarray_create(descr, last_agino, + sizeof(struct xfs_inobt_rec_incore), + &ri->inode_records); + kfree(descr); + if (error) + goto out_ri; + + /* Collect the inode data and find the old btree blocks. */ + xagb_bitmap_init(&ri->old_iallocbt_blocks); + error = xrep_ibt_find_inodes(ri); + if (error) + goto out_bitmap; + + /* Rebuild the inode indexes. */ + error = xrep_ibt_build_new_trees(ri); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_ibt_remove_old_trees(ri); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&ri->old_iallocbt_blocks); + xfarray_destroy(ri->inode_records); +out_ri: + kfree(ri); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_iallocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT; + error = xchk_iallocbt(sc); + if (error) + goto out; + + if (xfs_has_finobt(sc->mp)) { + sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT; + error = xchk_iallocbt(sc); + } + +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 01b7e8d1a58b..a604f0cea8c1 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -806,3 +806,62 @@ xrep_reinit_pagf( return 0; } + +/* + * Reinitialize the in-core AG state after a repair by rereading the AGI + * buffer. We had better get the same AGI buffer as the one that's attached + * to the scrub context. + */ +int +xrep_reinit_pagi( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag = sc->sa.pag; + struct xfs_buf *bp; + int error; + + ASSERT(pag); + ASSERT(xfs_perag_initialised_agi(pag)); + + clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate); + error = xfs_ialloc_read_agi(pag, sc->tp, &bp); + if (error) + return error; + + if (bp != sc->sa.agi_bp) { + ASSERT(bp == sc->sa.agi_bp); + return -EFSCORRUPTED; + } + + return 0; +} + +/* Reinitialize the per-AG block reservation for the AG we just fixed. */ +int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + int error; + + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(sc->sa.pag != NULL); + ASSERT(sc->ops->type == ST_PERAG); + ASSERT(sc->tp); + + sc->flags &= ~XREP_RESET_PERAG_RESV; + error = xfs_ag_resv_free(sc->sa.pag); + if (error) + goto out; + error = xfs_ag_resv_init(sc->sa.pag, sc->tp); + if (error == -ENOSPC) { + xfs_err(sc->mp, +"Insufficient free space to reset per-AG reservation for AG %u after repair.", + sc->sa.pag->pag_agno); + error = 0; + } + +out: + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index bc3353ecae8a..05bd55430e6e 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -59,6 +59,7 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +int xrep_reset_perag_resv(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); @@ -68,6 +69,7 @@ void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); /* Metadata revalidators */ int xrep_revalidate_allocbt(struct xfs_scrub *sc); +int xrep_revalidate_iallocbt(struct xfs_scrub *sc); /* Metadata repairers */ @@ -77,8 +79,10 @@ int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); +int xrep_iallocbt(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); +int xrep_reinit_pagi(struct xfs_scrub *sc); #else @@ -99,6 +103,17 @@ xrep_calc_ag_resblks( return 0; } +static inline int +xrep_reset_perag_resv( + struct xfs_scrub *sc) +{ + if (!(sc->flags & XREP_RESET_PERAG_RESV)) + return 0; + + ASSERT(0); + return -EOPNOTSUPP; +} + /* repair setup functions for no-repair */ static inline int xrep_setup_nothing( @@ -109,6 +124,7 @@ xrep_setup_nothing( #define xrep_setup_ag_allocbt xrep_setup_nothing #define xrep_revalidate_allocbt (NULL) +#define xrep_revalidate_iallocbt (NULL) #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported @@ -116,6 +132,7 @@ xrep_setup_nothing( #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported #define xrep_allocbt xrep_notsupported +#define xrep_iallocbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index ebc3b68a8ffb..02ddfddfbed4 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -253,14 +253,16 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, .scrub = xchk_iallocbt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, .scrub = xchk_iallocbt, .has = xfs_has_finobt, - .repair = xrep_notsupported, + .repair = xrep_iallocbt, + .repair_eval = xrep_revalidate_iallocbt, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 5f934a2a4cb9..7fc50654c4fe 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -121,6 +121,7 @@ struct xfs_scrub { #define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ +#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ /* diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index ea518712efa8..c60f76231f0c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -106,6 +106,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ + { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \ { XREP_ALREADY_FIXED, "already_fixed" } DECLARE_EVENT_CLASS(xchk_class, @@ -1172,7 +1173,7 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); @@ -1199,6 +1200,38 @@ TRACE_EVENT(xrep_abt_found, __entry->blockcount) ) +TRACE_EVENT(xrep_ibt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + const struct xfs_inobt_rec_incore *rec), + TP_ARGS(mp, agno, rec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = rec->ir_startino; + __entry->holemask = rec->ir_holemask; + __entry->count = rec->ir_count; + __entry->freecount = rec->ir_freecount; + __entry->freemask = rec->ir_free; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + TRACE_EVENT(xrep_refcount_extent_fn, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_refcount_irec *irec), @@ -1321,39 +1354,6 @@ TRACE_EVENT(xrep_reset_counters, MAJOR(__entry->dev), MINOR(__entry->dev)) ) -TRACE_EVENT(xrep_ialloc_insert, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agino_t startino, uint16_t holemask, uint8_t count, - uint8_t freecount, uint64_t freemask), - TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_agnumber_t, agno) - __field(xfs_agino_t, startino) - __field(uint16_t, holemask) - __field(uint8_t, count) - __field(uint8_t, freecount) - __field(uint64_t, freemask) - ), - TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->startino = startino; - __entry->holemask = holemask; - __entry->count = count; - __entry->freecount = freecount; - __entry->freemask = freemask; - ), - TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, - __entry->startino, - __entry->holemask, - __entry->count, - __entry->freecount, - __entry->freemask) -) - DECLARE_EVENT_CLASS(xrep_newbt_extent_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len, From d5aa62de1efe0fb8c52acf7103808048ddd38767 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:33 -0800 Subject: [PATCH 064/123] xfs: disable online repair quota helpers when quota not enabled Don't compile the quota helper functions if quota isn't being built into the XFS module. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/repair.c | 2 ++ fs/xfs/scrub/repair.h | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index a604f0cea8c1..b4e7c4ad779f 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -673,6 +673,7 @@ xrep_find_ag_btree_roots( return error; } +#ifdef CONFIG_XFS_QUOTA /* Force a quotacheck the next time we mount. */ void xrep_force_quotacheck( @@ -734,6 +735,7 @@ xrep_ino_dqattach( return error; } +#endif /* CONFIG_XFS_QUOTA */ /* * Initialize all the btree cursors for an AG repair except for the btree that diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index cc7ea3942729..93814acc678a 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -57,8 +57,15 @@ struct xrep_find_ag_btree { int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp); + +#ifdef CONFIG_XFS_QUOTA void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type); int xrep_ino_dqattach(struct xfs_scrub *sc); +#else +# define xrep_force_quotacheck(sc, type) ((void)0) +# define xrep_ino_dqattach(sc) (0) +#endif /* CONFIG_XFS_QUOTA */ + int xrep_reset_perag_resv(struct xfs_scrub *sc); /* Repair setup functions */ @@ -87,6 +94,8 @@ int xrep_reinit_pagi(struct xfs_scrub *sc); #else +#define xrep_ino_dqattach(sc) (0) + static inline int xrep_attempt( struct xfs_scrub *sc, From 9099cd38002f8029c9a1da08e6832d1cd18e8451 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:33 -0800 Subject: [PATCH 065/123] xfs: repair refcount btrees Reconstruct the refcount data from the rmap btree. Link: https://docs.kernel.org/filesystems/xfs-online-fsck-design.html#case-study-rebuilding-the-space-reference-counts Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_ag.h | 1 + fs/xfs/libxfs/xfs_btree.c | 26 + fs/xfs/libxfs/xfs_btree.h | 2 + fs/xfs/libxfs/xfs_refcount.c | 8 +- fs/xfs/libxfs/xfs_refcount.h | 2 +- fs/xfs/libxfs/xfs_refcount_btree.c | 13 +- fs/xfs/scrub/refcount.c | 2 +- fs/xfs/scrub/refcount_repair.c | 794 +++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 22 +- 12 files changed, 856 insertions(+), 19 deletions(-) create mode 100644 fs/xfs/scrub/refcount_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 8758abdcbb20..7e1df6fdaaad 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -186,6 +186,7 @@ xfs-y += $(addprefix scrub/, \ ialloc_repair.o \ newbt.o \ reap.o \ + refcount_repair.o \ repair.o \ ) endif diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index f16cb7a174d4..67c3260ee789 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -87,6 +87,7 @@ struct xfs_perag { * verifiers while rebuilding the AG btrees. */ uint8_t pagf_repair_levels[XFS_BTNUM_AGF]; + uint8_t pagf_repair_refcount_level; #endif spinlock_t pag_state_lock; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c100e92140be..ea8d3659df20 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void) xfs_rmapbt_destroy_cur_cache(); xfs_refcountbt_destroy_cur_cache(); } + +/* Move the btree cursor before the first record. */ +int +xfs_btree_goto_left_edge( + struct xfs_btree_cur *cur) +{ + int stat = 0; + int error; + + memset(&cur->bc_rec, 0, sizeof(cur->bc_rec)); + error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat); + if (error) + return error; + if (!stat) + return 0; + + error = xfs_btree_decrement(cur, 0, &stat); + if (error) + return error; + if (stat != 0) { + ASSERT(0); + return -EFSCORRUPTED; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index e0875cec4939..d906324e25c8 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -738,4 +738,6 @@ xfs_btree_alloc_cursor( int __init xfs_btree_init_cur_caches(void); void xfs_btree_destroy_cur_caches(void); +int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 5b039cd022e0..3a9f22d94444 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec( /* Simple checks for refcount records. */ xfs_failaddr_t xfs_refcount_check_irec( - struct xfs_btree_cur *cur, + struct xfs_perag *pag, const struct xfs_refcount_irec *irec) { - struct xfs_perag *pag = cur->bc_ag.pag; - if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN) return __this_address; @@ -179,7 +177,7 @@ xfs_refcount_get_rec( return error; xfs_refcount_btrec_to_irec(rec, irec); - fa = xfs_refcount_check_irec(cur, irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec); if (fa) return xfs_refcount_complain_bad_rec(cur, fa, irec); @@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent( INIT_LIST_HEAD(&rr->rr_list); xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec); - if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL || + if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL || XFS_IS_CORRUPT(cur->bc_mp, rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) { kfree(rr); diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 783cd89ca195..5c207f1c619c 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur, union xfs_btree_rec; extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); -xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur, +xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag, const struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 3fa795e2488d..0d80bd99147c 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -226,7 +226,18 @@ xfs_refcountbt_verify( level = be16_to_cpu(block->bb_level); if (pag && xfs_perag_initialised_agf(pag)) { - if (level >= pag->pagf_refcount_level) + unsigned int maxlevel = pag->pagf_refcount_level; + +#ifdef CONFIG_XFS_ONLINE_REPAIR + /* + * Online repair could be rewriting the refcount btree, so + * we'll validate against the larger of either tree while this + * is going on. + */ + maxlevel = max_t(unsigned int, maxlevel, + pag->pagf_repair_refcount_level); +#endif + if (level >= maxlevel) return __this_address; } else if (level >= mp->m_refc_maxlevels) return __this_address; diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 304ea1e1bfb0..bf22f245bbfa 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -441,7 +441,7 @@ xchk_refcountbt_rec( struct xchk_refcbt_records *rrc = bs->private; xfs_refcount_btrec_to_irec(rec, &irec); - if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) { + if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) { xchk_btree_set_corrupt(bs->sc, bs->cur, 0); return 0; } diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c new file mode 100644 index 000000000000..f38fccc42a20 --- /dev/null +++ b/fs/xfs/scrub/refcount_repair.c @@ -0,0 +1,794 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_error.h" +#include "xfs_ag.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/agb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Rebuilding the Reference Count Btree + * ==================================== + * + * This algorithm is "borrowed" from xfs_repair. Imagine the rmap + * entries as rectangles representing extents of physical blocks, and + * that the rectangles can be laid down to allow them to overlap each + * other; then we know that we must emit a refcnt btree entry wherever + * the amount of overlap changes, i.e. the emission stimulus is + * level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 + * cases because the bnobt tells us which blocks are free; single-use + * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt + * supports storing multiple entries covering a given block we could + * theoretically dispense with the refcntbt and simply count rmaps, but + * that's inefficient in the (hot) write path, so we'll take the cost of + * the extra tree to save time. Also there's no guarantee that rmap + * will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting + * physical block (sp), a bag to hold rmaps that cover sp, and the next + * physical block where the level changes (np), we can reconstruct the + * refcount btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. This + * is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) + * and (startblock + len of each rmap in the bag). + * + * Like all the other repairers, we make a list of all the refcount + * records we need, then reinitialize the refcount btree root and + * insert all the records. + */ + +/* The only parts of the rmap that we care about for computing refcounts. */ +struct xrep_refc_rmap { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; +} __packed; + +struct xrep_refc { + /* refcount extents */ + struct xfarray *refcount_records; + + /* new refcountbt information */ + struct xrep_newbt new_btree; + + /* old refcountbt blocks */ + struct xagb_bitmap old_refcountbt_blocks; + + struct xfs_scrub *sc; + + /* get_records()'s position in the refcount record array. */ + xfarray_idx_t array_cur; + + /* # of refcountbt blocks */ + xfs_extlen_t btblocks; +}; + +/* Check for any obvious conflicts with this shared/CoW staging extent. */ +STATIC int +xrep_refc_check_ext( + struct xfs_scrub *sc, + const struct xfs_refcount_irec *rec) +{ + enum xbtree_recpacking outcome; + int error; + + if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock, + rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rc_startblock, rec->rc_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record a reference count extent. */ +STATIC int +xrep_refc_stash( + struct xrep_refc *rr, + enum xfs_refc_domain domain, + xfs_agblock_t agbno, + xfs_extlen_t len, + uint64_t refcount) +{ + struct xfs_refcount_irec irec = { + .rc_startblock = agbno, + .rc_blockcount = len, + .rc_domain = domain, + }; + struct xfs_scrub *sc = rr->sc; + int error = 0; + + if (xchk_should_terminate(sc, &error)) + return error; + + irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount); + + error = xrep_refc_check_ext(rr->sc, &irec); + if (error) + return error; + + trace_xrep_refc_found(sc->sa.pag, &irec); + + return xfarray_append(rr->refcount_records, &irec); +} + +/* Record a CoW staging extent. */ +STATIC int +xrep_refc_stash_cow( + struct xrep_refc *rr, + xfs_agblock_t agbno, + xfs_extlen_t len) +{ + return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1); +} + +/* Decide if an rmap could describe a shared extent. */ +static inline bool +xrep_refc_rmap_shareable( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rmap) +{ + /* AG metadata are never sharable */ + if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return false; + + /* Metadata in files are never shareable */ + if (xfs_internal_inum(mp, rmap->rm_owner)) + return false; + + /* Metadata and unwritten file blocks are not shareable. */ + if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return false; + + return true; +} + +/* + * Walk along the reverse mapping records until we find one that could describe + * a shared extent. + */ +STATIC int +xrep_refc_walk_rmaps( + struct xrep_refc *rr, + struct xrep_refc_rmap *rrm, + bool *have_rec) +{ + struct xfs_rmap_irec rmap; + struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur; + struct xfs_mount *mp = cur->bc_mp; + int have_gt; + int error = 0; + + *have_rec = false; + + /* + * Loop through the remaining rmaps. Remember CoW staging + * extents and the refcountbt blocks from the old tree for later + * disposal. We can only share written data fork extents, so + * keep looping until we find an rmap for one. + */ + do { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfs_btree_increment(cur, 0, &have_gt); + if (error) + return error; + if (!have_gt) + return 0; + + error = xfs_rmap_get_rec(cur, &rmap, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(mp, !have_gt)) + return -EFSCORRUPTED; + + if (rmap.rm_owner == XFS_RMAP_OWN_COW) { + error = xrep_refc_stash_cow(rr, rmap.rm_startblock, + rmap.rm_blockcount); + if (error) + return error; + } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) { + /* refcountbt block, dump it when we're done. */ + rr->btblocks += rmap.rm_blockcount; + error = xagb_bitmap_set(&rr->old_refcountbt_blocks, + rmap.rm_startblock, rmap.rm_blockcount); + if (error) + return error; + } + } while (!xrep_refc_rmap_shareable(mp, &rmap)); + + rrm->startblock = rmap.rm_startblock; + rrm->blockcount = rmap.rm_blockcount; + *have_rec = true; + return 0; +} + +static inline uint32_t +xrep_refc_encode_startblock( + const struct xfs_refcount_irec *irec) +{ + uint32_t start; + + start = irec->rc_startblock & ~XFS_REFC_COWFLAG; + if (irec->rc_domain == XFS_REFC_DOMAIN_COW) + start |= XFS_REFC_COWFLAG; + + return start; +} + +/* Sort in the same order as the ondisk records. */ +static int +xrep_refc_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_refcount_irec *ap = a; + const struct xfs_refcount_irec *bp = b; + uint32_t sa, sb; + + sa = xrep_refc_encode_startblock(ap); + sb = xrep_refc_encode_startblock(bp); + + if (sa > sb) + return 1; + if (sa < sb) + return -1; + return 0; +} + +/* + * Sort the refcount extents by startblock or else the btree records will be in + * the wrong order. Make sure the records do not overlap in physical space. + */ +STATIC int +xrep_refc_sort_records( + struct xrep_refc *rr) +{ + struct xfs_refcount_irec irec; + xfarray_idx_t cur; + enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED; + xfs_agblock_t next_agbno = 0; + int error; + + error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rr->refcount_records, cur) { + if (xchk_should_terminate(rr->sc, &error)) + return error; + + error = xfarray_load(rr->refcount_records, cur, &irec); + if (error) + return error; + + if (dom == XFS_REFC_DOMAIN_SHARED && + irec.rc_domain == XFS_REFC_DOMAIN_COW) { + dom = irec.rc_domain; + next_agbno = 0; + } + + if (dom != irec.rc_domain) + return -EFSCORRUPTED; + if (irec.rc_startblock < next_agbno) + return -EFSCORRUPTED; + + next_agbno = irec.rc_startblock + irec.rc_blockcount; + } + + return error; +} + +#define RRM_NEXT(r) ((r).startblock + (r).blockcount) +/* + * Find the next block where the refcount changes, given the next rmap we + * looked at and the ones we're already tracking. + */ +static inline int +xrep_refc_next_edge( + struct xfarray *rmap_bag, + struct xrep_refc_rmap *next_rrm, + bool next_valid, + xfs_agblock_t *nbnop) +{ + struct xrep_refc_rmap rrm; + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + xfs_agblock_t nbno = NULLAGBLOCK; + int error; + + if (next_valid) + nbno = next_rrm->startblock; + + while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1) + nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm)); + + if (error) + return error; + + /* + * We should have found /something/ because either next_rrm is the next + * interesting rmap to look at after emitting this refcount extent, or + * there are other rmaps in rmap_bag contributing to the current + * sharing count. But if something is seriously wrong, bail out. + */ + if (nbno == NULLAGBLOCK) + return -EFSCORRUPTED; + + *nbnop = nbno; + return 0; +} + +/* + * Walk forward through the rmap btree to collect all rmaps starting at + * @bno in @rmap_bag. These represent the file(s) that share ownership of + * the current block. Upon return, the rmap cursor points to the last record + * satisfying the startblock constraint. + */ +static int +xrep_refc_push_rmaps_at( + struct xrep_refc *rr, + struct xfarray *rmap_bag, + xfs_agblock_t bno, + struct xrep_refc_rmap *rrm, + bool *have, + uint64_t *stack_sz) +{ + struct xfs_scrub *sc = rr->sc; + int have_gt; + int error; + + while (*have && rrm->startblock == bno) { + error = xfarray_store_anywhere(rmap_bag, rrm); + if (error) + return error; + (*stack_sz)++; + error = xrep_refc_walk_rmaps(rr, rrm, have); + if (error) + return error; + } + + error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt); + if (error) + return error; + if (XFS_IS_CORRUPT(sc->mp, !have_gt)) + return -EFSCORRUPTED; + + return 0; +} + +/* Iterate all the rmap records to generate reference count data. */ +STATIC int +xrep_refc_find_refcounts( + struct xrep_refc *rr) +{ + struct xrep_refc_rmap rrm; + struct xfs_scrub *sc = rr->sc; + struct xfarray *rmap_bag; + char *descr; + uint64_t old_stack_sz; + uint64_t stack_sz = 0; + xfs_agblock_t sbno; + xfs_agblock_t cbno; + xfs_agblock_t nbno; + bool have; + int error; + + xrep_ag_btcur_init(sc, &sc->sa); + + /* + * Set up a sparse array to store all the rmap records that we're + * tracking to generate a reference count record. If this exceeds + * MAXREFCOUNT, we clamp rc_refcount. + */ + descr = xchk_xfile_ag_descr(sc, "rmap record bag"); + error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap), + &rmap_bag); + kfree(descr); + if (error) + goto out_cur; + + /* Start the rmapbt cursor to the left of all records. */ + error = xfs_btree_goto_left_edge(sc->sa.rmap_cur); + if (error) + goto out_bag; + + /* Process reverse mappings into refcount data. */ + while (xfs_btree_has_more_records(sc->sa.rmap_cur)) { + /* Push all rmaps with pblk == sbno onto the stack */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (!have) + break; + sbno = cbno = rrm.startblock; + error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno, + &rrm, &have, &stack_sz); + if (error) + goto out_bag; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + old_stack_sz = stack_sz; + + /* While stack isn't empty... */ + while (stack_sz) { + xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT; + + /* Pop all rmaps that end at nbno */ + while ((error = xfarray_iter(rmap_bag, &array_cur, + &rrm)) == 1) { + if (RRM_NEXT(rrm) != nbno) + continue; + error = xfarray_unset(rmap_bag, array_cur - 1); + if (error) + goto out_bag; + stack_sz--; + } + if (error) + goto out_bag; + + /* Push array items that start at nbno */ + error = xrep_refc_walk_rmaps(rr, &rrm, &have); + if (error) + goto out_bag; + if (have) { + error = xrep_refc_push_rmaps_at(rr, rmap_bag, + nbno, &rrm, &have, &stack_sz); + if (error) + goto out_bag; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (stack_sz != old_stack_sz) { + if (old_stack_sz > 1) { + error = xrep_refc_stash(rr, + XFS_REFC_DOMAIN_SHARED, + cbno, nbno - cbno, + old_stack_sz); + if (error) + goto out_bag; + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (stack_sz == 0) + break; + old_stack_sz = stack_sz; + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + error = xrep_refc_next_edge(rmap_bag, &rrm, have, + &nbno); + if (error) + goto out_bag; + + ASSERT(nbno > sbno); + } + } + + ASSERT(stack_sz == 0); +out_bag: + xfarray_destroy(rmap_bag); +out_cur: + xchk_ag_btcur_free(&sc->sa); + return error; +} +#undef RRM_NEXT + +/* Retrieve refcountbt data for bulk load. */ +STATIC int +xrep_refc_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_refcount_irec *irec = &cur->bc_rec.rc; + struct xrep_refc *rr = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + error = xfarray_load(rr->refcount_records, rr->array_cur++, + irec); + if (error) + return error; + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_refc_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_refc *rr = priv; + + return xrep_newbt_claim_block(cur, &rr->new_btree, ptr); +} + +/* Update the AGF counters. */ +STATIC int +xrep_refc_reset_counters( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + + /* + * After we commit the new btree to disk, it is possible that the + * process to reap the old btree blocks will race with the AIL trying + * to checkpoint the old btree blocks into the filesystem. If the new + * tree is shorter than the old one, the refcountbt write verifier will + * fail and the AIL will shut down the filesystem. + * + * To avoid this, save the old incore btree height values as the alt + * height values before re-initializing the perag info from the updated + * AGF to capture all the new values. + */ + pag->pagf_repair_refcount_level = pag->pagf_refcount_level; + + /* Reinitialize with the values we just logged. */ + return xrep_reinit_pagf(sc); +} + +/* + * Use the collected refcount information to stage a new refcount btree. If + * this is successful we'll return with the new btree root information logged + * to the repair transaction but not yet committed. + */ +STATIC int +xrep_refc_build_new_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_btree_cur *refc_cur; + struct xfs_perag *pag = sc->sa.pag; + xfs_fsblock_t fsbno; + int error; + + error = xrep_refc_sort_records(rr); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp)); + xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno, + XFS_AG_RESV_METADATA); + rr->new_btree.bload.get_records = xrep_refc_get_records; + rr->new_btree.bload.claim_block = xrep_refc_claim_block; + + /* Compute how many blocks we'll need. */ + refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake, + pag); + error = xfs_btree_bload_compute_geometry(refc_cur, + &rr->new_btree.bload, + xfarray_length(rr->refcount_records)); + if (error) + goto err_cur; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + goto err_cur; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rr->new_btree, + rr->new_btree.bload.nr_blocks); + if (error) + goto err_cur; + + /* + * Due to btree slack factors, it's possible for a new btree to be one + * level taller than the old btree. Update the incore btree height so + * that we don't trip the verifiers when writing the new btree blocks + * to disk. + */ + pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height; + + /* Add all observed refcount records. */ + rr->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr); + if (error) + goto err_level; + + /* + * Install the new btree in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + */ + xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp); + xfs_btree_del_cursor(refc_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_refc_reset_counters(rr); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rr->new_btree); + if (error) + return error; + + return xrep_roll_ag_trans(sc); + +err_level: + pag->pagf_repair_refcount_level = 0; +err_cur: + xfs_btree_del_cursor(refc_cur, error); +err_newbt: + xrep_newbt_cancel(&rr->new_btree); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_refc_remove_old_tree( + struct xrep_refc *rr) +{ + struct xfs_scrub *sc = rr->sc; + struct xfs_perag *pag = sc->sa.pag; + int error; + + /* Free the old refcountbt blocks if they're not in use. */ + error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks, + &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA); + if (error) + return error; + + /* + * Now that we've zapped all the old refcountbt blocks we can turn off + * the alternate height mechanism and reset the per-AG space + * reservations. + */ + pag->pagf_repair_refcount_level = 0; + sc->flags |= XREP_RESET_PERAG_RESV; + return 0; +} + +/* Rebuild the refcount btree. */ +int +xrep_refcountbt( + struct xfs_scrub *sc) +{ + struct xrep_refc *rr; + struct xfs_mount *mp = sc->mp; + char *descr; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_has_rmapbt(mp)) + return -EOPNOTSUPP; + + rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS); + if (!rr) + return -ENOMEM; + rr->sc = sc; + + /* Set up enough storage to handle one refcount record per block. */ + descr = xchk_xfile_ag_descr(sc, "reference count records"); + error = xfarray_create(descr, mp->m_sb.sb_agblocks, + sizeof(struct xfs_refcount_irec), + &rr->refcount_records); + kfree(descr); + if (error) + goto out_rr; + + /* Collect all reference counts. */ + xagb_bitmap_init(&rr->old_refcountbt_blocks); + error = xrep_refc_find_refcounts(rr); + if (error) + goto out_bitmap; + + /* Rebuild the refcount information. */ + error = xrep_refc_build_new_tree(rr); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_refc_remove_old_tree(rr); + if (error) + goto out_bitmap; + +out_bitmap: + xagb_bitmap_destroy(&rr->old_refcountbt_blocks); + xfarray_destroy(rr->refcount_records); +out_rr: + kfree(rr); + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 05bd55430e6e..cc7ea3942729 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -80,6 +80,7 @@ int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); +int xrep_refcountbt(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -133,6 +134,7 @@ xrep_setup_nothing( #define xrep_agi xrep_notsupported #define xrep_allocbt xrep_notsupported #define xrep_iallocbt xrep_notsupported +#define xrep_refcountbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 02ddfddfbed4..6ff4dc57095f 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -276,7 +276,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, .has = xfs_has_reflink, - .repair = xrep_notsupported, + .repair = xrep_refcountbt, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index c60f76231f0c..3f7af4430951 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1232,27 +1232,29 @@ TRACE_EVENT(xrep_ibt_found, __entry->freemask) ) -TRACE_EVENT(xrep_refcount_extent_fn, - TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - struct xfs_refcount_irec *irec), - TP_ARGS(mp, agno, irec), +TRACE_EVENT(xrep_refc_found, + TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec), + TP_ARGS(pag, rec), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) + __field(enum xfs_refc_domain, domain) __field(xfs_agblock_t, startblock) __field(xfs_extlen_t, blockcount) __field(xfs_nlink_t, refcount) ), TP_fast_assign( - __entry->dev = mp->m_super->s_dev; - __entry->agno = agno; - __entry->startblock = irec->rc_startblock; - __entry->blockcount = irec->rc_blockcount; - __entry->refcount = irec->rc_refcount; + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->domain = rec->rc_domain; + __entry->startblock = rec->rc_startblock; + __entry->blockcount = rec->rc_blockcount; + __entry->refcount = rec->rc_refcount; ), - TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", + TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS), __entry->startblock, __entry->blockcount, __entry->refcount) From 259ba1d36f559653390c0e9dbdee5c4ffc28bb29 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:34 -0800 Subject: [PATCH 066/123] xfs: try to attach dquots to files before repairing them Inode resource usage is tracked in the quota metadata. Repairing a file might change the resources used by that file, which means that we need to attach dquots to the file that we're examining before accessing anything in the file protected by the ILOCK. However, there's a twist: a dquot cache miss requires the dquot to be read in from the quota file, during which we drop the ILOCK on the file being examined. This means that we *must* try to attach the dquots before taking the ILOCK. Therefore, dquots must be attached to files in the scrub setup function. If doing so yields corruption errors (or unknown dquot errors), we instead clear the quotachecked status, which will cause a quotacheck on next mount. A future series will make this trigger live quotacheck. While we're here, change the xrep_ino_dqattach function to use the unlocked dqattach functions so that we avoid cycling the ILOCK if the inode already has dquots attached. This makes the naming and locking requirements consistent with the rest of the filesystem. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/bmap.c | 4 ++++ fs/xfs/scrub/common.c | 25 +++++++++++++++++++++++++ fs/xfs/scrub/common.h | 6 ++++++ fs/xfs/scrub/inode.c | 4 ++++ fs/xfs/scrub/repair.c | 13 ++++++++----- fs/xfs/scrub/rtbitmap.c | 4 ++++ fs/xfs/scrub/rtsummary.c | 4 ++++ 7 files changed, 55 insertions(+), 5 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 06d8c1996a33..f74bd2a97c7f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -78,6 +78,10 @@ xchk_setup_inode_bmap( if (error) goto out; + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index e0d6d8c9f640..bff0a374fb1b 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -819,6 +819,26 @@ xchk_iget_agi( return 0; } +#ifdef CONFIG_XFS_QUOTA +/* + * Try to attach dquots to this inode if we think we might want to repair it. + * Callers must not hold any ILOCKs. If the dquots are broken and cannot be + * attached, a quotacheck will be scheduled. + */ +int +xchk_ino_dqattach( + struct xfs_scrub *sc) +{ + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + if (!xchk_could_repair(sc)) + return 0; + + return xrep_ino_dqattach(sc); +} +#endif + /* Install an inode that we opened by handle for scrubbing. */ int xchk_install_handle_inode( @@ -1030,6 +1050,11 @@ xchk_setup_inode_contents( error = xchk_trans_alloc(sc, resblks); if (error) goto out; + + error = xchk_ino_dqattach(sc); + if (error) + goto out; + xchk_ilock(sc, XFS_ILOCK_EXCL); out: /* scrub teardown will unlock and release the inode for us */ diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index c31be570e7d8..c69cacb0b696 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -103,9 +103,15 @@ xchk_setup_rtsummary(struct xfs_scrub *sc) } #endif #ifdef CONFIG_XFS_QUOTA +int xchk_ino_dqattach(struct xfs_scrub *sc); int xchk_setup_quota(struct xfs_scrub *sc); #else static inline int +xchk_ino_dqattach(struct xfs_scrub *sc) +{ + return 0; +} +static inline int xchk_setup_quota(struct xfs_scrub *sc) { return -ENOENT; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index b7a93380a1ab..7e97db8255c6 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -39,6 +39,10 @@ xchk_prepare_iscrub( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL); return 0; } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index b4e7c4ad779f..021f6ec72e87 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -700,10 +700,10 @@ xrep_force_quotacheck( * * This function ensures that the appropriate dquots are attached to an inode. * We cannot allow the dquot code to allocate an on-disk dquot block here - * because we're already in transaction context with the inode locked. The - * on-disk dquot should already exist anyway. If the quota code signals - * corruption or missing quota information, schedule quotacheck, which will - * repair corruptions in the quota metadata. + * because we're already in transaction context. The on-disk dquot should + * already exist anyway. If the quota code signals corruption or missing quota + * information, schedule quotacheck, which will repair corruptions in the quota + * metadata. */ int xrep_ino_dqattach( @@ -711,7 +711,10 @@ xrep_ino_dqattach( { int error; - error = xfs_qm_dqattach_locked(sc->ip, false); + ASSERT(sc->tp != NULL); + ASSERT(sc->ip != NULL); + + error = xfs_qm_dqattach(sc->ip); switch (error) { case -EFSBADCRC: case -EFSCORRUPTED: diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 41a1d89ae8e6..d509a08d3fc3 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -32,6 +32,10 @@ xchk_setup_rtbitmap( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); return 0; } diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index 8b15c47408d0..f94800a029f3 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -63,6 +63,10 @@ xchk_setup_rtsummary( if (error) return error; + error = xchk_ino_dqattach(sc); + if (error) + return error; + /* * Locking order requires us to take the rtbitmap first. We must be * careful to unlock it ourselves when we are done with the rtbitmap From 576d30ecb620ae3bc156dfb2a4e91143e7f3256d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:34 -0800 Subject: [PATCH 067/123] xfs: add missing nrext64 inode flag check to scrub Add this missing check that the superblock nrext64 flag is set if the inode flag is set. Fixes: 9b7d16e34bbeb ("xfs: Introduce XFS_DIFLAG2_NREXT64 and associated helpers") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 7e97db8255c6..6c40f3e020ea 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -342,6 +342,10 @@ xchk_inode_flags2( if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) goto bad; + /* no large extent counts without the filesystem feature */ + if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp)) + goto bad; + return; bad: xchk_ino_set_corrupt(sc, ino); From 6b5d917780219d0d8f8e2cefefcb6f50987d0fa3 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:35 -0800 Subject: [PATCH 068/123] xfs: dont cast to char * for XFS_DFORK_*PTR macros Code in the next patch will assign the return value of XFS_DFORK_*PTR macros to a struct pointer. gcc complains about casting char* strings to struct pointers, so let's fix the macro's cast to void* to shut up the warnings. While we're at it, fix one of the scrub tests that uses PTR to use BOFF instead for a simpler integer comparison, since other linters whine about char* and void* comparisons. Can't satisfy all these dman bots. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_format.h | 2 +- fs/xfs/scrub/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9a88aba1589f..f16974126ff9 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1008,7 +1008,7 @@ enum xfs_dinode_fmt { * Return pointers to the data or attribute forks. */ #define XFS_DFORK_DPTR(dip) \ - ((char *)dip + xfs_dinode_size(dip->di_version)) + ((void *)dip + xfs_dinode_size(dip->di_version)) #define XFS_DFORK_APTR(dip) \ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) #define XFS_DFORK_PTR(dip,w) \ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 6c40f3e020ea..a81f070b0cd2 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -556,7 +556,7 @@ xchk_dinode( } /* di_forkoff */ - if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) + if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize) xchk_ino_set_corrupt(sc, ino); if (naextents != 0 && dip->di_forkoff == 0) xchk_ino_set_corrupt(sc, ino); From d9041681dd2f5334529a68868c9266631c384de4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:35 -0800 Subject: [PATCH 069/123] xfs: set inode sick state flags when we zap either ondisk fork In a few patches, we'll add some online repair code that tries to massage the ondisk inode record just enough to get it to pass the inode verifiers so that we can continue with more file repairs. Part of that massaging can include zapping the ondisk forks to clear errors. After that point, the bmap fork repair functions will rebuild the zapped forks. Christoph asked for stronger protections against online repair zapping a fork to get the inode to load vs. other threads trying to access the partially repaired file. Do this by adding a special "[DA]FORK_ZAPPED" inode health flag whenever repair zaps a fork, and sprinkling checks for that flag into the various file operations for things that don't like handling an unexpected zero-extents fork. In practice xfs_scrub will scrub and fix the forks almost immediately after zapping them, so the window is very small. However, if a crash or unmount should occur, we can still detect these zapped inode forks by looking for a zero-extents fork when data was expected. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_health.h | 10 ++++++++++ fs/xfs/scrub/bmap.c | 39 ++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/common.c | 2 ++ fs/xfs/scrub/dir.c | 16 +++++++++++++--- fs/xfs/scrub/health.c | 32 +++++++++++++++++++++++++++++++ fs/xfs/scrub/health.h | 2 ++ fs/xfs/scrub/symlink.c | 20 ++++++++++++++----- fs/xfs/xfs_dir2_readdir.c | 3 +++ fs/xfs/xfs_health.c | 8 ++++++-- fs/xfs/xfs_inode.c | 35 ++++++++++++++++++++++++++++++++++ fs/xfs/xfs_inode.h | 2 ++ fs/xfs/xfs_symlink.c | 3 +++ fs/xfs/xfs_xattr.c | 6 ++++++ 13 files changed, 166 insertions(+), 12 deletions(-) diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h index 99e796256c5d..6296993ff8f3 100644 --- a/fs/xfs/libxfs/xfs_health.h +++ b/fs/xfs/libxfs/xfs_health.h @@ -68,6 +68,11 @@ struct xfs_fsop_geom; #define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */ #define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */ +#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */ +#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */ +#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */ +#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */ + /* Primary evidence of health problems in a given group. */ #define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \ XFS_SICK_FS_UQUOTA | \ @@ -97,6 +102,11 @@ struct xfs_fsop_geom; XFS_SICK_INO_SYMLINK | \ XFS_SICK_INO_PARENT) +#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \ + XFS_SICK_INO_BMBTA_ZAPPED | \ + XFS_SICK_INO_DIR_ZAPPED | \ + XFS_SICK_INO_SYMLINK_ZAPPED) + /* These functions must be provided by the xfs implementation. */ void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask); diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index f74bd2a97c7f..1487aaf3d95f 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -19,9 +19,11 @@ #include "xfs_bmap_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/btree.h" +#include "scrub/health.h" #include "xfs_ag.h" /* Set us up with an inode's bmap. */ @@ -943,7 +945,20 @@ int xchk_bmap_data( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_DATA_FORK); + int error; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_DATA_FORK); + if (error) + return error; + + /* If the data fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED); + return 0; } /* Scrub an inode's attr fork. */ @@ -951,7 +966,27 @@ int xchk_bmap_attr( struct xfs_scrub *sc) { - return xchk_bmap(sc, XFS_ATTR_FORK); + int error; + + /* + * If the attr fork has been zapped, it's possible that forkoff was + * reset to zero and hence sc->ip->i_afp is NULL. We don't want the + * NULL ifp check in xchk_bmap to conclude that the attr fork is ok, + * so short circuit that logic by setting the corruption flag and + * returning immediately. + */ + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) { + xchk_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + error = xchk_bmap(sc, XFS_ATTR_FORK); + if (error) + return error; + + /* If the attr fork is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED); + return 0; } /* Scrub an inode's CoW fork. */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index bff0a374fb1b..f0207e71e5dc 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -1160,6 +1160,7 @@ xchk_metadata_inode_subtype( unsigned int scrub_type) { __u32 smtype = sc->sm->sm_type; + unsigned int sick_mask = sc->sick_mask; int error; sc->sm->sm_type = scrub_type; @@ -1177,6 +1178,7 @@ xchk_metadata_inode_subtype( break; } + sc->sick_mask = sick_mask; sc->sm->sm_type = smtype; return error; } diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 0b491784b759..b366fab699ac 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -15,10 +15,12 @@ #include "xfs_icache.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/dabtree.h" #include "scrub/readdir.h" +#include "scrub/health.h" /* Set us up to scrub directories. */ int @@ -760,6 +762,11 @@ xchk_directory( if (!S_ISDIR(VFS_I(sc->ip)->i_mode)) return -ENOENT; + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + /* Plausible size? */ if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); @@ -784,7 +791,10 @@ xchk_directory( /* Look up every name in this directory by hash. */ error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL); - if (error == -ECANCELED) - error = 0; - return error; + if (error && error != -ECANCELED) + return error; + + /* If the dir is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); + return 0; } diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index 5e2b09ed6e29..df716da11226 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -117,6 +117,38 @@ xchk_health_mask_for_scrub_type( return type_to_health_flag[scrub_type].sick_mask; } +/* + * If the scrub state is clean, add @mask to the scrub sick mask to clear + * additional sick flags from the metadata object's sick state. + */ +void +xchk_mark_healthy_if_clean( + struct xfs_scrub *sc, + unsigned int mask) +{ + if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + sc->sick_mask |= mask; +} + +/* + * If we're scrubbing a piece of file metadata for the first time, does it look + * like it has been zapped? Skip the check if we just repaired the metadata + * and are revalidating it. + */ +bool +xchk_file_looks_zapped( + struct xfs_scrub *sc, + unsigned int mask) +{ + ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0); + + if (sc->flags & XREP_ALREADY_FIXED) + return false; + + return xfs_inode_has_sickness(sc->ip, mask); +} + /* * Update filesystem health assessments based on what we found and did. * diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h index 66a273f8585b..a731b2467399 100644 --- a/fs/xfs/scrub/health.h +++ b/fs/xfs/scrub/health.h @@ -10,5 +10,7 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); void xchk_update_health(struct xfs_scrub *sc); bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, xfs_btnum_t btnum); +void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask); +bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask); #endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 38708fb9a5d7..60643d791d4a 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -12,8 +12,10 @@ #include "xfs_log_format.h" #include "xfs_inode.h" #include "xfs_symlink.h" +#include "xfs_health.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/health.h" /* Set us up to scrub a symbolic link. */ int @@ -41,13 +43,19 @@ xchk_symlink( if (!S_ISLNK(VFS_I(ip)->i_mode)) return -ENOENT; + + if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); + return 0; + } + ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); len = ip->i_disk_size; /* Plausible size? */ if (len > XFS_SYMLINK_MAXLEN || len <= 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Inline symlink? */ @@ -55,15 +63,17 @@ xchk_symlink( if (len > xfs_inode_data_fork_size(ip) || len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); - goto out; + return 0; } /* Remote symlink; must read the contents. */ error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); -out: - return error; + + /* If a remote symlink is clean, it is clearly not zapped. */ + xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED); + return 0; } diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 9f3ceb461515..57f42c2af0a3 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "xfs_trans.h" #include "xfs_error.h" +#include "xfs_health.h" /* * Directory file type support functions @@ -519,6 +520,8 @@ xfs_readdir( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index 72a075bb2c10..9a57afee9338 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -222,7 +222,7 @@ xfs_inode_mark_sick( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_sick(ip, mask); spin_lock(&ip->i_flags_lock); @@ -246,7 +246,7 @@ xfs_inode_mark_healthy( struct xfs_inode *ip, unsigned int mask) { - ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY)); + ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED))); trace_xfs_inode_mark_healthy(ip, mask); spin_lock(&ip->i_flags_lock); @@ -369,6 +369,10 @@ static const struct ioctl_sick_map ino_map[] = { { XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR }, { XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK }, { XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT }, + { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD }, + { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA }, + { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR }, + { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK }, { 0, 0 }, }; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index c0f1c89786c2..ea6b277485a4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -37,6 +37,7 @@ #include "xfs_reflink.h" #include "xfs_ag.h" #include "xfs_log_priv.h" +#include "xfs_health.h" struct kmem_cache *xfs_inode_cache; @@ -661,6 +662,8 @@ xfs_lookup( if (xfs_is_shutdown(dp->i_mount)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) @@ -978,6 +981,8 @@ xfs_create( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; prid = xfs_get_initial_prid(dp); @@ -1217,6 +1222,8 @@ xfs_link( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(sip); if (error) @@ -2506,6 +2513,8 @@ xfs_remove( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return -EIO; error = xfs_qm_dqattach(dp); if (error) @@ -3758,3 +3767,29 @@ xfs_inode_reload_unlinked( return error; } + +/* Has this inode fork been zapped by repair? */ +bool +xfs_ifork_zapped( + const struct xfs_inode *ip, + int whichfork) +{ + unsigned int datamask = 0; + + switch (whichfork) { + case XFS_DATA_FORK: + switch (ip->i_vnode.i_mode & S_IFMT) { + case S_IFDIR: + datamask = XFS_SICK_INO_DIR_ZAPPED; + break; + case S_IFLNK: + datamask = XFS_SICK_INO_SYMLINK_ZAPPED; + break; + } + return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); + case XFS_ATTR_FORK: + return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; + default: + return false; + } +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3beb470f1892..97f63bacd4c2 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete( int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); int xfs_inode_reload_unlinked(struct xfs_inode *ip); +bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 85e433df6a3f..7c713727f7fd 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -23,6 +23,7 @@ #include "xfs_trans.h" #include "xfs_ialloc.h" #include "xfs_error.h" +#include "xfs_health.h" /* ----- Kernel only functions below ----- */ int @@ -108,6 +109,8 @@ xfs_readlink( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_ifork_zapped(ip, XFS_DATA_FORK)) + return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 987843f84d03..364104e1b38a 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, }; int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + error = xfs_attr_get(&args); if (error) return error; @@ -294,6 +297,9 @@ xfs_vn_listxattr( struct inode *inode = d_inode(dentry); int error; + if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK)) + return -EIO; + /* * First read the regular on-disk attributes. */ From 2d295fe65776d15c06d53dbe3064f62e036e7c46 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:36 -0800 Subject: [PATCH 070/123] xfs: repair inode records If an inode is so badly damaged that it cannot be loaded into the cache, fix the ondisk metadata and try again. If there /is/ a cached inode, fix any problems and apply any optimizations that can be solved incore. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/inode.c | 6 +- fs/xfs/scrub/inode_repair.c | 820 ++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.c | 42 ++ fs/xfs/scrub/repair.h | 20 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 129 ++++++ 7 files changed, 1018 insertions(+), 2 deletions(-) create mode 100644 fs/xfs/scrub/inode_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7e1df6fdaaad..561ab59b9422 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -184,6 +184,7 @@ xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ ialloc_repair.o \ + inode_repair.o \ newbt.o \ reap.o \ refcount_repair.o \ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index a81f070b0cd2..6e2fe2d6250b 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -25,6 +25,7 @@ #include "scrub/common.h" #include "scrub/btree.h" #include "scrub/trace.h" +#include "scrub/repair.h" /* Prepare the attached inode for scrubbing. */ static inline int @@ -185,8 +186,11 @@ xchk_setup_inode( * saying the inode is allocated and the icache being unable to load * the inode until we can flag the corruption in xchk_inode. The * scrub function has to note the corruption, since we're not really - * supposed to do that from the setup function. + * supposed to do that from the setup function. Save the mapping to + * make repairs to the ondisk inode buffer. */ + if (xchk_could_repair(sc)) + xrep_setup_inode(sc, &imap); return 0; out_cancel: diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c new file mode 100644 index 000000000000..f88d282fdfa5 --- /dev/null +++ b/fs/xfs/scrub/inode_repair.c @@ -0,0 +1,820 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_ialloc.h" +#include "xfs_da_format.h" +#include "xfs_reflink.h" +#include "xfs_rmap.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_quota_defs.h" +#include "xfs_quota.h" +#include "xfs_ag.h" +#include "xfs_rtbitmap.h" +#include "xfs_health.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Inode Record Repair + * =================== + * + * Roughly speaking, inode problems can be classified based on whether or not + * they trip the dinode verifiers. If those trip, then we won't be able to + * xfs_iget ourselves the inode. + * + * Therefore, the xrep_dinode_* functions fix anything that will cause the + * inode buffer verifier or the dinode verifier. The xrep_inode_* functions + * fix things on live incore inodes. The inode repair functions make decisions + * with security and usability implications when reviving a file: + * + * - Files with zero di_mode or a garbage di_mode are converted to regular file + * that only root can read. This file may not actually contain user data, + * if the file was not previously a regular file. Setuid and setgid bits + * are cleared. + * + * - Zero-size directories can be truncated to look empty. It is necessary to + * run the bmapbtd and directory repair functions to fully rebuild the + * directory. + * + * - Zero-size symbolic link targets can be truncated to '?'. It is necessary + * to run the bmapbtd and symlink repair functions to salvage the symlink. + * + * - Invalid extent size hints will be removed. + * + * - Quotacheck will be scheduled if we repaired an inode that was so badly + * damaged that the ondisk inode had to be rebuilt. + * + * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. + * Setuid and setgid bits are cleared. + */ + +/* + * All the information we need to repair the ondisk inode if we can't iget the + * incore inode. We don't allocate this buffer unless we're going to perform + * a repair to the ondisk inode cluster buffer. + */ +struct xrep_inode { + /* Inode mapping that we saved from the initial lookup attempt. */ + struct xfs_imap imap; + + struct xfs_scrub *sc; + + /* Sick state to set after zapping parts of the inode. */ + unsigned int ino_sick_mask; +}; + +/* + * Setup function for inode repair. @imap contains the ondisk inode mapping + * information so that we can correct the ondisk inode cluster buffer if + * necessary to make iget work. + */ +int +xrep_setup_inode( + struct xfs_scrub *sc, + const struct xfs_imap *imap) +{ + struct xrep_inode *ri; + + sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS); + if (!sc->buf) + return -ENOMEM; + + ri = sc->buf; + memcpy(&ri->imap, imap, sizeof(struct xfs_imap)); + ri->sc = sc; + return 0; +} + +/* + * Make sure this ondisk inode can pass the inode buffer verifier. This is + * not the same as the dinode verifier. + */ +STATIC void +xrep_dinode_buf_core( + struct xfs_scrub *sc, + struct xfs_buf *bp, + unsigned int ioffset) +{ + struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset); + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + xfs_agino_t agino; + bool crc_ok = false; + bool magic_ok = false; + bool unlinked_ok = false; + + agino = be32_to_cpu(dip->di_next_unlinked); + + if (xfs_verify_agino_or_null(bp->b_pag, agino)) + unlinked_ok = true; + + if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + xfs_dinode_good_version(mp, dip->di_version)) + magic_ok = true; + + if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + crc_ok = true; + + if (magic_ok && unlinked_ok && crc_ok) + return; + + if (!magic_ok) { + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + dip->di_version = 3; + } + if (!unlinked_ok) + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_dinode_calc_crc(mp, dip); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(tp, bp, ioffset, + ioffset + sizeof(struct xfs_dinode) - 1); +} + +/* Make sure this inode cluster buffer can pass the inode buffer verifier. */ +STATIC void +xrep_dinode_buf( + struct xfs_scrub *sc, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = sc->mp; + int i; + int ni; + + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) + xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog); +} + +/* Reinitialize things that never change in an inode. */ +STATIC void +xrep_dinode_header( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + trace_xrep_dinode_header(sc, dip); + + dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + if (!xfs_dinode_good_version(sc->mp, dip->di_version)) + dip->di_version = 3; + dip->di_ino = cpu_to_be64(sc->sm->sm_ino); + uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid); + dip->di_gen = cpu_to_be32(sc->sm->sm_gen); +} + +/* Turn di_mode into /something/ recognizable. */ +STATIC void +xrep_dinode_mode( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_mode(sc, dip); + + if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN) + return; + + /* bad mode, so we set it to a file that only root can read */ + mode = S_IFREG; + dip->di_mode = cpu_to_be16(mode); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Fix any conflicting flags that the verifiers complain about. */ +STATIC void +xrep_dinode_flags( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_flags(sc, dip); + + /* + * For regular files on a reflink filesystem, set the REFLINK flag to + * protect shared extents. A later stage will actually check those + * extents and clear the flag if possible. + */ + if (xfs_has_reflink(mp) && S_ISREG(mode)) + flags2 |= XFS_DIFLAG2_REFLINK; + else + flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE); + if (flags & XFS_DIFLAG_REALTIME) + flags2 &= ~XFS_DIFLAG2_REFLINK; + if (!xfs_has_bigtime(mp)) + flags2 &= ~XFS_DIFLAG2_BIGTIME; + if (!xfs_has_large_extent_counts(mp)) + flags2 &= ~XFS_DIFLAG2_NREXT64; + if (flags2 & XFS_DIFLAG2_NREXT64) + dip->di_nrext64_pad = 0; + else if (dip->di_version >= 3) + dip->di_v3_pad = 0; + dip->di_flags = cpu_to_be16(flags); + dip->di_flags2 = cpu_to_be64(flags2); +} + +/* + * Blow out symlink; now it points nowhere. We don't have to worry about + * incore state because this inode is failing the verifiers. + */ +STATIC void +xrep_dinode_zap_symlink( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + char *p; + + trace_xrep_dinode_zap_symlink(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + dip->di_size = cpu_to_be64(1); + p = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + *p = '?'; + ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED; +} + +/* + * Blow out dir, make the parent point to the root. In the future repair will + * reconstruct this directory for us. Note that there's no in-core directory + * inode because the sf verifier tripped, so we don't have to worry about the + * dentry cache. + */ +STATIC void +xrep_dinode_zap_dir( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_mount *mp = sc->mp; + struct xfs_dir2_sf_hdr *sfp; + int i8count; + + trace_xrep_dinode_zap_dir(sc, dip); + + dip->di_format = XFS_DINODE_FMT_LOCAL; + i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM; + sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + sfp->count = 0; + sfp->i8count = i8count; + xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino); + dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count)); + ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED; +} + +/* Make sure we don't have a garbage file size. */ +STATIC void +xrep_dinode_size( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + uint64_t size = be64_to_cpu(dip->di_size); + uint16_t mode = be16_to_cpu(dip->di_mode); + + trace_xrep_dinode_size(sc, dip); + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + /* di_size can't be nonzero for special files */ + dip->di_size = 0; + break; + case S_IFREG: + /* Regular files can't be larger than 2^63-1 bytes. */ + dip->di_size = cpu_to_be64(size & ~(1ULL << 63)); + break; + case S_IFLNK: + /* + * Truncate ridiculously oversized symlinks. If the size is + * zero, reset it to point to the current directory. Both of + * these conditions trigger dinode verifier errors, so there + * is no in-core state to reset. + */ + if (size > XFS_SYMLINK_MAXLEN) + dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN); + else if (size == 0) + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + /* + * Directories can't have a size larger than 32G. If the size + * is zero, reset it to an empty directory. Both of these + * conditions trigger dinode verifier errors, so there is no + * in-core state to reset. + */ + if (size > XFS_DIR2_SPACE_SIZE) + dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE); + else if (size == 0) + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* Fix extent size hints. */ +STATIC void +xrep_dinode_extsize_hints( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_mount *mp = sc->mp; + uint64_t flags2 = be64_to_cpu(dip->di_flags2); + uint16_t flags = be16_to_cpu(dip->di_flags); + uint16_t mode = be16_to_cpu(dip->di_mode); + + xfs_failaddr_t fa; + + trace_xrep_dinode_extsize_hints(sc, dip); + + fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) { + dip->di_extsize = 0; + dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE | + XFS_DIFLAG_EXTSZINHERIT); + } + + if (dip->di_version < 3) + return; + + fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) { + dip->di_cowextsize = 0; + dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE); + } +} + +/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ +STATIC int +xrep_dinode_core( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + struct xfs_buf *bp; + struct xfs_dinode *dip; + xfs_ino_t ino = sc->sm->sm_ino; + int error; + int iget_error; + + /* Read the inode cluster buffer. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, + ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, + NULL); + if (error) + return error; + + /* Make sure we can pass the inode buffer verifier. */ + xrep_dinode_buf(sc, bp); + bp->b_ops = &xfs_inode_buf_ops; + + /* Fix everything the verifier will complain about. */ + dip = xfs_buf_offset(bp, ri->imap.im_boffset); + xrep_dinode_header(sc, dip); + xrep_dinode_mode(sc, dip); + xrep_dinode_flags(sc, dip); + xrep_dinode_size(ri, dip); + xrep_dinode_extsize_hints(sc, dip); + + /* Write out the inode. */ + trace_xrep_dinode_fixed(sc, dip); + xfs_dinode_calc_crc(sc->mp, dip); + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF); + xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset, + ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1); + + /* + * In theory, we've fixed the ondisk inode record enough that we should + * be able to load the inode into the cache. Try to iget that inode + * now while we hold the AGI and the inode cluster buffer and take the + * IOLOCK so that we can continue with repairs without anyone else + * accessing the inode. If iget fails, we still need to commit the + * changes. + */ + iget_error = xchk_iget(sc, ino, &sc->ip); + if (!iget_error) + xchk_ilock(sc, XFS_IOLOCK_EXCL); + + /* + * Commit the inode cluster buffer updates and drop the AGI buffer that + * we've been holding since scrub setup. From here on out, repairs + * deal only with the cached inode. + */ + error = xrep_trans_commit(sc); + if (error) + return error; + + if (iget_error) + return iget_error; + + error = xchk_trans_alloc(sc, 0); + if (error) + return error; + + error = xrep_ino_dqattach(sc); + if (error) + return error; + + xchk_ilock(sc, XFS_ILOCK_EXCL); + if (ri->ino_sick_mask) + xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask); + return 0; +} + +/* Fix everything xfs_dinode_verify cares about. */ +STATIC int +xrep_dinode_problems( + struct xrep_inode *ri) +{ + struct xfs_scrub *sc = ri->sc; + int error; + + error = xrep_dinode_core(ri); + if (error) + return error; + + /* We had to fix a totally busted inode, schedule quotacheck. */ + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + + return 0; +} + +/* + * Fix problems that the verifiers don't care about. In general these are + * errors that don't cause problems elsewhere in the kernel that we can easily + * detect, so we don't check them all that rigorously. + */ + +/* Make sure block and extent counts are ok. */ +STATIC int +xrep_inode_blockcounts( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + xfs_filblks_t count; + xfs_filblks_t acount; + xfs_extnum_t nextents; + int error; + + trace_xrep_inode_blockcounts(sc); + + /* Set data fork counters from the data fork mappings. */ + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, + &nextents, &count); + if (error) + return error; + if (xfs_is_reflink_inode(sc->ip)) { + /* + * data fork blockcount can exceed physical storage if a user + * reflinks the same block over and over again. + */ + ; + } else if (XFS_IS_REALTIME_INODE(sc->ip)) { + if (count >= sc->mp->m_sb.sb_rblocks) + return -EFSCORRUPTED; + } else { + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + } + error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents); + if (error) + return error; + sc->ip->i_df.if_nextents = nextents; + + /* Set attr fork counters from the attr fork mappings. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); + if (ifp) { + error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, + &nextents, &acount); + if (error) + return error; + if (count >= sc->mp->m_sb.sb_dblocks) + return -EFSCORRUPTED; + error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK, + nextents); + if (error) + return error; + ifp->if_nextents = nextents; + } else { + acount = 0; + } + + sc->ip->i_nblocks = count + acount; + return 0; +} + +/* Check for invalid uid/gid/prid. */ +STATIC void +xrep_inode_ids( + struct xfs_scrub *sc) +{ + bool dirty = false; + + trace_xrep_inode_ids(sc); + + if (!uid_valid(VFS_I(sc->ip)->i_uid)) { + i_uid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_UQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_USER); + } + + if (!gid_valid(VFS_I(sc->ip)->i_gid)) { + i_gid_write(VFS_I(sc->ip), 0); + dirty = true; + if (XFS_IS_GQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP); + } + + if (sc->ip->i_projid == -1U) { + sc->ip->i_projid = 0; + dirty = true; + if (XFS_IS_PQUOTA_ON(sc->mp)) + xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ); + } + + /* strip setuid/setgid if we touched any of the ids */ + if (dirty) + VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID); +} + +static inline void +xrep_clamp_timestamp( + struct xfs_inode *ip, + struct timespec64 *ts) +{ + ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC); + *ts = timestamp_truncate(*ts, VFS_I(ip)); +} + +/* Nanosecond counters can't have more than 1 billion. */ +STATIC void +xrep_inode_timestamps( + struct xfs_inode *ip) +{ + struct timespec64 tstamp; + struct inode *inode = VFS_I(ip); + + tstamp = inode_get_atime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_atime_to_ts(inode, tstamp); + + tstamp = inode_get_mtime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_mtime_to_ts(inode, tstamp); + + tstamp = inode_get_ctime(inode); + xrep_clamp_timestamp(ip, &tstamp); + inode_set_ctime_to_ts(inode, tstamp); + + xrep_clamp_timestamp(ip, &ip->i_crtime); +} + +/* Fix inode flags that don't make sense together. */ +STATIC void +xrep_inode_flags( + struct xfs_scrub *sc) +{ + uint16_t mode; + + trace_xrep_inode_flags(sc); + + mode = VFS_I(sc->ip)->i_mode; + + /* Clear junk flags */ + if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY) + sc->ip->i_diflags &= ~XFS_DIFLAG_ANY; + + /* NEWRTBM only applies to realtime bitmaps */ + if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino) + sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM; + else + sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM; + + /* These only make sense for directories. */ + if (!S_ISDIR(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT | + XFS_DIFLAG_EXTSZINHERIT | + XFS_DIFLAG_PROJINHERIT | + XFS_DIFLAG_NOSYMLINKS); + + /* These only make sense for files. */ + if (!S_ISREG(mode)) + sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME | + XFS_DIFLAG_EXTSIZE); + + /* These only make sense for non-rt files. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM; + + /* Immutable and append only? Drop the append. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) && + (sc->ip->i_diflags & XFS_DIFLAG_APPEND)) + sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND; + + /* Clear junk flags. */ + if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY; + + /* No reflink flag unless we support it and it's a file. */ + if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode)) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* DAX only applies to files and dirs. */ + if (!(S_ISREG(mode) || S_ISDIR(mode))) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX; + + /* No reflink files on the realtime device. */ + if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; +} + +/* + * Fix size problems with block/node format directories. If we fail to find + * the extent list, just bail out and let the bmapbtd repair functions clean + * up that mess. + */ +STATIC void +xrep_inode_blockdir_size( + struct xfs_scrub *sc) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + struct xfs_ifork *ifp; + xfs_fileoff_t off; + int error; + + trace_xrep_inode_blockdir_size(sc); + + error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK); + if (error) + return; + + /* Find the last block before 32G; this is the dir size. */ + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE); + if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) { + /* zero-extents directory? */ + return; + } + + off = got.br_startoff + got.br_blockcount; + sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE, + XFS_FSB_TO_B(sc->mp, off)); +} + +/* Fix size problems with short format directories. */ +STATIC void +xrep_inode_sfdir_size( + struct xfs_scrub *sc) +{ + struct xfs_ifork *ifp; + + trace_xrep_inode_sfdir_size(sc); + + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + sc->ip->i_disk_size = ifp->if_bytes; +} + +/* + * Fix any irregularities in a directory inode's size now that we can iterate + * extent maps and access other regular inode data. + */ +STATIC void +xrep_inode_dir_size( + struct xfs_scrub *sc) +{ + trace_xrep_inode_dir_size(sc); + + switch (sc->ip->i_df.if_format) { + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + xrep_inode_blockdir_size(sc); + break; + case XFS_DINODE_FMT_LOCAL: + xrep_inode_sfdir_size(sc); + break; + } +} + +/* Fix extent size hint problems. */ +STATIC void +xrep_inode_extsize( + struct xfs_scrub *sc) +{ + /* Fix misaligned extent size hints on a directory. */ + if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) && + (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && + xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) { + sc->ip->i_extsize = 0; + sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT; + } +} + +/* Fix any irregularities in an inode that the verifiers don't catch. */ +STATIC int +xrep_inode_problems( + struct xfs_scrub *sc) +{ + int error; + + error = xrep_inode_blockcounts(sc); + if (error) + return error; + xrep_inode_timestamps(sc->ip); + xrep_inode_flags(sc); + xrep_inode_ids(sc); + /* + * We can now do a better job fixing the size of a directory now that + * we can scan the data fork extents than we could in xrep_dinode_size. + */ + if (S_ISDIR(VFS_I(sc->ip)->i_mode)) + xrep_inode_dir_size(sc); + xrep_inode_extsize(sc); + + trace_xrep_inode_fixed(sc); + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair an inode's fields. */ +int +xrep_inode( + struct xfs_scrub *sc) +{ + int error = 0; + + /* + * No inode? That means we failed the _iget verifiers. Repair all + * the things that the inode verifiers care about, then retry _iget. + */ + if (!sc->ip) { + struct xrep_inode *ri = sc->buf; + + ASSERT(ri != NULL); + + error = xrep_dinode_problems(ri); + if (error) + return error; + + /* By this point we had better have a working incore inode. */ + if (!sc->ip) + return -EFSCORRUPTED; + } + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* If we found corruption of any kind, try to fix it. */ + if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) { + error = xrep_inode_problems(sc); + if (error) + return error; + } + + /* See if we can clear the reflink flag. */ + if (xfs_is_reflink_inode(sc->ip)) { + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + return xrep_defer_finish(sc); +} diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 021f6ec72e87..25392dea326d 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -176,6 +176,16 @@ xrep_roll_ag_trans( return 0; } +/* Roll the scrub transaction, holding the primary metadata locked. */ +int +xrep_roll_trans( + struct xfs_scrub *sc) +{ + if (!sc->ip) + return xrep_roll_ag_trans(sc); + return xfs_trans_roll_inode(&sc->tp, sc->ip); +} + /* Finish all deferred work attached to the repair transaction. */ int xrep_defer_finish( @@ -740,6 +750,38 @@ xrep_ino_dqattach( } #endif /* CONFIG_XFS_QUOTA */ +/* + * Ensure that the inode being repaired is ready to handle a certain number of + * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode + * being repaired and have joined it to the scrub transaction. + */ +int +xrep_ino_ensure_extent_count( + struct xfs_scrub *sc, + int whichfork, + xfs_extnum_t nextents) +{ + xfs_extnum_t max_extents; + bool inode_has_nrext64; + + inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip); + max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork); + if (nextents <= max_extents) + return 0; + if (inode_has_nrext64) + return -EFSCORRUPTED; + if (!xfs_has_large_extent_counts(sc->mp)) + return -EFSCORRUPTED; + + max_extents = xfs_iext_max_nextents(true, whichfork); + if (nextents > max_extents) + return -EFSCORRUPTED; + + sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return 0; +} + /* * Initialize all the btree cursors for an AG repair except for the btree that * we're rebuilding. diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 93814acc678a..a513b84f5330 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -30,11 +30,22 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); +int xrep_roll_trans(struct xfs_scrub *sc); int xrep_defer_finish(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, enum xfs_ag_resv_type type); xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc); +static inline int +xrep_trans_commit( + struct xfs_scrub *sc) +{ + int error = xfs_trans_commit(sc->tp); + + sc->tp = NULL; + return error; +} + struct xbitmap; struct xagb_bitmap; @@ -66,11 +77,16 @@ int xrep_ino_dqattach(struct xfs_scrub *sc); # define xrep_ino_dqattach(sc) (0) #endif /* CONFIG_XFS_QUOTA */ +int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, + xfs_extnum_t nextents); int xrep_reset_perag_resv(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); +struct xfs_imap; +int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); + void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); /* Metadata revalidators */ @@ -88,6 +104,7 @@ int xrep_agi(struct xfs_scrub *sc); int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); int xrep_refcountbt(struct xfs_scrub *sc); +int xrep_inode(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -133,6 +150,8 @@ xrep_setup_nothing( } #define xrep_setup_ag_allocbt xrep_setup_nothing +#define xrep_setup_inode(sc, imap) ((void)0) + #define xrep_revalidate_allocbt (NULL) #define xrep_revalidate_iallocbt (NULL) @@ -144,6 +163,7 @@ xrep_setup_nothing( #define xrep_allocbt xrep_notsupported #define xrep_iallocbt xrep_notsupported #define xrep_refcountbt xrep_notsupported +#define xrep_inode xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 6ff4dc57095f..7e903a0fde6c 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -282,7 +282,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_inode, .scrub = xchk_inode, - .repair = xrep_notsupported, + .repair = xrep_inode, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3f7af4430951..6041c716242a 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1393,6 +1393,135 @@ DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks); DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block); +DECLARE_EVENT_CLASS(xrep_dinode_class, + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), + TP_ARGS(sc, dip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(uint16_t, mode) + __field(uint8_t, version) + __field(uint8_t, format) + __field(uint32_t, uid) + __field(uint32_t, gid) + __field(uint64_t, size) + __field(uint64_t, nblocks) + __field(uint32_t, extsize) + __field(uint32_t, nextents) + __field(uint16_t, anextents) + __field(uint8_t, forkoff) + __field(uint8_t, aformat) + __field(uint16_t, flags) + __field(uint32_t, gen) + __field(uint64_t, flags2) + __field(uint32_t, cowextsize) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->mode = be16_to_cpu(dip->di_mode); + __entry->version = dip->di_version; + __entry->format = dip->di_format; + __entry->uid = be32_to_cpu(dip->di_uid); + __entry->gid = be32_to_cpu(dip->di_gid); + __entry->size = be64_to_cpu(dip->di_size); + __entry->nblocks = be64_to_cpu(dip->di_nblocks); + __entry->extsize = be32_to_cpu(dip->di_extsize); + __entry->nextents = be32_to_cpu(dip->di_nextents); + __entry->anextents = be16_to_cpu(dip->di_anextents); + __entry->forkoff = dip->di_forkoff; + __entry->aformat = dip->di_aformat; + __entry->flags = be16_to_cpu(dip->di_flags); + __entry->gen = be32_to_cpu(dip->di_gen); + __entry->flags2 = be64_to_cpu(dip->di_flags2); + __entry->cowextsize = be32_to_cpu(dip->di_cowextsize); + ), + TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->mode, + __entry->version, + __entry->format, + __entry->uid, + __entry->gid, + __entry->size, + __entry->nblocks, + __entry->extsize, + __entry->nextents, + __entry->anextents, + __entry->forkoff, + __entry->aformat, + __entry->flags, + __entry->gen, + __entry->flags2, + __entry->cowextsize) +) + +#define DEFINE_REPAIR_DINODE_EVENT(name) \ +DEFINE_EVENT(xrep_dinode_class, name, \ + TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \ + TP_ARGS(sc, dip)) +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); + +DECLARE_EVENT_CLASS(xrep_inode_class, + TP_PROTO(struct xfs_scrub *sc), + TP_ARGS(sc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_rfsblock_t, nblocks) + __field(uint16_t, flags) + __field(uint64_t, flags2) + __field(uint32_t, nextents) + __field(uint8_t, format) + __field(uint32_t, anextents) + __field(uint8_t, aformat) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->size = sc->ip->i_disk_size; + __entry->nblocks = sc->ip->i_nblocks; + __entry->flags = sc->ip->i_diflags; + __entry->flags2 = sc->ip->i_diflags2; + __entry->nextents = sc->ip->i_df.if_nextents; + __entry->format = sc->ip->i_df.if_format; + __entry->anextents = sc->ip->i_af.if_nextents; + __entry->aformat = sc->ip->i_af.if_format; + ), + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->nblocks, + __entry->flags, + __entry->flags2, + __entry->nextents, + __entry->format, + __entry->anextents, + __entry->aformat) +) + +#define DEFINE_REPAIR_INODE_EVENT(name) \ +DEFINE_EVENT(xrep_inode_class, name, \ + TP_PROTO(struct xfs_scrub *sc), \ + TP_ARGS(sc)) +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); +DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From e744cef206055954517648070d2b3aaa3d2515ba Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:37 -0800 Subject: [PATCH 071/123] xfs: zap broken inode forks Determine if inode fork damage is responsible for the inode being unable to pass the ifork verifiers in xfs_iget and zap the fork contents if this is true. Once this is done the fork will be empty but we'll be able to construct an in-core inode, and a subsequent call to the inode fork repair ioctl will search the rmapbt to rebuild the records that were in the fork. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_attr_leaf.c | 13 +- fs/xfs/libxfs/xfs_attr_leaf.h | 3 +- fs/xfs/libxfs/xfs_bmap.c | 22 +- fs/xfs/libxfs/xfs_bmap.h | 2 + fs/xfs/libxfs/xfs_dir2_priv.h | 3 +- fs/xfs/libxfs/xfs_dir2_sf.c | 13 +- fs/xfs/libxfs/xfs_inode_fork.c | 33 +- fs/xfs/libxfs/xfs_shared.h | 2 +- fs/xfs/libxfs/xfs_symlink_remote.c | 8 +- fs/xfs/scrub/inode_repair.c | 713 ++++++++++++++++++++++++++++- fs/xfs/scrub/trace.h | 42 ++ 11 files changed, 808 insertions(+), 46 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 654e17e6610d..5d1ab4978f32 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1040,23 +1040,16 @@ xfs_attr_shortform_allfit( return xfs_attr_shortform_bytesfit(dp, bytes); } -/* Verify the consistency of an inline attribute fork. */ +/* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( - struct xfs_inode *ip) + struct xfs_attr_shortform *sfp, + size_t size) { - struct xfs_attr_shortform *sfp; struct xfs_attr_sf_entry *sfep; struct xfs_attr_sf_entry *next_sfep; char *endp; - struct xfs_ifork *ifp; int i; - int64_t size; - - ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); - ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); - sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = ifp->if_bytes; /* * Give up if the attribute is way too short. diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 368f4d9fa1d5..ce6743463c86 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -56,7 +56,8 @@ int xfs_attr_sf_findname(struct xfs_da_args *args, unsigned int *basep); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); -xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_shortform *sfp, + size_t size); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e308d2f44a3c..a073ca877ced 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6168,19 +6168,18 @@ xfs_bmap_finish_one( return error; } -/* Check that an inode's extent does not have invalid flags or bad ranges. */ +/* Check that an extent does not have invalid flags or bad ranges. */ xfs_failaddr_t -xfs_bmap_validate_extent( - struct xfs_inode *ip, +xfs_bmap_validate_extent_raw( + struct xfs_mount *mp, + bool rtfile, int whichfork, struct xfs_bmbt_irec *irec) { - struct xfs_mount *mp = ip->i_mount; - if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount)) return __this_address; - if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) { + if (rtfile && whichfork == XFS_DATA_FORK) { if (!xfs_verify_rtbext(mp, irec->br_startblock, irec->br_blockcount)) return __this_address; @@ -6210,3 +6209,14 @@ xfs_bmap_intent_destroy_cache(void) kmem_cache_destroy(xfs_bmap_intent_cache); xfs_bmap_intent_cache = NULL; } + +/* Check that an inode's extent does not have invalid flags or bad ranges. */ +xfs_failaddr_t +xfs_bmap_validate_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *irec) +{ + return xfs_bmap_validate_extent_raw(ip->i_mount, + XFS_IS_REALTIME_INODE(ip), whichfork, irec); +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e33470e39728..8518324db285 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -263,6 +263,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork) } } +xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile, + int whichfork, struct xfs_bmbt_irec *irec); xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 7404a9ff1a92..1db2e60ba827 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); -extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, int64_t size); int xfs_dir2_sf_entsize(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, int len); void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8cd37e6e9d38..870ef1d1ebe4 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -707,11 +707,10 @@ xfs_dir2_sf_check( /* Verify the consistency of an inline directory. */ xfs_failaddr_t xfs_dir2_sf_verify( - struct xfs_inode *ip) + struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *sfp, + int64_t size) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - struct xfs_dir2_sf_hdr *sfp; struct xfs_dir2_sf_entry *sfep; struct xfs_dir2_sf_entry *next_sfep; char *endp; @@ -719,15 +718,9 @@ xfs_dir2_sf_verify( int i; int i8count; int offset; - int64_t size; int error; uint8_t filetype; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - - sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; - size = ifp->if_bytes; - /* * Give up if the directory is way too short. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5a2e7ddfa76d..dad8ea832c20 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -702,12 +702,22 @@ xfs_ifork_verify_local_data( xfs_failaddr_t fa = NULL; switch (VFS_I(ip)->i_mode & S_IFMT) { - case S_IFDIR: - fa = xfs_dir2_sf_verify(ip); + case S_IFDIR: { + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + struct xfs_dir2_sf_hdr *sfp; + + sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; + fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes); break; - case S_IFLNK: - fa = xfs_symlink_shortform_verify(ip); + } + case S_IFLNK: { + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); + + fa = xfs_symlink_shortform_verify(ifp->if_u1.if_data, + ifp->if_bytes); break; + } default: break; } @@ -729,11 +739,20 @@ xfs_ifork_verify_local_attr( struct xfs_ifork *ifp = &ip->i_af; xfs_failaddr_t fa; - if (!xfs_inode_has_attr_fork(ip)) + if (!xfs_inode_has_attr_fork(ip)) { fa = __this_address; - else - fa = xfs_attr_shortform_verify(ip); + } else { + struct xfs_attr_shortform *sfp; + struct xfs_ifork *ifp; + int64_t size; + ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); + ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); + sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + size = ifp->if_bytes; + + fa = xfs_attr_shortform_verify(sfp, size); + } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", ifp->if_u1.if_data, ifp->if_bytes, fa); diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index c4381388c0c1..4220d3584c1b 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -139,7 +139,7 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, uint32_t size, struct xfs_buf *bp); void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp); -xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip); +xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size); /* Computed inode geometry for the filesystem. */ struct xfs_ino_geometry { diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index bdc777b9ec4a..3c96d1d617fb 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -202,15 +202,11 @@ xfs_symlink_local_to_remote( */ xfs_failaddr_t xfs_symlink_shortform_verify( - struct xfs_inode *ip) + void *sfp, + int64_t size) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - char *sfp = (char *)ifp->if_u1.if_data; - int size = ifp->if_bytes; char *endp = sfp + size; - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - /* * Zero length symlinks should never occur in memory as they are * never allowed to exist on disk. diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index f88d282fdfa5..66949cc3d7cc 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -22,8 +22,11 @@ #include "xfs_ialloc.h" #include "xfs_da_format.h" #include "xfs_reflink.h" +#include "xfs_alloc.h" #include "xfs_rmap.h" +#include "xfs_rmap_btree.h" #include "xfs_bmap.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap_util.h" #include "xfs_dir2.h" #include "xfs_dir2_priv.h" @@ -31,6 +34,8 @@ #include "xfs_quota.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_attr_leaf.h" +#include "xfs_log_priv.h" #include "xfs_health.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" @@ -71,6 +76,16 @@ * * - Invalid user, group, or project IDs (aka -1U) will be reset to zero. * Setuid and setgid bits are cleared. + * + * - Data and attr forks are reset to extents format with zero extents if the + * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta + * repair functions to recover the space mapping. + * + * - ACLs will not be recovered if the attr fork is zapped or the extended + * attribute structure itself requires salvaging. + * + * - If the attr fork is zapped, the user and group ids are reset to root and + * the setuid and setgid bits are removed. */ /* @@ -84,8 +99,33 @@ struct xrep_inode { struct xfs_scrub *sc; + /* Blocks in use on the data device by data extents or bmbt blocks. */ + xfs_rfsblock_t data_blocks; + + /* Blocks in use on the rt device. */ + xfs_rfsblock_t rt_blocks; + + /* Blocks in use by the attr fork. */ + xfs_rfsblock_t attr_blocks; + + /* Number of data device extents for the data fork. */ + xfs_extnum_t data_extents; + + /* + * Number of realtime device extents for the data fork. If + * data_extents and rt_extents indicate that the data fork has extents + * on both devices, we'll just back away slowly. + */ + xfs_extnum_t rt_extents; + + /* Number of (data device) extents for the attr fork. */ + xfs_aextnum_t attr_extents; + /* Sick state to set after zapping parts of the inode. */ unsigned int ino_sick_mask; + + /* Must we remove all access from this file? */ + bool zap_acls; }; /* @@ -190,9 +230,10 @@ xrep_dinode_header( /* Turn di_mode into /something/ recognizable. */ STATIC void xrep_dinode_mode( - struct xfs_scrub *sc, + struct xrep_inode *ri, struct xfs_dinode *dip) { + struct xfs_scrub *sc = ri->sc; uint16_t mode = be16_to_cpu(dip->di_mode); trace_xrep_dinode_mode(sc, dip); @@ -205,13 +246,15 @@ xrep_dinode_mode( dip->di_mode = cpu_to_be16(mode); dip->di_uid = 0; dip->di_gid = 0; + ri->zap_acls = true; } /* Fix any conflicting flags that the verifiers complain about. */ STATIC void xrep_dinode_flags( struct xfs_scrub *sc, - struct xfs_dinode *dip) + struct xfs_dinode *dip, + bool isrt) { struct xfs_mount *mp = sc->mp; uint64_t flags2 = be64_to_cpu(dip->di_flags2); @@ -220,6 +263,11 @@ xrep_dinode_flags( trace_xrep_dinode_flags(sc, dip); + if (isrt) + flags |= XFS_DIFLAG_REALTIME; + else + flags &= ~XFS_DIFLAG_REALTIME; + /* * For regular files on a reflink filesystem, set the REFLINK flag to * protect shared extents. A later stage will actually check those @@ -377,6 +425,657 @@ xrep_dinode_extsize_hints( } } +/* Count extents and blocks for an inode given an rmap. */ +STATIC int +xrep_dinode_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_inode *ri = priv; + int error = 0; + + if (xchk_should_terminate(ri->sc, &error)) + return error; + + /* We only care about this inode. */ + if (rec->rm_owner != ri->sc->sm->sm_ino) + return 0; + + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) { + ri->attr_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->attr_extents++; + + return 0; + } + + ri->data_blocks += rec->rm_blockcount; + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK)) + ri->data_extents++; + + return 0; +} + +/* Count extents and blocks for an inode from all AG rmap data. */ +STATIC int +xrep_dinode_count_ag_rmaps( + struct xrep_inode *ri, + struct xfs_perag *pag) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag); + error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(ri->sc->tp, agf); + return error; +} + +/* Count extents and blocks for a given inode from all rmap data. */ +STATIC int +xrep_dinode_count_rmaps( + struct xrep_inode *ri) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error; + + if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp)) + return -EOPNOTSUPP; + + for_each_perag(ri->sc->mp, agno, pag) { + error = xrep_dinode_count_ag_rmaps(ri, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + /* Can't have extents on both the rt and the data device. */ + if (ri->data_extents && ri->rt_extents) + return -EFSCORRUPTED; + + trace_xrep_dinode_count_rmaps(ri->sc, + ri->data_blocks, ri->rt_blocks, ri->attr_blocks, + ri->data_extents, ri->rt_extents, ri->attr_extents); + return 0; +} + +/* Return true if this extents-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_extents_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmbt_irec new; + struct xfs_bmbt_rec *dp; + xfs_extnum_t nex; + bool isrt; + unsigned int i; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex > dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + dp = XFS_DFORK_PTR(dip, whichfork); + + isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME); + for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + + xfs_bmbt_disk_get_all(dp, &new); + fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork, + &new); + if (fa) + return true; + } + + return false; +} + +/* Return true if this btree-format ifork looks like garbage. */ +STATIC bool +xrep_dinode_bad_bmbt_fork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + unsigned int dfork_size, + int whichfork) +{ + struct xfs_bmdr_block *dfp; + xfs_extnum_t nex; + unsigned int i; + unsigned int dmxr; + unsigned int nrecs; + unsigned int level; + + nex = xfs_dfork_nextents(dip, whichfork); + if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec)) + return true; + + if (dfork_size < sizeof(struct xfs_bmdr_block)) + return true; + + dfp = XFS_DFORK_PTR(dip, whichfork); + nrecs = be16_to_cpu(dfp->bb_numrecs); + level = be16_to_cpu(dfp->bb_level); + + if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size) + return true; + if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork)) + return true; + + dmxr = xfs_bmdr_maxrecs(dfork_size, 0); + for (i = 1; i <= nrecs; i++) { + struct xfs_bmbt_key *fkp; + xfs_bmbt_ptr_t *fpp; + xfs_fileoff_t fileoff; + xfs_fsblock_t fsbno; + + fkp = XFS_BMDR_KEY_ADDR(dfp, i); + fileoff = be64_to_cpu(fkp->br_startoff); + if (!xfs_verify_fileoff(sc->mp, fileoff)) + return true; + + fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr); + fsbno = be64_to_cpu(*fpp); + if (!xfs_verify_fsbno(sc->mp, fsbno)) + return true; + } + + return false; +} + +/* + * Check the data fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_dfork( + struct xfs_scrub *sc, + struct xfs_dinode *dip, + uint16_t mode) +{ + void *dfork_ptr; + int64_t data_size; + unsigned int fmt; + unsigned int dfork_size; + + /* + * Verifier functions take signed int64_t, so check for bogus negative + * values first. + */ + data_size = be64_to_cpu(dip->di_size); + if (data_size < 0) + return true; + + fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK); + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (fmt != XFS_DINODE_FMT_DEV) + return true; + break; + case S_IFREG: + if (fmt == XFS_DINODE_FMT_LOCAL) + return true; + fallthrough; + case S_IFLNK: + case S_IFDIR: + switch (fmt) { + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return true; + } + break; + default: + return true; + } + + dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK); + dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + + switch (fmt) { + case XFS_DINODE_FMT_DEV: + break; + case XFS_DINODE_FMT_LOCAL: + /* dir/symlink structure cannot be larger than the fork */ + if (data_size > dfork_size) + return true; + /* directory structure must pass verification. */ + if (S_ISDIR(mode) && + xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL) + return true; + /* symlink structure must pass verification. */ + if (S_ISLNK(mode) && + xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL) + return true; + break; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size, + XFS_DATA_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +static void +xrep_dinode_set_data_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_nextents = cpu_to_be64(nextents); + else + dip->di_nextents = cpu_to_be32(nextents); +} + +static void +xrep_dinode_set_attr_nextents( + struct xfs_dinode *dip, + xfs_extnum_t nextents) +{ + if (xfs_dinode_has_large_extent_counts(dip)) + dip->di_big_anextents = cpu_to_be32(nextents); + else + dip->di_anextents = cpu_to_be16(nextents); +} + +/* Reset the data fork to something sane. */ +STATIC void +xrep_dinode_zap_dfork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_dfork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED; + + xrep_dinode_set_data_nextents(dip, 0); + ri->data_blocks = 0; + ri->rt_blocks = 0; + + /* Special files always get reset to DEV */ + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + dip->di_format = XFS_DINODE_FMT_DEV; + dip->di_size = 0; + return; + } + + /* + * If we have data extents, reset to an empty map and hope the user + * will run the bmapbtd checker next. + */ + if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) { + dip->di_format = XFS_DINODE_FMT_EXTENTS; + return; + } + + /* Otherwise, reset the local format to the minimum. */ + switch (mode & S_IFMT) { + case S_IFLNK: + xrep_dinode_zap_symlink(ri, dip); + break; + case S_IFDIR: + xrep_dinode_zap_dir(ri, dip); + break; + } +} + +/* + * Check the attr fork for things that will fail the ifork verifiers or the + * ifork formatters. + */ +STATIC bool +xrep_dinode_check_afork( + struct xfs_scrub *sc, + struct xfs_dinode *dip) +{ + struct xfs_attr_shortform *afork_ptr; + size_t attr_size; + unsigned int afork_size; + + if (XFS_DFORK_BOFF(dip) == 0) + return dip->di_aformat != XFS_DINODE_FMT_EXTENTS || + xfs_dfork_attr_extents(dip) != 0; + + afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + + switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) { + case XFS_DINODE_FMT_LOCAL: + /* Fork has to be large enough to extract the xattr size. */ + if (afork_size < sizeof(struct xfs_attr_sf_hdr)) + return true; + + /* xattr structure cannot be larger than the fork */ + attr_size = be16_to_cpu(afork_ptr->hdr.totsize); + if (attr_size > afork_size) + return true; + + /* xattr structure must pass verification. */ + return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL; + case XFS_DINODE_FMT_EXTENTS: + if (xrep_dinode_bad_extents_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + case XFS_DINODE_FMT_BTREE: + if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size, + XFS_ATTR_FORK)) + return true; + break; + default: + return true; + } + + return false; +} + +/* + * Reset the attr fork to empty. Since the attr fork could have contained + * ACLs, make the file readable only by root. + */ +STATIC void +xrep_dinode_zap_afork( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_scrub *sc = ri->sc; + + trace_xrep_dinode_zap_afork(sc, dip); + + ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED; + + dip->di_aformat = XFS_DINODE_FMT_EXTENTS; + xrep_dinode_set_attr_nextents(dip, 0); + ri->attr_blocks = 0; + + /* + * If the data fork is in btree format, removing the attr fork entirely + * might cause verifier failures if the next level down in the bmbt + * could now fit in the data fork area. + */ + if (dip->di_format != XFS_DINODE_FMT_BTREE) + dip->di_forkoff = 0; + dip->di_mode = cpu_to_be16(mode & ~0777); + dip->di_uid = 0; + dip->di_gid = 0; +} + +/* Make sure the fork offset is a sensible value. */ +STATIC void +xrep_dinode_ensure_forkoff( + struct xrep_inode *ri, + struct xfs_dinode *dip, + uint16_t mode) +{ + struct xfs_bmdr_block *bmdr; + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t attr_extents, data_extents; + size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1); + unsigned int lit_sz = XFS_LITINO(sc->mp); + unsigned int afork_min, dfork_min; + + trace_xrep_dinode_ensure_forkoff(sc, dip); + + /* + * Before calling this function, xrep_dinode_core ensured that both + * forks actually fit inside their respective literal areas. If this + * was not the case, the fork was reset to FMT_EXTENTS with zero + * records. If the rmapbt scan found attr or data fork blocks, this + * will be noted in the dinode_stats, and we must leave enough room + * for the bmap repair code to reconstruct the mapping structure. + * + * First, compute the minimum space required for the attr fork. + */ + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform xattr structure at all, that + * means the attr fork area was exactly large enough to fit + * the sf structure. + */ + afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_EXTENTS: + attr_extents = xfs_dfork_attr_extents(dip); + if (attr_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents; + } else if (ri->attr_extents > 0) { + /* + * The attr fork thinks it has zero extents, but we + * found some xattr extents. We need to leave enough + * empty space here so that the incore attr fork will + * get created (and hence trigger the attr fork bmap + * repairer). + */ + afork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + afork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK); + afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + /* We should never see any other formats. */ + afork_min = 0; + break; + } + + /* Compute the minimum space required for the data fork. */ + switch (dip->di_format) { + case XFS_DINODE_FMT_DEV: + dfork_min = sizeof(__be32); + break; + case XFS_DINODE_FMT_UUID: + dfork_min = sizeof(uuid_t); + break; + case XFS_DINODE_FMT_LOCAL: + /* + * If we still have a shortform data fork at all, that means + * the data fork area was large enough to fit whatever was in + * there. + */ + dfork_min = be64_to_cpu(dip->di_size); + break; + case XFS_DINODE_FMT_EXTENTS: + data_extents = xfs_dfork_data_extents(dip); + if (data_extents) { + /* + * We must maintain sufficient space to hold the entire + * extent map array in the data fork. Note that we + * previously zapped the fork if it had no chance of + * fitting in the inode. + */ + dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents; + } else if (ri->data_extents > 0 || ri->rt_extents > 0) { + /* + * The data fork thinks it has zero extents, but we + * found some data extents. We need to leave enough + * empty space here so that the data fork bmap repair + * will recover the mappings. + */ + dfork_min = bmdr_minsz; + } else { + /* No extents on disk or found in rmapbt. */ + dfork_min = 0; + } + break; + case XFS_DINODE_FMT_BTREE: + /* Must have space for btree header and key/pointers. */ + bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK); + dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr); + break; + default: + dfork_min = 0; + break; + } + + /* + * Round all values up to the nearest 8 bytes, because that is the + * precision of di_forkoff. + */ + afork_min = roundup(afork_min, 8); + dfork_min = roundup(dfork_min, 8); + bmdr_minsz = roundup(bmdr_minsz, 8); + + ASSERT(dfork_min <= lit_sz); + ASSERT(afork_min <= lit_sz); + + /* + * If the data fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork up. + */ + if (dip->di_format == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_data_extents(dip) == 0 && + (ri->data_extents > 0 || ri->rt_extents > 0) && + bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) { + if (bmdr_minsz + afork_min > lit_sz) { + /* + * The attr for and the stub fork we need to recover + * the data fork won't both fit. Zap the attr fork. + */ + xrep_dinode_zap_afork(ri, dip, mode); + afork_min = bmdr_minsz; + } else { + void *before, *after; + + /* Otherwise, just slide the attr fork up. */ + before = XFS_DFORK_APTR(dip); + dip->di_forkoff = bmdr_minsz >> 3; + after = XFS_DFORK_APTR(dip); + memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp)); + } + } + + /* + * If the attr fork was zapped and we don't have enough space for the + * recovery fork, move the attr fork down. + */ + if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS && + xfs_dfork_attr_extents(dip) == 0 && + ri->attr_extents > 0 && + bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) { + if (dip->di_format == XFS_DINODE_FMT_BTREE) { + /* + * If the data fork is in btree format then we can't + * adjust forkoff because that runs the risk of + * violating the extents/btree format transition rules. + */ + } else if (bmdr_minsz + dfork_min > lit_sz) { + /* + * If we can't move the attr fork, too bad, we lose the + * attr fork and leak its blocks. + */ + xrep_dinode_zap_afork(ri, dip, mode); + } else { + /* + * Otherwise, just slide the attr fork down. The attr + * fork is empty, so we don't have any old contents to + * move here. + */ + dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3; + } + } +} + +/* + * Zap the data/attr forks if we spot anything that isn't going to pass the + * ifork verifiers or the ifork formatters, because we need to get the inode + * into good enough shape that the higher level repair functions can run. + */ +STATIC void +xrep_dinode_zap_forks( + struct xrep_inode *ri, + struct xfs_dinode *dip) +{ + struct xfs_scrub *sc = ri->sc; + xfs_extnum_t data_extents; + xfs_extnum_t attr_extents; + xfs_filblks_t nblocks; + uint16_t mode; + bool zap_datafork = false; + bool zap_attrfork = ri->zap_acls; + + trace_xrep_dinode_zap_forks(sc, dip); + + mode = be16_to_cpu(dip->di_mode); + + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + nblocks = be64_to_cpu(dip->di_nblocks); + + /* Inode counters don't make sense? */ + if (data_extents > nblocks) + zap_datafork = true; + if (attr_extents > nblocks) + zap_attrfork = true; + if (data_extents + attr_extents > nblocks) + zap_datafork = zap_attrfork = true; + + if (!zap_datafork) + zap_datafork = xrep_dinode_check_dfork(sc, dip, mode); + if (!zap_attrfork) + zap_attrfork = xrep_dinode_check_afork(sc, dip); + + /* Zap whatever's bad. */ + if (zap_attrfork) + xrep_dinode_zap_afork(ri, dip, mode); + if (zap_datafork) + xrep_dinode_zap_dfork(ri, dip, mode); + xrep_dinode_ensure_forkoff(ri, dip, mode); + + /* + * Zero di_nblocks if we don't have any extents at all to satisfy the + * buffer verifier. + */ + data_extents = xfs_dfork_data_extents(dip); + attr_extents = xfs_dfork_attr_extents(dip); + if (data_extents + attr_extents == 0) + dip->di_nblocks = 0; +} + /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */ STATIC int xrep_dinode_core( @@ -389,6 +1088,11 @@ xrep_dinode_core( int error; int iget_error; + /* Figure out what this inode had mapped in both forks. */ + error = xrep_dinode_count_rmaps(ri); + if (error) + return error; + /* Read the inode cluster buffer. */ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, @@ -403,10 +1107,11 @@ xrep_dinode_core( /* Fix everything the verifier will complain about. */ dip = xfs_buf_offset(bp, ri->imap.im_boffset); xrep_dinode_header(sc, dip); - xrep_dinode_mode(sc, dip); - xrep_dinode_flags(sc, dip); + xrep_dinode_mode(ri, dip); + xrep_dinode_flags(sc, dip, ri->rt_extents > 0); xrep_dinode_size(ri, dip); xrep_dinode_extsize_hints(sc, dip); + xrep_dinode_zap_forks(ri, dip); /* Write out the inode. */ trace_xrep_dinode_fixed(sc, dip); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 6041c716242a..120faa4dce2d 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1469,6 +1469,10 @@ DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints); DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink); DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir); DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork); +DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff); DECLARE_EVENT_CLASS(xrep_inode_class, TP_PROTO(struct xfs_scrub *sc), @@ -1522,6 +1526,44 @@ DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size); DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size); DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed); +TRACE_EVENT(xrep_dinode_count_rmaps, + TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks, + xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks, + xfs_extnum_t data_extents, xfs_extnum_t rt_extents, + xfs_aextnum_t attr_extents), + TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents, + rt_extents, attr_extents), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_rfsblock_t, data_blocks) + __field(xfs_rfsblock_t, rt_blocks) + __field(xfs_rfsblock_t, attr_blocks) + __field(xfs_extnum_t, data_extents) + __field(xfs_extnum_t, rt_extents) + __field(xfs_aextnum_t, attr_extents) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->ino = sc->sm->sm_ino; + __entry->data_blocks = data_blocks; + __entry->rt_blocks = rt_blocks; + __entry->attr_blocks = attr_blocks; + __entry->data_extents = data_extents; + __entry->rt_extents = rt_extents; + __entry->attr_extents = attr_extents; + ), + TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->data_blocks, + __entry->rt_blocks, + __entry->attr_blocks, + __entry->data_extents, + __entry->rt_extents, + __entry->attr_extents) +); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From 6c7289528d3c91855d78c56bb35fa360ed9a40bd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:37 -0800 Subject: [PATCH 072/123] xfs: abort directory parent scrub scans if we encounter a zapped directory In a previous patch, we added some code to perform sufficient repairs to an ondisk inode record such that the inode cache would be willing to load the inode. If the broken inode was a shortform directory, it will reset the directory to something plausible, which is to say an empty subdirectory of the root. The telltale signs that something is seriously wrong is the broken link count. Such directories look clean, but they shouldn't participate in a filesystem scan to find or confirm a directory parent pointer. Create a predicate that identifies such directories and abort the scrub. Found by fuzzing xfs/1554 with multithreaded xfs_scrub enabled and u3.bmx[0].startblock = zeroes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/common.c | 1 + fs/xfs/scrub/common.h | 2 ++ fs/xfs/scrub/dir.c | 26 ++++++++++++++++++++++++++ fs/xfs/scrub/parent.c | 17 +++++++++++++++++ 4 files changed, 46 insertions(+) diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index f0207e71e5dc..81f2b96bb5a7 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -25,6 +25,7 @@ #include "xfs_trans_priv.h" #include "xfs_da_format.h" #include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" #include "xfs_attr.h" #include "xfs_reflink.h" #include "xfs_ag.h" diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index c69cacb0b696..ec5755266259 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -198,6 +198,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) XFS_SCRUB_OFLAG_XCORRUPT); } +bool xchk_dir_looks_zapped(struct xfs_inode *dp); + #ifdef CONFIG_XFS_ONLINE_REPAIR /* Decide if a repair is required. */ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm) diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index b366fab699ac..d86ab51af928 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -798,3 +798,29 @@ xchk_directory( xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED); return 0; } + +/* + * Decide if this directory has been zapped to satisfy the inode and ifork + * verifiers. Checking and repairing should be postponed until the directory + * is fixed. + */ +bool +xchk_dir_looks_zapped( + struct xfs_inode *dp) +{ + /* Repair zapped this dir's data fork a short time ago */ + if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) + return true; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the bmapbtd + * scrubber to reconstruct the block mappings. Directories always + * contain some content, so this is a clear sign of a zapped directory. + * The state checked by xfs_ifork_zapped is not persisted, so this is + * the secondary strategy if repairs are interrupted by a crash or an + * unmount. + */ + return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS && + dp->i_df.if_nextents == 0; +} diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index e6155d86f791..7db873672146 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -156,6 +156,16 @@ xchk_parent_validate( goto out_rele; } + /* + * We cannot yet validate this parent pointer if the directory looks as + * though it has been zapped by the inode record repair code. + */ + if (xchk_dir_looks_zapped(dp)) { + error = -EBUSY; + xchk_set_incomplete(sc); + goto out_unlock; + } + /* Look for a directory entry in the parent pointing to the child. */ error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc); if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) @@ -217,6 +227,13 @@ xchk_parent( */ error = xchk_parent_validate(sc, parent_ino); } while (error == -EAGAIN); + if (error == -EBUSY) { + /* + * We could not scan a directory, so we marked the check + * incomplete. No further error return is necessary. + */ + return 0; + } return error; } From 66da11280f7ecd77abd999c469efc0dd643f26f5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:38 -0800 Subject: [PATCH 073/123] xfs: reintroduce reaping of file metadata blocks to xrep_reap_extents Back in commit a55e07308831b ("xfs: only allow reaping of per-AG blocks in xrep_reap_extents"), we removed from the reaping code the ability to handle bmbt blocks. At the time, the reaping code only walked single blocks, didn't correctly detect crosslinked blocks, and the special casing made the function hard to understand. It was easier to remove unneeded functionality prior to fixing all the bugs. Now that we've fixed the problems, we want again the ability to reap file metadata blocks. Reintroduce the per-file reaping functionality atop the current implementation. We require that sc->sa is uninitialized, so that we can use it to hold all the per-AG context for a given extent. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/fsb_bitmap.h | 37 ++++++++++++ fs/xfs/scrub/reap.c | 121 ++++++++++++++++++++++++++++++++++++-- fs/xfs/scrub/reap.h | 5 ++ fs/xfs/scrub/repair.h | 1 + 4 files changed, 160 insertions(+), 4 deletions(-) create mode 100644 fs/xfs/scrub/fsb_bitmap.h diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h new file mode 100644 index 000000000000..40b462c1dd0d --- /dev/null +++ b/fs/xfs/scrub/fsb_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_FSB_BITMAP_H__ +#define __XFS_SCRUB_FSB_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fsblock_t */ + +struct xfsb_bitmap { + struct xbitmap64 fsbitmap; +}; + +static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->fsbitmap); +} + +static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->fsbitmap); +} + +static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap, + xfs_fsblock_t start, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->fsbitmap, start, len); +} + +static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->fsbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index bfc3583132ac..0d2e32fbb51a 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -38,6 +38,7 @@ #include "scrub/repair.h" #include "scrub/bitmap.h" #include "scrub/agb_bitmap.h" +#include "scrub/fsb_bitmap.h" #include "scrub/reap.h" /* @@ -75,10 +76,10 @@ * with only the same rmap owner but the block is not owned by something with * the same rmap owner, the block will be freed. * - * The caller is responsible for locking the AG headers for the entire rebuild - * operation so that nothing else can sneak in and change the AG state while - * we're not looking. We must also invalidate any buffers associated with - * @bitmap. + * The caller is responsible for locking the AG headers/inode for the entire + * rebuild operation so that nothing else can sneak in and change the incore + * state while we're not looking. We must also invalidate any buffers + * associated with @bitmap. */ /* Information about reaping extents after a repair. */ @@ -501,3 +502,115 @@ xrep_reap_agblocks( return 0; } + +/* + * Break a file metadata extent into sub-extents by fate (crosslinked, not + * crosslinked), and dispose of each sub-extent separately. The extent must + * not cross an AG boundary. + */ +STATIC int +xreap_fsmeta_extent( + uint64_t fsbno, + uint64_t len, + void *priv) +{ + struct xreap_state *rs = priv; + struct xfs_scrub *sc = rs->sc; + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + xfs_agblock_t agbno_next = agbno + len; + int error = 0; + + ASSERT(len <= XFS_MAX_BMBT_EXTLEN); + ASSERT(sc->ip != NULL); + ASSERT(!sc->sa.pag); + + /* + * We're reaping blocks after repairing file metadata, which means that + * we have to init the xchk_ag structure ourselves. + */ + sc->sa.pag = xfs_perag_get(sc->mp, agno); + if (!sc->sa.pag) + return -EFSCORRUPTED; + + error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp); + if (error) + goto out_pag; + + while (agbno < agbno_next) { + xfs_extlen_t aglen; + bool crosslinked; + + error = xreap_agextent_select(rs, agbno, agbno_next, + &crosslinked, &aglen); + if (error) + goto out_agf; + + error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked); + if (error) + goto out_agf; + + if (xreap_want_defer_finish(rs)) { + /* + * Holds the AGF buffer across the deferred chain + * processing. + */ + error = xrep_defer_finish(sc); + if (error) + goto out_agf; + xreap_defer_finish_reset(rs); + } else if (xreap_want_roll(rs)) { + /* + * Hold the AGF buffer across the transaction roll so + * that we don't have to reattach it to the scrub + * context. + */ + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + error = xfs_trans_roll_inode(&sc->tp, sc->ip); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + if (error) + goto out_agf; + xreap_reset(rs); + } + + agbno += aglen; + } + +out_agf: + xfs_trans_brelse(sc->tp, sc->sa.agf_bp); + sc->sa.agf_bp = NULL; +out_pag: + xfs_perag_put(sc->sa.pag); + sc->sa.pag = NULL; + return error; +} + +/* + * Dispose of every block of every fs metadata extent in the bitmap. + * Do not use this to dispose of the mappings in an ondisk inode fork. + */ +int +xrep_reap_fsblocks( + struct xfs_scrub *sc, + struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo) +{ + struct xreap_state rs = { + .sc = sc, + .oinfo = oinfo, + .resv = XFS_AG_RESV_NONE, + }; + int error; + + ASSERT(xfs_has_rmapbt(sc->mp)); + ASSERT(sc->ip != NULL); + + error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs); + if (error) + return error; + + if (xreap_dirty(&rs)) + return xrep_defer_finish(sc); + + return 0; +} diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h index fe24626af164..0b69f16dd98f 100644 --- a/fs/xfs/scrub/reap.h +++ b/fs/xfs/scrub/reap.h @@ -6,7 +6,12 @@ #ifndef __XFS_SCRUB_REAP_H__ #define __XFS_SCRUB_REAP_H__ +struct xagb_bitmap; +struct xfsb_bitmap; + int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap, const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); +int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap, + const struct xfs_owner_info *oinfo); #endif /* __XFS_SCRUB_REAP_H__ */ diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index a513b84f5330..d4ef740c878f 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -48,6 +48,7 @@ xrep_trans_commit( struct xbitmap; struct xagb_bitmap; +struct xfsb_bitmap; int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink); From c3a22c2e4b45fcf3184e7dd1c755e6b45dc9f499 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:38 -0800 Subject: [PATCH 074/123] xfs: skip the rmapbt search on an empty attr fork unless we know it was zapped The attribute fork scrubber can optionally scan the reverse mapping records of the filesystem to determine if the fork is missing mappings that it should have. However, this is a very expensive operation, so we only want to do this if we suspect that the fork is missing records. For attribute forks the criteria for suspicion is that the attr fork is in EXTENTS format and has zero extents. However, there are several ways that a file can end up in this state through regular filesystem usage. For example, an LSM can set a s_security hook but then decide not to set an ACL; or an attr set can create the attr fork but then the actual set operation fails with ENOSPC; or we can delete all the attrs on a file whose data fork is in btree format, in which case we do not delete the attr fork. We don't want to run the expensive check for any case that can be arrived at through regular operations. However. When online inode repair decides to zap an attribute fork, it cannot determine if it is zapping ACL information. As a precaution it removes all the discretionary access control permissions and sets the user and group ids to zero. Check these three additional conditions to decide if we want to scan the rmap records. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/bmap.c | 101 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 22 deletions(-) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 1487aaf3d95f..8175e8c17c14 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -638,6 +638,82 @@ xchk_bmap_check_ag_rmaps( return error; } +/* + * Decide if we want to scan the reverse mappings to determine if the attr + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_attrfork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_af; + + /* + * If the dinode repair found a bad attr fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * Files can have an attr fork in EXTENTS format with zero records for + * several reasons: + * + * a) an attr set created a fork but ran out of space + * b) attr replace deleted an old attr but failed during the set step + * c) the data fork was in btree format when all attrs were deleted, so + * the fork was left in place + * d) the inode repair code zapped the fork + * + * Only in case (d) do we want to scan the rmapbt to see if we need to + * rebuild the attr fork. The fork zap code clears all DAC permission + * bits and zeroes the uid and gid, so avoid the scan if any of those + * three conditions are not met. + */ + if ((VFS_I(ip)->i_mode & 0777) != 0) + return false; + if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID)) + return false; + if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID)) + return false; + + return true; +} + +/* + * Decide if we want to scan the reverse mappings to determine if the data + * fork /really/ has zero space mappings. + */ +STATIC bool +xchk_bmap_check_empty_datafork( + struct xfs_inode *ip) +{ + struct xfs_ifork *ifp = &ip->i_df; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(ip)) + return false; + + /* + * If the dinode repair found a bad data fork, it will reset the fork + * to extents format with zero records and wait for the this scrubber + * to reconstruct the block mappings. If the fork is not in this + * state, then the fork cannot have been zapped. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0) + return false; + + /* + * If we encounter an empty data fork along with evidence that the fork + * might not really be empty, we need to scan the reverse mappings to + * decide if we're going to rebuild the fork. Data forks with nonzero + * file size are scanned. + */ + return i_size_read(VFS_I(ip)) != 0; +} + /* * Decide if we want to walk every rmap btree in the fs to make sure that each * rmap for this file fork has corresponding bmbt entries. @@ -647,7 +723,6 @@ xchk_bmap_want_check_rmaps( struct xchk_bmap_info *info) { struct xfs_scrub *sc = info->sc; - struct xfs_ifork *ifp; if (!xfs_has_rmapbt(sc->mp)) return false; @@ -656,28 +731,10 @@ xchk_bmap_want_check_rmaps( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return false; - /* Don't support realtime rmap checks yet. */ - if (info->is_rt) - return false; + if (info->whichfork == XFS_ATTR_FORK) + return xchk_bmap_check_empty_attrfork(sc->ip); - /* - * The inode repair code zaps broken inode forks by resetting them back - * to EXTENTS format and zero extent records. If we encounter a fork - * in this state along with evidence that the fork isn't supposed to be - * empty, we need to scan the reverse mappings to decide if we're going - * to rebuild the fork. Data forks with nonzero file size are scanned. - * xattr forks are never empty of content, so they are always scanned. - */ - ifp = xfs_ifork_ptr(sc->ip, info->whichfork); - if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) { - if (info->whichfork == XFS_DATA_FORK && - i_size_read(VFS_I(sc->ip)) == 0) - return false; - - return true; - } - - return false; + return xchk_bmap_check_empty_datafork(sc->ip); } /* Make sure each rmap has a corresponding bmbt entry. */ From 8f71bede8efd820627ac05c19eac2758214bc896 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:39 -0800 Subject: [PATCH 075/123] xfs: repair inode fork block mapping data structures Use the reverse-mapping btree information to rebuild an inode block map. Update the btree bulk loading code as necessary to support inode rooted btrees and fix some bitrot problems. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/libxfs/xfs_bmap_btree.c | 121 ++++- fs/xfs/libxfs/xfs_bmap_btree.h | 5 + fs/xfs/libxfs/xfs_btree_staging.c | 11 +- fs/xfs/libxfs/xfs_btree_staging.h | 2 +- fs/xfs/libxfs/xfs_iext_tree.c | 23 +- fs/xfs/libxfs/xfs_inode_fork.c | 1 + fs/xfs/libxfs/xfs_inode_fork.h | 3 + fs/xfs/scrub/bmap.c | 18 + fs/xfs/scrub/bmap_repair.c | 858 ++++++++++++++++++++++++++++++ fs/xfs/scrub/common.h | 6 +- fs/xfs/scrub/repair.c | 28 + fs/xfs/scrub/repair.h | 6 + fs/xfs/scrub/scrub.c | 4 +- fs/xfs/scrub/trace.h | 34 +- fs/xfs/xfs_trans.c | 62 +++ fs/xfs/xfs_trans.h | 4 + 17 files changed, 1153 insertions(+), 34 deletions(-) create mode 100644 fs/xfs/scrub/bmap_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 561ab59b9422..66c1a5001772 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -183,6 +183,7 @@ ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ + bmap_repair.o \ ialloc_repair.o \ inode_repair.o \ newbt.o \ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8360256cff16..71f2d50f7823 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -15,6 +15,7 @@ #include "xfs_trans.h" #include "xfs_alloc.h" #include "xfs_btree.h" +#include "xfs_btree_staging.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" @@ -288,10 +289,7 @@ xfs_bmbt_get_minrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0) / 2; @@ -306,10 +304,7 @@ xfs_bmbt_get_maxrecs( int level) { if (level == cur->bc_nlevels - 1) { - struct xfs_ifork *ifp; - - ifp = xfs_ifork_ptr(cur->bc_ino.ip, - cur->bc_ino.whichfork); + struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); return xfs_bmbt_maxrecs(cur->bc_mp, ifp->if_broot_bytes, level == 0); @@ -543,23 +538,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = { .keys_contiguous = xfs_bmbt_keys_contiguous, }; -/* - * Allocate a new bmap btree cursor. - */ -struct xfs_btree_cur * /* new bmap btree cursor */ -xfs_bmbt_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* inode owning the btree */ - int whichfork) /* data or attr fork */ +static struct xfs_btree_cur * +xfs_bmbt_init_common( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) { - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); struct xfs_btree_cur *cur; + ASSERT(whichfork != XFS_COW_FORK); cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); - cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; @@ -567,10 +558,30 @@ xfs_bmbt_init_cursor( if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; - cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.ip = ip; cur->bc_ino.allocated = 0; cur->bc_ino.flags = 0; + + return cur; +} + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * +xfs_bmbt_init_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = xfs_bmbt_init_common(mp, tp, ip, whichfork); + + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork); cur->bc_ino.whichfork = whichfork; return cur; @@ -587,6 +598,76 @@ xfs_bmbt_block_maxrecs( return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); } +/* + * Allocate a new bmap btree cursor for reloading an inode block mapping data + * structure. Note that callers can use the staged cursor to reload extents + * format inode forks if they rebuild the iext tree and commit the staged + * cursor immediately. + */ +struct xfs_btree_cur * +xfs_bmbt_stage_cursor( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xbtree_ifakeroot *ifake) +{ + struct xfs_btree_cur *cur; + struct xfs_btree_ops *ops; + + /* data fork always has larger maxheight */ + cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK); + cur->bc_nlevels = ifake->if_levels; + cur->bc_ino.forksize = ifake->if_fork_size; + + /* Don't let anyone think we're attached to the real fork yet. */ + cur->bc_ino.whichfork = -1; + xfs_btree_stage_ifakeroot(cur, ifake, &ops); + ops->update_cursor = NULL; + return cur; +} + +/* + * Swap in the new inode fork root. Once we pass this point the newly rebuilt + * mappings are in place and we have to kill off any old btree blocks. + */ +void +xfs_bmbt_commit_staged_btree( + struct xfs_btree_cur *cur, + struct xfs_trans *tp, + int whichfork) +{ + struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake; + struct xfs_ifork *ifp; + static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT}; + static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT}; + int flags = XFS_ILOG_CORE; + + ASSERT(cur->bc_flags & XFS_BTREE_STAGING); + ASSERT(whichfork != XFS_COW_FORK); + + /* + * Free any resources hanging off the real fork, then shallow-copy the + * staging fork's contents into the real fork to transfer everything + * we just built. + */ + ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork); + xfs_idestroy_fork(ifp); + memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork)); + + switch (ifp->if_format) { + case XFS_DINODE_FMT_EXTENTS: + flags |= extflag[whichfork]; + break; + case XFS_DINODE_FMT_BTREE: + flags |= brootflag[whichfork]; + break; + default: + ASSERT(0); + break; + } + xfs_trans_log_inode(tp, cur->bc_ino.ip, flags); + xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops); +} + /* * Calculate number of records in a bmap btree block. */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 3e7a40a83835..151b8491f60e 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -11,6 +11,7 @@ struct xfs_btree_block; struct xfs_mount; struct xfs_inode; struct xfs_trans; +struct xbtree_ifakeroot; /* * Btree block header size depends on a superblock flag. @@ -106,6 +107,10 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp, + struct xfs_inode *ip, struct xbtree_ifakeroot *ifake); +void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, + struct xfs_trans *tp, int whichfork); extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index 0c978a31e284..e276eba87cb1 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -405,7 +405,7 @@ xfs_btree_bload_prep_block( ASSERT(*bpp == NULL); /* Allocate a new incore btree root block. */ - new_size = bbl->iroot_size(cur, nr_this_block, priv); + new_size = bbl->iroot_size(cur, level, nr_this_block, priv); ifp->if_broot = kmem_zalloc(new_size, 0); ifp->if_broot_bytes = (int)new_size; @@ -596,7 +596,14 @@ xfs_btree_bload_level_geometry( unsigned int desired_npb; unsigned int maxnr; - maxnr = cur->bc_ops->get_maxrecs(cur, level); + /* + * Compute the absolute maximum number of records that we can store in + * the ondisk block or inode root. + */ + if (cur->bc_ops->get_dmaxrecs) + maxnr = cur->bc_ops->get_dmaxrecs(cur, level); + else + maxnr = cur->bc_ops->get_maxrecs(cur, level); /* * Compute the number of blocks we need to fill each block with the diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h index f0a5007284ef..055ea43b1e18 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.h +++ b/fs/xfs/libxfs/xfs_btree_staging.h @@ -53,7 +53,7 @@ typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur, typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, void *priv); typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur, - unsigned int nr_this_level, void *priv); + unsigned int level, unsigned int nr_this_level, void *priv); struct xfs_btree_bload { /* diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 773cf4349428..d062794cc795 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -622,13 +622,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp) } void -xfs_iext_insert( - struct xfs_inode *ip, +xfs_iext_insert_raw( + struct xfs_ifork *ifp, struct xfs_iext_cursor *cur, - struct xfs_bmbt_irec *irec, - int state) + struct xfs_bmbt_irec *irec) { - struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); xfs_fileoff_t offset = irec->br_startoff; struct xfs_iext_leaf *new = NULL; int nr_entries, i; @@ -662,12 +660,23 @@ xfs_iext_insert( xfs_iext_set(cur_rec(cur), irec); ifp->if_bytes += sizeof(struct xfs_iext_rec); - trace_xfs_iext_insert(ip, cur, state, _RET_IP_); - if (new) xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2); } +void +xfs_iext_insert( + struct xfs_inode *ip, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec, + int state) +{ + struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state); + + xfs_iext_insert_raw(ifp, cur, irec); + trace_xfs_iext_insert(ip, cur, state, _RET_IP_); +} + static struct xfs_iext_node * xfs_iext_rebalance_node( struct xfs_iext_node *parent, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index dad8ea832c20..b86d57589f67 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -520,6 +520,7 @@ xfs_idata_realloc( ifp->if_bytes = new_size; } +/* Free all memory and reset a fork back to its initial state. */ void xfs_idestroy_fork( struct xfs_ifork *ifp) diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 96d307784c85..535be5c03689 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -180,6 +180,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork, const void *data, int64_t size); xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp); +void xfs_iext_insert_raw(struct xfs_ifork *ifp, + struct xfs_iext_cursor *cur, + struct xfs_bmbt_irec *irec); void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *, int); void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *, diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 8175e8c17c14..b169cddde6da 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -50,9 +50,18 @@ xchk_setup_inode_bmap( if (S_ISREG(VFS_I(sc->ip)->i_mode) && sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) { struct address_space *mapping = VFS_I(sc->ip)->i_mapping; + bool is_repair = xchk_could_repair(sc); xchk_ilock(sc, XFS_MMAPLOCK_EXCL); + /* Break all our leases, we're going to mess with things. */ + if (is_repair) { + error = xfs_break_layouts(VFS_I(sc->ip), + &sc->ilock_flags, BREAK_WRITE); + if (error) + goto out; + } + inode_dio_wait(VFS_I(sc->ip)); /* @@ -73,6 +82,15 @@ xchk_setup_inode_bmap( error = filemap_fdatawait_keep_errors(mapping); if (error && (error != -ENOSPC && error != -EIO)) goto out; + + /* Drop the page cache if we're repairing block mappings. */ + if (is_repair) { + error = invalidate_inode_pages2( + VFS_I(sc->ip)->i_mapping); + if (error) + goto out; + } + } /* Got the inode, lock it and we're ready to go. */ diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c new file mode 100644 index 000000000000..a8d6415b1c38 --- /dev/null +++ b/fs/xfs/scrub/bmap_repair.c @@ -0,0 +1,858 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_btree_staging.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_reflink.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/xfile.h" +#include "scrub/xfarray.h" +#include "scrub/newbt.h" +#include "scrub/reap.h" + +/* + * Inode Fork Block Mapping (BMBT) Repair + * ====================================== + * + * Gather all the rmap records for the inode and fork we're fixing, reset the + * incore fork, then recreate the btree. + */ + +enum reflink_scan_state { + RLS_IRRELEVANT = -1, /* not applicable to this file */ + RLS_UNKNOWN, /* shared extent scans required */ + RLS_SET_IFLAG, /* iflag must be set */ +}; + +struct xrep_bmap { + /* Old bmbt blocks */ + struct xfsb_bitmap old_bmbt_blocks; + + /* New fork. */ + struct xrep_newbt new_bmapbt; + + /* List of new bmap records. */ + struct xfarray *bmap_records; + + struct xfs_scrub *sc; + + /* How many blocks did we find allocated to this file? */ + xfs_rfsblock_t nblocks; + + /* How many bmbt blocks did we find for this fork? */ + xfs_rfsblock_t old_bmbt_block_count; + + /* get_records()'s position in the free space record array. */ + xfarray_idx_t array_cur; + + /* How many real (non-hole, non-delalloc) mappings do we have? */ + uint64_t real_mappings; + + /* Which fork are we fixing? */ + int whichfork; + + /* What d the REFLINK flag be set when the repair is over? */ + enum reflink_scan_state reflink_scan; +}; + +/* Is this space extent shared? Flag the inode if it is. */ +STATIC int +xrep_bmap_discover_shared( + struct xrep_bmap *rb, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + struct xfs_scrub *sc = rb->sc; + xfs_agblock_t agbno; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock); + error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount, + &fbno, &flen, false); + if (error) + return error; + + if (fbno != NULLAGBLOCK) + rb->reflink_scan = RLS_SET_IFLAG; + + return 0; +} + +/* Remember this reverse-mapping as a series of bmap records. */ +STATIC int +xrep_bmap_from_rmap( + struct xrep_bmap *rb, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + bool unwritten) +{ + struct xfs_bmbt_irec irec = { + .br_startoff = startoff, + .br_startblock = startblock, + .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM, + }; + struct xfs_bmbt_rec rbe; + struct xfs_scrub *sc = rb->sc; + int error = 0; + + /* + * If we're repairing the data fork of a non-reflinked regular file on + * a reflink filesystem, we need to figure out if this space extent is + * shared. + */ + if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) { + error = xrep_bmap_discover_shared(rb, startblock, blockcount); + if (error) + return error; + } + + do { + xfs_failaddr_t fa; + + irec.br_blockcount = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec); + if (fa) + return -EFSCORRUPTED; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec); + + if (xchk_should_terminate(sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + + rb->real_mappings++; + + irec.br_startblock += irec.br_blockcount; + irec.br_startoff += irec.br_blockcount; + blockcount -= irec.br_blockcount; + } while (blockcount > 0); + + return 0; +} + +/* Check for any obvious errors or conflicts in the file mapping. */ +STATIC int +xrep_bmap_check_fork_rmap( + struct xrep_bmap *rb, + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec) +{ + struct xfs_scrub *sc = rb->sc; + enum xbtree_recpacking outcome; + int error; + + /* + * Data extents for rt files are never stored on the data device, but + * everything else (xattrs, bmbt blocks) can be. + */ + if (XFS_IS_REALTIME_INODE(sc->ip) && + !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) + return -EFSCORRUPTED; + + /* Check that this is within the AG. */ + if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock, + rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* Check the file offset range. */ + if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) && + !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount)) + return -EFSCORRUPTED; + + /* No contradictory flags. */ + if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) && + (rec->rm_flags & XFS_RMAP_UNWRITTEN)) + return -EFSCORRUPTED; + + /* Make sure this isn't free space. */ + error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock, + rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + /* Must not be an inode chunk. */ + error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur, + rec->rm_startblock, rec->rm_blockcount, &outcome); + if (error) + return error; + if (outcome != XBTREE_RECPACKING_EMPTY) + return -EFSCORRUPTED; + + return 0; +} + +/* Record extents that belong to this inode's fork. */ +STATIC int +xrep_bmap_walk_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_bmap *rb = priv; + struct xfs_mount *mp = cur->bc_mp; + xfs_fsblock_t fsbno; + int error = 0; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + if (rec->rm_owner != rb->sc->ip->i_ino) + return 0; + + error = xrep_bmap_check_fork_rmap(rb, cur, rec); + if (error) + return error; + + /* + * Record all blocks allocated to this file even if the extent isn't + * for the fork we're rebuilding so that we can reset di_nblocks later. + */ + rb->nblocks += rec->rm_blockcount; + + /* If this rmap isn't for the fork we want, we're done. */ + if (rb->whichfork == XFS_DATA_FORK && + (rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rb->whichfork == XFS_ATTR_FORK && + !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, + rec->rm_startblock); + + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) { + rb->old_bmbt_block_count += rec->rm_blockcount; + return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno, + rec->rm_blockcount); + } + + return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno, + rec->rm_blockcount, + rec->rm_flags & XFS_RMAP_UNWRITTEN); +} + +/* + * Compare two block mapping records. We want to sort in order of increasing + * file offset. + */ +static int +xrep_bmap_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_bmbt_rec *ba = a; + const struct xfs_bmbt_rec *bb = b; + xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba); + xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb); + + if (ao > bo) + return 1; + else if (ao < bo) + return -1; + return 0; +} + +/* + * Sort the bmap extents by fork offset or else the records will be in the + * wrong order. Ensure there are no overlaps in the file offset ranges. + */ +STATIC int +xrep_bmap_sort_records( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + xfs_fileoff_t next_off = 0; + xfarray_idx_t array_cur; + int error; + + error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp, + XFARRAY_SORT_KILLABLE); + if (error) + return error; + + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + if (irec.br_startoff < next_off) + return -EFSCORRUPTED; + + next_off = irec.br_startoff + irec.br_blockcount; + } + + return 0; +} + +/* Scan one AG for reverse mappings that we can turn into extent maps. */ +STATIC int +xrep_bmap_scan_ag( + struct xrep_bmap *rb, + struct xfs_perag *pag) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + return error; + + error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb); + xchk_ag_free(sc, &sc->sa); + return error; +} + +/* Find the delalloc extents from the old incore extent tree. */ +STATIC int +xrep_bmap_find_delalloc( + struct xrep_bmap *rb) +{ + struct xfs_bmbt_irec irec; + struct xfs_iext_cursor icur; + struct xfs_bmbt_rec rbe; + struct xfs_inode *ip = rb->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork); + int error = 0; + + /* + * Skip this scan if we don't expect to find delayed allocation + * reservations in this fork. + */ + if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0) + return 0; + + for_each_xfs_iext(ifp, &icur, &irec) { + if (!isnullstartblock(irec.br_startblock)) + continue; + + xfs_bmbt_disk_set_all(&rbe, &irec); + + trace_xrep_bmap_found(ip, rb->whichfork, &irec); + + if (xchk_should_terminate(rb->sc, &error)) + return error; + + error = xfarray_append(rb->bmap_records, &rbe); + if (error) + return error; + } + + return 0; +} + +/* + * Collect block mappings for this fork of this inode and decide if we have + * enough space to rebuild. Caller is responsible for cleaning up the list if + * anything goes wrong. + */ +STATIC int +xrep_bmap_find_mappings( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_perag *pag; + xfs_agnumber_t agno; + int error = 0; + + /* Iterate the rmaps for extents. */ + for_each_perag(sc->mp, agno, pag) { + error = xrep_bmap_scan_ag(rb, pag); + if (error) { + xfs_perag_rele(pag); + return error; + } + } + + return xrep_bmap_find_delalloc(rb); +} + +/* Retrieve real extent mappings for bulk loading the bmap btree. */ +STATIC int +xrep_bmap_get_records( + struct xfs_btree_cur *cur, + unsigned int idx, + struct xfs_btree_block *block, + unsigned int nr_wanted, + void *priv) +{ + struct xfs_bmbt_rec rec; + struct xfs_bmbt_irec *irec = &cur->bc_rec.b; + struct xrep_bmap *rb = priv; + union xfs_btree_rec *block_rec; + unsigned int loaded; + int error; + + for (loaded = 0; loaded < nr_wanted; loaded++, idx++) { + do { + error = xfarray_load(rb->bmap_records, rb->array_cur++, + &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, irec); + } while (isnullstartblock(irec->br_startblock)); + + block_rec = xfs_btree_rec_addr(cur, idx, block); + cur->bc_ops->init_rec_from_cur(cur, block_rec); + } + + return loaded; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_bmap_claim_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_bmap *rb = priv; + + return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr); +} + +/* Figure out how much space we need to create the incore btree root block. */ +STATIC size_t +xrep_bmap_iroot_size( + struct xfs_btree_cur *cur, + unsigned int level, + unsigned int nr_this_level, + void *priv) +{ + ASSERT(level > 0); + + return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level); +} + +/* Update the inode counters. */ +STATIC int +xrep_bmap_reset_counters( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int64_t delta; + + if (rb->reflink_scan == RLS_SET_IFLAG) + sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK; + + /* + * Update the inode block counts to reflect the extents we found in the + * rmapbt. + */ + delta = ifake->if_blocks - rb->old_bmbt_block_count; + sc->ip->i_nblocks = rb->nblocks + delta; + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + + /* + * Adjust the quota counts by the difference in size between the old + * and new bmbt. + */ + xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta); + return 0; +} + +/* + * Create a new iext tree and load it with block mappings. If the inode is + * in extents format, that's all we need to do to commit the new mappings. + * If it is in btree format, this takes care of preloading the incore tree. + */ +STATIC int +xrep_bmap_extents_load( + struct xrep_bmap *rb) +{ + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec irec; + struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork; + xfarray_idx_t array_cur; + int error; + + ASSERT(ifp->if_bytes == 0); + + /* Add all the mappings (incl. delalloc) to the incore extent tree. */ + xfs_iext_first(ifp, &icur); + foreach_xfarray_idx(rb->bmap_records, array_cur) { + struct xfs_bmbt_rec rec; + + error = xfarray_load(rb->bmap_records, array_cur, &rec); + if (error) + return error; + + xfs_bmbt_disk_get_all(&rec, &irec); + + xfs_iext_insert_raw(ifp, &icur, &irec); + if (!isnullstartblock(irec.br_startblock)) + ifp->if_nextents++; + + xfs_iext_next(ifp, &icur); + } + + return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork, + ifp->if_nextents); +} + +/* + * Reserve new btree blocks, bulk load the bmap records into the ondisk btree, + * and load the incore extent tree. + */ +STATIC int +xrep_bmap_btree_load( + struct xrep_bmap *rb, + struct xfs_btree_cur *bmap_cur) +{ + struct xfs_scrub *sc = rb->sc; + int error; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(bmap_cur, + &rb->new_bmapbt.bload, rb->real_mappings); + if (error) + return error; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * Guess how many blocks we're going to need to rebuild an entire bmap + * from the number of extents we found, and pump up our transaction to + * have sufficient block reservation. We're allowed to exceed file + * quota to repair inconsistent metadata. + */ + error = xfs_trans_reserve_more_inode(sc->tp, sc->ip, + rb->new_bmapbt.bload.nr_blocks, 0, true); + if (error) + return error; + + /* Reserve the space we'll need for the new btree. */ + error = xrep_newbt_alloc_blocks(&rb->new_bmapbt, + rb->new_bmapbt.bload.nr_blocks); + if (error) + return error; + + /* Add all observed bmap records. */ + rb->array_cur = XFARRAY_CURSOR_INIT; + error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb); + if (error) + return error; + + /* + * Load the new bmap records into the new incore extent tree to + * preserve delalloc reservations for regular files. The directory + * code loads the extent tree during xfs_dir_open and assumes + * thereafter that it remains loaded, so we must not violate that + * assumption. + */ + return xrep_bmap_extents_load(rb); +} + +/* + * Use the collected bmap information to stage a new bmap fork. If this is + * successful we'll return with the new fork information logged to the repair + * transaction but not yet committed. The caller must ensure that the inode + * is joined to the transaction; the inode will be joined to a clean + * transaction when the function returns. + */ +STATIC int +xrep_bmap_build_new_fork( + struct xrep_bmap *rb) +{ + struct xfs_owner_info oinfo; + struct xfs_scrub *sc = rb->sc; + struct xfs_btree_cur *bmap_cur; + struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake; + int error; + + error = xrep_bmap_sort_records(rb); + if (error) + return error; + + /* + * Prepare to construct the new fork by initializing the new btree + * structure and creating a fake ifork in the ifakeroot structure. + */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork, + &oinfo); + if (error) + return error; + + rb->new_bmapbt.bload.get_records = xrep_bmap_get_records; + rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block; + rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size; + bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake); + + /* + * Figure out the size and format of the new fork, then fill it with + * all the bmap records we've found. Join the inode to the transaction + * so that we can roll the transaction while holding the inode locked. + */ + if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) { + ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS; + error = xrep_bmap_extents_load(rb); + } else { + ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE; + error = xrep_bmap_btree_load(rb, bmap_cur); + } + if (error) + goto err_cur; + + /* + * Install the new fork in the inode. After this point the old mapping + * data are no longer accessible and the new tree is live. We delete + * the cursor immediately after committing the staged root because the + * staged fork might be in extents format. + */ + xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork); + xfs_btree_del_cursor(bmap_cur, 0); + + /* Reset the inode counters now that we've changed the fork. */ + error = xrep_bmap_reset_counters(rb); + if (error) + goto err_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + error = xrep_newbt_commit(&rb->new_bmapbt); + if (error) + return error; + + return xrep_roll_trans(sc); + +err_cur: + if (bmap_cur) + xfs_btree_del_cursor(bmap_cur, error); +err_newbt: + xrep_newbt_cancel(&rb->new_bmapbt); + return error; +} + +/* + * Now that we've logged the new inode btree, invalidate all of the old blocks + * and free them, if there were any. + */ +STATIC int +xrep_bmap_remove_old_tree( + struct xrep_bmap *rb) +{ + struct xfs_scrub *sc = rb->sc; + struct xfs_owner_info oinfo; + + /* Free the old bmbt blocks if they're not in use. */ + xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork); + return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo); +} + +/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */ +STATIC int +xrep_bmap_check_inputs( + struct xfs_scrub *sc, + int whichfork) +{ + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork); + + ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK); + + if (!xfs_has_rmapbt(sc->mp)) + return -EOPNOTSUPP; + + /* No fork means nothing to rebuild. */ + if (!ifp) + return -ECANCELED; + + /* + * We only know how to repair extent mappings, which is to say that we + * only support extents and btree fork format. Repairs to a local + * format fork require a higher level repair function, so we do not + * have any work to do here. + */ + switch (ifp->if_format) { + case XFS_DINODE_FMT_DEV: + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_UUID: + return -ECANCELED; + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + break; + default: + return -EFSCORRUPTED; + } + + if (whichfork == XFS_ATTR_FORK) + return 0; + + /* Only files, symlinks, and directories get to have data forks. */ + switch (VFS_I(sc->ip)->i_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + /* ok */ + break; + default: + return -EINVAL; + } + + /* Don't know how to rebuild realtime data forks. */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + return 0; +} + +/* Set up the initial state of the reflink scan. */ +static inline enum reflink_scan_state +xrep_bmap_init_reflink_scan( + struct xfs_scrub *sc, + int whichfork) +{ + /* cannot share on non-reflink filesystem */ + if (!xfs_has_reflink(sc->mp)) + return RLS_IRRELEVANT; + + /* preserve flag if it's already set */ + if (xfs_is_reflink_inode(sc->ip)) + return RLS_SET_IFLAG; + + /* can only share regular files */ + if (!S_ISREG(VFS_I(sc->ip)->i_mode)) + return RLS_IRRELEVANT; + + /* cannot share attr fork extents */ + if (whichfork != XFS_DATA_FORK) + return RLS_IRRELEVANT; + + /* cannot share realtime extents */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return RLS_IRRELEVANT; + + return RLS_UNKNOWN; +} + +/* Repair an inode fork. */ +STATIC int +xrep_bmap( + struct xfs_scrub *sc, + int whichfork) +{ + struct xrep_bmap *rb; + char *descr; + unsigned int max_bmbt_recs; + bool large_extcount; + int error = 0; + + error = xrep_bmap_check_inputs(sc, whichfork); + if (error == -ECANCELED) + return 0; + if (error) + return error; + + rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS); + if (!rb) + return -ENOMEM; + rb->sc = sc; + rb->whichfork = whichfork; + rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + + /* Set up enough storage to handle the max records for this fork. */ + large_extcount = xfs_has_large_extent_counts(sc->mp); + max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork); + descr = xchk_xfile_ino_descr(sc, "%s fork mapping records", + whichfork == XFS_DATA_FORK ? "data" : "attr"); + error = xfarray_create(descr, max_bmbt_recs, + sizeof(struct xfs_bmbt_rec), &rb->bmap_records); + kfree(descr); + if (error) + goto out_rb; + + /* Collect all reverse mappings for this fork's extents. */ + xfsb_bitmap_init(&rb->old_bmbt_blocks); + error = xrep_bmap_find_mappings(rb); + if (error) + goto out_bitmap; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Rebuild the bmap information. */ + error = xrep_bmap_build_new_fork(rb); + if (error) + goto out_bitmap; + + /* Kill the old tree. */ + error = xrep_bmap_remove_old_tree(rb); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&rb->old_bmbt_blocks); + xfarray_destroy(rb->bmap_records); +out_rb: + kfree(rb); + return error; +} + +/* Repair an inode's data fork. */ +int +xrep_bmap_data( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_DATA_FORK); +} + +/* Repair an inode's attr fork. */ +int +xrep_bmap_attr( + struct xfs_scrub *sc) +{ + return xrep_bmap(sc, XFS_ATTR_FORK); +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index ec5755266259..da09580b454a 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -239,7 +239,11 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc); (sc)->mp->m_super->s_id, \ (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \ ##__VA_ARGS__) - +#define xchk_xfile_ino_descr(sc, fmt, ...) \ + kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \ + (sc)->mp->m_super->s_id, \ + (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \ + ##__VA_ARGS__) /* * Setting up a hook to wait for intents to drain is costly -- we have to take diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 25392dea326d..26d65175ae8b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -883,6 +883,34 @@ xrep_reinit_pagi( return 0; } +/* + * Given an active reference to a perag structure, load AG headers and cursors. + * This should only be called to scan an AG while repairing file-based metadata. + */ +int +xrep_ag_init( + struct xfs_scrub *sc, + struct xfs_perag *pag, + struct xchk_ag *sa) +{ + int error; + + ASSERT(!sa->pag); + + error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp); + if (error) + return error; + + error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp); + if (error) + return error; + + /* Grab our own passive reference from the caller's ref. */ + sa->pag = xfs_perag_hold(pag); + xrep_ag_btcur_init(sc, sa); + return 0; +} + /* Reinitialize the per-AG block reservation for the AG we just fixed. */ int xrep_reset_perag_resv( diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index d4ef740c878f..8aa8b8889e39 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -89,6 +89,8 @@ struct xfs_imap; int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap); void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa); +int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag, + struct xchk_ag *sa); /* Metadata revalidators */ @@ -106,6 +108,8 @@ int xrep_allocbt(struct xfs_scrub *sc); int xrep_iallocbt(struct xfs_scrub *sc); int xrep_refcountbt(struct xfs_scrub *sc); int xrep_inode(struct xfs_scrub *sc); +int xrep_bmap_data(struct xfs_scrub *sc); +int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -165,6 +169,8 @@ xrep_setup_nothing( #define xrep_iallocbt xrep_notsupported #define xrep_refcountbt xrep_notsupported #define xrep_inode xrep_notsupported +#define xrep_bmap_data xrep_notsupported +#define xrep_bmap_attr xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7e903a0fde6c..238ead205c52 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -288,13 +288,13 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_data, - .repair = xrep_notsupported, + .repair = xrep_bmap_data, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_attr, - .repair = xrep_notsupported, + .repair = xrep_bmap_attr, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 120faa4dce2d..d6a1f46cf6e9 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1175,7 +1175,7 @@ DEFINE_EVENT(xrep_rmap_class, name, \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); -DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap); TRACE_EVENT(xrep_abt_found, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, @@ -1260,6 +1260,38 @@ TRACE_EVENT(xrep_refc_found, __entry->refcount) ) +TRACE_EVENT(xrep_bmap_found, + TP_PROTO(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec), + TP_ARGS(ip, whichfork, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, whichfork) + __field(xfs_fileoff_t, lblk) + __field(xfs_filblks_t, len) + __field(xfs_fsblock_t, pblk) + __field(int, state) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->whichfork = whichfork; + __entry->lblk = irec->br_startoff; + __entry->len = irec->br_blockcount; + __entry->pblk = irec->br_startblock; + __entry->state = irec->br_state; + ), + TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), + __entry->lblk, + __entry->len, + __entry->pblk, + __entry->state) +); + TRACE_EVENT(xrep_findroot_block, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, uint32_t magic, uint16_t level), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 305c9d07bf1b..12d45e93f07d 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1236,6 +1236,68 @@ xfs_trans_alloc_inode( return error; } +/* + * Try to reserve more blocks for a transaction. + * + * This is for callers that need to attach resources to a transaction, scan + * those resources to determine the space reservation requirements, and then + * modify the attached resources. In other words, online repair. This can + * fail due to ENOSPC, so the caller must be able to cancel the transaction + * without shutting down the fs. + */ +int +xfs_trans_reserve_more( + struct xfs_trans *tp, + unsigned int blocks, + unsigned int rtextents) +{ + struct xfs_trans_res resv = { }; + + return xfs_trans_reserve(tp, &resv, blocks, rtextents); +} + +/* + * Try to reserve more blocks and file quota for a transaction. Same + * conditions of usage as xfs_trans_reserve_more. + */ +int +xfs_trans_reserve_more_inode( + struct xfs_trans *tp, + struct xfs_inode *ip, + unsigned int dblocks, + unsigned int rblocks, + bool force_quota) +{ + struct xfs_trans_res resv = { }; + struct xfs_mount *mp = ip->i_mount; + unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks); + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + error = xfs_trans_reserve(tp, &resv, dblocks, rtx); + if (error) + return error; + + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return 0; + + if (tp->t_flags & XFS_TRANS_RESERVE) + force_quota = true; + + error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks, + force_quota); + if (!error) + return 0; + + /* Quota failed, give back the new reservation. */ + xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE); + tp->t_blk_res -= dblocks; + xfs_mod_frextents(mp, rtx); + tp->t_rtx_res -= rtx; + return error; +} + /* * Allocate an transaction in preparation for inode creation by reserving quota * against the given dquots. Callers are not required to hold any inode locks. diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 2cb1e143fc49..08ce757c7454 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -164,6 +164,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_reserve_more(struct xfs_trans *tp, + unsigned int blocks, unsigned int rtextents); int xfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -248,6 +250,8 @@ struct xfs_dquot; int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv, unsigned int dblocks, unsigned int rblocks, bool force, struct xfs_trans **tpp); +int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip, + unsigned int dblocks, unsigned int rblocks, bool force_quota); int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, unsigned int dblocks, From 48a72f60861f790dd7c2db04d69c3a1ce606984e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:39 -0800 Subject: [PATCH 076/123] xfs: refactor repair forcing tests into a repair.c helper There are a couple of conditions that userspace can set to force repairs of metadata. These really belong in the repair code and not open-coded into the check code, so refactor them into a helper. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/repair.c | 22 ++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 ++ fs/xfs/scrub/scrub.c | 14 +------------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 26d65175ae8b..020d49b0f9b9 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -27,6 +27,8 @@ #include "xfs_quota.h" #include "xfs_qm.h" #include "xfs_defer.h" +#include "xfs_errortag.h" +#include "xfs_error.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -940,3 +942,23 @@ xrep_reset_perag_resv( out: return error; } + +/* Decide if we are going to call the repair function for a scrub type. */ +bool +xrep_will_attempt( + struct xfs_scrub *sc) +{ + /* Userspace asked us to rebuild the structure regardless. */ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) + return true; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + return true; + + /* Metadata is corrupt or failed cross-referencing. */ + if (xchk_needs_repair(sc->sm)) + return true; + + return false; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 8aa8b8889e39..51ba3df5998c 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -28,6 +28,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run); +bool xrep_will_attempt(struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); int xrep_roll_trans(struct xfs_scrub *sc); @@ -117,6 +118,7 @@ int xrep_reinit_pagi(struct xfs_scrub *sc); #else #define xrep_ino_dqattach(sc) (0) +#define xrep_will_attempt(sc) (false) static inline int xrep_attempt( diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 238ead205c52..c9e37c688089 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -14,8 +14,6 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_qm.h" -#include "xfs_errortag.h" -#include "xfs_error.h" #include "xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" @@ -550,21 +548,11 @@ xfs_scrub_metadata( xchk_update_health(sc); if (xchk_could_repair(sc)) { - bool needs_fix = xchk_needs_repair(sc->sm); - - /* Userspace asked us to rebuild the structure regardless. */ - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) - needs_fix = true; - - /* Let debug users force us into the repair routines. */ - if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - needs_fix = true; - /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. */ - if (!needs_fix) { + if (!xrep_will_attempt(sc)) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; goto out_nofix; } From d12bf8bac87a0d93e6e5fab67f399d1e3d3d5767 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:40 -0800 Subject: [PATCH 077/123] xfs: create a ranged query function for refcount btrees Implement ranged queries for refcount records. The next patch will use this to scan refcount data. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_refcount.c | 41 ++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_refcount.h | 10 +++++++++ 2 files changed, 51 insertions(+) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 3a9f22d94444..6709a7f8bad5 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -2031,6 +2031,47 @@ xfs_refcount_has_records( return xfs_btree_has_records(cur, &low, &high, NULL, outcome); } +struct xfs_refcount_query_range_info { + xfs_refcount_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_refcount_query_range_helper( + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_refcount_query_range_info *query = priv; + struct xfs_refcount_irec irec; + xfs_failaddr_t fa; + + xfs_refcount_btrec_to_irec(rec, &irec); + fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec); + if (fa) + return xfs_refcount_complain_bad_rec(cur, fa, &irec); + + return query->fn(cur, &irec, query->priv); +} + +/* Find all refcount records between two keys. */ +int +xfs_refcount_query_range( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec = { .rc = *low_rec }; + union xfs_btree_irec high_brec = { .rc = *high_rec }; + struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn }; + + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_refcount_query_range_helper, &query); +} + int __init xfs_refcount_intent_init_cache(void) { diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 5c207f1c619c..9b56768a590c 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache; int __init xfs_refcount_intent_init_cache(void); void xfs_refcount_intent_destroy_cache(void); +typedef int (*xfs_refcount_query_range_fn)( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv); + +int xfs_refcount_query_range(struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *low_rec, + const struct xfs_refcount_irec *high_rec, + xfs_refcount_query_range_fn fn, void *priv); + #endif /* __XFS_REFCOUNT_H__ */ From dbbdbd0086320a026903ca34efedb6abf55230ed Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:40 -0800 Subject: [PATCH 078/123] xfs: repair problems in CoW forks Try to repair errors that we see in file CoW forks so that we don't do stupid things like remap garbage into a file. There's not a lot we can do with the COW fork -- the ondisk metadata record only that the COW staging extents are owned by the refcount btree, which effectively means that we can't reconstruct this incore structure from scratch. Actually, this is even worse -- we can't touch written extents, because those map space that are actively under writeback, and there's not much to do with delalloc reservations. Hence we can only detect crosslinked unwritten extents and fix them by punching out the problematic parts and replacing them with delalloc extents. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 1 + fs/xfs/scrub/cow_repair.c | 614 ++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/off_bitmap.h | 37 +++ fs/xfs/scrub/reap.c | 32 ++ fs/xfs/scrub/repair.h | 2 + fs/xfs/scrub/scrub.c | 2 +- fs/xfs/scrub/trace.h | 84 ++++++ 7 files changed, 771 insertions(+), 1 deletion(-) create mode 100644 fs/xfs/scrub/cow_repair.c create mode 100644 fs/xfs/scrub/off_bitmap.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 66c1a5001772..a7830df42c4e 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -184,6 +184,7 @@ xfs-y += $(addprefix scrub/, \ agheader_repair.o \ alloc_repair.o \ bmap_repair.o \ + cow_repair.o \ ialloc_repair.o \ inode_repair.o \ newbt.o \ diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c new file mode 100644 index 000000000000..1e82c727af8e --- /dev/null +++ b/fs/xfs/scrub/cow_repair.c @@ -0,0 +1,614 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_quota.h" +#include "xfs_ialloc.h" +#include "xfs_ag.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_refcount_btree.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/off_bitmap.h" +#include "scrub/fsb_bitmap.h" +#include "scrub/reap.h" + +/* + * CoW Fork Mapping Repair + * ======================= + * + * Although CoW staging extents are owned by incore CoW inode forks, on disk + * they are owned by the refcount btree. The ondisk metadata does not record + * any ownership information, which limits what we can do to repair the + * mappings in the CoW fork. At most, we can replace ifork mappings that lack + * an entry in the refcount btree or are described by a reverse mapping record + * whose owner is not OWN_COW. + * + * Replacing extents is also tricky -- we can't touch written CoW fork extents + * since they are undergoing writeback, and delalloc extents do not require + * repair since they only exist incore. Hence the most we can do is find the + * bad parts of unwritten mappings, allocate a replacement set of blocks, and + * replace the incore mapping. We use the regular reaping process to unmap + * or free the discarded blocks, as appropriate. + */ +struct xrep_cow { + struct xfs_scrub *sc; + + /* Bitmap of file offset ranges that need replacing. */ + struct xoff_bitmap bad_fileoffs; + + /* Bitmap of fsblocks that were removed from the CoW fork. */ + struct xfsb_bitmap old_cowfork_fsblocks; + + /* CoW fork mappings used to scan for bad CoW staging extents. */ + struct xfs_bmbt_irec irec; + + /* refcount btree block number of irec.br_startblock */ + unsigned int irec_startbno; + + /* refcount btree block number of the next refcount record we expect */ + unsigned int next_bno; +}; + +/* CoW staging extent. */ +struct xrep_cow_extent { + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +/* + * Mark the part of the file range that corresponds to the given physical + * space. Caller must ensure that the physical range is within xc->irec. + */ +STATIC int +xrep_cow_mark_file_range( + struct xrep_cow *xc, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount) +{ + xfs_fileoff_t startoff; + + startoff = xc->irec.br_startoff + + (startblock - xc->irec.br_startblock); + + trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff, + blockcount); + + return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount); +} + +/* + * Trim @src to fit within the CoW fork mapping being examined, and put the + * result in @dst. + */ +static inline void +xrep_cow_trim_refcount( + struct xrep_cow *xc, + struct xfs_refcount_irec *dst, + const struct xfs_refcount_irec *src) +{ + unsigned int adj; + + memcpy(dst, src, sizeof(*dst)); + + if (dst->rc_startblock < xc->irec_startbno) { + adj = xc->irec_startbno - dst->rc_startblock; + dst->rc_blockcount -= adj; + dst->rc_startblock += adj; + } + + if (dst->rc_startblock + dst->rc_blockcount > + xc->irec_startbno + xc->irec.br_blockcount) { + adj = (dst->rc_startblock + dst->rc_blockcount) - + (xc->irec_startbno + xc->irec.br_blockcount); + dst->rc_blockcount -= adj; + } +} + +/* Mark any shared CoW staging extents. */ +STATIC int +xrep_cow_mark_shared_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + xfs_fsblock_t fsbno; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_SHARED) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + rrec.rc_startblock); + return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount); +} + +/* + * Mark any portion of the CoW fork file offset range where there is not a CoW + * staging extent record in the refcountbt, and keep a record of where we did + * find correct refcountbt records. Staging records are always cleaned out at + * mount time, so any two inodes trying to map the same staging area would have + * already taken the fs down due to refcount btree verifier errors. Hence this + * inode should be the sole creator of the staging extent records ondisk. + */ +STATIC int +xrep_cow_mark_missing_staging( + struct xfs_btree_cur *cur, + const struct xfs_refcount_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + struct xfs_refcount_irec rrec; + int error; + + if (!xfs_refcount_check_domain(rec) || + rec->rc_domain != XFS_REFC_DOMAIN_COW) + return -EFSCORRUPTED; + + xrep_cow_trim_refcount(xc, &rrec, rec); + + if (xc->next_bno >= rrec.rc_startblock) + goto next; + + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, + xc->next_bno), + rrec.rc_startblock - xc->next_bno); + if (error) + return error; + +next: + xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount; + return 0; +} + +/* + * Mark any area that does not correspond to a CoW staging rmap. These are + * cross-linked areas that must be avoided. + */ +STATIC int +xrep_cow_mark_missing_staging_rmap( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_cow *xc = priv; + xfs_fsblock_t fsbno; + xfs_agblock_t rec_bno; + xfs_extlen_t rec_len; + unsigned int adj; + + if (rec->rm_owner == XFS_RMAP_OWN_COW) + return 0; + + rec_bno = rec->rm_startblock; + rec_len = rec->rm_blockcount; + if (rec_bno < xc->irec_startbno) { + adj = xc->irec_startbno - rec_bno; + rec_len -= adj; + rec_bno += adj; + } + + if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) { + adj = (rec_bno + rec_len) - + (xc->irec_startbno + xc->irec.br_blockcount); + rec_len -= adj; + } + + fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno); + return xrep_cow_mark_file_range(xc, fsbno, rec_len); +} + +/* + * Find any part of the CoW fork mapping that isn't a single-owner CoW staging + * extent and mark the corresponding part of the file range in the bitmap. + */ +STATIC int +xrep_cow_find_bad( + struct xrep_cow *xc) +{ + struct xfs_refcount_irec rc_low = { 0 }; + struct xfs_refcount_irec rc_high = { 0 }; + struct xfs_rmap_irec rm_low = { 0 }; + struct xfs_rmap_irec rm_high = { 0 }; + struct xfs_perag *pag; + struct xfs_scrub *sc = xc->sc; + xfs_agnumber_t agno; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock); + xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock); + + pag = xfs_perag_get(sc->mp, agno); + if (!pag) + return -EFSCORRUPTED; + + error = xrep_ag_init(sc, pag, &sc->sa); + if (error) + goto out_pag; + + /* Mark any CoW fork extents that are shared. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_shared_staging, xc); + if (error) + goto out_sa; + + /* Make sure there are CoW staging extents for the whole mapping. */ + rc_low.rc_startblock = xc->irec_startbno; + rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW; + xc->next_bno = xc->irec_startbno; + error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high, + xrep_cow_mark_missing_staging, xc); + if (error) + goto out_sa; + + if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) { + error = xrep_cow_mark_file_range(xc, + XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, + xc->next_bno), + xc->irec_startbno + xc->irec.br_blockcount - + xc->next_bno); + if (error) + goto out_sa; + } + + /* Mark any area has an rmap that isn't a COW staging extent. */ + rm_low.rm_startblock = xc->irec_startbno; + memset(&rm_high, 0xFF, sizeof(rm_high)); + rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1; + error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high, + xrep_cow_mark_missing_staging_rmap, xc); + if (error) + goto out_sa; + + /* + * If userspace is forcing us to rebuild the CoW fork or someone turned + * on the debugging knob, replace everything in the CoW fork. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) || + XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) { + error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock, + xc->irec.br_blockcount); + if (error) + return error; + } + +out_sa: + xchk_ag_free(sc, &sc->sa); +out_pag: + xfs_perag_put(pag); + return 0; +} + +/* + * Allocate a replacement CoW staging extent of up to the given number of + * blocks, and fill out the mapping. + */ +STATIC int +xrep_cow_alloc( + struct xfs_scrub *sc, + xfs_extlen_t maxlen, + struct xrep_cow_extent *repl) +{ + struct xfs_alloc_arg args = { + .tp = sc->tp, + .mp = sc->mp, + .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE, + .minlen = 1, + .maxlen = maxlen, + .prod = 1, + .resv = XFS_AG_RESV_NONE, + .datatype = XFS_ALLOC_USERDATA, + }; + int error; + + error = xfs_trans_reserve_more(sc->tp, maxlen, 0); + if (error) + return error; + + error = xfs_alloc_vextent_start_ag(&args, + XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino)); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + + xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len); + + repl->fsbno = args.fsbno; + repl->len = args.len; + return 0; +} + +/* + * Look up the current CoW fork mapping so that we only allocate enough to + * replace a single mapping. If we don't find a mapping that covers the start + * of the file range, or we find a delalloc or written extent, something is + * seriously wrong, since we didn't drop the ILOCK. + */ +static inline int +xrep_cow_find_mapping( + struct xrep_cow *xc, + struct xfs_iext_cursor *icur, + xfs_fileoff_t startoff, + struct xfs_bmbt_irec *got) +{ + struct xfs_inode *ip = xc->sc->ip; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); + + if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got)) + goto bad; + + if (got->br_startoff > startoff) + goto bad; + + if (got->br_blockcount == 0) + goto bad; + + if (isnullstartblock(got->br_startblock)) + goto bad; + + if (xfs_bmap_is_written_extent(got)) + goto bad; + + return 0; +bad: + ASSERT(0); + return -EFSCORRUPTED; +} + +#define REPLACE_LEFT_SIDE (1U << 0) +#define REPLACE_RIGHT_SIDE (1U << 1) + +/* + * Given a CoW fork mapping @got and a replacement mapping @repl, remap the + * beginning of @got with the space described by @rep. + */ +static inline void +xrep_cow_replace_mapping( + struct xfs_inode *ip, + struct xfs_iext_cursor *icur, + const struct xfs_bmbt_irec *got, + const struct xrep_cow_extent *repl) +{ + struct xfs_bmbt_irec new = *got; /* struct copy */ + + ASSERT(repl->len > 0); + ASSERT(!isnullstartblock(got->br_startblock)); + + trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len); + + if (got->br_blockcount == repl->len) { + /* + * The new extent is a complete replacement for the existing + * extent. Update the COW fork record. + */ + new.br_startblock = repl->fsbno; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + return; + } + + /* + * The new extent can replace the beginning of the COW fork record. + * Move the left side of @got upwards, then insert the new record. + */ + new.br_startoff += repl->len; + new.br_startblock += repl->len; + new.br_blockcount -= repl->len; + xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new); + + new.br_startoff = got->br_startoff; + new.br_startblock = repl->fsbno; + new.br_blockcount = repl->len; + xfs_iext_insert(ip, icur, &new, BMAP_COWFORK); +} + +/* + * Replace the unwritten CoW staging extent backing the given file range with a + * new space extent that isn't as problematic. + */ +STATIC int +xrep_cow_replace_range( + struct xrep_cow *xc, + xfs_fileoff_t startoff, + xfs_extlen_t *blockcount) +{ + struct xfs_iext_cursor icur; + struct xrep_cow_extent repl; + struct xfs_bmbt_irec got; + struct xfs_scrub *sc = xc->sc; + xfs_fileoff_t nextoff; + xfs_extlen_t alloc_len; + int error; + + /* + * Put the existing CoW fork mapping in @got. If @got ends before + * @rep, truncate @rep so we only replace one extent mapping at a time. + */ + error = xrep_cow_find_mapping(xc, &icur, startoff, &got); + if (error) + return error; + nextoff = min(startoff + *blockcount, + got.br_startoff + got.br_blockcount); + + /* + * Allocate a replacement extent. If we don't fill all the blocks, + * shorten the quantity that will be deleted in this step. + */ + alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN, + nextoff - startoff); + error = xrep_cow_alloc(sc, alloc_len, &repl); + if (error) + return error; + + /* + * Replace the old mapping with the new one, and commit the metadata + * changes made so far. + */ + xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl); + + xfs_inode_set_cowblocks_tag(sc->ip); + error = xfs_defer_finish(&sc->tp); + if (error) + return error; + + /* Note the old CoW staging extents; we'll reap them all later. */ + error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock, + repl.len); + if (error) + return error; + + *blockcount = repl.len; + return 0; +} + +/* + * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc + * reservation. + */ +STATIC int +xrep_cow_replace( + uint64_t startoff, + uint64_t blockcount, + void *priv) +{ + struct xrep_cow *xc = priv; + int error = 0; + + while (blockcount > 0) { + xfs_extlen_t len = min_t(xfs_filblks_t, blockcount, + XFS_MAX_BMBT_EXTLEN); + + error = xrep_cow_replace_range(xc, startoff, &len); + if (error) + break; + + blockcount -= len; + startoff += len; + } + + return error; +} + +/* + * Repair an inode's CoW fork. The CoW fork is an in-core structure, so + * there's no btree to rebuid. Instead, we replace any mappings that are + * cross-linked or lack ondisk CoW fork records in the refcount btree. + */ +int +xrep_bmap_cow( + struct xfs_scrub *sc) +{ + struct xrep_cow *xc; + struct xfs_iext_cursor icur; + struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK); + int error; + + if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp)) + return -EOPNOTSUPP; + + if (!ifp) + return 0; + + /* realtime files aren't supported yet */ + if (XFS_IS_REALTIME_INODE(sc->ip)) + return -EOPNOTSUPP; + + /* + * If we're somehow not in extents format, then reinitialize it to + * an empty extent mapping fork and exit. + */ + if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) { + ifp->if_format = XFS_DINODE_FMT_EXTENTS; + ifp->if_nextents = 0; + return 0; + } + + xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS); + if (!xc) + return -ENOMEM; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + xc->sc = sc; + xoff_bitmap_init(&xc->bad_fileoffs); + xfsb_bitmap_init(&xc->old_cowfork_fsblocks); + + for_each_xfs_iext(ifp, &icur, &xc->irec) { + if (xchk_should_terminate(sc, &error)) + goto out_bitmap; + + /* + * delalloc reservations only exist incore, so there is no + * ondisk metadata that we can examine. Hence we leave them + * alone. + */ + if (isnullstartblock(xc->irec.br_startblock)) + continue; + + /* + * COW fork extents are only in the written state if writeback + * is actively writing to disk. We cannot restart the write + * at a different disk address since we've already issued the + * IO, so we leave these alone and hope for the best. + */ + if (xfs_bmap_is_written_extent(&xc->irec)) + continue; + + error = xrep_cow_find_bad(xc); + if (error) + goto out_bitmap; + } + + /* Replace any bad unwritten mappings with fresh reservations. */ + error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc); + if (error) + goto out_bitmap; + + /* + * Reap as many of the old CoW blocks as we can. They are owned ondisk + * by the refcount btree, not the inode, so it is correct to treat them + * like inode metadata. + */ + error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks, + &XFS_RMAP_OINFO_COW); + if (error) + goto out_bitmap; + +out_bitmap: + xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks); + xoff_bitmap_destroy(&xc->bad_fileoffs); + kmem_free(xc); + return error; +} diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h new file mode 100644 index 000000000000..0d3f9e6c1aad --- /dev/null +++ b/fs/xfs/scrub/off_bitmap.h @@ -0,0 +1,37 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2022-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_OFF_BITMAP_H__ +#define __XFS_SCRUB_OFF_BITMAP_H__ + +/* Bitmaps, but for type-checked for xfs_fileoff_t */ + +struct xoff_bitmap { + struct xbitmap64 offbitmap; +}; + +static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap) +{ + xbitmap64_init(&bitmap->offbitmap); +} + +static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap) +{ + xbitmap64_destroy(&bitmap->offbitmap); +} + +static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap, + xfs_fileoff_t off, xfs_filblks_t len) +{ + return xbitmap64_set(&bitmap->offbitmap, off, len); +} + +static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap, + xbitmap64_walk_fn fn, void *priv) +{ + return xbitmap64_walk(&bitmap->offbitmap, fn, priv); +} + +#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */ diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index 0d2e32fbb51a..f99eca799809 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -20,6 +20,7 @@ #include "xfs_ialloc_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_refcount.h" #include "xfs_refcount_btree.h" #include "xfs_extent_busy.h" #include "xfs_ag.h" @@ -380,6 +381,17 @@ xreap_agextent_iter( trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp); rs->force_roll = true; + + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + /* + * If we're unmapping CoW staging extents, remove the + * records from the refcountbt, which will remove the + * rmap record as well. + */ + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + return 0; + } + return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, *aglenp, rs->oinfo); } @@ -398,6 +410,26 @@ xreap_agextent_iter( return 0; } + /* + * If we're getting rid of CoW staging extents, use deferred work items + * to remove the refcountbt records (which removes the rmap records) + * and free the extent. We're not worried about the system going down + * here because log recovery walks the refcount btree to clean out the + * CoW staging extents. + */ + if (rs->oinfo == &XFS_RMAP_OINFO_COW) { + ASSERT(rs->resv == XFS_AG_RESV_NONE); + + xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp); + error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL, + rs->resv, true); + if (error) + return error; + + rs->force_roll = true; + return 0; + } + /* Put blocks back on the AGFL one at a time. */ if (rs->resv == XFS_AG_RESV_AGFL) { ASSERT(*aglenp == 1); diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 51ba3df5998c..f89c8f08b037 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -111,6 +111,7 @@ int xrep_refcountbt(struct xfs_scrub *sc); int xrep_inode(struct xfs_scrub *sc); int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); +int xrep_bmap_cow(struct xfs_scrub *sc); int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -173,6 +174,7 @@ xrep_setup_nothing( #define xrep_inode xrep_notsupported #define xrep_bmap_data xrep_notsupported #define xrep_bmap_attr xrep_notsupported +#define xrep_bmap_cow xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c9e37c688089..e46b3afb5467 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -298,7 +298,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_INODE, .setup = xchk_setup_inode_bmap, .scrub = xchk_bmap_cow, - .repair = xrep_notsupported, + .repair = xrep_bmap_cow, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index d6a1f46cf6e9..3d5c8e748955 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1596,6 +1596,90 @@ TRACE_EVENT(xrep_dinode_count_rmaps, __entry->attr_extents) ); +TRACE_EVENT(xrep_cow_mark_file_range, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock, + xfs_fileoff_t startoff, xfs_filblks_t blockcount), + TP_ARGS(ip, startblock, startoff, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = startoff; + __entry->startblock = startblock; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount) +); + +TRACE_EVENT(xrep_cow_replace_mapping, + TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec, + xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount), + TP_ARGS(ip, irec, new_startblock, new_blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, startblock) + __field(xfs_fileoff_t, startoff) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(xfs_fsblock_t, new_startblock) + __field(xfs_extlen_t, new_blockcount) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->ino = ip->i_ino; + __entry->startoff = irec->br_startoff; + __entry->startblock = irec->br_startblock; + __entry->blockcount = irec->br_blockcount; + __entry->state = irec->br_state; + __entry->new_startblock = new_startblock; + __entry->new_blockcount = new_blockcount; + ), + TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->new_startblock, + __entry->new_blockcount) +); + +TRACE_EVENT(xrep_cow_free_staging, + TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno, + xfs_extlen_t blockcount), + TP_ARGS(pag, agbno, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->agbno = agbno; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->blockcount) +); + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ From 41991cf298919de211c63251d72266aff70ecad0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:41 -0800 Subject: [PATCH 079/123] xfs: check rt bitmap file geometry more thoroughly I forgot that the superblock tracks the number of blocks that are in the realtime bitmap, and that the rt bitmap file can have more blocks mapped to the data fork than sb_rbmblocks if growfsrt fails. So. Add to the rtbitmap scrubber an explicit check that sb_rextents and sb_rbmblocks are correct, then adjust the rtbitmap i_size checks to allow for the growfsrt failure case. Finally, flag post-eof blocks in the rtbitmap. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/rtbitmap.c | 99 ++++++++++++++++++++++++++++++++++------- 1 file changed, 84 insertions(+), 15 deletions(-) diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index d509a08d3fc3..578b935ca93f 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -14,16 +14,30 @@ #include "xfs_rtbitmap.h" #include "xfs_inode.h" #include "xfs_bmap.h" +#include "xfs_bit.h" #include "scrub/scrub.h" #include "scrub/common.h" +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; +}; + /* Set us up with the realtime metadata locked. */ int xchk_setup_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb; int error; + rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS); + if (!rtb) + return -ENOMEM; + sc->buf = rtb; + error = xchk_trans_alloc(sc, 0); if (error) return error; @@ -37,6 +51,17 @@ xchk_setup_rtbitmap( return error; xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); + + /* + * Now that we've locked the rtbitmap, we can't race with growfsrt + * trying to expand the bitmap or change the size of the rt volume. + * Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rtb->rextslog = xfs_compute_rextslog(rtb->rextents); + rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents); + } return 0; } @@ -67,21 +92,30 @@ STATIC int xchk_rtbitmap_check_extents( struct xfs_scrub *sc) { - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_rtblock_t off; - int nmap; + struct xfs_iext_cursor icur; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; int error = 0; - for (off = 0; off < mp->m_sb.sb_rbmblocks;) { + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; + if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rbmip, off, - mp->m_sb.sb_rbmblocks - off, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) break; @@ -102,12 +136,48 @@ int xchk_rtbitmap( struct xfs_scrub *sc) { + struct xfs_mount *mp = sc->mp; + struct xchk_rtbitmap *rtb = sc->buf; int error; - /* Is the size of the rtbitmap correct? */ - if (sc->mp->m_rbmip->i_disk_size != - XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) { - xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino); + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rtb->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* Is sb_rextslog correct? */ + if (mp->m_sb.sb_rextslog != rtb->rextslog) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is sb_rbmblocks large enough to handle the current rt volume? In no + * case can we exceed 4bn bitmap blocks since the super field is a u32. + */ + if (rtb->rbmblocks > U32_MAX) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* The bitmap file length must be aligned to an fsblock. */ + if (mp->m_rbmip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + return 0; + } + + /* + * Is the bitmap file itself large enough to handle the rt volume? + * growfsrt expands the bitmap file before updating sb_rextents, so the + * file can be larger than sb_rbmblocks. + */ + if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); return 0; } @@ -120,12 +190,11 @@ xchk_rtbitmap( if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return error; - error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc); + error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) - goto out; + return error; -out: - return error; + return 0; } /* xref check that the extent is not free in the rtbitmap */ From 04f0c3269b41f28c041980a30514850453ded251 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:41 -0800 Subject: [PATCH 080/123] xfs: check rt summary file geometry more thoroughly I forgot that the xfs_mount tracks the size and number of levels in the realtime summary file, and that the rt summary file can have more blocks mapped to the data fork than m_rsumsize implies if growfsrt fails. So. Add to the rtsummary scrubber an explicit check that all the summary geometry values are correct, then adjust the rtsummary i_size checks to allow for the growfsrt failure case. Finally, flag post-eof blocks in the summary file. While we're at it, split the extent map checking so that we only call xfs_bmapi_read once per extent instead of once per rtsummary block. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/rtsummary.c | 137 +++++++++++++++++++++++++++++++-------- 1 file changed, 110 insertions(+), 27 deletions(-) diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index f94800a029f3..b0d90426a5cb 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -31,6 +31,18 @@ * (potentially large) amount of data in pageable memory. */ +struct xchk_rtsummary { + struct xfs_rtalloc_args args; + + uint64_t rextents; + uint64_t rbmblocks; + uint64_t rsumsize; + unsigned int rsumlevels; + + /* Memory buffer for the summary comparison. */ + union xfs_suminfo_raw words[]; +}; + /* Set us up to check the rtsummary file. */ int xchk_setup_rtsummary( @@ -38,8 +50,15 @@ xchk_setup_rtsummary( { struct xfs_mount *mp = sc->mp; char *descr; + struct xchk_rtsummary *rts; int error; + rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize), + XCHK_GFP_FLAGS); + if (!rts) + return -ENOMEM; + sc->buf = rts; + /* * Create an xfile to construct a new rtsummary file. The xfile allows * us to avoid pinning kernel memory for this purpose. @@ -54,11 +73,6 @@ xchk_setup_rtsummary( if (error) return error; - /* Allocate a memory buffer for the summary comparison. */ - sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS); - if (!sc->buf) - return -ENOMEM; - error = xchk_install_live_inode(sc, mp->m_rsumip); if (error) return error; @@ -75,13 +89,29 @@ xchk_setup_rtsummary( */ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); + + /* + * Now that we've locked the rtbitmap and rtsummary, we can't race with + * growfsrt trying to expand the summary or change the size of the rt + * volume. Hence it is safe to compute and check the geometry values. + */ + if (mp->m_sb.sb_rblocks) { + xfs_filblks_t rsumblocks; + int rextslog; + + rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks); + rextslog = xfs_compute_rextslog(rts->rextents); + rts->rsumlevels = rextslog + 1; + rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents); + rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels, + rts->rbmblocks); + rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks); + } return 0; } /* Helper functions to record suminfo words in an xfile. */ -typedef unsigned int xchk_rtsumoff_t; - static inline int xfsum_load( struct xfs_scrub *sc, @@ -192,19 +222,29 @@ STATIC int xchk_rtsum_compare( struct xfs_scrub *sc) { - struct xfs_rtalloc_args args = { - .mp = sc->mp, - .tp = sc->tp, - }; - struct xfs_mount *mp = sc->mp; struct xfs_bmbt_irec map; - xfs_fileoff_t off; - xchk_rtsumoff_t sumoff = 0; - int nmap; + struct xfs_iext_cursor icur; - for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) { - union xfs_suminfo_raw *ondisk_info; - int error = 0; + struct xfs_mount *mp = sc->mp; + struct xfs_inode *ip = sc->ip; + struct xchk_rtsummary *rts = sc->buf; + xfs_fileoff_t off = 0; + xfs_fileoff_t endoff; + xfs_rtsumoff_t sumoff = 0; + int error = 0; + + rts->args.mp = sc->mp; + rts->args.tp = sc->tp; + + /* Mappings may not cross or lie beyond EOF. */ + endoff = XFS_B_TO_FSB(mp, ip->i_disk_size); + if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff); + return 0; + } + + while (off < endoff) { + int nmap = 1; if (xchk_should_terminate(sc, &error)) return error; @@ -212,8 +252,7 @@ xchk_rtsum_compare( return 0; /* Make sure we have a written extent. */ - nmap = 1; - error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap, + error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap, XFS_DATA_FORK); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; @@ -223,24 +262,33 @@ xchk_rtsum_compare( return 0; } + off += map.br_blockcount; + } + + for (off = 0; off < endoff; off++) { + union xfs_suminfo_raw *ondisk_info; + /* Read a block's worth of ondisk rtsummary file. */ - error = xfs_rtsummary_read_buf(&args, off); + error = xfs_rtsummary_read_buf(&rts->args, off); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error)) return error; /* Read a block's worth of computed rtsummary file. */ - error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize); + error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize); if (error) { - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); return error; } - ondisk_info = xfs_rsumblock_infoptr(&args, 0); - if (memcmp(ondisk_info, sc->buf, - mp->m_blockwsize << XFS_WORDLOG) != 0) + ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0); + if (memcmp(ondisk_info, rts->words, + mp->m_blockwsize << XFS_WORDLOG) != 0) { xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + xfs_rtbuf_cache_relse(&rts->args); + return error; + } - xfs_rtbuf_cache_relse(&args); + xfs_rtbuf_cache_relse(&rts->args); sumoff += mp->m_blockwsize; } @@ -253,8 +301,43 @@ xchk_rtsummary( struct xfs_scrub *sc) { struct xfs_mount *mp = sc->mp; + struct xchk_rtsummary *rts = sc->buf; int error = 0; + /* Is sb_rextents correct? */ + if (mp->m_sb.sb_rextents != rts->rextents) { + xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino); + goto out_rbm; + } + + /* Is m_rsumlevels correct? */ + if (mp->m_rsumlevels != rts->rsumlevels) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* Is m_rsumsize correct? */ + if (mp->m_rsumsize != rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* The summary file length must be aligned to an fsblock. */ + if (mp->m_rsumip->i_disk_size & mp->m_blockmask) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + + /* + * Is the summary file itself large enough to handle the rt volume? + * growfsrt expands the summary file before updating sb_rextents, so + * the file can be larger than rsumsize. + */ + if (mp->m_rsumip->i_disk_size < rts->rsumsize) { + xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino); + goto out_rbm; + } + /* Invoke the fork scrubber. */ error = xchk_metadata_inode_forks(sc); if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) From 20cc0d398e89d1f735c8e2815defc8ba9fdcce3f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:42 -0800 Subject: [PATCH 081/123] xfs: always check the rtbitmap and rtsummary files XFS filesystems always have a realtime bitmap and summary file, even if there has never been a realtime volume attached. Always check them. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/scrub.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index e46b3afb5467..a7019c9bba0c 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -328,14 +328,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, - .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ From 5a8e07e799721ba68dd6d713d4a68598eab3bea1 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:42 -0800 Subject: [PATCH 082/123] xfs: repair the inode core and forks of a metadata inode Add a helper function to repair the core and forks of a metadata inode, so that we can get move onto the task of repairing higher level metadata that lives in an inode. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/bmap_repair.c | 17 ++++- fs/xfs/scrub/repair.c | 153 +++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 2 + 3 files changed, 168 insertions(+), 4 deletions(-) diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c index a8d6415b1c38..a4bb89fdd510 100644 --- a/fs/xfs/scrub/bmap_repair.c +++ b/fs/xfs/scrub/bmap_repair.c @@ -86,6 +86,9 @@ struct xrep_bmap { /* What d the REFLINK flag be set when the repair is over? */ enum reflink_scan_state reflink_scan; + + /* Do we allow unwritten extents? */ + bool allow_unwritten; }; /* Is this space extent shared? Flag the inode if it is. */ @@ -262,6 +265,10 @@ xrep_bmap_walk_rmap( !(rec->rm_flags & XFS_RMAP_ATTR_FORK)) return 0; + /* Reject unwritten extents if we don't allow those. */ + if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten) + return -EFSCORRUPTED; + fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno, rec->rm_startblock); @@ -780,10 +787,11 @@ xrep_bmap_init_reflink_scan( } /* Repair an inode fork. */ -STATIC int +int xrep_bmap( struct xfs_scrub *sc, - int whichfork) + int whichfork, + bool allow_unwritten) { struct xrep_bmap *rb; char *descr; @@ -803,6 +811,7 @@ xrep_bmap( rb->sc = sc; rb->whichfork = whichfork; rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork); + rb->allow_unwritten = allow_unwritten; /* Set up enough storage to handle the max records for this fork. */ large_extcount = xfs_has_large_extent_counts(sc->mp); @@ -846,7 +855,7 @@ int xrep_bmap_data( struct xfs_scrub *sc) { - return xrep_bmap(sc, XFS_DATA_FORK); + return xrep_bmap(sc, XFS_DATA_FORK, true); } /* Repair an inode's attr fork. */ @@ -854,5 +863,5 @@ int xrep_bmap_attr( struct xfs_scrub *sc) { - return xrep_bmap(sc, XFS_ATTR_FORK); + return xrep_bmap(sc, XFS_ATTR_FORK, false); } diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 020d49b0f9b9..745d5b8f405a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -29,6 +29,7 @@ #include "xfs_defer.h" #include "xfs_errortag.h" #include "xfs_error.h" +#include "xfs_reflink.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -962,3 +963,155 @@ xrep_will_attempt( return false; } + +/* Try to fix some part of a metadata inode by calling another scrubber. */ +STATIC int +xrep_metadata_inode_subtype( + struct xfs_scrub *sc, + unsigned int scrub_type) +{ + __u32 smtype = sc->sm->sm_type; + __u32 smflags = sc->sm->sm_flags; + unsigned int sick_mask = sc->sick_mask; + int error; + + /* + * Let's see if the inode needs repair. We're going to open-code calls + * to the scrub and repair functions so that we can hang on to the + * resources that we already acquired instead of using the standard + * setup/teardown routines. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + sc->sm->sm_type = scrub_type; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + default: + ASSERT(0); + error = -EFSCORRUPTED; + } + if (error) + goto out; + + if (!xrep_will_attempt(sc)) + goto out; + + /* + * Repair some part of the inode. This will potentially join the inode + * to the transaction. + */ + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xrep_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xrep_bmap(sc, XFS_DATA_FORK, false); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xrep_bmap(sc, XFS_ATTR_FORK, false); + break; + } + if (error) + goto out; + + /* + * Finish all deferred intent items and then roll the transaction so + * that the inode will not be joined to the transaction when we exit + * the function. + */ + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + error = xfs_trans_roll(&sc->tp); + if (error) + goto out; + + /* + * Clear the corruption flags and re-check the metadata that we just + * repaired. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + + switch (scrub_type) { + case XFS_SCRUB_TYPE_INODE: + error = xchk_inode(sc); + break; + case XFS_SCRUB_TYPE_BMBTD: + error = xchk_bmap_data(sc); + break; + case XFS_SCRUB_TYPE_BMBTA: + error = xchk_bmap_attr(sc); + break; + } + if (error) + goto out; + + /* If corruption persists, the repair has failed. */ + if (xchk_needs_repair(sc->sm)) { + error = -EFSCORRUPTED; + goto out; + } +out: + sc->sick_mask = sick_mask; + sc->sm->sm_type = smtype; + sc->sm->sm_flags = smflags; + return error; +} + +/* + * Repair the ondisk forks of a metadata inode. The caller must ensure that + * sc->ip points to the metadata inode and the ILOCK is held on that inode. + * The inode must not be joined to the transaction before the call, and will + * not be afterwards. + */ +int +xrep_metadata_inode_forks( + struct xfs_scrub *sc) +{ + bool dirty = false; + int error; + + /* Repair the inode record and the data fork. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE); + if (error) + return error; + + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD); + if (error) + return error; + + /* Make sure the attr fork looks ok before we delete it. */ + error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA); + if (error) + return error; + + /* Clear the reflink flag since metadata never shares. */ + if (xfs_is_reflink_inode(sc->ip)) { + dirty = true; + xfs_trans_ijoin(sc->tp, sc->ip, 0); + error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp); + if (error) + return error; + } + + /* + * If we modified the inode, roll the transaction but don't rejoin the + * inode to the new transaction because xrep_bmap_data can do that. + */ + if (dirty) { + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + dirty = false; + } + + return 0; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f89c8f08b037..f0f9c5194e8d 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -82,6 +82,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc); int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork, xfs_extnum_t nextents); int xrep_reset_perag_resv(struct xfs_scrub *sc); +int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten); +int xrep_metadata_inode_forks(struct xfs_scrub *sc); /* Repair setup functions */ int xrep_setup_ag_allocbt(struct xfs_scrub *sc); From a59eb5fc21b2a6dc160ee6cdf77f20bc186a88fd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:43 -0800 Subject: [PATCH 083/123] xfs: create a new inode fork block unmap helper Create a new helper to unmap blocks from an inode's fork. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/libxfs/xfs_bmap.c | 41 +++++++++++++++++++++++++++++++++++++++- fs/xfs/libxfs/xfs_bmap.h | 5 ++--- fs/xfs/xfs_inode.c | 24 ++++------------------- 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a073ca877ced..523926fe50eb 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5239,7 +5239,7 @@ xfs_bmap_del_extent_real( * that value. If not all extents in the block range can be removed then * *done is set. */ -int /* error */ +static int __xfs_bunmapi( struct xfs_trans *tp, /* transaction pointer */ struct xfs_inode *ip, /* incore inode */ @@ -6220,3 +6220,42 @@ xfs_bmap_validate_extent( return xfs_bmap_validate_extent_raw(ip->i_mount, XFS_IS_REALTIME_INODE(ip), whichfork, irec); } + +/* + * Used in xfs_itruncate_extents(). This is the maximum number of extents + * freed from a file in a single transaction. + */ +#define XFS_ITRUNC_MAX_EXTENTS 2 + +/* + * Unmap every extent in part of an inode's fork. We don't do any higher level + * invalidation work at all. + */ +int +xfs_bunmapi_range( + struct xfs_trans **tpp, + struct xfs_inode *ip, + uint32_t flags, + xfs_fileoff_t startoff, + xfs_fileoff_t endoff) +{ + xfs_filblks_t unmap_len = endoff - startoff + 1; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + while (unmap_len > 0) { + ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); + error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS); + if (error) + goto out; + + /* free the just unmapped extents */ + error = xfs_defer_finish(tpp); + if (error) + goto out; + } +out: + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 8518324db285..4b83f6148e00 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -190,9 +190,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); -int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags, - xfs_extnum_t nexts); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); @@ -273,6 +270,8 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags); +int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip, + uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff); extern struct kmem_cache *xfs_bmap_intent_cache; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ea6b277485a4..1ffc8dfa2a52 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -41,12 +41,6 @@ struct kmem_cache *xfs_inode_cache; -/* - * Used in xfs_itruncate_extents(). This is the maximum number of extents - * freed from a file in a single transaction. - */ -#define XFS_ITRUNC_MAX_EXTENTS 2 - STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, struct xfs_inode *); @@ -1346,7 +1340,6 @@ xfs_itruncate_extents_flags( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; xfs_fileoff_t first_unmap_block; - xfs_filblks_t unmap_len; int error = 0; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1378,19 +1371,10 @@ xfs_itruncate_extents_flags( return 0; } - unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1; - while (unmap_len > 0) { - ASSERT(tp->t_highest_agno == NULLAGNUMBER); - error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len, - flags, XFS_ITRUNC_MAX_EXTENTS); - if (error) - goto out; - - /* free the just unmapped extents */ - error = xfs_defer_finish(&tp); - if (error) - goto out; - } + error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, + XFS_MAX_FILEOFF); + if (error) + goto out; if (whichfork == XFS_DATA_FORK) { /* Remove all pending CoW reservations. */ From ffd37b22bd2b7cca7749c85a0a08268158903e55 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:43 -0800 Subject: [PATCH 084/123] xfs: online repair of realtime bitmaps Fix all the file metadata surrounding the realtime bitmap file, which includes the rt geometry, file size, forks, and space mappings. The bitmap contents themselves cannot be fixed without rt rmap, so that will come later. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 4 + fs/xfs/scrub/repair.h | 7 ++ fs/xfs/scrub/rtbitmap.c | 16 +-- fs/xfs/scrub/rtbitmap.h | 22 ++++ fs/xfs/scrub/rtbitmap_repair.c | 202 +++++++++++++++++++++++++++++++++ fs/xfs/scrub/scrub.c | 2 +- 6 files changed, 245 insertions(+), 8 deletions(-) create mode 100644 fs/xfs/scrub/rtbitmap.h create mode 100644 fs/xfs/scrub/rtbitmap_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index a7830df42c4e..e7727451d1c8 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -192,5 +192,9 @@ xfs-y += $(addprefix scrub/, \ refcount_repair.o \ repair.o \ ) + +xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ + rtbitmap_repair.o \ + ) endif endif diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index f0f9c5194e8d..86cf86037fe0 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -115,6 +115,12 @@ int xrep_bmap_data(struct xfs_scrub *sc); int xrep_bmap_attr(struct xfs_scrub *sc); int xrep_bmap_cow(struct xfs_scrub *sc); +#ifdef CONFIG_XFS_RT +int xrep_rtbitmap(struct xfs_scrub *sc); +#else +# define xrep_rtbitmap xrep_notsupported +#endif /* CONFIG_XFS_RT */ + int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -177,6 +183,7 @@ xrep_setup_nothing( #define xrep_bmap_data xrep_notsupported #define xrep_bmap_attr xrep_notsupported #define xrep_bmap_cow xrep_notsupported +#define xrep_rtbitmap xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 578b935ca93f..441ca9977652 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -17,12 +17,8 @@ #include "xfs_bit.h" #include "scrub/scrub.h" #include "scrub/common.h" - -struct xchk_rtbitmap { - uint64_t rextents; - uint64_t rbmblocks; - unsigned int rextslog; -}; +#include "scrub/repair.h" +#include "scrub/rtbitmap.h" /* Set us up with the realtime metadata locked. */ int @@ -38,7 +34,13 @@ xchk_setup_rtbitmap( return -ENOMEM; sc->buf = rtb; - error = xchk_trans_alloc(sc, 0); + if (xchk_could_repair(sc)) { + error = xrep_setup_rtbitmap(sc, rtb); + if (error) + return error; + } + + error = xchk_trans_alloc(sc, rtb->resblks); if (error) return error; diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h new file mode 100644 index 000000000000..85304ff019e1 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap.h @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_RTBITMAP_H__ +#define __XFS_SCRUB_RTBITMAP_H__ + +struct xchk_rtbitmap { + uint64_t rextents; + uint64_t rbmblocks; + unsigned int rextslog; + unsigned int resblks; +}; + +#ifdef CONFIG_XFS_ONLINE_REPAIR +int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb); +#else +# define xrep_setup_rtbitmap(sc, rtb) (0) +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_RTBITMAP_H__ */ diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c new file mode 100644 index 000000000000..46f5d5f605c9 --- /dev/null +++ b/fs/xfs/scrub/rtbitmap_repair.c @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2020-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/xfile.h" +#include "scrub/rtbitmap.h" + +/* Set up to repair the realtime bitmap file metadata. */ +int +xrep_setup_rtbitmap( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + + /* + * Reserve enough blocks to write out a completely new bmbt for a + * maximally fragmented bitmap file. We do not hold the rtbitmap + * ILOCK yet, so this is entirely speculative. + */ + blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + + rtb->resblks += blocks; + return 0; +} + +/* + * Make sure that the given range of the data fork of the realtime file is + * mapped to written blocks. The caller must ensure that the inode is joined + * to the transaction. + */ +STATIC int +xrep_rtbitmap_data_mappings( + struct xfs_scrub *sc, + xfs_filblks_t len) +{ + struct xfs_bmbt_irec map; + xfs_fileoff_t off = 0; + int error; + + ASSERT(sc->ip != NULL); + + while (off < len) { + int nmaps = 1; + + /* + * If we have a real extent mapping this block then we're + * in ok shape. + */ + error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps, + XFS_DATA_FORK); + if (error) + return error; + if (nmaps == 0) { + ASSERT(nmaps != 0); + return -EFSCORRUPTED; + } + + /* + * Written extents are ok. Holes are not filled because we + * do not know the freespace information. + */ + if (xfs_bmap_is_written_extent(&map) || + map.br_startblock == HOLESTARTBLOCK) { + off = map.br_startoff + map.br_blockcount; + continue; + } + + /* + * If we find a delalloc reservation then something is very + * very wrong. Bail out. + */ + if (map.br_startblock == DELAYSTARTBLOCK) + return -EFSCORRUPTED; + + /* Make sure we're really converting an unwritten extent. */ + if (map.br_state != XFS_EXT_UNWRITTEN) { + ASSERT(map.br_state == XFS_EXT_UNWRITTEN); + return -EFSCORRUPTED; + } + + /* Make sure this block has a real zeroed extent mapped. */ + nmaps = 1; + error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff, + map.br_blockcount, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, + 0, &map, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -EFSCORRUPTED; + + /* Commit new extent and all deferred work. */ + error = xrep_defer_finish(sc); + if (error) + return error; + + off = map.br_startoff + map.br_blockcount; + } + + return 0; +} + +/* Fix broken rt volume geometry. */ +STATIC int +xrep_rtbitmap_geometry( + struct xfs_scrub *sc, + struct xchk_rtbitmap *rtb) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_trans *tp = sc->tp; + + /* Superblock fields */ + if (mp->m_sb.sb_rextents != rtb->rextents) + xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS, + rtb->rextents - mp->m_sb.sb_rextents); + + if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + rtb->rbmblocks - mp->m_sb.sb_rbmblocks); + + if (mp->m_sb.sb_rextslog != rtb->rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + rtb->rextslog - mp->m_sb.sb_rextslog); + + /* Fix broken isize */ + sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size, + mp->m_sb.sb_blocksize); + + if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) + sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks); + + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + return xrep_roll_trans(sc); +} + +/* Repair the realtime bitmap file metadata. */ +int +xrep_rtbitmap( + struct xfs_scrub *sc) +{ + struct xchk_rtbitmap *rtb = sc->buf; + struct xfs_mount *mp = sc->mp; + unsigned long long blocks = 0; + int error; + + /* Impossibly large rtbitmap means we can't touch the filesystem. */ + if (rtb->rbmblocks > U32_MAX) + return 0; + + /* + * If the size of the rt bitmap file is larger than what we reserved, + * figure out if we need to adjust the block reservation in the + * transaction. + */ + blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks); + if (blocks > UINT_MAX) + return -EOPNOTSUPP; + if (blocks > rtb->resblks) { + error = xfs_trans_reserve_more(sc->tp, blocks, 0); + if (error) + return error; + + rtb->resblks += blocks; + } + + /* Fix inode core and forks. */ + error = xrep_metadata_inode_forks(sc); + if (error) + return error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Ensure no unwritten extents. */ + error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks); + if (error) + return error; + + /* Fix inconsistent bitmap geometry */ + return xrep_rtbitmap_geometry(sc, rtb); +} diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index a7019c9bba0c..c33480894229 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -328,7 +328,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, - .repair = xrep_notsupported, + .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, From 7d1f0e167a067ed741dec08b7614d76893422b04 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:44 -0800 Subject: [PATCH 085/123] xfs: check the ondisk space mapping behind a dquot Each xfs_dquot object caches the file offset and daddr of the ondisk block that backs the dquot. Make sure these cached values are the same as the bmapi data, and that the block state is written. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/quota.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 5671c8153433..59350cd7a325 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -6,6 +6,7 @@ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" +#include "xfs_bit.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" @@ -75,6 +76,47 @@ struct xchk_quota_info { xfs_dqid_t last_id; }; +/* There's a written block backing this dquot, right? */ +STATIC int +xchk_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + xfs_fileoff_t offset) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + if (!xfs_verify_fileoff(mp, offset)) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (dq->q_fileoffset != offset) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps != 1) { + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + return 0; + } + + if (!xfs_verify_fsbno(mp, irec.br_startblock)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + if (!xfs_bmap_is_written_extent(&irec)) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + + return 0; +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( @@ -93,6 +135,17 @@ xchk_quota_item( if (xchk_should_terminate(sc, &error)) return error; + /* + * We want to validate the bmap record for the storage backing this + * dquot, so we need to lock the dquot and the quota file. For quota + * operations, the locking order is first the ILOCK and then the dquot. + * However, dqiterate gave us a locked dquot, so drop the dquot lock to + * get the ILOCK. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_SHARED); + xfs_dqlock(dq); + /* * Except for the root dquot, the actual dquot we got must either have * the same or higher id as we saw before. @@ -103,6 +156,11 @@ xchk_quota_item( sqi->last_id = dq->q_id; + error = xchk_quota_item_bmap(sc, dq, offset); + xchk_iunlock(sc, XFS_ILOCK_SHARED); + if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error)) + return error; + /* * Warn if the hard limits are larger than the fs. * Administrators can do this, though in production this seems From 774b5c0a5152892bf5f43ce560f3a814b1fdf3b7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:44 -0800 Subject: [PATCH 086/123] xfs: check dquot resource timers For each dquot resource, ensure either (a) the resource usage is over the soft limit and there is a nonzero timer; or (b) usage is at or under the soft limit and the timer is unset. (a) is redundant with the dquot buffer verifier, but (b) isn't checked anywhere. Found by fuzzing xfs/426 and noticing that diskdq.btimer = add didn't trip any kind of warning for having a timer set even with no limits. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/scrub/quota.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 59350cd7a325..49835d2840b4 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -117,6 +117,23 @@ xchk_quota_item_bmap( return 0; } +/* Complain if a quota timer is incorrectly set. */ +static inline void +xchk_quota_item_timer( + struct xfs_scrub *sc, + xfs_fileoff_t offset, + const struct xfs_dquot_res *res) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } else { + if (res->timer) + xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + } +} + /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( @@ -224,6 +241,10 @@ xchk_quota_item( dq->q_rtb.count > dq->q_rtb.hardlimit) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); + xchk_quota_item_timer(sc, offset, &dq->q_blk); + xchk_quota_item_timer(sc, offset, &dq->q_ino); + xchk_quota_item_timer(sc, offset, &dq->q_rtb); + out: if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) return -ECANCELED; From 21d7500929c8a0b10e22a6755850c6f9a9280284 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:45 -0800 Subject: [PATCH 087/123] xfs: improve dquot iteration for scrub Upon a closer inspection of the quota record scrubber, I noticed that dqiterate wasn't actually walking all possible dquots for the mapped blocks in the quota file. This is due to xfs_qm_dqget_next skipping all XFS_IS_DQUOT_UNINITIALIZED dquots. For a fsck program, we really want to look at all the dquots, even if all counters and limits in the dquot record are zero. Rewrite the implementation to do this, as well as switching to an iterator paradigm to reduce the number of indirect calls. This enables removal of the old broken dqiterate code from xfs_dquot.c. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 5 +- fs/xfs/libxfs/xfs_format.h | 3 + fs/xfs/scrub/dqiterate.c | 211 +++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/quota.c | 25 +++-- fs/xfs/scrub/quota.h | 34 ++++++ fs/xfs/scrub/trace.c | 2 + fs/xfs/scrub/trace.h | 49 +++++++++ fs/xfs/xfs_dquot.c | 31 ------ fs/xfs/xfs_dquot.h | 5 - 9 files changed, 319 insertions(+), 46 deletions(-) create mode 100644 fs/xfs/scrub/dqiterate.c create mode 100644 fs/xfs/scrub/quota.h diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index e7727451d1c8..cdf81eb180e2 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -176,7 +176,10 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtsummary.o \ ) -xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + dqiterate.o \ + quota.o \ + ) # online repair ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index f16974126ff9..e6ca188e2271 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1272,6 +1272,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds) #define XFS_DQ_GRACE_MIN ((int64_t)0) #define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX) +/* Maximum id value for a quota record */ +#define XFS_DQ_ID_MAX (U32_MAX) + /* * This is the main portion of the on-disk representation of quota information * for a user. We pad this with some more expansion room to construct the on diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c new file mode 100644 index 000000000000..20c4daedd48d --- /dev/null +++ b/fs/xfs/scrub/dqiterate.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_bmap.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" + +/* Initialize a dquot iteration cursor. */ +void +xchk_dqiter_init( + struct xchk_dqiter *cursor, + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + cursor->sc = sc; + cursor->bmap.br_startoff = NULLFILEOFF; + cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK; + cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype); + cursor->id = 0; +} + +/* + * Ensure that the cached data fork mapping for the dqiter cursor is fresh and + * covers the dquot pointed to by the scan cursor. + */ +STATIC int +xchk_dquot_iter_revalidate_bmap( + struct xchk_dqiter *cursor) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + xfs_dqid_t this_id = cursor->id; + int nmaps = 1; + int error; + + fileoff = this_id / qi->qi_dqperchunk; + + /* + * If we have a mapping for cursor->id and it's still fresh, there's + * no need to reread the bmbt. + */ + if (cursor->bmap.br_startoff != NULLFILEOFF && + cursor->if_seq == ifp->if_seq && + cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff) + return 0; + + /* Look up the data fork mapping for the dquot id of interest. */ + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0); + if (error) + return error; + if (!nmaps) { + ASSERT(nmaps > 0); + return -EFSCORRUPTED; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id); + return 0; +} + +/* Advance the dqiter cursor to the next non-sparse region of the quota file. */ +STATIC int +xchk_dquot_iter_advance_bmap( + struct xchk_dqiter *cursor, + uint64_t *next_ondisk_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip, + XFS_DATA_FORK); + xfs_fileoff_t fileoff; + uint64_t next_id; + int nmaps = 1; + int error; + + /* Find the dquot id for the next non-hole mapping. */ + do { + fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount; + if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + error = xfs_bmapi_read(cursor->quota_ip, fileoff, + XFS_MAX_FILEOFF - fileoff, &cursor->bmap, + &nmaps, 0); + if (error) + return error; + if (!nmaps) { + /* Must have reached the end of the mappings. */ + *next_ondisk_id = -1ULL; + return 0; + } + if (cursor->bmap.br_startoff > fileoff) { + ASSERT(cursor->bmap.br_startoff == fileoff); + return -EFSCORRUPTED; + } + } while (!xfs_bmap_is_real_extent(&cursor->bmap)); + + next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk; + if (next_id > XFS_DQ_ID_MAX) { + /* The hole goes beyond the max dquot id, we're done */ + *next_ondisk_id = -1ULL; + return 0; + } + + /* Propose jumping forward to the dquot in the next allocated block. */ + *next_ondisk_id = next_id; + cursor->if_seq = ifp->if_seq; + trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id); + return 0; +} + +/* + * Find the id of the next highest incore dquot. Normally this will correspond + * exactly with the quota file block mappings, but repair might have erased a + * mapping because it was crosslinked; in that case, we need to re-allocate the + * space so that we can reset q_blkno. + */ +STATIC void +xchk_dquot_iter_advance_incore( + struct xchk_dqiter *cursor, + uint64_t *next_incore_id) +{ + struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype); + struct xfs_dquot *dq; + unsigned int nr_found; + + *next_incore_id = -1ULL; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1); + if (nr_found) + *next_incore_id = dq->q_id; + mutex_unlock(&qi->qi_tree_lock); + + trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id); +} + +/* + * Walk all incore dquots of this filesystem. Caller must set *@cursorp to + * zero before the first call, and must not hold the quota file ILOCK. + * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more + * dquots to iterate; or a negative errno. + */ +int +xchk_dquot_iter( + struct xchk_dqiter *cursor, + struct xfs_dquot **dqpp) +{ + struct xfs_mount *mp = cursor->sc->mp; + struct xfs_dquot *dq = NULL; + uint64_t next_ondisk, next_incore = -1ULL; + unsigned int lock_mode; + int error = 0; + + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + next_ondisk = cursor->id; + + /* Revalidate and/or advance the cursor. */ + lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip); + error = xchk_dquot_iter_revalidate_bmap(cursor); + if (!error && !xfs_bmap_is_real_extent(&cursor->bmap)) + error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk); + xfs_iunlock(cursor->quota_ip, lock_mode); + if (error) + return error; + + if (next_ondisk > cursor->id) + xchk_dquot_iter_advance_incore(cursor, &next_incore); + + /* Pick the next dquot in the sequence and return it. */ + cursor->id = min(next_ondisk, next_incore); + if (cursor->id > XFS_DQ_ID_MAX) + return 0; + + trace_xchk_dquot_iter(cursor, cursor->id); + + error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq); + if (error) + return error; + + cursor->id = dq->q_id + 1; + *dqpp = dq; + return 1; +} diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 49835d2840b4..1a65a7502527 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -18,6 +18,7 @@ #include "xfs_bmap.h" #include "scrub/scrub.h" #include "scrub/common.h" +#include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ static inline xfs_dqtype_t @@ -137,11 +138,9 @@ xchk_quota_item_timer( /* Scrub the fields in an individual quota item. */ STATIC int xchk_quota_item( - struct xfs_dquot *dq, - xfs_dqtype_t dqtype, - void *priv) + struct xchk_quota_info *sqi, + struct xfs_dquot *dq) { - struct xchk_quota_info *sqi = priv; struct xfs_scrub *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -270,7 +269,7 @@ xchk_quota_data_fork( return error; /* Check for data fork problems that apply only to quota files. */ - max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); for_each_xfs_iext(ifp, &icur, &irec) { if (xchk_should_terminate(sc, &error)) @@ -297,9 +296,11 @@ int xchk_quota( struct xfs_scrub *sc) { - struct xchk_quota_info sqi; + struct xchk_dqiter cursor = { }; + struct xchk_quota_info sqi = { .sc = sc }; struct xfs_mount *mp = sc->mp; struct xfs_quotainfo *qi = mp->m_quotainfo; + struct xfs_dquot *dq; xfs_dqtype_t dqtype; int error = 0; @@ -318,9 +319,15 @@ xchk_quota( * functions. */ xchk_iunlock(sc, sc->ilock_flags); - sqi.sc = sc; - sqi.last_id = 0; - error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi); + + /* Now look for things that the quota verifiers won't complain about. */ + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xchk_quota_item(&sqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } xchk_ilock(sc, XFS_ILOCK_EXCL); if (error == -ECANCELED) error = 0; diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h new file mode 100644 index 000000000000..5056b7766c4a --- /dev/null +++ b/fs/xfs/scrub/quota.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_SCRUB_QUOTA_H__ +#define __XFS_SCRUB_QUOTA_H__ + +/* dquot iteration code */ + +struct xchk_dqiter { + struct xfs_scrub *sc; + + /* Quota file that we're walking. */ + struct xfs_inode *quota_ip; + + /* Cached data fork mapping for the dquot. */ + struct xfs_bmbt_irec bmap; + + /* The next dquot to scan. */ + uint64_t id; + + /* Quota type (user/group/project). */ + xfs_dqtype_t dqtype; + + /* Data fork sequence number to detect stale mappings. */ + unsigned int if_seq; +}; + +void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc, + xfs_dqtype_t dqtype); +int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp); + +#endif /* __XFS_SCRUB_QUOTA_H__ */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 29afa4851235..4641522fd907 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -14,9 +14,11 @@ #include "xfs_btree.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" +#include "xfs_quota.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" +#include "scrub/quota.h" /* Figure out which block the btree cursor was pointing to. */ static inline xfs_fsblock_t diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3d5c8e748955..83827f3e165b 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -19,6 +19,7 @@ struct xfile; struct xfarray; struct xfarray_sortinfo; +struct xchk_dqiter; /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -348,6 +349,54 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \ DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error); DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning); +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xchk_dqiter_class, + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), + TP_ARGS(cursor, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_dqtype_t, dqtype) + __field(xfs_ino_t, ino) + __field(unsigned long long, cur_id) + __field(unsigned long long, id) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + ), + TP_fast_assign( + __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; + __entry->dqtype = cursor->dqtype; + __entry->ino = cursor->quota_ip->i_ino; + __entry->cur_id = cursor->id; + __entry->startoff = cursor->bmap.br_startoff; + __entry->startblock = cursor->bmap.br_startblock; + __entry->blockcount = cursor->bmap.br_blockcount; + __entry->state = cursor->bmap.br_state; + __entry->id = id; + ), + TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS), + __entry->ino, + __entry->cur_id, + __entry->startoff, + __entry->startblock, + __entry->blockcount, + __entry->state, + __entry->id) +); + +#define DEFINE_SCRUB_DQITER_EVENT(name) \ +DEFINE_EVENT(xchk_dqiter_class, name, \ + TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \ + TP_ARGS(cursor, id)) +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore); +DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter); +#endif /* CONFIG_XFS_QUOTA */ + TRACE_EVENT(xchk_incomplete, TP_PROTO(struct xfs_scrub *sc, void *ret_ip), TP_ARGS(sc, ret_ip), diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a013b87ab8d5..60ec401e26ff 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1362,34 +1362,3 @@ xfs_qm_exit(void) kmem_cache_destroy(xfs_dqtrx_cache); kmem_cache_destroy(xfs_dquot_cache); } - -/* - * Iterate every dquot of a particular type. The caller must ensure that the - * particular quota type is active. iter_fn can return negative error codes, - * or -ECANCELED to indicate that it wants to stop iterating. - */ -int -xfs_qm_dqiterate( - struct xfs_mount *mp, - xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, - void *priv) -{ - struct xfs_dquot *dq; - xfs_dqid_t id = 0; - int error; - - do { - error = xfs_qm_dqget_next(mp, id, type, &dq); - if (error == -ENOENT) - return 0; - if (error) - return error; - - error = iter_fn(dq, type, priv); - id = dq->q_id + 1; - xfs_qm_dqput(dq); - } while (error == 0 && id != 0); - - return error; -} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 80c8f851a2f3..8d9d4b0d979d 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -234,11 +234,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } -typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, - xfs_dqtype_t type, void *priv); -int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type, - xfs_qm_dqiterate_fn iter_fn, void *priv); - time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); From a5b91555403e3a09ae00bed85fc78b60801dda24 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 15 Dec 2023 10:03:45 -0800 Subject: [PATCH 088/123] xfs: repair quotas Fix anything that causes the quota verifiers to fail. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- fs/xfs/Makefile | 4 + fs/xfs/scrub/quota.c | 3 +- fs/xfs/scrub/quota.h | 2 + fs/xfs/scrub/quota_repair.c | 575 ++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/repair.h | 7 + fs/xfs/scrub/scrub.c | 6 +- fs/xfs/scrub/trace.c | 1 + fs/xfs/scrub/trace.h | 29 ++ fs/xfs/xfs_dquot.c | 6 +- fs/xfs/xfs_dquot.h | 3 + 10 files changed, 628 insertions(+), 8 deletions(-) create mode 100644 fs/xfs/scrub/quota_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index cdf81eb180e2..fbe3cdc79036 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -199,5 +199,9 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \ rtbitmap_repair.o \ ) + +xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \ + quota_repair.o \ + ) endif endif diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 1a65a7502527..183d531875ea 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -21,7 +21,7 @@ #include "scrub/quota.h" /* Convert a scrub type code to a DQ flag, or return 0 if error. */ -static inline xfs_dqtype_t +xfs_dqtype_t xchk_quota_to_dqtype( struct xfs_scrub *sc) { @@ -328,7 +328,6 @@ xchk_quota( if (error) break; } - xchk_ilock(sc, XFS_ILOCK_EXCL); if (error == -ECANCELED) error = 0; if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h index 5056b7766c4a..6c7134ce2385 100644 --- a/fs/xfs/scrub/quota.h +++ b/fs/xfs/scrub/quota.h @@ -6,6 +6,8 @@ #ifndef __XFS_SCRUB_QUOTA_H__ #define __XFS_SCRUB_QUOTA_H__ +xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc); + /* dquot iteration code */ struct xchk_dqiter { diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c new file mode 100644 index 000000000000..0bab4c30cb85 --- /dev/null +++ b/fs/xfs/scrub/quota_repair.c @@ -0,0 +1,575 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018-2023 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_inode_fork.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_dquot.h" +#include "xfs_dquot_item.h" +#include "xfs_reflink.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/quota.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Quota Repair + * ============ + * + * Quota repairs are fairly simplistic; we fix everything that the dquot + * verifiers complain about, cap any counters or limits that make no sense, + * and schedule a quotacheck if we had to fix anything. We also repair any + * data fork extent records that don't apply to metadata files. + */ + +struct xrep_quota_info { + struct xfs_scrub *sc; + bool need_quotacheck; +}; + +/* + * Allocate a new block into a sparse hole in the quota file backing this + * dquot, initialize the block, and commit the whole mess. + */ +STATIC int +xrep_quota_item_fill_bmap_hole( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + struct xfs_bmbt_irec *irec) +{ + struct xfs_buf *bp; + struct xfs_mount *mp = sc->mp; + int nmaps = 1; + int error; + + xfs_trans_ijoin(sc->tp, sc->ip, 0); + + /* Map a block into the file. */ + error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp), + 0); + if (error) + return error; + + error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0, + irec, &nmaps); + if (error) + return error; + if (nmaps != 1) + return -ENOSPC; + + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock); + + trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id); + + /* Initialize the new block. */ + error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp); + if (error) + return error; + bp->b_ops = &xfs_dquot_buf_ops; + + xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * Finish the mapping transactions and roll one more time to + * disconnect sc->ip from sc->tp. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + return xfs_trans_roll(&sc->tp); +} + +/* Make sure there's a written block backing this dquot */ +STATIC int +xrep_quota_item_bmap( + struct xfs_scrub *sc, + struct xfs_dquot *dq, + bool *dirty) +{ + struct xfs_bmbt_irec irec; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk; + int nmaps = 1; + int error; + + /* The computed file offset should always be valid. */ + if (!xfs_verify_fileoff(mp, offset)) { + ASSERT(xfs_verify_fileoff(mp, offset)); + return -EFSCORRUPTED; + } + dq->q_fileoffset = offset; + + error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0); + if (error) + return error; + + if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) { + /* Hole/delalloc extent; allocate a real block. */ + error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec); + if (error) + return error; + } else if (irec.br_state != XFS_EXT_NORM) { + /* Unwritten extent, which we already took care of? */ + ASSERT(irec.br_state == XFS_EXT_NORM); + return -EFSCORRUPTED; + } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) { + /* + * If the cached daddr is incorrect, repair probably punched a + * hole out of the quota file and filled it back in with a new + * block. Update the block mapping in the dquot. + */ + dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock); + } + + *dirty = true; + return 0; +} + +/* Reset quota timers if incorrectly set. */ +static inline void +xrep_quota_item_timer( + struct xfs_scrub *sc, + const struct xfs_dquot_res *res, + bool *dirty) +{ + if ((res->softlimit && res->count > res->softlimit) || + (res->hardlimit && res->count > res->hardlimit)) { + if (!res->timer) + *dirty = true; + } else { + if (res->timer) + *dirty = true; + } +} + +/* Scrub the fields in an individual quota item. */ +STATIC int +xrep_quota_item( + struct xrep_quota_info *rqi, + struct xfs_dquot *dq) +{ + struct xfs_scrub *sc = rqi->sc; + struct xfs_mount *mp = sc->mp; + xfs_ino_t fs_icount; + bool dirty = false; + int error = 0; + + /* Last chance to abort before we start committing fixes. */ + if (xchk_should_terminate(sc, &error)) + return error; + + /* + * We might need to fix holes in the bmap record for the storage + * backing this dquot, so we need to lock the dquot and the quota file. + * dqiterate gave us a locked dquot, so drop the dquot lock to get the + * ILOCK_EXCL. + */ + xfs_dqunlock(dq); + xchk_ilock(sc, XFS_ILOCK_EXCL); + xfs_dqlock(dq); + + error = xrep_quota_item_bmap(sc, dq, &dirty); + xchk_iunlock(sc, XFS_ILOCK_EXCL); + if (error) + return error; + + /* Check the limits. */ + if (dq->q_blk.softlimit > dq->q_blk.hardlimit) { + dq->q_blk.softlimit = dq->q_blk.hardlimit; + dirty = true; + } + + if (dq->q_ino.softlimit > dq->q_ino.hardlimit) { + dq->q_ino.softlimit = dq->q_ino.hardlimit; + dirty = true; + } + + if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) { + dq->q_rtb.softlimit = dq->q_rtb.hardlimit; + dirty = true; + } + + /* + * Check that usage doesn't exceed physical limits. However, on + * a reflink filesystem we're allowed to exceed physical space + * if there are no quota limits. We don't know what the real number + * is, but we can make quotacheck find out for us. + */ + if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) { + dq->q_blk.reserved -= dq->q_blk.count; + dq->q_blk.reserved += mp->m_sb.sb_dblocks; + dq->q_blk.count = mp->m_sb.sb_dblocks; + rqi->need_quotacheck = true; + dirty = true; + } + fs_icount = percpu_counter_sum(&mp->m_icount); + if (dq->q_ino.count > fs_icount) { + dq->q_ino.reserved -= dq->q_ino.count; + dq->q_ino.reserved += fs_icount; + dq->q_ino.count = fs_icount; + rqi->need_quotacheck = true; + dirty = true; + } + if (dq->q_rtb.count > mp->m_sb.sb_rblocks) { + dq->q_rtb.reserved -= dq->q_rtb.count; + dq->q_rtb.reserved += mp->m_sb.sb_rblocks; + dq->q_rtb.count = mp->m_sb.sb_rblocks; + rqi->need_quotacheck = true; + dirty = true; + } + + xrep_quota_item_timer(sc, &dq->q_blk, &dirty); + xrep_quota_item_timer(sc, &dq->q_ino, &dirty); + xrep_quota_item_timer(sc, &dq->q_rtb, &dirty); + + if (!dirty) + return 0; + + trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id); + + dq->q_flags |= XFS_DQFLAG_DIRTY; + xfs_trans_dqjoin(sc->tp, dq); + if (dq->q_id) { + xfs_qm_adjust_dqlimits(dq); + xfs_qm_adjust_dqtimers(dq); + } + xfs_trans_log_dquot(sc->tp, dq); + error = xfs_trans_roll(&sc->tp); + xfs_dqlock(dq); + return error; +} + +/* Fix a quota timer so that we can pass the verifier. */ +STATIC void +xrep_quota_fix_timer( + struct xfs_mount *mp, + const struct xfs_disk_dquot *ddq, + __be64 softlimit, + __be64 countnow, + __be32 *timer, + time64_t timelimit) +{ + uint64_t soft = be64_to_cpu(softlimit); + uint64_t count = be64_to_cpu(countnow); + time64_t new_timer; + uint32_t t; + + if (!soft || count <= soft || *timer != 0) + return; + + new_timer = xfs_dquot_set_timeout(mp, + ktime_get_real_seconds() + timelimit); + if (ddq->d_type & XFS_DQTYPE_BIGTIME) + t = xfs_dq_unix_to_bigtime(new_timer); + else + t = new_timer; + + *timer = cpu_to_be32(t); +} + +/* Fix anything the verifiers complain about. */ +STATIC int +xrep_quota_block( + struct xfs_scrub *sc, + xfs_daddr_t daddr, + xfs_dqtype_t dqtype, + xfs_dqid_t id) +{ + struct xfs_dqblk *dqblk; + struct xfs_disk_dquot *ddq; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype); + struct xfs_buf *bp = NULL; + enum xfs_blft buftype = 0; + int i; + int error; + + error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr, + qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + /* Failed verifier, retry read with no ops. */ + error = xfs_trans_read_buf(sc->mp, sc->tp, + sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen, + 0, &bp, NULL); + if (error) + return error; + break; + case 0: + dqblk = bp->b_addr; + ddq = &dqblk[0].dd_diskdq; + + /* + * If there's nothing that would impede a dqiterate, we're + * done. + */ + if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype || + id == be32_to_cpu(ddq->d_id)) { + xfs_trans_brelse(sc->tp, bp); + return 0; + } + break; + default: + return error; + } + + /* Something's wrong with the block, fix the whole thing. */ + dqblk = bp->b_addr; + bp->b_ops = &xfs_dquot_buf_ops; + for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) { + ddq = &dqblk->dd_diskdq; + + trace_xrep_disk_dquot(sc->mp, dqtype, id + i); + + ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + ddq->d_version = XFS_DQUOT_VERSION; + ddq->d_type = dqtype; + ddq->d_id = cpu_to_be32(id + i); + + if (xfs_has_bigtime(sc->mp) && ddq->d_id) + ddq->d_type |= XFS_DQTYPE_BIGTIME; + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit, + ddq->d_bcount, &ddq->d_btimer, + defq->blk.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit, + ddq->d_icount, &ddq->d_itimer, + defq->ino.time); + + xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit, + ddq->d_rtbcount, &ddq->d_rtbtimer, + defq->rtb.time); + + /* We only support v5 filesystems so always set these. */ + uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + dqblk->dd_lsn = 0; + } + switch (dqtype) { + case XFS_DQTYPE_USER: + buftype = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_DQTYPE_GROUP: + buftype = XFS_BLFT_GDQUOT_BUF; + break; + case XFS_DQTYPE_PROJ: + buftype = XFS_BLFT_PDQUOT_BUF; + break; + } + xfs_trans_buf_set_type(sc->tp, bp, buftype); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return xrep_roll_trans(sc); +} + +/* + * Repair a quota file's data fork. The function returns with the inode + * joined. + */ +STATIC int +xrep_quota_data_fork( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xfs_bmbt_irec irec = { 0 }; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; + xfs_fileoff_t max_dqid_off; + xfs_fileoff_t off; + xfs_fsblock_t fsbno; + bool truncate = false; + bool joined = false; + int error = 0; + + error = xrep_metadata_inode_forks(sc); + if (error) + goto out; + + /* Check for data fork problems that apply only to quota files. */ + max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk; + ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { + if (isnullstartblock(irec.br_startblock)) { + error = -EFSCORRUPTED; + goto out; + } + + if (irec.br_startoff > max_dqid_off || + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + truncate = true; + break; + } + + /* Convert unwritten extents to real ones. */ + if (irec.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec nrec; + int nmap = 1; + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + error = xfs_bmapi_write(sc->tp, sc->ip, + irec.br_startoff, irec.br_blockcount, + XFS_BMAPI_CONVERT, 0, &nrec, &nmap); + if (error) + goto out; + if (nmap != 1) { + error = -ENOSPC; + goto out; + } + ASSERT(nrec.br_startoff == irec.br_startoff); + ASSERT(nrec.br_blockcount == irec.br_blockcount); + + error = xfs_defer_finish(&sc->tp); + if (error) + goto out; + } + } + + if (!joined) { + xfs_trans_ijoin(sc->tp, sc->ip, 0); + joined = true; + } + + if (truncate) { + /* Erase everything after the block containing the max dquot */ + error = xfs_bunmapi_range(&sc->tp, sc->ip, 0, + max_dqid_off * sc->mp->m_sb.sb_blocksize, + XFS_MAX_FILEOFF); + if (error) + goto out; + + /* Remove all CoW reservations. */ + error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0, + XFS_MAX_FILEOFF, true); + if (error) + goto out; + sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + + /* + * Always re-log the inode so that our permanent transaction + * can keep on rolling it forward in the log. + */ + xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE); + } + + /* Now go fix anything that fails the verifiers. */ + for_each_xfs_iext(ifp, &icur, &irec) { + for (fsbno = irec.br_startblock, off = irec.br_startoff; + fsbno < irec.br_startblock + irec.br_blockcount; + fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB, + off += XFS_DQUOT_CLUSTER_SIZE_FSB) { + error = xrep_quota_block(sc, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + dqtype, off * qi->qi_dqperchunk); + if (error) + goto out; + } + } + +out: + return error; +} + +/* + * Go fix anything in the quota items that we could have been mad about. Now + * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to + * use the regular dquot functions. + */ +STATIC int +xrep_quota_problems( + struct xfs_scrub *sc, + xfs_dqtype_t dqtype) +{ + struct xchk_dqiter cursor = { }; + struct xrep_quota_info rqi = { .sc = sc }; + struct xfs_dquot *dq; + int error; + + xchk_dqiter_init(&cursor, sc, dqtype); + while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) { + error = xrep_quota_item(&rqi, dq); + xfs_qm_dqput(dq); + if (error) + break; + } + if (error) + return error; + + /* Make a quotacheck happen. */ + if (rqi.need_quotacheck) + xrep_force_quotacheck(sc, dqtype); + return 0; +} + +/* Repair all of a quota type's items. */ +int +xrep_quota( + struct xfs_scrub *sc) +{ + xfs_dqtype_t dqtype; + int error; + + dqtype = xchk_quota_to_dqtype(sc); + + /* + * Re-take the ILOCK so that we can fix any problems that we found + * with the data fork mappings, or with the dquot bufs themselves. + */ + if (!(sc->ilock_flags & XFS_ILOCK_EXCL)) + xchk_ilock(sc, XFS_ILOCK_EXCL); + error = xrep_quota_data_fork(sc, dqtype); + if (error) + return error; + + /* + * Finish deferred items and roll the transaction to unjoin the quota + * inode from transaction so that we can unlock the quota inode; we + * play only with dquots from now on. + */ + error = xrep_defer_finish(sc); + if (error) + return error; + error = xfs_trans_roll(&sc->tp); + if (error) + return error; + xchk_iunlock(sc, sc->ilock_flags); + + /* Fix anything the dquot verifiers don't complain about. */ + error = xrep_quota_problems(sc, dqtype); + if (error) + return error; + + return xrep_trans_commit(sc); +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 86cf86037fe0..17114327e6fa 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -121,6 +121,12 @@ int xrep_rtbitmap(struct xfs_scrub *sc); # define xrep_rtbitmap xrep_notsupported #endif /* CONFIG_XFS_RT */ +#ifdef CONFIG_XFS_QUOTA +int xrep_quota(struct xfs_scrub *sc); +#else +# define xrep_quota xrep_notsupported +#endif /* CONFIG_XFS_QUOTA */ + int xrep_reinit_pagf(struct xfs_scrub *sc); int xrep_reinit_pagi(struct xfs_scrub *sc); @@ -184,6 +190,7 @@ xrep_setup_nothing( #define xrep_bmap_attr xrep_notsupported #define xrep_bmap_cow xrep_notsupported #define xrep_rtbitmap xrep_notsupported +#define xrep_quota xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index c33480894229..caf324c2b991 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -340,19 +340,19 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xchk_setup_quota, .scrub = xchk_quota, - .repair = xrep_notsupported, + .repair = xrep_quota, }, [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ .type = ST_FS, diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 4641522fd907..d0e24ffaf754 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -15,6 +15,7 @@ #include "xfs_ag.h" #include "xfs_rtbitmap.h" #include "xfs_quota.h" +#include "xfs_quota_defs.h" #include "scrub/scrub.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 83827f3e165b..6bbb4e8639dc 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -1729,6 +1729,35 @@ TRACE_EVENT(xrep_cow_free_staging, __entry->blockcount) ); +#ifdef CONFIG_XFS_QUOTA +DECLARE_EVENT_CLASS(xrep_dquot_class, + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), + TP_ARGS(mp, type, id), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint8_t, type) + __field(uint32_t, id) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->id = id; + __entry->type = type; + ), + TP_printk("dev %d:%d type %s id 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS), + __entry->id) +); + +#define DEFINE_XREP_DQUOT_EVENT(name) \ +DEFINE_EVENT(xrep_dquot_class, name, \ + TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \ + TP_ARGS(mp, type, id)) +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item); +DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot); +DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole); +#endif /* CONFIG_XFS_QUOTA */ + #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ #endif /* _TRACE_XFS_SCRUB_TRACE_H */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 60ec401e26ff..a93ad76f23c5 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -172,14 +172,14 @@ xfs_qm_adjust_dqtimers( /* * initialize a buffer full of dquots and log the whole thing */ -STATIC void +void xfs_qm_init_dquot_blk( struct xfs_trans *tp, - struct xfs_mount *mp, xfs_dqid_t id, xfs_dqtype_t type, struct xfs_buf *bp) { + struct xfs_mount *mp = tp->t_mountp; struct xfs_quotainfo *q = mp->m_quotainfo; struct xfs_dqblk *d; xfs_dqid_t curid; @@ -353,7 +353,7 @@ xfs_dquot_disk_alloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp); + xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp); xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 8d9d4b0d979d..956272d9b302 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -237,4 +237,7 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout); time64_t xfs_dquot_set_grace_period(time64_t grace); +void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t + type, struct xfs_buf *bp); + #endif /* __XFS_DQUOT_H__ */ From 07afd3173d0c6d24a47441839a835955ec6cf0d4 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 15 Dec 2023 16:22:33 +0800 Subject: [PATCH 089/123] xfs: add lock protection when remove perag from radix tree Take mp->m_perag_lock for deletions from the perag radix tree in xfs_initialize_perag to prevent racing with tagging operations. Lookups are fine - they are RCU protected so already deal with the tree changing shape underneath the lookup - but tagging operations require the tree to be stable while the tags are propagated back up to the root. Right now there's nothing stopping radix tree tagging from operating while a growfs operation is progress and adding/removing new entries into the radix tree. Hence we can have traversals that require a stable tree occurring at the same time we are removing unused entries from the radix tree which causes the shape of the tree to change. Likely this hasn't caused a problem in the past because we are only doing append addition and removal so the active AG part of the tree is not changing shape, but that doesn't mean it is safe. Just making the radix tree modifications serialise against each other is obviously correct. Signed-off-by: Long Li Reviewed-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_ag.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index f62ff125a50a..c730976fdfc0 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -424,13 +424,17 @@ xfs_initialize_perag( out_remove_pag: xfs_defer_drain_free(&pag->pag_intents_drain); + spin_lock(&mp->m_perag_lock); radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); out_free_pag: kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ for (index = first_initialised; index < agcount; index++) { + spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); if (!pag) break; xfs_buf_hash_destroy(pag); From 7823921887750b39d02e6b44faafdd1cc617c651 Mon Sep 17 00:00:00 2001 From: Long Li Date: Fri, 15 Dec 2023 16:22:34 +0800 Subject: [PATCH 090/123] xfs: fix perag leak when growfs fails During growfs, if new ag in memory has been initialized, however sb_agcount has not been updated, if an error occurs at this time it will cause perag leaks as follows, these new AGs will not been freed during umount , because of these new AGs are not visible(that is included in mp->m_sb.sb_agcount). unreferenced object 0xffff88810be40200 (size 512): comm "xfs_growfs", pid 857, jiffies 4294909093 hex dump (first 32 bytes): 00 c0 c1 05 81 88 ff ff 04 00 00 00 00 00 00 00 ................ 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 381741e2): [] __kmalloc+0x386/0x4f0 [] kmem_alloc+0xb5/0x2f0 [] xfs_initialize_perag+0xc5/0x810 [] xfs_growfs_data+0x9bc/0xbc0 [] xfs_file_ioctl+0x5fe/0x14d0 [] __x64_sys_ioctl+0x144/0x1c0 [] do_syscall_64+0x3f/0xe0 [] entry_SYSCALL_64_after_hwframe+0x62/0x6a unreferenced object 0xffff88810be40800 (size 512): comm "xfs_growfs", pid 857, jiffies 4294909093 hex dump (first 32 bytes): 20 00 00 00 00 00 00 00 57 ef be dc 00 00 00 00 .......W....... 10 08 e4 0b 81 88 ff ff 10 08 e4 0b 81 88 ff ff ................ backtrace (crc bde50e2d): [] __kmalloc_node+0x3da/0x540 [] kvmalloc_node+0x99/0x160 [] bucket_table_alloc.isra.0+0x5f/0x400 [] rhashtable_init+0x405/0x760 [] xfs_initialize_perag+0x3a3/0x810 [] xfs_growfs_data+0x9bc/0xbc0 [] xfs_file_ioctl+0x5fe/0x14d0 [] __x64_sys_ioctl+0x144/0x1c0 [] do_syscall_64+0x3f/0xe0 [] entry_SYSCALL_64_after_hwframe+0x62/0x6a Factor out xfs_free_unused_perag_range() from xfs_initialize_perag(), used for freeing unused perag within a specified range in error handling, included in the error path of the growfs failure. Fixes: 1c1c6ebcf528 ("xfs: Replace per-ag array with a radix tree") Signed-off-by: Long Li Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_ag.c | 36 ++++++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_ag.h | 2 ++ fs/xfs/xfs_fsops.c | 5 ++++- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index c730976fdfc0..39d9525270b7 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -332,6 +332,31 @@ xfs_agino_range( return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last); } +/* + * Free perag within the specified AG range, it is only used to free unused + * perags under the error handling path. + */ +void +xfs_free_unused_perag_range( + struct xfs_mount *mp, + xfs_agnumber_t agstart, + xfs_agnumber_t agend) +{ + struct xfs_perag *pag; + xfs_agnumber_t index; + + for (index = agstart; index < agend; index++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, index); + spin_unlock(&mp->m_perag_lock); + if (!pag) + break; + xfs_buf_hash_destroy(pag); + xfs_defer_drain_free(&pag->pag_intents_drain); + kmem_free(pag); + } +} + int xfs_initialize_perag( struct xfs_mount *mp, @@ -431,16 +456,7 @@ xfs_initialize_perag( kmem_free(pag); out_unwind_new_pags: /* unwind any prior newly initialized pags */ - for (index = first_initialised; index < agcount; index++) { - spin_lock(&mp->m_perag_lock); - pag = radix_tree_delete(&mp->m_perag_tree, index); - spin_unlock(&mp->m_perag_lock); - if (!pag) - break; - xfs_buf_hash_destroy(pag); - xfs_defer_drain_free(&pag->pag_intents_drain); - kmem_free(pag); - } + xfs_free_unused_perag_range(mp, first_initialised, agcount); return error; } diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 67c3260ee789..4b343c4fac28 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -143,6 +143,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA) __XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES) __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET) +void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, + xfs_agnumber_t agend); int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 5e7255e6ad3e..2c3e3fea888d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -157,7 +157,7 @@ xfs_growfs_data_private( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0, 0, &tp); if (error) - return error; + goto out_free_unused_perag; last_pag = xfs_perag_get(mp, oagcount - 1); if (delta > 0) { @@ -231,6 +231,9 @@ xfs_growfs_data_private( out_trans_cancel: xfs_trans_cancel(tp); +out_free_unused_perag: + if (nagcount > oagcount) + xfs_free_unused_perag_range(mp, oagcount, nagcount); return error; } From b5785f615918423f4ad4abe28eeb2679db8712b0 Mon Sep 17 00:00:00 2001 From: Wang Jinchao Date: Fri, 15 Dec 2023 18:24:34 +0800 Subject: [PATCH 091/123] xfs/health: cleanup, remove duplicated including remove the second ones: \#include "xfs_trans_resv.h" \#include "xfs_mount.h" Signed-off-by: Wang Jinchao Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/scrub/health.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c index df716da11226..531006910ca9 100644 --- a/fs/xfs/scrub/health.c +++ b/fs/xfs/scrub/health.c @@ -10,8 +10,6 @@ #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_btree.h" -#include "xfs_trans_resv.h" -#include "xfs_mount.h" #include "xfs_ag.h" #include "xfs_health.h" #include "scrub/scrub.h" From 944df75958807d56f2db9fdc769eb15dd9f0366a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:17 +0100 Subject: [PATCH 092/123] xfs: consider minlen sized extents in xfs_rtallocate_extent_block minlen is the lower bound on the extent length that the caller can accept, and maxlen is at this point the maximal available length. This means a minlen extent is perfectly fine to use, so do it. This matches the equivalent logic in xfs_rtallocate_extent_exact that also accepts a minlen sized extent. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 0c9893b9f2a9..b5b596cf043a 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -309,7 +309,7 @@ xfs_rtallocate_extent_block( /* * Searched the whole thing & didn't find a maxlen free extent. */ - if (minlen < maxlen && besti != -1) { + if (minlen <= maxlen && besti != -1) { xfs_rtxlen_t p; /* amount to trim length by */ /* From 825b49e4dad8eba37d32bd12ceb436f1b0958fde Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:18 +0100 Subject: [PATCH 093/123] xfs: turn the xfs_trans_mod_dquot_byino stub into an inline function Without this upcoming change can cause an unused variable warning, when adding a local variable for the fields field passed to it. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_quota.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index dcc785fdd345..e0d56489f3b2 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -127,7 +127,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, } #define xfs_trans_dup_dqinfo(tp, tp2) #define xfs_trans_free_dqinfo(tp) -#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0) +static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp, + struct xfs_inode *ip, uint field, int64_t delta) +{ +} #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, From eef519d746bbfb90cbad4077c2d39d7a359c3282 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:19 +0100 Subject: [PATCH 094/123] xfs: remove the xfs_alloc_arg argument to xfs_bmap_btalloc_accounting xfs_bmap_btalloc_accounting only uses the len field from args, but that has just been propagated to ap->length field by the caller. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 523926fe50eb..cad09a3da35d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3265,8 +3265,7 @@ xfs_bmap_btalloc_select_lengths( /* Update all inode and quota accounting for the allocation we just did. */ static void xfs_bmap_btalloc_accounting( - struct xfs_bmalloca *ap, - struct xfs_alloc_arg *args) + struct xfs_bmalloca *ap) { if (ap->flags & XFS_BMAPI_COWFORK) { /* @@ -3279,7 +3278,7 @@ xfs_bmap_btalloc_accounting( * yet. */ if (ap->wasdel) { - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); return; } @@ -3291,22 +3290,22 @@ xfs_bmap_btalloc_accounting( * This essentially transfers the transaction quota reservation * to that of a delalloc extent. */ - ap->ip->i_delayed_blks += args->len; + ap->ip->i_delayed_blks += ap->length; xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS, - -(long)args->len); + -(long)ap->length); return; } /* data/attr fork only */ - ap->ip->i_nblocks += args->len; + ap->ip->i_nblocks += ap->length; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) { - ap->ip->i_delayed_blks -= args->len; - xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len); + ap->ip->i_delayed_blks -= ap->length; + xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); } xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, - args->len); + ap->length); } static int @@ -3380,7 +3379,7 @@ xfs_bmap_process_allocated_extent( ap->offset = orig_offset; else if (ap->offset + ap->length < orig_offset + orig_length) ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap, args); + xfs_bmap_btalloc_accounting(ap); } #ifdef DEBUG From 58643460546da1dc61593fc6fd78762798b4534f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:20 +0100 Subject: [PATCH 095/123] xfs: also use xfs_bmap_btalloc_accounting for RT allocations Make xfs_bmap_btalloc_accounting more generic by handling the RT quota reservations and then also use it from xfs_bmap_rtalloc instead of open coding the accounting logic there. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 21 ++++++++++++++------- fs/xfs/libxfs/xfs_bmap.h | 2 ++ fs/xfs/xfs_bmap_util.c | 12 +----------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index cad09a3da35d..396f242e5932 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3263,10 +3263,14 @@ xfs_bmap_btalloc_select_lengths( } /* Update all inode and quota accounting for the allocation we just did. */ -static void -xfs_bmap_btalloc_accounting( +void +xfs_bmap_alloc_account( struct xfs_bmalloca *ap) { + bool isrt = XFS_IS_REALTIME_INODE(ap->ip) && + (ap->flags & XFS_BMAPI_ATTRFORK); + uint fld; + if (ap->flags & XFS_BMAPI_COWFORK) { /* * COW fork blocks are in-core only and thus are treated as @@ -3291,7 +3295,8 @@ xfs_bmap_btalloc_accounting( * to that of a delalloc extent. */ ap->ip->i_delayed_blks += ap->length; - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS, + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ? + XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS, -(long)ap->length); return; } @@ -3302,10 +3307,12 @@ xfs_bmap_btalloc_accounting( if (ap->wasdel) { ap->ip->i_delayed_blks -= ap->length; xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length); + fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT; + } else { + fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; } - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, - ap->length); + + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length); } static int @@ -3379,7 +3386,7 @@ xfs_bmap_process_allocated_extent( ap->offset = orig_offset; else if (ap->offset + ap->length < orig_offset + orig_length) ap->offset = orig_offset + orig_length - ap->length; - xfs_bmap_btalloc_accounting(ap); + xfs_bmap_alloc_account(ap); } #ifdef DEBUG diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 4b83f6148e00..f6b73f1bad5f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) return XFS_DATA_FORK; } +void xfs_bmap_alloc_account(struct xfs_bmalloca *ap); + /* * Special values for xfs_bmbt_irec_t br_startblock field. */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 731260a5af6d..d6432a7ef285 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -168,17 +168,7 @@ xfs_bmap_rtalloc( if (rtx != NULLRTEXTNO) { ap->blkno = xfs_rtx_to_rtb(mp, rtx); ap->length = xfs_rtxlen_to_extlen(mp, ralen); - ap->ip->i_nblocks += ap->length; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= ap->length; - /* - * Adjust the disk quota also. This was reserved - * earlier. - */ - xfs_trans_mod_dquot_byino(ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, ap->length); + xfs_bmap_alloc_account(ap); return 0; } From 152e21235727bbfe50ddc79a2d60f6bcf19d1640 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:21 +0100 Subject: [PATCH 096/123] xfs: move xfs_bmap_rtalloc to xfs_rtalloc.c xfs_bmap_rtalloc is currently in xfs_bmap_util.c, which is a somewhat odd spot for it, given that is only called from xfs_bmap.c and calls into xfs_rtalloc.c to do the actual work. Move xfs_bmap_rtalloc to xfs_rtalloc.c and mark xfs_rtpick_extent xfs_rtallocate_extent and xfs_rtallocate_extent static now that they aren't called from outside of xfs_rtalloc.c. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_bmap_util.c | 131 --------------------------------------- fs/xfs/xfs_rtalloc.c | 135 ++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_rtalloc.h | 37 ----------- 3 files changed, 133 insertions(+), 170 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index d6432a7ef285..c2531c28905c 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -69,137 +69,6 @@ xfs_zero_extent( GFP_NOFS, 0); } -#ifdef CONFIG_XFS_RT -int -xfs_bmap_rtalloc( - struct xfs_bmalloca *ap) -{ - struct xfs_mount *mp = ap->ip->i_mount; - xfs_fileoff_t orig_offset = ap->offset; - xfs_rtxnum_t rtx; - xfs_rtxlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t mod = 0; /* product factor for allocators */ - xfs_rtxlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_extlen_t orig_length = ap->length; - xfs_extlen_t minlen = mp->m_sb.sb_rextsize; - xfs_rtxlen_t raminlen; - bool rtlocked = false; - bool ignore_locality = false; - int error; - - align = xfs_get_extsz_hint(ap->ip); -retry: - prod = xfs_extlen_to_rtxlen(mp, align); - error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, - align, 1, ap->eof, 0, - ap->conv, &ap->offset, &ap->length); - if (error) - return error; - ASSERT(ap->length); - ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); - - /* - * If we shifted the file offset downward to satisfy an extent size - * hint, increase minlen by that amount so that the allocator won't - * give us an allocation that's too short to cover at least one of the - * blocks that the caller asked for. - */ - if (ap->offset != orig_offset) - minlen += orig_offset - ap->offset; - - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - div_u64_rem(ap->offset, align, &mod); - if (mod || ap->length % align) - prod = 1; - /* - * Set ralen to be the actual requested length in rtextents. - * - * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that - * we rounded up to it, cut it back so it's valid again. - * Note that if it's a really large request (bigger than - * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't - * adjust the starting point to match it. - */ - ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); - - /* - * Lock out modifications to both the RT bitmap and summary inodes - */ - if (!rtlocked) { - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); - xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); - xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); - rtlocked = true; - } - - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->offset == 0) { - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); - if (error) - return error; - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - } else { - ap->blkno = 0; - } - - xfs_bmap_adjacent(ap); - - /* - * Realtime allocation, done through xfs_rtallocate_extent. - */ - if (ignore_locality) - rtx = 0; - else - rtx = xfs_rtb_to_rtx(mp, ap->blkno); - raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); - error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, - ap->wasdel, prod, &rtx); - if (error) - return error; - - if (rtx != NULLRTEXTNO) { - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - ap->length = xfs_rtxlen_to_extlen(mp, ralen); - xfs_bmap_alloc_account(ap); - return 0; - } - - if (align > mp->m_sb.sb_rextsize) { - /* - * We previously enlarged the request length to try to satisfy - * an extent size hint. The allocator didn't return anything, - * so reset the parameters to the original values and try again - * without alignment criteria. - */ - ap->offset = orig_offset; - ap->length = orig_length; - minlen = align = mp->m_sb.sb_rextsize; - goto retry; - } - - if (!ignore_locality && ap->blkno != 0) { - /* - * If we can't allocate near a specific rt extent, try again - * without locality criteria. - */ - ignore_locality = true; - goto retry; - } - - ap->blkno = NULLFSBLOCK; - ap->length = 0; - return 0; -} -#endif /* CONFIG_XFS_RT */ - /* * Extent tree block counting routines. */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b5b596cf043a..35380d0cfe9b 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -14,12 +14,14 @@ #include "xfs_inode.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" #include "xfs_sb.h" #include "xfs_rtbitmap.h" +#include "xfs_quota.h" /* * Read and return the summary information for a given extent size, @@ -1171,7 +1173,7 @@ xfs_growfs_rt( * parameters. The length units are all in realtime extents, as is the * result block number. */ -int +static int xfs_rtallocate_extent( struct xfs_trans *tp, xfs_rtxnum_t start, /* starting rtext number to allocate */ @@ -1419,7 +1421,7 @@ xfs_rtunmount_inodes( * of rtextents and the fraction. * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... */ -int /* error */ +static int xfs_rtpick_extent( xfs_mount_t *mp, /* file system mount point */ xfs_trans_t *tp, /* transaction pointer */ @@ -1458,3 +1460,132 @@ xfs_rtpick_extent( *pick = b; return 0; } + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_fileoff_t orig_offset = ap->offset; + xfs_rtxnum_t rtx; + xfs_rtxlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t mod = 0; /* product factor for allocators */ + xfs_rtxlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_extlen_t orig_length = ap->length; + xfs_extlen_t minlen = mp->m_sb.sb_rextsize; + xfs_rtxlen_t raminlen; + bool rtlocked = false; + bool ignore_locality = false; + int error; + + align = xfs_get_extsz_hint(ap->ip); +retry: + prod = xfs_extlen_to_rtxlen(mp, align); + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0); + + /* + * If we shifted the file offset downward to satisfy an extent size + * hint, increase minlen by that amount so that the allocator won't + * give us an allocation that's too short to cover at least one of the + * blocks that the caller asked for. + */ + if (ap->offset != orig_offset) + minlen += orig_offset - ap->offset; + + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + div_u64_rem(ap->offset, align, &mod); + if (mod || ap->length % align) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + * + * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); + + /* + * Lock out modifications to both the RT bitmap and summary inodes + */ + if (!rtlocked) { + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM); + xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL); + rtlocked = true; + } + + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->offset == 0) { + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + } else { + ap->blkno = 0; + } + + xfs_bmap_adjacent(ap); + + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + if (ignore_locality) + rtx = 0; + else + rtx = xfs_rtb_to_rtx(mp, ap->blkno); + raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, + ap->wasdel, prod, &rtx); + if (error) + return error; + + if (rtx != NULLRTEXTNO) { + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + ap->length = xfs_rtxlen_to_extlen(mp, ralen); + xfs_bmap_alloc_account(ap); + return 0; + } + + if (align > mp->m_sb.sb_rextsize) { + /* + * We previously enlarged the request length to try to satisfy + * an extent size hint. The allocator didn't return anything, + * so reset the parameters to the original values and try again + * without alignment criteria. + */ + ap->offset = orig_offset; + ap->length = orig_length; + minlen = align = mp->m_sb.sb_rextsize; + goto retry; + } + + if (!ignore_locality && ap->blkno != 0) { + /* + * If we can't allocate near a specific rt extent, try again + * without locality criteria. + */ + ignore_locality = true; + goto retry; + } + + ap->blkno = NULLFSBLOCK; + ap->length = 0; + return 0; +} diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index f7cb9ffe51ca..a6836da9bebe 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -12,27 +12,6 @@ struct xfs_mount; struct xfs_trans; #ifdef CONFIG_XFS_RT -/* - * Function prototypes for exported functions. - */ - -/* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -int /* error */ -xfs_rtallocate_extent( - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxnum_t start, /* starting rtext number to allocate */ - xfs_rtxlen_t minlen, /* minimum length to allocate */ - xfs_rtxlen_t maxlen, /* maximum length to allocate */ - xfs_rtxlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_rtxlen_t prod, /* extent product factor */ - xfs_rtxnum_t *rtblock); /* out: start rtext allocated */ - - /* * Initialize realtime fields in the mount structure. */ @@ -51,20 +30,6 @@ int /* error */ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ -/* - * Pick an extent for allocation at the start of a new realtime file. - * Use the sequence number stored in the atime field of the bitmap inode. - * Translate this to a fraction of the rtextents, and return the product - * of rtextents and the fraction. - * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ... - */ -int /* error */ -xfs_rtpick_extent( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtxlen_t len, /* allocation length (rtextents) */ - xfs_rtxnum_t *pick); /* result rt extent */ - /* * Grow the realtime area of the filesystem. */ @@ -75,8 +40,6 @@ xfs_growfs_rt( int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else -# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS) -# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS) # define xfs_growfs_rt(mp,in) (-ENOSYS) # define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ From ce42b5d37527b282d38413c1b5f7283253f6562d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:22 +0100 Subject: [PATCH 097/123] xfs: return -ENOSPC from xfs_rtallocate_* Just return -ENOSPC instead of returning 0 and setting the return rt extent number to NULLRTEXTNO. This is turn removes all users of NULLRTEXTNO, so remove that as well. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_types.h | 1 - fs/xfs/xfs_rtalloc.c | 211 +++++++++++++------------------------- 2 files changed, 71 insertions(+), 141 deletions(-) diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 035bf703d719..20b5375f2d9c 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -51,7 +51,6 @@ typedef void * xfs_failaddr_t; #define NULLRFSBLOCK ((xfs_rfsblock_t)-1) #define NULLRTBLOCK ((xfs_rtblock_t)-1) #define NULLFILEOFF ((xfs_fileoff_t)-1) -#define NULLRTEXTNO ((xfs_rtxnum_t)-1) #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 35380d0cfe9b..a810482339ad 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -156,17 +156,17 @@ xfs_rtallocate_range( * properly update the summary. */ error = xfs_rtfind_back(args, start, 0, &preblock); - if (error) { + if (error) return error; - } + /* * Find the next allocated block (end of free extent). */ error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1, &postblock); - if (error) { + if (error) return error; - } + /* * Decrement the summary information corresponding to the entire * (old) free extent. @@ -174,9 +174,9 @@ xfs_rtallocate_range( error = xfs_rtmodify_summary(args, XFS_RTBLOCKLOG(postblock + 1 - preblock), xfs_rtx_to_rbmblock(mp, preblock), -1); - if (error) { + if (error) return error; - } + /* * If there are blocks not being allocated at the front of the * old extent, add summary data for them to be free. @@ -185,10 +185,10 @@ xfs_rtallocate_range( error = xfs_rtmodify_summary(args, XFS_RTBLOCKLOG(start - preblock), xfs_rtx_to_rbmblock(mp, preblock), 1); - if (error) { + if (error) return error; - } } + /* * If there are blocks not being allocated at the end of the * old extent, add summary data for them to be free. @@ -197,15 +197,14 @@ xfs_rtallocate_range( error = xfs_rtmodify_summary(args, XFS_RTBLOCKLOG(postblock - end), xfs_rtx_to_rbmblock(mp, end + 1), 1); - if (error) { + if (error) return error; - } } + /* * Modify the bitmap to mark this extent allocated. */ - error = xfs_rtmodify_range(args, start, len, 0); - return error; + return xfs_rtmodify_range(args, start, len, 0); } /* @@ -267,17 +266,17 @@ xfs_rtallocate_extent_block( * If it's not so then next will contain the first non-free. */ error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); - if (error) { + if (error) return error; - } + if (stat) { /* * i for maxlen is all free, allocate and return that. */ error = xfs_rtallocate_range(args, i, maxlen); - if (error) { + if (error) return error; - } + *len = maxlen; *rtx = i; return 0; @@ -302,9 +301,8 @@ xfs_rtallocate_extent_block( */ if (next < end) { error = xfs_rtfind_forw(args, next, end, &i); - if (error) { + if (error) return error; - } } else break; } @@ -327,9 +325,8 @@ xfs_rtallocate_extent_block( * Allocate besti for bestlen & return that. */ error = xfs_rtallocate_range(args, besti, bestlen); - if (error) { + if (error) return error; - } *len = bestlen; *rtx = besti; return 0; @@ -338,8 +335,7 @@ xfs_rtallocate_extent_block( * Allocation failed. Set *nextp to the next block to try. */ *nextp = next; - *rtx = NULLRTEXTNO; - return 0; + return -ENOSPC; } /* @@ -369,17 +365,16 @@ xfs_rtallocate_extent_exact( * Check if the range in question (for maxlen) is free. */ error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree); - if (error) { + if (error) return error; - } + if (isfree) { /* * If it is, allocate it and return success. */ error = xfs_rtallocate_range(args, start, maxlen); - if (error) { + if (error) return error; - } *len = maxlen; *rtx = start; return 0; @@ -388,33 +383,23 @@ xfs_rtallocate_extent_exact( * If not, allocate what there is, if it's at least minlen. */ maxlen = next - start; - if (maxlen < minlen) { - /* - * Failed, return failure status. - */ - *rtx = NULLRTEXTNO; - return 0; - } + if (maxlen < minlen) + return -ENOSPC; + /* * Trim off tail of extent, if prod is specified. */ if (prod > 1 && (i = maxlen % prod)) { maxlen -= i; - if (maxlen < minlen) { - /* - * Now we can't do it, return failure status. - */ - *rtx = NULLRTEXTNO; - return 0; - } + if (maxlen < minlen) + return -ENOSPC; } /* * Allocate what we can and return it. */ error = xfs_rtallocate_range(args, start, maxlen); - if (error) { + if (error) return error; - } *len = maxlen; *rtx = start; return 0; @@ -443,7 +428,6 @@ xfs_rtallocate_extent_near( int j; /* secondary loop control */ int log2len; /* log2 of minlen */ xfs_rtxnum_t n; /* next rtext to try */ - xfs_rtxnum_t r; /* result rtext */ ASSERT(minlen % prod == 0); ASSERT(maxlen % prod == 0); @@ -457,26 +441,18 @@ xfs_rtallocate_extent_near( /* Make sure we don't run off the end of the rt volume. */ maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod); - if (maxlen < minlen) { - *rtx = NULLRTEXTNO; - return 0; - } + if (maxlen < minlen) + return -ENOSPC; /* * Try the exact allocation first. */ error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len, - prod, &r); - if (error) { + prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If the exact allocation worked, return that. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } + + bbno = xfs_rtx_to_rbmblock(mp, start); i = 0; j = -1; @@ -492,9 +468,9 @@ xfs_rtallocate_extent_near( */ error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1, bbno + i, &maxlog); - if (error) { + if (error) return error; - } + /* * If there are any useful extents starting here, try * allocating one. @@ -513,17 +489,9 @@ xfs_rtallocate_extent_near( */ error = xfs_rtallocate_extent_block(args, bbno + i, minlen, maxavail, len, - &n, prod, &r); - if (error) { + &n, prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If it worked, return it. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } } /* * On the negative side of the starting location. @@ -557,17 +525,9 @@ xfs_rtallocate_extent_near( error = xfs_rtallocate_extent_block(args, bbno + j, minlen, maxavail, len, &n, prod, - &r); - if (error) { + rtx); + if (error != -ENOSPC) return error; - } - /* - * If it works, return the extent. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } } } } @@ -601,8 +561,7 @@ xfs_rtallocate_extent_near( else break; } - *rtx = NULLRTEXTNO; - return 0; + return -ENOSPC; } /* @@ -624,7 +583,6 @@ xfs_rtallocate_extent_size( xfs_fileoff_t i; /* bitmap block number */ int l; /* level number (loop control) */ xfs_rtxnum_t n; /* next rtext to be tried */ - xfs_rtxnum_t r; /* result rtext number */ xfs_suminfo_t sum; /* summary information for extents */ ASSERT(minlen % prod == 0); @@ -647,9 +605,8 @@ xfs_rtallocate_extent_size( * Get the summary for this level/block. */ error = xfs_rtget_summary(args, l, i, &sum); - if (error) { + if (error) return error; - } /* * Nothing there, on to the next block. */ @@ -659,17 +616,9 @@ xfs_rtallocate_extent_size( * Try allocating the extent. */ error = xfs_rtallocate_extent_block(args, i, maxlen, - maxlen, len, &n, prod, &r); - if (error) { + maxlen, len, &n, prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If it worked, return that. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } /* * If the "next block to try" returned from the * allocator is beyond the next bitmap block, @@ -683,10 +632,8 @@ xfs_rtallocate_extent_size( * Didn't find any maxlen blocks. Try smaller ones, unless * we're asking for a fixed size extent. */ - if (minlen > --maxlen) { - *rtx = NULLRTEXTNO; - return 0; - } + if (minlen > --maxlen) + return -ENOSPC; ASSERT(minlen != 0); ASSERT(maxlen != 0); @@ -705,9 +652,9 @@ xfs_rtallocate_extent_size( * Get the summary information for this level/block. */ error = xfs_rtget_summary(args, l, i, &sum); - if (error) { + if (error) return error; - } + /* * If nothing there, go on to next. */ @@ -721,17 +668,10 @@ xfs_rtallocate_extent_size( error = xfs_rtallocate_extent_block(args, i, XFS_RTMAX(minlen, 1 << l), XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), - len, &n, prod, &r); - if (error) { + len, &n, prod, rtx); + if (error != -ENOSPC) return error; - } - /* - * If it worked, return that extent. - */ - if (r != NULLRTEXTNO) { - *rtx = r; - return 0; - } + /* * If the "next block to try" returned from the * allocator is beyond the next bitmap block, @@ -744,8 +684,7 @@ xfs_rtallocate_extent_size( /* * Got nothing, return failure. */ - *rtx = NULLRTEXTNO; - return 0; + return -ENOSPC; } /* @@ -1182,14 +1121,13 @@ xfs_rtallocate_extent( xfs_rtxlen_t *len, /* out: actual length allocated */ int wasdel, /* was a delayed allocation extent */ xfs_rtxlen_t prod, /* extent product factor */ - xfs_rtxnum_t *rtblock) /* out: start rtext allocated */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { struct xfs_rtalloc_args args = { .mp = tp->t_mountp, .tp = tp, }; int error; /* error value */ - xfs_rtxnum_t r; /* result allocated rtext */ ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL)); ASSERT(minlen > 0 && minlen <= maxlen); @@ -1204,42 +1142,35 @@ xfs_rtallocate_extent( maxlen -= i; if ((i = minlen % prod)) minlen += prod - i; - if (maxlen < minlen) { - *rtblock = NULLRTEXTNO; - return 0; - } + if (maxlen < minlen) + return -ENOSPC; } retry: if (start == 0) { error = xfs_rtallocate_extent_size(&args, minlen, - maxlen, len, prod, &r); + maxlen, len, prod, rtx); } else { error = xfs_rtallocate_extent_near(&args, start, minlen, - maxlen, len, prod, &r); + maxlen, len, prod, rtx); } - xfs_rtbuf_cache_relse(&args); - if (error) + if (error) { + if (error == -ENOSPC && prod > 1) { + prod = 1; + goto retry; + } return error; + } /* * If it worked, update the superblock. */ - if (r != NULLRTEXTNO) { - long slen = (long)*len; - - ASSERT(*len >= minlen && *len <= maxlen); - if (wasdel) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen); - else - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen); - } else if (prod > 1) { - prod = 1; - goto retry; - } - - *rtblock = r; + ASSERT(*len >= minlen && *len <= maxlen); + if (wasdel) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -(long)*len); + else + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -(long)*len); return 0; } @@ -1553,16 +1484,16 @@ xfs_bmap_rtalloc( raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, ap->wasdel, prod, &rtx); - if (error) - return error; - - if (rtx != NULLRTEXTNO) { + if (!error) { ap->blkno = xfs_rtx_to_rtb(mp, rtx); ap->length = xfs_rtxlen_to_extlen(mp, ralen); xfs_bmap_alloc_account(ap); return 0; } + if (error != -ENOSPC) + return error; + if (align > mp->m_sb.sb_rextsize) { /* * We previously enlarged the request length to try to satisfy From db8616e2765a184a3ac7c0d5c901c39f0d3b1570 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:23 +0100 Subject: [PATCH 098/123] xfs: reflow the tail end of xfs_bmap_rtalloc Reorder the tail end of xfs_bmap_rtalloc so that the successfully allocation is in the main path, and the error handling is on a branch. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 60 ++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index a810482339ad..92ff05ce33c7 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1484,39 +1484,39 @@ xfs_bmap_rtalloc( raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, ap->wasdel, prod, &rtx); - if (!error) { - ap->blkno = xfs_rtx_to_rtb(mp, rtx); - ap->length = xfs_rtxlen_to_extlen(mp, ralen); - xfs_bmap_alloc_account(ap); + if (error == -ENOSPC) { + if (align > mp->m_sb.sb_rextsize) { + /* + * We previously enlarged the request length to try to + * satisfy an extent size hint. The allocator didn't + * return anything, so reset the parameters to the + * original values and try again without alignment + * criteria. + */ + ap->offset = orig_offset; + ap->length = orig_length; + minlen = align = mp->m_sb.sb_rextsize; + goto retry; + } + + if (!ignore_locality && ap->blkno != 0) { + /* + * If we can't allocate near a specific rt extent, try + * again without locality criteria. + */ + ignore_locality = true; + goto retry; + } + + ap->blkno = NULLFSBLOCK; + ap->length = 0; return 0; } - - if (error != -ENOSPC) + if (error) return error; - if (align > mp->m_sb.sb_rextsize) { - /* - * We previously enlarged the request length to try to satisfy - * an extent size hint. The allocator didn't return anything, - * so reset the parameters to the original values and try again - * without alignment criteria. - */ - ap->offset = orig_offset; - ap->length = orig_length; - minlen = align = mp->m_sb.sb_rextsize; - goto retry; - } - - if (!ignore_locality && ap->blkno != 0) { - /* - * If we can't allocate near a specific rt extent, try again - * without locality criteria. - */ - ignore_locality = true; - goto retry; - } - - ap->blkno = NULLFSBLOCK; - ap->length = 0; + ap->blkno = xfs_rtx_to_rtb(mp, rtx); + ap->length = xfs_rtxlen_to_extlen(mp, ralen); + xfs_bmap_alloc_account(ap); return 0; } From 676544c27e710aee7f8357f57abd348d98b1ccd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:24 +0100 Subject: [PATCH 099/123] xfs: indicate if xfs_bmap_adjacent changed ap->blkno Add a return value to xfs_bmap_adjacent to indicate if it did change ap->blkno or not. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_bmap.c | 19 ++++++++++++++----- fs/xfs/xfs_bmap_util.h | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 396f242e5932..be7b76ab9fcd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3044,7 +3044,8 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -void +/* returns true if ap->blkno was modified */ +bool xfs_bmap_adjacent( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { @@ -3079,13 +3080,14 @@ xfs_bmap_adjacent( if (adjust && ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) ap->blkno += adjust; + return true; } /* * If not at eof, then compare the two neighbor blocks. * Figure out whether either one gives us a good starting point, * and pick the better one. */ - else if (!ap->eof) { + if (!ap->eof) { xfs_fsblock_t gotbno; /* right side block number */ xfs_fsblock_t gotdiff=0; /* right side difference */ xfs_fsblock_t prevbno; /* left side block number */ @@ -3165,14 +3167,21 @@ xfs_bmap_adjacent( * If both valid, pick the better one, else the only good * one, else ap->blkno is already set (to 0 or the inode block). */ - if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) + if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) { ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; - else if (prevbno != NULLFSBLOCK) + return true; + } + if (prevbno != NULLFSBLOCK) { ap->blkno = prevbno; - else if (gotbno != NULLFSBLOCK) + return true; + } + if (gotbno != NULLFSBLOCK) { ap->blkno = gotbno; + return true; + } } #undef ISVALID + return false; } int diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 6888078f5c31..77ecbb753ef2 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -47,7 +47,7 @@ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, int rt, int eof, int delay, int convert, xfs_fileoff_t *offp, xfs_extlen_t *lenp); -void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +bool xfs_bmap_adjacent(struct xfs_bmalloca *ap); int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *rec, int *is_empty); From a3e48f68b5f4bc83cdded35be2c4c3cc23eb9e19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:25 +0100 Subject: [PATCH 100/123] xfs: cleanup picking the start extent hint in xfs_bmap_rtalloc Clean up the logical in xfs_bmap_rtalloc that tries to find a rtextent to start the search from by using a separate variable for the hint, not calling xfs_bmap_adjacent when we want to ignore the locality and avoid an extra roundtrip converting between block numbers and RT extent numbers. As a side-effect this doesn't pointlessly call xfs_rtpick_extent and increment the start rtextent hint if we are going to ignore the result anyway. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 92ff05ce33c7..33f558166642 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1398,7 +1398,8 @@ xfs_bmap_rtalloc( { struct xfs_mount *mp = ap->ip->i_mount; xfs_fileoff_t orig_offset = ap->offset; - xfs_rtxnum_t rtx; + xfs_rtxnum_t start; /* allocation hint rtextent no */ + xfs_rtxnum_t rtx; /* actually allocated rtextent no */ xfs_rtxlen_t prod = 0; /* product factor for allocators */ xfs_extlen_t mod = 0; /* product factor for allocators */ xfs_rtxlen_t ralen = 0; /* realtime allocation length */ @@ -1459,30 +1460,24 @@ xfs_bmap_rtalloc( rtlocked = true; } - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->offset == 0) { - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (ignore_locality) { + start = 0; + } else if (xfs_bmap_adjacent(ap)) { + start = xfs_rtb_to_rtx(mp, ap->blkno); + } else if (ap->eof && ap->offset == 0) { + /* + * If it's an allocation to an empty file at offset 0, pick an + * extent that will space things out in the rt area. + */ + error = xfs_rtpick_extent(mp, ap->tp, ralen, &start); if (error) return error; - ap->blkno = xfs_rtx_to_rtb(mp, rtx); } else { - ap->blkno = 0; + start = 0; } - xfs_bmap_adjacent(ap); - - /* - * Realtime allocation, done through xfs_rtallocate_extent. - */ - if (ignore_locality) - rtx = 0; - else - rtx = xfs_rtb_to_rtx(mp, ap->blkno); raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); - error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen, + error = xfs_rtallocate_extent(ap->tp, start, raminlen, ralen, &ralen, ap->wasdel, prod, &rtx); if (error == -ENOSPC) { if (align > mp->m_sb.sb_rextsize) { @@ -1499,7 +1494,7 @@ xfs_bmap_rtalloc( goto retry; } - if (!ignore_locality && ap->blkno != 0) { + if (!ignore_locality && start != 0) { /* * If we can't allocate near a specific rt extent, try * again without locality criteria. From c2adcfa31ff606264fab6e69129d6d45c9ddb7cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:26 +0100 Subject: [PATCH 101/123] xfs: move xfs_rtget_summary to xfs_rtbitmap.c xfs_rtmodify_summary_int is only used inside xfs_rtbitmap.c and to implement xfs_rtget_summary. Move xfs_rtget_summary to xfs_rtbitmap.c as the exported API and mark xfs_rtmodify_summary_int static. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_rtbitmap.c | 14 ++++++++++++++ fs/xfs/libxfs/xfs_rtbitmap.h | 4 ++-- fs/xfs/xfs_rtalloc.c | 16 ---------------- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 30a2844f62e3..e67f6f763f7d 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -519,6 +519,20 @@ xfs_rtmodify_summary( return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL); } +/* + * Read and return the summary information for a given extent size, bitmap block + * combination. + */ +int +xfs_rtget_summary( + struct xfs_rtalloc_args *args, + int log, /* log2 of extent size */ + xfs_fileoff_t bbno, /* bitmap block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + return xfs_rtmodify_summary_int(args, log, bbno, 0, sum); +} + /* Log rtbitmap block from the word @from to the byte before @next. */ static inline void xfs_trans_log_rtbitmap( diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h index 1c84b52de3d4..274dc7dae1fa 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.h +++ b/fs/xfs/libxfs/xfs_rtbitmap.h @@ -321,8 +321,8 @@ int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock); int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, xfs_rtxlen_t len, int val); -int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log, - xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum); +int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log, + xfs_fileoff_t bbno, xfs_suminfo_t *sum); int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log, xfs_fileoff_t bbno, int delta); int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 33f558166642..0475693bc199 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -23,22 +23,6 @@ #include "xfs_rtbitmap.h" #include "xfs_quota.h" -/* - * Read and return the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -static int -xfs_rtget_summary( - struct xfs_rtalloc_args *args, - int log, /* log2 of extent size */ - xfs_fileoff_t bbno, /* bitmap block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - return xfs_rtmodify_summary_int(args, log, bbno, 0, sum); -} - /* * Return whether there are any free extents in the size range given * by low and high, for the bitmap block bbno. From b271b314119eca1fb98a2c4e15304ce562802f0c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:27 +0100 Subject: [PATCH 102/123] xfs: split xfs_rtmodify_summary_int Inline the logic of xfs_rtmodify_summary_int into xfs_rtmodify_summary and xfs_rtget_summary instead of having a somewhat awkward helper to share a little bit of code. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_rtbitmap.c | 90 +++++++++++++----------------------- 1 file changed, 32 insertions(+), 58 deletions(-) diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e67f6f763f7d..5773e4ea36c6 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -452,63 +452,9 @@ xfs_trans_log_rtsummary( } /* - * Read and/or modify the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - * - * Summary information is returned in *sum if specified. - * If no delta is specified, returns summary only. + * Modify the summary information for a given extent size, bitmap block + * combination. */ -int -xfs_rtmodify_summary_int( - struct xfs_rtalloc_args *args, - int log, /* log2 of extent size */ - xfs_fileoff_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - struct xfs_mount *mp = args->mp; - int error; - xfs_fileoff_t sb; /* summary fsblock */ - xfs_rtsumoff_t so; /* index into the summary file */ - unsigned int infoword; - - /* - * Compute entry number in the summary file. - */ - so = xfs_rtsumoffs(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = xfs_rtsumoffs_to_block(mp, so); - - error = xfs_rtsummary_read_buf(args, sb); - if (error) - return error; - - /* - * Point to the summary information, modify/log it, and/or copy it out. - */ - infoword = xfs_rtsumoffs_to_infoword(mp, so); - if (delta) { - xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta); - - if (mp->m_rsum_cache) { - if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log; - if (val != 0 && log >= mp->m_rsum_cache[bbno]) - mp->m_rsum_cache[bbno] = log + 1; - } - xfs_trans_log_rtsummary(args, infoword); - if (sum) - *sum = val; - } else if (sum) { - *sum = xfs_suminfo_get(args, infoword); - } - return 0; -} - int xfs_rtmodify_summary( struct xfs_rtalloc_args *args, @@ -516,7 +462,28 @@ xfs_rtmodify_summary( xfs_fileoff_t bbno, /* bitmap block number */ int delta) /* in/out: summary block number */ { - return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL); + struct xfs_mount *mp = args->mp; + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + unsigned int infoword; + xfs_suminfo_t val; + int error; + + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); + if (error) + return error; + + infoword = xfs_rtsumoffs_to_infoword(mp, so); + val = xfs_suminfo_add(args, infoword, delta); + + if (mp->m_rsum_cache) { + if (val == 0 && log + 1 == mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log; + if (val != 0 && log >= mp->m_rsum_cache[bbno]) + mp->m_rsum_cache[bbno] = log + 1; + } + + xfs_trans_log_rtsummary(args, infoword); + return 0; } /* @@ -530,7 +497,14 @@ xfs_rtget_summary( xfs_fileoff_t bbno, /* bitmap block number */ xfs_suminfo_t *sum) /* out: summary info for this block */ { - return xfs_rtmodify_summary_int(args, log, bbno, 0, sum); + struct xfs_mount *mp = args->mp; + xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno); + int error; + + error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so)); + if (!error) + *sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so)); + return error; } /* Log rtbitmap block from the word @from to the byte before @next. */ From f3e509dd45c226aff268bab3695fded60e18f720 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:28 +0100 Subject: [PATCH 103/123] xfs: invert a check in xfs_rtallocate_extent_block Doing a break in the else side of a conditional is rather silly. Invert the check, break ASAP and unindent the other leg. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 0475693bc199..abcbd0c95b30 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -283,12 +283,11 @@ xfs_rtallocate_extent_block( /* * If not done yet, find the start of the next free space. */ - if (next < end) { - error = xfs_rtfind_forw(args, next, end, &i); - if (error) - return error; - } else + if (next >= end) break; + error = xfs_rtfind_forw(args, next, end, &i); + if (error) + return error; } /* * Searched the whole thing & didn't find a maxlen free extent. From 9ade45b08a685e121895228f344af1f8985adb2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:29 +0100 Subject: [PATCH 104/123] xfs: reflow the tail end of xfs_rtallocate_extent_block Change polarity of a check so that the successful case of being able to allocate an extent is in the main path of the function and error handling is on a branch. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 46 +++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index abcbd0c95b30..c52f479e3abd 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -289,36 +289,38 @@ xfs_rtallocate_extent_block( if (error) return error; } + /* * Searched the whole thing & didn't find a maxlen free extent. */ - if (minlen <= maxlen && besti != -1) { + if (minlen > maxlen || besti == -1) { + /* + * Allocation failed. Set *nextp to the next block to try. + */ + *nextp = next; + return -ENOSPC; + } + + /* + * If size should be a multiple of prod, make that so. + */ + if (prod > 1) { xfs_rtxlen_t p; /* amount to trim length by */ - /* - * If size should be a multiple of prod, make that so. - */ - if (prod > 1) { - div_u64_rem(bestlen, prod, &p); - if (p) - bestlen -= p; - } - - /* - * Allocate besti for bestlen & return that. - */ - error = xfs_rtallocate_range(args, besti, bestlen); - if (error) - return error; - *len = bestlen; - *rtx = besti; - return 0; + div_u64_rem(bestlen, prod, &p); + if (p) + bestlen -= p; } + /* - * Allocation failed. Set *nextp to the next block to try. + * Allocate besti for bestlen & return that. */ - *nextp = next; - return -ENOSPC; + error = xfs_rtallocate_range(args, besti, bestlen); + if (error) + return error; + *len = bestlen; + *rtx = besti; + return 0; } /* From d9498fa8c8580b9cedb764e475503706ba7a0fbf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:30 +0100 Subject: [PATCH 105/123] xfs: merge the calls to xfs_rtallocate_range in xfs_rtallocate_block Use a goto to use a common tail for the case of being able to allocate an extent. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index c52f479e3abd..774e55cd602f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -252,19 +252,15 @@ xfs_rtallocate_extent_block( error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat); if (error) return error; - if (stat) { /* * i for maxlen is all free, allocate and return that. */ - error = xfs_rtallocate_range(args, i, maxlen); - if (error) - return error; - - *len = maxlen; - *rtx = i; - return 0; + bestlen = maxlen; + besti = i; + goto allocate; } + /* * In the case where we have a variable-sized allocation * request, figure out how big this free piece is, @@ -315,6 +311,7 @@ xfs_rtallocate_extent_block( /* * Allocate besti for bestlen & return that. */ +allocate: error = xfs_rtallocate_range(args, besti, bestlen); if (error) return error; From 3c97c9f78d23c7e449fc9a0865b40f44748c3011 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:31 +0100 Subject: [PATCH 106/123] xfs: tidy up xfs_rtallocate_extent_exact Use common code for both xfs_rtallocate_range calls by moving the !isfree logic into the non-default branch. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 774e55cd602f..00083013a010 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -350,32 +350,24 @@ xfs_rtallocate_extent_exact( if (error) return error; - if (isfree) { + if (!isfree) { /* - * If it is, allocate it and return success. + * If not, allocate what there is, if it's at least minlen. */ - error = xfs_rtallocate_range(args, start, maxlen); - if (error) - return error; - *len = maxlen; - *rtx = start; - return 0; - } - /* - * If not, allocate what there is, if it's at least minlen. - */ - maxlen = next - start; - if (maxlen < minlen) - return -ENOSPC; - - /* - * Trim off tail of extent, if prod is specified. - */ - if (prod > 1 && (i = maxlen % prod)) { - maxlen -= i; + maxlen = next - start; if (maxlen < minlen) return -ENOSPC; + + /* + * Trim off tail of extent, if prod is specified. + */ + if (prod > 1 && (i = maxlen % prod)) { + maxlen -= i; + if (maxlen < minlen) + return -ENOSPC; + } } + /* * Allocate what we can and return it. */ From 8ceee72fdb6f26fe924e02b3342353bac5efa42d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:32 +0100 Subject: [PATCH 107/123] xfs: factor out a xfs_rtalloc_sumlevel helper xfs_rtallocate_extent_size has two loops with nearly identical logic in them. Split that logic into a separate xfs_rtalloc_sumlevel helper. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 153 ++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 83 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 00083013a010..cd183d050fd2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -538,6 +538,52 @@ xfs_rtallocate_extent_near( return -ENOSPC; } +static int +xfs_rtalloc_sumlevel( + struct xfs_rtalloc_args *args, + int l, /* level number */ + xfs_rtxlen_t minlen, /* minimum length to allocate */ + xfs_rtxlen_t maxlen, /* maximum length to allocate */ + xfs_rtxlen_t prod, /* extent product factor */ + xfs_rtxlen_t *len, /* out: actual length allocated */ + xfs_rtxnum_t *rtx) /* out: start rtext allocated */ +{ + xfs_fileoff_t i; /* bitmap block number */ + + for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) { + xfs_suminfo_t sum; /* summary information for extents */ + xfs_rtxnum_t n; /* next rtext to be tried */ + int error; + + error = xfs_rtget_summary(args, l, i, &sum); + if (error) + return error; + + /* + * Nothing there, on to the next block. + */ + if (!sum) + continue; + + /* + * Try allocating the extent. + */ + error = xfs_rtallocate_extent_block(args, i, minlen, maxlen, + len, &n, prod, rtx); + if (error != -ENOSPC) + return error; + + /* + * If the "next block to try" returned from the allocator is + * beyond the next bitmap block, skip to that bitmap block. + */ + if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1) + i = xfs_rtx_to_rbmblock(args->mp, n) - 1; + } + + return -ENOSPC; +} + /* * Allocate an extent of length minlen<=len<=maxlen, with no position * specified. If we don't get maxlen then use prod to trim @@ -552,12 +598,8 @@ xfs_rtallocate_extent_size( xfs_rtxlen_t prod, /* extent product factor */ xfs_rtxnum_t *rtx) /* out: start rtext allocated */ { - struct xfs_mount *mp = args->mp; int error; - xfs_fileoff_t i; /* bitmap block number */ int l; /* level number (loop control) */ - xfs_rtxnum_t n; /* next rtext to be tried */ - xfs_suminfo_t sum; /* summary information for extents */ ASSERT(minlen % prod == 0); ASSERT(maxlen % prod == 0); @@ -565,46 +607,23 @@ xfs_rtallocate_extent_size( /* * Loop over all the levels starting with maxlen. - * At each level, look at all the bitmap blocks, to see if there - * are extents starting there that are long enough (>= maxlen). - * Note, only on the initial level can the allocation fail if - * the summary says there's an extent. + * + * At each level, look at all the bitmap blocks, to see if there are + * extents starting there that are long enough (>= maxlen). + * + * Note, only on the initial level can the allocation fail if the + * summary says there's an extent. */ - for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) { - /* - * Loop over all the bitmap blocks. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary for this level/block. - */ - error = xfs_rtget_summary(args, l, i, &sum); - if (error) - return error; - /* - * Nothing there, on to the next block. - */ - if (!sum) - continue; - /* - * Try allocating the extent. - */ - error = xfs_rtallocate_extent_block(args, i, maxlen, - maxlen, len, &n, prod, rtx); - if (error != -ENOSPC) - return error; - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (xfs_rtx_to_rbmblock(mp, n) > i + 1) - i = xfs_rtx_to_rbmblock(mp, n) - 1; - } + for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) { + error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len, + rtx); + if (error != -ENOSPC) + return error; } + /* - * Didn't find any maxlen blocks. Try smaller ones, unless - * we're asking for a fixed size extent. + * Didn't find any maxlen blocks. Try smaller ones, unless we are + * looking for a fixed size extent. */ if (minlen > --maxlen) return -ENOSPC; @@ -613,51 +632,19 @@ xfs_rtallocate_extent_size( /* * Loop over sizes, from maxlen down to minlen. - * This time, when we do the allocations, allow smaller ones - * to succeed. + * + * This time, when we do the allocations, allow smaller ones to succeed, + * but make sure the specified minlen/maxlen are in the possible range + * for this summary level. */ for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { - /* - * Loop over all the bitmap blocks, try an allocation - * starting in that block. - */ - for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - /* - * Get the summary information for this level/block. - */ - error = xfs_rtget_summary(args, l, i, &sum); - if (error) - return error; - - /* - * If nothing there, go on to next. - */ - if (!sum) - continue; - /* - * Try the allocation. Make sure the specified - * minlen/maxlen are in the possible range for - * this summary level. - */ - error = xfs_rtallocate_extent_block(args, i, - XFS_RTMAX(minlen, 1 << l), - XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), - len, &n, prod, rtx); - if (error != -ENOSPC) - return error; - - /* - * If the "next block to try" returned from the - * allocator is beyond the next bitmap block, - * skip to that bitmap block. - */ - if (xfs_rtx_to_rbmblock(mp, n) > i + 1) - i = xfs_rtx_to_rbmblock(mp, n) - 1; - } + error = xfs_rtalloc_sumlevel(args, l, XFS_RTMAX(minlen, 1 << l), + XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), prod, + len, rtx); + if (error != -ENOSPC) + return error; } - /* - * Got nothing, return failure. - */ + return -ENOSPC; } From 3abfe6c2759e2e3000b13f8ce8a1a325e80987a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:33 +0100 Subject: [PATCH 108/123] xfs: remove rt-wrappers from xfs_format.h xfs_format.h has a bunch odd wrappers for helper functions and mount structure access using RT* prefixes. Replace them with their open coded versions (for those that weren't entirely unused) and remove the wrappers. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 8 -------- fs/xfs/libxfs/xfs_rtbitmap.c | 24 ++++++++++++------------ fs/xfs/scrub/rtsummary.c | 2 +- fs/xfs/xfs_rtalloc.c | 6 +++--- 4 files changed, 16 insertions(+), 24 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e6ca188e2271..7d2873a79a48 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1156,20 +1156,12 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ -#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) -#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) - /* * RT bit manipulation macros. */ #define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) #define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) -#define XFS_RTLOBIT(w) xfs_lowbit32(w) -#define XFS_RTHIBIT(w) xfs_highbit32(w) - -#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) - /* * Dquot and dquot block format definitions */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 5773e4ea36c6..4185ccf83bab 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -195,7 +195,7 @@ xfs_rtfind_back( /* * Different. Mark where we are and return. */ - i = bit - XFS_RTHIBIT(wdiff); + i = bit - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } @@ -233,7 +233,7 @@ xfs_rtfind_back( /* * Different, mark where we are and return. */ - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } @@ -272,7 +272,7 @@ xfs_rtfind_back( /* * Different, mark where we are and return. */ - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + i += XFS_NBWORD - 1 - xfs_highbit32(wdiff); *rtx = start - i + 1; return 0; } else @@ -348,7 +348,7 @@ xfs_rtfind_forw( /* * Different. Mark where we are and return. */ - i = XFS_RTLOBIT(wdiff) - bit; + i = xfs_lowbit32(wdiff) - bit; *rtx = start + i - 1; return 0; } @@ -386,7 +386,7 @@ xfs_rtfind_forw( /* * Different, mark where we are and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *rtx = start + i - 1; return 0; } @@ -423,7 +423,7 @@ xfs_rtfind_forw( /* * Different, mark where we are and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *rtx = start + i - 1; return 0; } else @@ -708,7 +708,7 @@ xfs_rtfree_range( */ if (preblock < start) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(start - preblock), + xfs_highbit64(start - preblock), xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) { return error; @@ -720,7 +720,7 @@ xfs_rtfree_range( */ if (postblock > end) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock - end), + xfs_highbit64(postblock - end), xfs_rtx_to_rbmblock(mp, end + 1), -1); if (error) { return error; @@ -731,7 +731,7 @@ xfs_rtfree_range( * (new) free extent. */ return xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_highbit64(postblock + 1 - preblock), xfs_rtx_to_rbmblock(mp, preblock), 1); } @@ -800,7 +800,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i = XFS_RTLOBIT(wdiff) - bit; + i = xfs_lowbit32(wdiff) - bit; *new = start + i; *stat = 0; return 0; @@ -839,7 +839,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; @@ -877,7 +877,7 @@ xfs_rtcheck_range( /* * Different, compute first wrong bit and return. */ - i += XFS_RTLOBIT(wdiff); + i += xfs_lowbit32(wdiff); *new = start + i; *stat = 0; return 0; diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c index b0d90426a5cb..fabd0ed9dfa6 100644 --- a/fs/xfs/scrub/rtsummary.c +++ b/fs/xfs/scrub/rtsummary.c @@ -177,7 +177,7 @@ xchk_rtsum_record_free( /* Compute the relevant location in the rtsum file. */ rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext); - lenlog = XFS_RTBLOCKLOG(rec->ar_extcount); + lenlog = xfs_highbit64(rec->ar_extcount); offs = xfs_rtsumoffs(mp, lenlog, rbmoff); rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index cd183d050fd2..be93542827cf 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -156,7 +156,7 @@ xfs_rtallocate_range( * (old) free extent. */ error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock + 1 - preblock), + xfs_highbit64(postblock + 1 - preblock), xfs_rtx_to_rbmblock(mp, preblock), -1); if (error) return error; @@ -167,7 +167,7 @@ xfs_rtallocate_range( */ if (preblock < start) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(start - preblock), + xfs_highbit64(start - preblock), xfs_rtx_to_rbmblock(mp, preblock), 1); if (error) return error; @@ -179,7 +179,7 @@ xfs_rtallocate_range( */ if (postblock > end) { error = xfs_rtmodify_summary(args, - XFS_RTBLOCKLOG(postblock - end), + xfs_highbit64(postblock - end), xfs_rtx_to_rbmblock(mp, end + 1), 1); if (error) return error; From a39f5ccc30d5a00b7e6d921aa387ad17d1e6d168 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:34 +0100 Subject: [PATCH 109/123] xfs: remove XFS_RTMIN/XFS_RTMAX Use the kernel min/max helpers instead. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_format.h | 6 ------ fs/xfs/libxfs/xfs_rtbitmap.c | 8 ++++---- fs/xfs/xfs_rtalloc.c | 7 ++++--- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 7d2873a79a48..382ab1e71c0b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1156,12 +1156,6 @@ static inline bool xfs_dinode_has_large_extent_counts( #define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ #define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ -/* - * RT bit manipulation macros. - */ -#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) -#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) - /* * Dquot and dquot block format definitions */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 4185ccf83bab..31100120b2c5 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -184,7 +184,7 @@ xfs_rtfind_back( * Calculate first (leftmost) bit number to look at, * and mask for all the relevant bits in this word. */ - firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0); mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << firstbit; /* @@ -338,7 +338,7 @@ xfs_rtfind_forw( * Calculate last (rightmost) bit number to look at, * and mask for all the relevant bits in this word. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Calculate the difference between the value there @@ -573,7 +573,7 @@ xfs_rtmodify_range( /* * Compute first bit not changed and mask of relevant bits. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; /* * Set/clear the active bits. @@ -787,7 +787,7 @@ xfs_rtcheck_range( /* * Compute first bit not examined. */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + lastbit = min(bit + len, XFS_NBWORD); /* * Mask of relevant bits. */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index be93542827cf..4b8fc8e510ac 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -638,9 +638,10 @@ xfs_rtallocate_extent_size( * for this summary level. */ for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) { - error = xfs_rtalloc_sumlevel(args, l, XFS_RTMAX(minlen, 1 << l), - XFS_RTMIN(maxlen, (1 << (l + 1)) - 1), prod, - len, rtx); + error = xfs_rtalloc_sumlevel(args, l, + max_t(xfs_rtxlen_t, minlen, 1 << l), + min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1), + prod, len, rtx); if (error != -ENOSPC) return error; } From 26e5eed7802299a666714ee511da58d906e8770c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:35 +0100 Subject: [PATCH 110/123] xfs: reorder the minlen and prod calculations in xfs_bmap_rtalloc xfs_bmap_rtalloc is a bit of a mess in terms of calculating the locally need variables. Reorder them a bit so that related code is located next to each other - the raminlen calculation moves up next to where the maximum len is calculated, and all the prod calculation is move into a single place and rearranged so that the real prod calculation only happens when it actually is needed. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 4b8fc8e510ac..ff2d7b237ef6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1375,7 +1375,6 @@ xfs_bmap_rtalloc( align = xfs_get_extsz_hint(ap->ip); retry: - prod = xfs_extlen_to_rtxlen(mp, align); error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 1, ap->eof, 0, ap->conv, &ap->offset, &ap->length); @@ -1393,13 +1392,6 @@ xfs_bmap_rtalloc( if (ap->offset != orig_offset) minlen += orig_offset - ap->offset; - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - div_u64_rem(ap->offset, align, &mod); - if (mod || ap->length % align) - prod = 1; /* * Set ralen to be the actual requested length in rtextents. * @@ -1410,6 +1402,7 @@ xfs_bmap_rtalloc( * adjust the starting point to match it. */ ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); + raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); /* * Lock out modifications to both the RT bitmap and summary inodes @@ -1438,7 +1431,16 @@ xfs_bmap_rtalloc( start = 0; } - raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + /* + * Only bother calculating a real prod factor if offset & length are + * perfectly aligned, otherwise it will just get us in trouble. + */ + div_u64_rem(ap->offset, align, &mod); + if (mod || ap->length % align) + prod = 1; + else + prod = xfs_extlen_to_rtxlen(mp, align); + error = xfs_rtallocate_extent(ap->tp, start, raminlen, ralen, &ralen, ap->wasdel, prod, &rtx); if (error == -ENOSPC) { From b6bb34588f4c95a56f23160bf3cadee74fa5480b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:36 +0100 Subject: [PATCH 111/123] xfs: simplify and optimize the RT allocation fallback cascade There are currently multiple levels of fall back if an RT allocation can not be satisfied: 1) xfs_rtallocate_extent extends the minlen and reduces the maxlen due to the extent size hint. If that can't be done, it return -ENOSPC and let's xfs_bmap_rtalloc retry, which then not only drops the extent size hint based alignment, but also the minlen adjustment 2) if xfs_rtallocate_extent gets -ENOSPC from the underlying functions, it only drops the extent size hint based alignment and retries 3) if that still does not succeed, xfs_rtallocate_extent drops the extent size hint (which is a complex no-op at this point) and the minlen using the same code as (1) above 4) if that still doesn't success and the caller wanted an allocation near a blkno, drop that blkno hint. The handling in 1 is rather inefficient as we could just drop the alignment and continue, and 2/3 interact in really weird ways due to the duplicate policy. Move aligning the min and maxlen out of xfs_rtallocate_extent and into a helper called directly by xfs_bmap_rtalloc. This allows just continuing with the allocation if we have to drop the alignment instead of going through the retry loop and also dropping the perfectly usable minlen adjustment that didn't cause the problem, and then just use a single retry that drops both the minlen and alignment requirement when we really are out of space, thus consolidating cases (2) and (3) above. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 58 ++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ff2d7b237ef6..d1fc64a8161f 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1094,21 +1094,6 @@ xfs_rtallocate_extent( ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL)); ASSERT(minlen > 0 && minlen <= maxlen); - /* - * If prod is set then figure out what to do to minlen and maxlen. - */ - if (prod > 1) { - xfs_rtxlen_t i; - - if ((i = maxlen % prod)) - maxlen -= i; - if ((i = minlen % prod)) - minlen += prod - i; - if (maxlen < minlen) - return -ENOSPC; - } - -retry: if (start == 0) { error = xfs_rtallocate_extent_size(&args, minlen, maxlen, len, prod, rtx); @@ -1117,13 +1102,8 @@ xfs_rtallocate_extent( maxlen, len, prod, rtx); } xfs_rtbuf_cache_relse(&args); - if (error) { - if (error == -ENOSPC && prod > 1) { - prod = 1; - goto retry; - } + if (error) return error; - } /* * If it worked, update the superblock. @@ -1354,6 +1334,35 @@ xfs_rtpick_extent( return 0; } +static void +xfs_rtalloc_align_minmax( + xfs_rtxlen_t *raminlen, + xfs_rtxlen_t *ramaxlen, + xfs_rtxlen_t *prod) +{ + xfs_rtxlen_t newmaxlen = *ramaxlen; + xfs_rtxlen_t newminlen = *raminlen; + xfs_rtxlen_t slack; + + slack = newmaxlen % *prod; + if (slack) + newmaxlen -= slack; + slack = newminlen % *prod; + if (slack) + newminlen += *prod - slack; + + /* + * If adjusting for extent size hint alignment produces an invalid + * min/max len combination, go ahead without it. + */ + if (newmaxlen < newminlen) { + *prod = 1; + return; + } + *ramaxlen = newmaxlen; + *raminlen = newminlen; +} + int xfs_bmap_rtalloc( struct xfs_bmalloca *ap) @@ -1436,10 +1445,13 @@ xfs_bmap_rtalloc( * perfectly aligned, otherwise it will just get us in trouble. */ div_u64_rem(ap->offset, align, &mod); - if (mod || ap->length % align) + if (mod || ap->length % align) { prod = 1; - else + } else { prod = xfs_extlen_to_rtxlen(mp, align); + if (prod > 1) + xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod); + } error = xfs_rtallocate_extent(ap->tp, start, raminlen, ralen, &ralen, ap->wasdel, prod, &rtx); From e1ead237407a7f42957f6108a95cf093ce6c2c5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Dec 2023 05:57:37 +0100 Subject: [PATCH 112/123] xfs: fold xfs_rtallocate_extent into xfs_bmap_rtalloc There isn't really much left in xfs_rtallocate_extent now, fold it into the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/xfs_rtalloc.c | 67 ++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 49 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index d1fc64a8161f..8649d981a097 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1069,53 +1069,6 @@ xfs_growfs_rt( return error; } -/* - * Allocate an extent in the realtime subvolume, with the usual allocation - * parameters. The length units are all in realtime extents, as is the - * result block number. - */ -static int -xfs_rtallocate_extent( - struct xfs_trans *tp, - xfs_rtxnum_t start, /* starting rtext number to allocate */ - xfs_rtxlen_t minlen, /* minimum length to allocate */ - xfs_rtxlen_t maxlen, /* maximum length to allocate */ - xfs_rtxlen_t *len, /* out: actual length allocated */ - int wasdel, /* was a delayed allocation extent */ - xfs_rtxlen_t prod, /* extent product factor */ - xfs_rtxnum_t *rtx) /* out: start rtext allocated */ -{ - struct xfs_rtalloc_args args = { - .mp = tp->t_mountp, - .tp = tp, - }; - int error; /* error value */ - - ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL)); - ASSERT(minlen > 0 && minlen <= maxlen); - - if (start == 0) { - error = xfs_rtallocate_extent_size(&args, minlen, - maxlen, len, prod, rtx); - } else { - error = xfs_rtallocate_extent_near(&args, start, minlen, - maxlen, len, prod, rtx); - } - xfs_rtbuf_cache_relse(&args); - if (error) - return error; - - /* - * If it worked, update the superblock. - */ - ASSERT(*len >= minlen && *len <= maxlen); - if (wasdel) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -(long)*len); - else - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -(long)*len); - return 0; -} - /* * Initialize realtime fields in the mount structure. */ @@ -1380,6 +1333,10 @@ xfs_bmap_rtalloc( xfs_rtxlen_t raminlen; bool rtlocked = false; bool ignore_locality = false; + struct xfs_rtalloc_args args = { + .mp = mp, + .tp = ap->tp, + }; int error; align = xfs_get_extsz_hint(ap->ip); @@ -1412,6 +1369,8 @@ xfs_bmap_rtalloc( */ ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN)); raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen)); + ASSERT(raminlen > 0); + ASSERT(raminlen <= ralen); /* * Lock out modifications to both the RT bitmap and summary inodes @@ -1453,8 +1412,15 @@ xfs_bmap_rtalloc( xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod); } - error = xfs_rtallocate_extent(ap->tp, start, raminlen, ralen, &ralen, - ap->wasdel, prod, &rtx); + if (start) { + error = xfs_rtallocate_extent_near(&args, start, raminlen, + ralen, &ralen, prod, &rtx); + } else { + error = xfs_rtallocate_extent_size(&args, raminlen, + ralen, &ralen, prod, &rtx); + } + xfs_rtbuf_cache_relse(&args); + if (error == -ENOSPC) { if (align > mp->m_sb.sb_rextsize) { /* @@ -1486,6 +1452,9 @@ xfs_bmap_rtalloc( if (error) return error; + xfs_trans_mod_sb(ap->tp, ap->wasdel ? + XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS, + -(long)ralen); ap->blkno = xfs_rtx_to_rtb(mp, rtx); ap->length = xfs_rtxlen_to_extlen(mp, ralen); xfs_bmap_alloc_account(ap); From 6e145f943bd86be47e54101fa5939f9ed0cb73e5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:34:55 +0100 Subject: [PATCH 113/123] xfs: make if_data a void pointer The xfs_ifork structure currently has a union of the if_root void pointer and the if_data char pointer. In either case it is an opaque pointer that depends on the fork format. Replace the union with a single if_data void pointer as that is what almost all callers want. Only the symlink NULL termination code in xfs_init_local_fork actually needs a new local variable now. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 3 +- fs/xfs/libxfs/xfs_attr_leaf.c | 62 ++++++++++++------------------ fs/xfs/libxfs/xfs_bmap.c | 4 +- fs/xfs/libxfs/xfs_dir2.c | 2 +- fs/xfs/libxfs/xfs_dir2_block.c | 6 +-- fs/xfs/libxfs/xfs_dir2_sf.c | 61 ++++++++++++----------------- fs/xfs/libxfs/xfs_iext_tree.c | 36 ++++++++--------- fs/xfs/libxfs/xfs_inode_fork.c | 53 ++++++++++++------------- fs/xfs/libxfs/xfs_inode_fork.h | 8 ++-- fs/xfs/libxfs/xfs_symlink_remote.c | 4 +- fs/xfs/scrub/attr.c | 10 ++--- fs/xfs/scrub/readdir.c | 6 +-- fs/xfs/scrub/symlink.c | 2 +- fs/xfs/xfs_attr_list.c | 3 +- fs/xfs/xfs_dir2_readdir.c | 6 +-- fs/xfs/xfs_inode.c | 6 +-- fs/xfs/xfs_inode_item.c | 10 ++--- fs/xfs/xfs_symlink.c | 4 +- 18 files changed, 119 insertions(+), 167 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fa49c795f407..7f822e72dfcd 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1049,9 +1049,8 @@ xfs_attr_set( static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { - struct xfs_attr_shortform *sf; + struct xfs_attr_shortform *sf = dp->i_af.if_data; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; return be16_to_cpu(sf->hdr.totsize); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 5d1ab4978f32..3e5377fd4984 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -691,7 +691,7 @@ xfs_attr_shortform_create( if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data; + hdr = ifp->if_data; memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); @@ -712,14 +712,13 @@ xfs_attr_sf_findname( struct xfs_attr_sf_entry **sfep, unsigned int *basep) { - struct xfs_attr_shortform *sf; + struct xfs_attr_shortform *sf = args->dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; unsigned int base = sizeof(struct xfs_attr_sf_hdr); int size = 0; int end; int i; - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; end = sf->hdr.count; for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), @@ -751,29 +750,25 @@ xfs_attr_shortform_add( struct xfs_da_args *args, int forkoff) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_shortform *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; int offset, size; - struct xfs_mount *mp; - struct xfs_inode *dp; - struct xfs_ifork *ifp; trace_xfs_attr_sf_add(args); - dp = args->dp; - mp = dp->i_mount; dp->i_forkoff = forkoff; - ifp = &dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) ASSERT(0); offset = (char *)sfe - (char *)sf; size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; + sf = ifp->if_data; sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); sfe->namelen = args->namelen; @@ -811,20 +806,16 @@ int xfs_attr_sf_removename( struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_attr_shortform *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; int size = 0, end, totsize; unsigned int base; - struct xfs_mount *mp; - struct xfs_inode *dp; int error; trace_xfs_attr_sf_remove(args); - dp = args->dp; - mp = dp->i_mount; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; - error = xfs_attr_sf_findname(args, &sfe, &base); /* @@ -878,18 +869,17 @@ xfs_attr_sf_removename( */ /*ARGSUSED*/ int -xfs_attr_shortform_lookup(xfs_da_args_t *args) +xfs_attr_shortform_lookup( + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; - struct xfs_ifork *ifp; + struct xfs_ifork *ifp = &args->dp->i_af; + struct xfs_attr_shortform *sf = ifp->if_data; + struct xfs_attr_sf_entry *sfe; + int i; trace_xfs_attr_sf_lookup(args); - ifp = &args->dp->i_af; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = xfs_attr_sf_nextentry(sfe), i++) { @@ -909,14 +899,13 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) */ int xfs_attr_shortform_getvalue( - struct xfs_da_args *args) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf; - struct xfs_attr_sf_entry *sfe; - int i; + struct xfs_attr_shortform *sf = args->dp->i_af.if_data; + struct xfs_attr_sf_entry *sfe; + int i; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); - sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = xfs_attr_sf_nextentry(sfe), i++) { @@ -933,25 +922,22 @@ int xfs_attr_shortform_to_leaf( struct xfs_da_args *args) { - struct xfs_inode *dp; - struct xfs_attr_shortform *sf; + struct xfs_inode *dp = args->dp; + struct xfs_ifork *ifp = &dp->i_af; + struct xfs_attr_shortform *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; struct xfs_da_args nargs; char *tmpbuffer; int error, i, size; xfs_dablk_t blkno; struct xfs_buf *bp; - struct xfs_ifork *ifp; trace_xfs_attr_sf_to_leaf(args); - dp = args->dp; - ifp = &dp->i_af; - sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data; size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, ifp->if_u1.if_data, size); + memcpy(tmpbuffer, ifp->if_data, size); sf = (struct xfs_attr_shortform *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index be7b76ab9fcd..98aaca933bdd 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -747,7 +747,7 @@ xfs_bmap_local_to_extents_empty( ASSERT(ifp->if_nextents == 0); xfs_bmap_forkoff_reset(ip, whichfork); - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; ifp->if_format = XFS_DINODE_FMT_EXTENTS; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -832,7 +832,7 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; rec.br_startoff = 0; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index f5462fd582d5..a76673281514 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -196,7 +196,7 @@ xfs_dir_isempty( return 1; if (dp->i_disk_size > xfs_inode_data_fork_size(dp)) return 0; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; return !sfp->count; } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 00f960a703b2..3c256d4cc40b 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1089,7 +1089,7 @@ xfs_dir2_sf_to_block( int newoffset; /* offset from current entry */ unsigned int offset = geo->data_entry_offset; xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ - xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data; xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ struct xfs_name name; @@ -1099,10 +1099,8 @@ xfs_dir2_sf_to_block( ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); - oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; - ASSERT(ifp->if_bytes == dp->i_disk_size); - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(oldsfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); ASSERT(dp->i_df.if_nextents == 0); diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 870ef1d1ebe4..0b63138d2b9f 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -364,25 +364,23 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - xfs_inode_t *dp; /* incore directory inode */ + struct xfs_inode *dp = args->dp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ int pick; /* which algorithm to use */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ trace_xfs_dir2_sf_addname(args); ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT); - dp = args->dp; ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. @@ -462,11 +460,9 @@ xfs_dir2_sf_addname_easy( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - int byteoff; /* byte offset in sf dir */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; + int byteoff = (int)((char *)sfep - (char *)sfp); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; - byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ @@ -475,7 +471,7 @@ xfs_dir2_sf_addname_easy( /* * Need to set up again due to realloc of the inode data. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); /* * Fill in the new entry. @@ -528,11 +524,10 @@ xfs_dir2_sf_addname_hard( /* * Copy the old directory to the stack buffer. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_disk_size; buf = kmem_alloc(old_isize, 0); oldsfp = (xfs_dir2_sf_hdr_t *)buf; - memcpy(oldsfp, sfp, old_isize); + memcpy(oldsfp, dp->i_df.if_data, old_isize); /* * Loop over the old directory finding the place we're going * to insert the new entry. @@ -560,7 +555,7 @@ xfs_dir2_sf_addname_hard( /* * Reset the pointer since the buffer was reallocated. */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Copy the first part of the directory, including the header. */ @@ -610,11 +605,10 @@ xfs_dir2_sf_addname_pick( int i; /* entry number */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int size; /* entry's data size */ int used; /* data bytes used */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; size = xfs_dir2_data_entsize(mp, args->namelen); offset = args->geo->data_first_offset; sfep = xfs_dir2_sf_firstentry(sfp); @@ -673,14 +667,13 @@ xfs_dir2_sf_check( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry number */ int i8count; /* number of big inode#s */ xfs_ino_t ino; /* entry inode number */ int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; offset = args->geo->data_first_offset; ino = xfs_dir2_sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; @@ -834,7 +827,7 @@ xfs_dir2_sf_create( /* * Fill in the header, */ - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; sfp->i8count = i8count; /* * Now can put in the inode number, since i8count is set. @@ -857,9 +850,9 @@ xfs_dir2_sf_lookup( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ enum xfs_dacmp cmp; /* comparison result */ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ @@ -870,8 +863,7 @@ xfs_dir2_sf_lookup( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . @@ -933,13 +925,13 @@ xfs_dir2_sf_removename( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int byteoff; /* offset of removed entry */ int entsize; /* this entry's size */ int i; /* shortform entry index */ int newsize; /* new inode size */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_removename(args); @@ -947,8 +939,7 @@ xfs_dir2_sf_removename( oldsize = (int)dp->i_disk_size; ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == oldsize); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Loop over the old directory entries. @@ -989,7 +980,7 @@ xfs_dir2_sf_removename( * Reallocate, making it smaller. */ xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Are we changing inode number size? */ @@ -1012,13 +1003,12 @@ xfs_dir2_sf_replace_needblock( struct xfs_inode *dp, xfs_ino_t inum) { + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int newsize; - struct xfs_dir2_sf_hdr *sfp; if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL) return false; - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF; return inum > XFS_DIR2_MAX_SHORT_INUM && @@ -1034,19 +1024,18 @@ xfs_dir2_sf_replace( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; int i; /* entry index */ xfs_ino_t ino=0; /* entry old inode number */ int i8elevated; /* sf_toino8 set i8count=1 */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ trace_xfs_dir2_sf_replace(args); ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent)); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* @@ -1069,7 +1058,7 @@ xfs_dir2_sf_replace( */ xfs_dir2_sf_toino8(args); i8elevated = 1; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; } else i8elevated = 0; @@ -1150,11 +1139,11 @@ xfs_dir2_sf_toino4( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1168,7 +1157,6 @@ xfs_dir2_sf_toino4( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* @@ -1181,7 +1169,7 @@ xfs_dir2_sf_toino4( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ @@ -1223,11 +1211,11 @@ xfs_dir2_sf_toino8( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; + struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data; char *buf; /* old dir's buffer */ int i; /* entry index */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ @@ -1241,7 +1229,6 @@ xfs_dir2_sf_toino8( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, 0); - oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* @@ -1254,7 +1241,7 @@ xfs_dir2_sf_toino8( * Reset our pointers, the data has moved. */ oldsfp = (xfs_dir2_sf_hdr_t *)buf; - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp = dp->i_df.if_data; /* * Fill in the new header. */ diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index d062794cc795..f4e6b200cdf8 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -158,7 +158,7 @@ static void * xfs_iext_find_first_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height; if (!ifp->if_height) @@ -176,7 +176,7 @@ static void * xfs_iext_find_last_leaf( struct xfs_ifork *ifp) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -306,7 +306,7 @@ xfs_iext_find_level( xfs_fileoff_t offset, int level) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; if (!ifp->if_height) @@ -402,12 +402,12 @@ xfs_iext_grow( int i; if (ifp->if_height == 1) { - struct xfs_iext_leaf *prev = ifp->if_u1.if_root; + struct xfs_iext_leaf *prev = ifp->if_data; node->keys[0] = xfs_iext_leaf_key(prev, 0); node->ptrs[0] = prev; } else { - struct xfs_iext_node *prev = ifp->if_u1.if_root; + struct xfs_iext_node *prev = ifp->if_data; ASSERT(ifp->if_height > 1); @@ -418,7 +418,7 @@ xfs_iext_grow( for (i = 1; i < KEYS_PER_NODE; i++) node->keys[i] = XFS_IEXT_KEY_INVALID; - ifp->if_u1.if_root = node; + ifp->if_data = node; ifp->if_height++; } @@ -430,7 +430,7 @@ xfs_iext_update_node( int level, void *ptr) { - struct xfs_iext_node *node = ifp->if_u1.if_root; + struct xfs_iext_node *node = ifp->if_data; int height, i; for (height = ifp->if_height; height > level; height--) { @@ -583,11 +583,11 @@ xfs_iext_alloc_root( { ASSERT(ifp->if_bytes == 0); - ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); + ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS); ifp->if_height = 1; /* now that we have a node step into it */ - cur->leaf = ifp->if_u1.if_root; + cur->leaf = ifp->if_data; cur->pos = 0; } @@ -603,9 +603,9 @@ xfs_iext_realloc_root( if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF) new_size = NODE_SIZE; - new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL); + new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL); memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes); - ifp->if_u1.if_root = new; + ifp->if_data = new; cur->leaf = new; } @@ -786,8 +786,8 @@ xfs_iext_remove_node( * If we are at the root and only one entry is left we can just * free this node and update the root pointer. */ - ASSERT(node == ifp->if_u1.if_root); - ifp->if_u1.if_root = node->ptrs[0]; + ASSERT(node == ifp->if_data); + ifp->if_data = node->ptrs[0]; ifp->if_height--; kmem_free(node); } @@ -863,8 +863,8 @@ xfs_iext_free_last_leaf( struct xfs_ifork *ifp) { ifp->if_height--; - kmem_free(ifp->if_u1.if_root); - ifp->if_u1.if_root = NULL; + kmem_free(ifp->if_data); + ifp->if_data = NULL; } void @@ -881,7 +881,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, cur, state, _RET_IP_); ASSERT(ifp->if_height > 0); - ASSERT(ifp->if_u1.if_root != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(xfs_iext_valid(ifp, cur)); xfs_iext_inc_seq(ifp); @@ -1051,9 +1051,9 @@ void xfs_iext_destroy( struct xfs_ifork *ifp) { - xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height); + xfs_iext_destroy_node(ifp->if_data, ifp->if_height); ifp->if_bytes = 0; ifp->if_height = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index b86d57589f67..d23910e503a1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -50,12 +50,15 @@ xfs_init_local_fork( mem_size++; if (size) { - ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS); - memcpy(ifp->if_u1.if_data, data, size); + char *new_data = kmem_alloc(mem_size, KM_NOFS); + + memcpy(new_data, data, size); if (zero_terminate) - ifp->if_u1.if_data[size] = '\0'; + new_data[size] = '\0'; + + ifp->if_data = new_data; } else { - ifp->if_u1.if_data = NULL; + ifp->if_data = NULL; } ifp->if_bytes = size; @@ -125,7 +128,7 @@ xfs_iformat_extents( } ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); @@ -212,7 +215,7 @@ xfs_iformat_btree( ifp->if_broot, size); ifp->if_bytes = 0; - ifp->if_u1.if_root = NULL; + ifp->if_data = NULL; ifp->if_height = 0; return 0; } @@ -509,14 +512,14 @@ xfs_idata_realloc( return; if (new_size == 0) { - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; + kmem_free(ifp->if_data); + ifp->if_data = NULL; ifp->if_bytes = 0; return; } - ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size, - GFP_NOFS | __GFP_NOFAIL); + ifp->if_data = krealloc(ifp->if_data, new_size, + GFP_NOFS | __GFP_NOFAIL); ifp->if_bytes = new_size; } @@ -532,8 +535,8 @@ xfs_idestroy_fork( switch (ifp->if_format) { case XFS_DINODE_FMT_LOCAL: - kmem_free(ifp->if_u1.if_data); - ifp->if_u1.if_data = NULL; + kmem_free(ifp->if_data); + ifp->if_data = NULL; break; case XFS_DINODE_FMT_EXTENTS: case XFS_DINODE_FMT_BTREE: @@ -626,9 +629,9 @@ xfs_iflush_fork( case XFS_DINODE_FMT_LOCAL: if ((iip->ili_fields & dataflag[whichfork]) && (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_data != NULL); ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(cp, ifp->if_data, ifp->if_bytes); } break; @@ -706,17 +709,15 @@ xfs_ifork_verify_local_data( case S_IFDIR: { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_hdr *sfp = ifp->if_data; - sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes); break; } case S_IFLNK: { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); - fa = xfs_symlink_shortform_verify(ifp->if_u1.if_data, - ifp->if_bytes); + fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes); break; } default: @@ -725,7 +726,7 @@ xfs_ifork_verify_local_data( if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork", - ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa); + ip->i_df.if_data, ip->i_df.if_bytes, fa); return -EFSCORRUPTED; } @@ -743,20 +744,14 @@ xfs_ifork_verify_local_attr( if (!xfs_inode_has_attr_fork(ip)) { fa = __this_address; } else { - struct xfs_attr_shortform *sfp; - struct xfs_ifork *ifp; - int64_t size; + struct xfs_ifork *ifp = &ip->i_af; - ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL); - ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK); - sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data; - size = ifp->if_bytes; - - fa = xfs_attr_shortform_verify(sfp, size); + ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); + fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes); } if (fa) { xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork", - ifp->if_u1.if_data, ifp->if_bytes, fa); + ifp->if_data, ifp->if_bytes, fa); return -EFSCORRUPTED; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 535be5c03689..7edcf0e8cd53 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -13,14 +13,12 @@ struct xfs_dinode; * File incore extent information, present for each of data & attr forks. */ struct xfs_ifork { - int64_t if_bytes; /* bytes in if_u1 */ + int64_t if_bytes; /* bytes in if_data */ struct xfs_btree_block *if_broot; /* file's incore btree root */ unsigned int if_seq; /* fork mod counter */ int if_height; /* height of the extent tree */ - union { - void *if_root; /* extent tree root */ - char *if_data; /* inline file data */ - } if_u1; + void *if_data; /* extent tree root or + inline data */ xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 3c96d1d617fb..160aa20aa441 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -175,7 +175,7 @@ xfs_symlink_local_to_remote( if (!xfs_has_crc(mp)) { bp->b_ops = NULL; - memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); return; } @@ -191,7 +191,7 @@ xfs_symlink_local_to_remote( buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); - memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); + memcpy(buf, ifp->if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + ifp->if_bytes - 1); } diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 6c16d9530cca..bac6fb2f01d8 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -527,19 +527,15 @@ xchk_xattr_check_sf( struct xfs_scrub *sc) { struct xchk_xattr_buf *ab = sc->buf; - struct xfs_attr_shortform *sf; + struct xfs_ifork *ifp = &sc->ip->i_af; + struct xfs_attr_shortform *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; struct xfs_attr_sf_entry *next; - struct xfs_ifork *ifp; - unsigned char *end; + unsigned char *end = ifp->if_data + ifp->if_bytes; int i; int error = 0; - ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK); - bitmap_zero(ab->usedmap, ifp->if_bytes); - sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data; - end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes; xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); sfe = &sf->list[0]; diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c index e51c1544be63..16462332c897 100644 --- a/fs/xfs/scrub/readdir.c +++ b/fs/xfs/scrub/readdir.c @@ -36,16 +36,14 @@ xchk_dir_walk_sf( struct xfs_mount *mp = dp->i_mount; struct xfs_da_geometry *geo = mp->m_dir_geo; struct xfs_dir2_sf_entry *sfep; - struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_ino_t ino; xfs_dir2_dataptr_t dapos; unsigned int i; int error; ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* dot entry */ dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c index 60643d791d4a..ddff86713df3 100644 --- a/fs/xfs/scrub/symlink.c +++ b/fs/xfs/scrub/symlink.c @@ -61,7 +61,7 @@ xchk_symlink( /* Inline symlink? */ if (ifp->if_format == XFS_DINODE_FMT_LOCAL) { if (len > xfs_inode_data_fork_size(ip) || - len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip))) + len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip))) xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 99bbbe1a0e44..8700b00e154c 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -56,12 +56,11 @@ xfs_attr_shortform_list( struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; - struct xfs_attr_shortform *sf; + struct xfs_attr_shortform *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; int sbsize, nsbuf, count, i; int error = 0; - sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data; ASSERT(sf != NULL); if (!sf->hdr.count) return 0; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 57f42c2af0a3..cc6dc56f455d 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -52,7 +52,7 @@ xfs_dir2_sf_getdents( struct xfs_mount *mp = dp->i_mount; xfs_dir2_dataptr_t off; /* current entry's offset */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data; xfs_dir2_dataptr_t dot_offset; xfs_dir2_dataptr_t dotdot_offset; xfs_ino_t ino; @@ -60,9 +60,7 @@ xfs_dir2_sf_getdents( ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL); ASSERT(dp->i_df.if_bytes == dp->i_disk_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(sfp != NULL); /* * If the block number in the offset is out of range, we're done. diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1ffc8dfa2a52..1fd94958aa97 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -872,7 +872,7 @@ xfs_init_new_inode( case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_bytes = 0; - ip->i_df.if_u1.if_root = NULL; + ip->i_df.if_data = NULL; break; default: ASSERT(0); @@ -2378,8 +2378,8 @@ xfs_ifree( * already been freed by xfs_attr_inactive. */ if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { - kmem_free(ip->i_df.if_u1.if_data); - ip->i_df.if_u1.if_data = NULL; + kmem_free(ip->i_df.if_data); + ip->i_df.if_data = NULL; ip->i_df.if_bytes = 0; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b35335e20342..0aee97ba0be8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -352,11 +352,10 @@ xfs_inode_item_format_data_fork( ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV); if ((iip->ili_fields & XFS_ILOG_DDATA) && ip->i_df.if_bytes > 0) { - ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_df.if_data != NULL); ASSERT(ip->i_disk_size > 0); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, - ip->i_df.if_u1.if_data, - ip->i_df.if_bytes); + ip->i_df.if_data, ip->i_df.if_bytes); ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes; ilf->ilf_size++; } else { @@ -431,10 +430,9 @@ xfs_inode_item_format_attr_fork( if ((iip->ili_fields & XFS_ILOG_ADATA) && ip->i_af.if_bytes > 0) { - ASSERT(ip->i_af.if_u1.if_data != NULL); + ASSERT(ip->i_af.if_data != NULL); xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, - ip->i_af.if_u1.if_data, - ip->i_af.if_bytes); + ip->i_af.if_data, ip->i_af.if_bytes); ilf->ilf_asize = (unsigned)ip->i_af.if_bytes; ilf->ilf_size++; } else { diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 7c713727f7fd..92974a4414c8 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -131,10 +131,10 @@ xfs_readlink( * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED * if if_data is junk. */ - if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data)) + if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data)) goto out; - memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1); + memcpy(link, ip->i_df.if_data, pathlen + 1); error = 0; } else { error = xfs_readlink_bmap_ilocked(ip, link); From 45c76a2add55b332d965c901e14004ae0134a67e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:34:56 +0100 Subject: [PATCH 114/123] xfs: return if_data from xfs_idata_realloc Many of the xfs_idata_realloc callers need to set a local pointer to the just reallocated if_data memory. Return the pointer to simplify them a bit and use the opportunity to re-use krealloc for freeing if_data if the size hits 0. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr_leaf.c | 7 +++---- fs/xfs/libxfs/xfs_dir2_sf.c | 25 ++++++++++--------------- fs/xfs/libxfs/xfs_inode_fork.c | 20 ++++++++------------ fs/xfs/libxfs/xfs_inode_fork.h | 2 +- 4 files changed, 22 insertions(+), 32 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 3e5377fd4984..2e3334ac3228 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -690,8 +690,8 @@ xfs_attr_shortform_create( ASSERT(ifp->if_bytes == 0); if (ifp->if_format == XFS_DINODE_FMT_EXTENTS) ifp->if_format = XFS_DINODE_FMT_LOCAL; - xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); - hdr = ifp->if_data; + + hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK); memset(hdr, 0, sizeof(*hdr)); hdr->totsize = cpu_to_be16(sizeof(*hdr)); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); @@ -767,8 +767,7 @@ xfs_attr_shortform_add( offset = (char *)sfe - (char *)sf; size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sf = ifp->if_data; + sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK); sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); sfe->namelen = args->namelen; diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 0b63138d2b9f..e1f83fc7b6ad 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -466,12 +466,11 @@ xfs_dir2_sf_addname_easy( /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), + sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen), XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ - sfp = dp->i_df.if_data; sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); /* * Fill in the new entry. @@ -551,11 +550,8 @@ xfs_dir2_sf_addname_hard( * the data. */ xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK); - xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); - /* - * Reset the pointer since the buffer was reallocated. - */ - sfp = dp->i_df.if_data; + sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK); + /* * Copy the first part of the directory, including the header. */ @@ -820,15 +816,13 @@ xfs_dir2_sf_create( ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; size = xfs_dir2_sf_hdr_size(i8count); + /* - * Make a buffer for the data. + * Make a buffer for the data and fill in the header. */ - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - /* - * Fill in the header, - */ - sfp = dp->i_df.if_data; + sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK); sfp->i8count = i8count; + /* * Now can put in the inode number, since i8count is set. */ @@ -976,11 +970,12 @@ xfs_dir2_sf_removename( */ sfp->count--; dp->i_disk_size = newsize; + /* * Reallocate, making it smaller. */ - xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); - sfp = dp->i_df.if_data; + sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); + /* * Are we changing inode number size? */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index d23910e503a1..d8405a8d3c14 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -496,7 +496,7 @@ xfs_iroot_realloc( * byte_diff -- the change in the number of bytes, positive or negative, * requested for the if_data array. */ -void +void * xfs_idata_realloc( struct xfs_inode *ip, int64_t byte_diff, @@ -508,19 +508,15 @@ xfs_idata_realloc( ASSERT(new_size >= 0); ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork)); - if (byte_diff == 0) - return; - - if (new_size == 0) { - kmem_free(ifp->if_data); - ifp->if_data = NULL; - ifp->if_bytes = 0; - return; + if (byte_diff) { + ifp->if_data = krealloc(ifp->if_data, new_size, + GFP_NOFS | __GFP_NOFAIL); + if (new_size == 0) + ifp->if_data = NULL; + ifp->if_bytes = new_size; } - ifp->if_data = krealloc(ifp->if_data, new_size, - GFP_NOFS | __GFP_NOFAIL); - ifp->if_bytes = new_size; + return ifp->if_data; } /* Free all memory and reset a fork back to its initial state. */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7edcf0e8cd53..96303249d28a 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -168,7 +168,7 @@ int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *); void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, struct xfs_inode_log_item *, int); void xfs_idestroy_fork(struct xfs_ifork *ifp); -void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, +void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff, int whichfork); void xfs_iroot_realloc(struct xfs_inode *, int, int); int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); From 14f2e4ab5d0310c2bb231941d9884fa5bae47fab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:34:57 +0100 Subject: [PATCH 115/123] xfs: move the xfs_attr_sf_lookup tracepoint trace_xfs_attr_sf_lookup is currently only called by xfs_attr_shortform_lookup, which despit it's name is a simple helper for xfs_attr_shortform_addname, which has it's own tracing. Move the callsite to xfs_attr_shortform_getvalue, which is the closest thing to a high level lookup we have for the Linux xattr API. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr_leaf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 2e3334ac3228..37474af8ee46 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -876,8 +876,6 @@ xfs_attr_shortform_lookup( struct xfs_attr_sf_entry *sfe; int i; - trace_xfs_attr_sf_lookup(args); - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; @@ -905,6 +903,9 @@ xfs_attr_shortform_getvalue( int i; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); + + trace_xfs_attr_sf_lookup(args); + sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; sfe = xfs_attr_sf_nextentry(sfe), i++) { From 6c8d169bbd51fc10d1d0029d495962881315b4c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:34:58 +0100 Subject: [PATCH 116/123] xfs: simplify xfs_attr_sf_findname xfs_attr_sf_findname has the simple job of finding a xfs_attr_sf_entry in the attr fork, but the convoluted calling convention obfuscates that. Return the found entry as the return value instead of an pointer argument, as the -ENOATTR/-EEXIST can be trivally derived from that, and remove the basep argument, as it is equivalent of the offset of sfe in the data for if an sfe was found, or an offset of totsize if not was found. To simplify the totsize computation add a xfs_attr_sf_endptr helper that returns the imaginative xfs_attr_sf_entry at the end of the current attrs. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 7 ++- fs/xfs/libxfs/xfs_attr_leaf.c | 96 +++++++++++++---------------------- fs/xfs/libxfs/xfs_attr_leaf.h | 4 +- fs/xfs/libxfs/xfs_attr_sf.h | 7 +++ 4 files changed, 48 insertions(+), 66 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 7f822e72dfcd..bcf8748cb1a3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -862,8 +862,11 @@ xfs_attr_lookup( if (!xfs_inode_hasattr(dp)) return -ENOATTR; - if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) - return xfs_attr_sf_findname(args, NULL, NULL); + if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) { + if (xfs_attr_sf_findname(args)) + return -EEXIST; + return -ENOATTR; + } if (xfs_attr_is_leaf(dp)) { error = xfs_attr_leaf_hasname(args, &bp); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 37474af8ee46..7a623efd23a6 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -698,47 +698,24 @@ xfs_attr_shortform_create( } /* - * Return -EEXIST if attr is found, or -ENOATTR if not - * args: args containing attribute name and namelen - * sfep: If not null, pointer will be set to the last attr entry found on - -EEXIST. On -ENOATTR pointer is left at the last entry in the list - * basep: If not null, pointer is set to the byte offset of the entry in the - * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of - * the last entry in the list + * Return the entry if the attr in args is found, or NULL if not. */ -int +struct xfs_attr_sf_entry * xfs_attr_sf_findname( - struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep) + struct xfs_da_args *args) { - struct xfs_attr_shortform *sf = args->dp->i_af.if_data; - struct xfs_attr_sf_entry *sfe; - unsigned int base = sizeof(struct xfs_attr_sf_hdr); - int size = 0; - int end; - int i; + struct xfs_attr_shortform *sf = args->dp->i_af.if_data; + struct xfs_attr_sf_entry *sfe; - sfe = &sf->list[0]; - end = sf->hdr.count; - for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe), - base += size, i++) { - size = xfs_attr_sf_entsize(sfe); - if (!xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - continue; - break; + for (sfe = &sf->list[0]; + sfe < xfs_attr_sf_endptr(sf); + sfe = xfs_attr_sf_nextentry(sfe)) { + if (xfs_attr_match(args, sfe->namelen, sfe->nameval, + sfe->flags)) + return sfe; } - if (sfep != NULL) - *sfep = sfe; - - if (basep != NULL) - *basep = base; - - if (i == end) - return -ENOATTR; - return -EEXIST; + return NULL; } /* @@ -755,21 +732,19 @@ xfs_attr_shortform_add( struct xfs_ifork *ifp = &dp->i_af; struct xfs_attr_shortform *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; - int offset, size; + int size; trace_xfs_attr_sf_add(args); dp->i_forkoff = forkoff; ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST) - ASSERT(0); + ASSERT(!xfs_attr_sf_findname(args)); - offset = (char *)sfe - (char *)sf; size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen); sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK); - sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset); + sfe = xfs_attr_sf_endptr(sf); sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; sfe->flags = args->attr_filter; @@ -809,39 +784,38 @@ xfs_attr_sf_removename( struct xfs_mount *mp = dp->i_mount; struct xfs_attr_shortform *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; - int size = 0, end, totsize; - unsigned int base; - int error; + uint16_t totsize = be16_to_cpu(sf->hdr.totsize); + void *next, *end; + int size = 0; trace_xfs_attr_sf_remove(args); - error = xfs_attr_sf_findname(args, &sfe, &base); - - /* - * If we are recovering an operation, finding nothing to - * remove is not an error - it just means there was nothing - * to clean up. - */ - if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY)) - return 0; - if (error != -EEXIST) - return error; - size = xfs_attr_sf_entsize(sfe); + sfe = xfs_attr_sf_findname(args); + if (!sfe) { + /* + * If we are recovering an operation, finding nothing to remove + * is not an error, it just means there was nothing to clean up. + */ + if (args->op_flags & XFS_DA_OP_RECOVERY) + return 0; + return -ENOATTR; + } /* * Fix up the attribute fork data, covering the hole */ - end = base + size; - totsize = be16_to_cpu(sf->hdr.totsize); - if (end != totsize) - memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); + size = xfs_attr_sf_entsize(sfe); + next = xfs_attr_sf_nextentry(sfe); + end = xfs_attr_sf_endptr(sf); + if (next < end) + memmove(sfe, next, end - next); sf->hdr.count--; - be16_add_cpu(&sf->hdr.totsize, -size); + totsize -= size; + sf->hdr.totsize = cpu_to_be16(totsize); /* * Fix up the start offset of the attribute fork */ - totsize -= size; if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index ce6743463c86..56fcd689eedf 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,9 +51,7 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); -int xfs_attr_sf_findname(struct xfs_da_args *args, - struct xfs_attr_sf_entry **sfep, - unsigned int *basep); +struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_shortform *sfp, diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 37578b369d9b..a774d4d87763 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -48,4 +48,11 @@ xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) return (void *)sfep + xfs_attr_sf_entsize(sfep); } +/* pointer to the space after the last entry, e.g. for adding a new one */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_endptr(struct xfs_attr_shortform *sf) +{ + return (void *)sf + be16_to_cpu(sf->hdr.totsize); +} + #endif /* __XFS_ATTR_SF_H__ */ From 22b7b1f597a6a21fb7b3791a55f3a7ae54d2dfe4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:34:59 +0100 Subject: [PATCH 117/123] xfs: remove xfs_attr_shortform_lookup xfs_attr_shortform_lookup is only used by xfs_attr_shortform_addname, which is much better served by calling xfs_attr_sf_findname. Switch it over and remove xfs_attr_shortform_lookup. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 21 +++++++-------------- fs/xfs/libxfs/xfs_attr_leaf.c | 24 ------------------------ fs/xfs/libxfs/xfs_attr_leaf.h | 1 - 3 files changed, 7 insertions(+), 39 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index bcf8748cb1a3..e8b4317da830 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1066,19 +1066,14 @@ xfs_attr_shortform_addname( struct xfs_da_args *args) { int newsize, forkoff; - int error; trace_xfs_attr_sf_addname(args); - error = xfs_attr_shortform_lookup(args); - switch (error) { - case -ENOATTR: - if (args->op_flags & XFS_DA_OP_REPLACE) - return error; - break; - case -EEXIST: + if (xfs_attr_sf_findname(args)) { + int error; + if (!(args->op_flags & XFS_DA_OP_REPLACE)) - return error; + return -EEXIST; error = xfs_attr_sf_removename(args); if (error) @@ -1091,11 +1086,9 @@ xfs_attr_shortform_addname( * around. */ args->op_flags &= ~XFS_DA_OP_REPLACE; - break; - case 0: - break; - default: - return error; + } else { + if (args->op_flags & XFS_DA_OP_REPLACE) + return -ENOATTR; } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 7a623efd23a6..75c597805ffa 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -837,30 +837,6 @@ xfs_attr_sf_removename( return 0; } -/* - * Look up a name in a shortform attribute list structure. - */ -/*ARGSUSED*/ -int -xfs_attr_shortform_lookup( - struct xfs_da_args *args) -{ - struct xfs_ifork *ifp = &args->dp->i_af; - struct xfs_attr_shortform *sf = ifp->if_data; - struct xfs_attr_sf_entry *sfe; - int i; - - ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return -EEXIST; - } - return -ENOATTR; -} - /* * Retrieve the attribute value and length. * diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 56fcd689eedf..35e668ae744f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -47,7 +47,6 @@ struct xfs_attr3_icleaf_hdr { */ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); -int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_sf_removename(struct xfs_da_args *args); From 1fb4b0def7b5a5bf91ad62a112d8d3f6dc76585f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:35:00 +0100 Subject: [PATCH 118/123] xfs: use xfs_attr_sf_findname in xfs_attr_shortform_getvalue xfs_attr_shortform_getvalue duplicates the logic in xfs_attr_sf_findname. Use the helper instead. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr_leaf.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 75c597805ffa..82e183033416 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -848,23 +848,17 @@ int xfs_attr_shortform_getvalue( struct xfs_da_args *args) { - struct xfs_attr_shortform *sf = args->dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; - int i; ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL); trace_xfs_attr_sf_lookup(args); - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; - sfe = xfs_attr_sf_nextentry(sfe), i++) { - if (xfs_attr_match(args, sfe->namelen, sfe->nameval, - sfe->flags)) - return xfs_attr_copy_value(args, - &sfe->nameval[args->namelen], sfe->valuelen); - } - return -ENOATTR; + sfe = xfs_attr_sf_findname(args); + if (!sfe) + return -ENOATTR; + return xfs_attr_copy_value(args, &sfe->nameval[args->namelen], + sfe->valuelen); } /* Convert from using the shortform to the leaf format. */ From 414147225400a0c4562ebfb0fdd40f065099ede4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:35:01 +0100 Subject: [PATCH 119/123] xfs: remove struct xfs_attr_shortform sparse complains about struct xfs_attr_shortform because it embeds a structure with a variable sized array in a variable sized array. Given that xfs_attr_shortform is not a very useful structure, and the dir2 equivalent has been removed a long time ago, remove it as well. Provide a xfs_attr_sf_firstentry helper that returns the first xfs_attr_sf_entry behind a xfs_attr_sf_hdr to replace the structure dereference. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 4 ++-- fs/xfs/libxfs/xfs_attr_leaf.c | 37 +++++++++++++++++----------------- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- fs/xfs/libxfs/xfs_attr_sf.h | 13 +++++++++--- fs/xfs/libxfs/xfs_da_format.h | 31 ++++++++++++++++------------ fs/xfs/libxfs/xfs_inode_fork.c | 5 ++--- fs/xfs/libxfs/xfs_ondisk.h | 14 ++++++------- fs/xfs/scrub/attr.c | 9 ++++----- fs/xfs/scrub/inode_repair.c | 4 ++-- fs/xfs/xfs_attr_list.c | 12 +++++------ 10 files changed, 70 insertions(+), 61 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index e8b4317da830..ec4061db7ffc 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1052,9 +1052,9 @@ xfs_attr_set( static inline int xfs_attr_sf_totsize(struct xfs_inode *dp) { - struct xfs_attr_shortform *sf = dp->i_af.if_data; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; - return be16_to_cpu(sf->hdr.totsize); + return be16_to_cpu(sf->totsize); } /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 82e183033416..e1281ab413c8 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -704,10 +704,10 @@ struct xfs_attr_sf_entry * xfs_attr_sf_findname( struct xfs_da_args *args) { - struct xfs_attr_shortform *sf = args->dp->i_af.if_data; + struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; - for (sfe = &sf->list[0]; + for (sfe = xfs_attr_sf_firstentry(sf); sfe < xfs_attr_sf_endptr(sf); sfe = xfs_attr_sf_nextentry(sfe)) { if (xfs_attr_match(args, sfe->namelen, sfe->nameval, @@ -730,7 +730,7 @@ xfs_attr_shortform_add( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_ifork *ifp = &dp->i_af; - struct xfs_attr_shortform *sf = ifp->if_data; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; int size; @@ -750,8 +750,8 @@ xfs_attr_shortform_add( sfe->flags = args->attr_filter; memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); - sf->hdr.count++; - be16_add_cpu(&sf->hdr.totsize, size); + sf->count++; + be16_add_cpu(&sf->totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); @@ -782,9 +782,9 @@ xfs_attr_sf_removename( { struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; - struct xfs_attr_shortform *sf = dp->i_af.if_data; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; - uint16_t totsize = be16_to_cpu(sf->hdr.totsize); + uint16_t totsize = be16_to_cpu(sf->totsize); void *next, *end; int size = 0; @@ -809,9 +809,9 @@ xfs_attr_sf_removename( end = xfs_attr_sf_endptr(sf); if (next < end) memmove(sfe, next, end - next); - sf->hdr.count--; + sf->count--; totsize -= size; - sf->hdr.totsize = cpu_to_be16(totsize); + sf->totsize = cpu_to_be16(totsize); /* * Fix up the start offset of the attribute fork @@ -868,21 +868,21 @@ xfs_attr_shortform_to_leaf( { struct xfs_inode *dp = args->dp; struct xfs_ifork *ifp = &dp->i_af; - struct xfs_attr_shortform *sf = ifp->if_data; + struct xfs_attr_sf_hdr *sf = ifp->if_data; struct xfs_attr_sf_entry *sfe; + int size = be16_to_cpu(sf->totsize); struct xfs_da_args nargs; char *tmpbuffer; - int error, i, size; + int error, i; xfs_dablk_t blkno; struct xfs_buf *bp; trace_xfs_attr_sf_to_leaf(args); - size = be16_to_cpu(sf->hdr.totsize); tmpbuffer = kmem_alloc(size, 0); ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, ifp->if_data, size); - sf = (struct xfs_attr_shortform *)tmpbuffer; + sf = (struct xfs_attr_sf_hdr *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); @@ -905,8 +905,8 @@ xfs_attr_shortform_to_leaf( nargs.trans = args->trans; nargs.op_flags = XFS_DA_OP_OKNOENT; - sfe = &sf->list[0]; - for (i = 0; i < sf->hdr.count; i++) { + sfe = xfs_attr_sf_firstentry(sf); + for (i = 0; i < sf->count; i++) { nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; nargs.value = &sfe->nameval[nargs.namelen]; @@ -973,10 +973,10 @@ xfs_attr_shortform_allfit( /* Verify the consistency of a raw inline attribute fork. */ xfs_failaddr_t xfs_attr_shortform_verify( - struct xfs_attr_shortform *sfp, + struct xfs_attr_sf_hdr *sfp, size_t size) { - struct xfs_attr_sf_entry *sfep; + struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp); struct xfs_attr_sf_entry *next_sfep; char *endp; int i; @@ -990,8 +990,7 @@ xfs_attr_shortform_verify( endp = (char *)sfp + size; /* Check all reported entries */ - sfep = &sfp->list[0]; - for (i = 0; i < sfp->hdr.count; i++) { + for (i = 0; i < sfp->count; i++) { /* * struct xfs_attr_sf_entry has a variable length. * Check the fixed-offset parts of the structure are diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 35e668ae744f..9b9948639c0f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -53,7 +53,7 @@ int xfs_attr_sf_removename(struct xfs_da_args *args); struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); -xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_shortform *sfp, +xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp, size_t size); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index a774d4d87763..9abf7de95465 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -41,7 +41,14 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep) return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen); } -/* next entry in struct */ +/* first entry in the SF attr fork */ +static inline struct xfs_attr_sf_entry * +xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr) +{ + return (struct xfs_attr_sf_entry *)(hdr + 1); +} + +/* next entry after sfep */ static inline struct xfs_attr_sf_entry * xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) { @@ -50,9 +57,9 @@ xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep) /* pointer to the space after the last entry, e.g. for adding a new one */ static inline struct xfs_attr_sf_entry * -xfs_attr_sf_endptr(struct xfs_attr_shortform *sf) +xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf) { - return (void *)sf + be16_to_cpu(sf->hdr.totsize); + return (void *)sf + be16_to_cpu(sf->totsize); } #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index f9015f88eca7..24f9d1461f9a 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -578,20 +578,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) #define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ /* - * Entries are packed toward the top as tight as possible. + * Attribute storage when stored inside the inode. + * + * Small attribute lists are packed as tightly as possible so as to fit into the + * literal area of the inode. + * + * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header + * followed by zero or more xfs_attr_sf_entry structures. */ -struct xfs_attr_shortform { - struct xfs_attr_sf_hdr { /* constant-structure header block */ - __be16 totsize; /* total bytes in shortform list */ - __u8 count; /* count of active entries */ - __u8 padding; - } hdr; - struct xfs_attr_sf_entry { - uint8_t namelen; /* actual length of name (no NULL) */ - uint8_t valuelen; /* actual length of value (no NULL) */ - uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ - uint8_t nameval[]; /* name & value bytes concatenated */ - } list[]; /* variable sized array */ +struct xfs_attr_sf_hdr { /* constant-structure header block */ + __be16 totsize; /* total bytes in shortform list */ + __u8 count; /* count of active entries */ + __u8 padding; +}; + +struct xfs_attr_sf_entry { + __u8 namelen; /* actual length of name (no NULL) */ + __u8 valuelen; /* actual length of value (no NULL) */ + __u8 flags; /* flags bits (XFS_ATTR_*) */ + __u8 nameval[]; /* name & value bytes concatenated */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index d8405a8d3c14..f4569e18a8d0 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -279,10 +279,9 @@ static uint16_t xfs_dfork_attr_shortform_size( struct xfs_dinode *dip) { - struct xfs_attr_shortform *atp = - (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip); + struct xfs_attr_sf_hdr *sf = XFS_DFORK_APTR(dip); - return be16_to_cpu(atp->hdr.totsize); + return be16_to_cpu(sf->totsize); } void diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index d9c988c5ad69..81885a6a028e 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -93,13 +93,13 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6); - XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen, 0); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2); + XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3); XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index bac6fb2f01d8..83c7feb38714 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -528,23 +528,22 @@ xchk_xattr_check_sf( { struct xchk_xattr_buf *ab = sc->buf; struct xfs_ifork *ifp = &sc->ip->i_af; - struct xfs_attr_shortform *sf = ifp->if_data; - struct xfs_attr_sf_entry *sfe; + struct xfs_attr_sf_hdr *sf = ifp->if_data; + struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf); struct xfs_attr_sf_entry *next; unsigned char *end = ifp->if_data + ifp->if_bytes; int i; int error = 0; bitmap_zero(ab->usedmap, ifp->if_bytes); - xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr)); + xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf)); - sfe = &sf->list[0]; if ((unsigned char *)sfe > end) { xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); return 0; } - for (i = 0; i < sf->hdr.count; i++) { + for (i = 0; i < sf->count; i++) { unsigned char *name = sfe->nameval; unsigned char *value = &sfe->nameval[sfe->namelen]; diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 66949cc3d7cc..0ca62d59f84a 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -760,7 +760,7 @@ xrep_dinode_check_afork( struct xfs_scrub *sc, struct xfs_dinode *dip) { - struct xfs_attr_shortform *afork_ptr; + struct xfs_attr_sf_hdr *afork_ptr; size_t attr_size; unsigned int afork_size; @@ -778,7 +778,7 @@ xrep_dinode_check_afork( return true; /* xattr structure cannot be larger than the fork */ - attr_size = be16_to_cpu(afork_ptr->hdr.totsize); + attr_size = be16_to_cpu(afork_ptr->totsize); if (attr_size > afork_size) return true; diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 8700b00e154c..e368ad671e26 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -56,13 +56,13 @@ xfs_attr_shortform_list( struct xfs_attrlist_cursor_kern *cursor = &context->cursor; struct xfs_inode *dp = context->dp; struct xfs_attr_sf_sort *sbuf, *sbp; - struct xfs_attr_shortform *sf = dp->i_af.if_data; + struct xfs_attr_sf_hdr *sf = dp->i_af.if_data; struct xfs_attr_sf_entry *sfe; int sbsize, nsbuf, count, i; int error = 0; ASSERT(sf != NULL); - if (!sf->hdr.count) + if (!sf->count) return 0; trace_xfs_attr_list_sf(context); @@ -78,8 +78,8 @@ xfs_attr_shortform_list( */ if (context->bufsize == 0 || (XFS_ISRESET_CURSOR(cursor) && - (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (XFS_IS_CORRUPT(context->dp->i_mount, !xfs_attr_namecheck(sfe->nameval, sfe->namelen))) @@ -108,7 +108,7 @@ xfs_attr_shortform_list( /* * It didn't all fit, so we have to sort everything on hashval. */ - sbsize = sf->hdr.count * sizeof(*sbuf); + sbsize = sf->count * sizeof(*sbuf); sbp = sbuf = kmem_alloc(sbsize, KM_NOFS); /* @@ -116,7 +116,7 @@ xfs_attr_shortform_list( * the relevant info from only those that match into a buffer. */ nsbuf = 0; - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) { if (unlikely( ((char *)sfe < (char *)sf) || ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { From 074aea4be1a4074be49a7ec41c674cc02b52fd60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:35:02 +0100 Subject: [PATCH 120/123] xfs: remove xfs_attr_sf_hdr_t Remove the last two users of the typedef. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr_leaf.c | 4 ++-- fs/xfs/libxfs/xfs_attr_sf.h | 8 -------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e1281ab413c8..6374bf107242 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -816,7 +816,7 @@ xfs_attr_sf_removename( /* * Fix up the start offset of the attribute fork */ - if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && + if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) { xfs_attr_fork_remove(dp, args->trans); @@ -824,7 +824,7 @@ xfs_attr_sf_removename( xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) || (args->op_flags & XFS_DA_OP_ADDNAME) || !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE); diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h index 9abf7de95465..bc4422223024 100644 --- a/fs/xfs/libxfs/xfs_attr_sf.h +++ b/fs/xfs/libxfs/xfs_attr_sf.h @@ -6,14 +6,6 @@ #ifndef __XFS_ATTR_SF_H__ #define __XFS_ATTR_SF_H__ -/* - * Attribute storage when stored inside the inode. - * - * Small attribute lists are packed as tightly as possible so as - * to fit into the literal area of the inode. - */ -typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t; - /* * We generate this then sort it, attr_list() must return things in hash-order. */ From 378b6aef9de0f7c3d0de309ecc61c11eb29e57da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2023 07:35:03 +0100 Subject: [PATCH 121/123] xfs: turn the XFS_DA_OP_REPLACE checks in xfs_attr_shortform_addname into asserts Since commit deed9512872d ("xfs: Check for -ENOATTR or -EEXIST"), the high-level attr code does a lookup for any attr we're trying to set, and does the checks to handle the create vs replace cases, which thus never hit the low-level attr code. Turn the checks in xfs_attr_shortform_addname as they must never trip. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_attr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ec4061db7ffc..9976a00a73f9 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1072,8 +1072,7 @@ xfs_attr_shortform_addname( if (xfs_attr_sf_findname(args)) { int error; - if (!(args->op_flags & XFS_DA_OP_REPLACE)) - return -EEXIST; + ASSERT(args->op_flags & XFS_DA_OP_REPLACE); error = xfs_attr_sf_removename(args); if (error) @@ -1087,8 +1086,7 @@ xfs_attr_shortform_addname( */ args->op_flags &= ~XFS_DA_OP_REPLACE; } else { - if (args->op_flags & XFS_DA_OP_REPLACE) - return -ENOATTR; + ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE)); } if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || From 4f6ac47b55e3ce6e982807928d6074ec105ab66e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Dec 2023 07:24:09 +0000 Subject: [PATCH 122/123] xfs: fix a use after free in xfs_defer_finish_recovery dfp will be freed by ->recover_work and thus the tracepoint in case of an error can lead to a use after free. Store the defer ops in a local variable to avoid that. Fixes: 7f2f7531e0d4 ("xfs: store an ops pointer in struct xfs_defer_pending") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index ca7f0ac04896..75c5b3a2c2cb 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -915,12 +915,14 @@ xfs_defer_finish_recovery( struct xfs_defer_pending *dfp, struct list_head *capture_list) { + const struct xfs_defer_op_type *ops = dfp->dfp_ops; int error; - error = dfp->dfp_ops->recover_work(dfp, capture_list); + /* dfp is freed by recover_work and must not be accessed afterwards */ + error = ops->recover_work(dfp, capture_list); if (error) trace_xlog_intent_recovery_failed(mp, error, - dfp->dfp_ops->recover_work); + ops->recover_work); return error; } From bcdfae6ee520b665385020fa3e47633a8af84f12 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Dec 2023 07:24:10 +0000 Subject: [PATCH 123/123] xfs: use the op name in trace_xlog_intent_recovery_failed Instead of tracing the address of the recovery handler, use the name in the defer op, similar to other defer ops related tracepoints. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R --- fs/xfs/libxfs/xfs_defer.c | 3 +-- fs/xfs/xfs_trace.h | 15 +++++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 75c5b3a2c2cb..66a17910d021 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -921,8 +921,7 @@ xfs_defer_finish_recovery( /* dfp is freed by recover_work and must not be accessed afterwards */ error = ops->recover_work(dfp, capture_list); if (error) - trace_xlog_intent_recovery_failed(mp, error, - ops->recover_work); + trace_xlog_intent_recovery_failed(mp, ops, error); return error; } diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0efcdb79d10e..0984a1c884c7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -67,6 +67,7 @@ struct xfs_buf_log_format; struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; +struct xfs_defer_op_type; struct xfs_refcount_irec; struct xfs_fsmap; struct xfs_rmap_irec; @@ -145,21 +146,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); TRACE_EVENT(xlog_intent_recovery_failed, - TP_PROTO(struct xfs_mount *mp, int error, void *function), - TP_ARGS(mp, error, function), + TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, + int error), + TP_ARGS(mp, ops, error), TP_STRUCT__entry( __field(dev_t, dev) + __string(name, ops->name) __field(int, error) - __field(void *, function) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; + __assign_str(name, ops->name); __entry->error = error; - __entry->function = function; ), - TP_printk("dev %d:%d error %d function %pS", + TP_printk("dev %d:%d optype %s error %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->error, __entry->function) + __get_str(name), + __entry->error) ); DECLARE_EVENT_CLASS(xfs_perag_class,