xfs: rework xfs_alloc_vextent()

It's a multiplexing mess that can be greatly simplified, and really
needs to be simplified to allow active per-ag references to
propagate from initial AG selection code the the bmapi code.

This splits the code out into separate a parameter checking
function, an iterator function, and allocation completion functions
and then implements the individual policies using these functions.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
This commit is contained in:
Dave Chinner 2023-02-13 09:14:53 +11:00
parent 76257a1587
commit ecd788a924

View File

@ -3151,29 +3151,20 @@ xfs_alloc_read_agf(
}
/*
* Allocate an extent (variable-size).
* Depending on the allocation type, we either look in a single allocation
* group or loop over the allocation groups to find the result.
* Pre-proces allocation arguments to set initial state that we don't require
* callers to set up correctly, as well as bounds check the allocation args
* that are set up.
*/
int /* error */
xfs_alloc_vextent(
struct xfs_alloc_arg *args) /* allocation argument structure */
static int
xfs_alloc_vextent_check_args(
struct xfs_alloc_arg *args)
{
xfs_agblock_t agsize; /* allocation group size */
int error;
int flags; /* XFS_ALLOC_FLAG_... locking flags */
struct xfs_mount *mp; /* mount structure pointer */
xfs_agnumber_t sagno; /* starting allocation group number */
xfs_alloctype_t type; /* input allocation type */
int bump_rotor = 0;
xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
xfs_agnumber_t minimum_agno = 0;
struct xfs_mount *mp = args->mp;
xfs_agblock_t agsize;
mp = args->mp;
type = args->otype = args->type;
args->otype = args->type;
args->agbno = NULLAGBLOCK;
if (args->tp->t_highest_agno != NULLAGNUMBER)
minimum_agno = args->tp->t_highest_agno;
/*
* Just fix this up, for the case where the last a.g. is shorter
* (or there's only one a.g.) and the caller couldn't easily figure
@ -3195,199 +3186,314 @@ xfs_alloc_vextent(
args->mod >= args->prod) {
args->fsbno = NULLFSBLOCK;
trace_xfs_alloc_vextent_badargs(args);
return 0;
return -ENOSPC;
}
return 0;
}
switch (type) {
case XFS_ALLOCTYPE_THIS_AG:
case XFS_ALLOCTYPE_NEAR_BNO:
case XFS_ALLOCTYPE_THIS_BNO:
/*
* These three force us into a single a.g.
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
args->pag = xfs_perag_get(mp, args->agno);
if (minimum_agno > args->agno) {
trace_xfs_alloc_vextent_skip_deadlock(args);
error = 0;
break;
}
error = xfs_alloc_fix_freelist(args, 0);
if (error) {
trace_xfs_alloc_vextent_nofix(args);
goto error0;
}
if (!args->agbp) {
trace_xfs_alloc_vextent_noagbp(args);
break;
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
if ((error = xfs_alloc_ag_vextent(args)))
goto error0;
break;
case XFS_ALLOCTYPE_START_BNO:
/*
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
xfs_is_inode32(mp)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
mp->m_sb.sb_agcount), 0);
bump_rotor = 1;
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
fallthrough;
case XFS_ALLOCTYPE_FIRST_AG:
/*
* Rotate through the allocation groups looking for a winner.
* If we are blocking, we must obey minimum_agno contraints for
* avoiding ABBA deadlocks on AGF locking.
*/
if (type == XFS_ALLOCTYPE_FIRST_AG) {
/*
* Start with allocation group given by bno.
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_THIS_AG;
sagno = minimum_agno;
flags = 0;
} else {
/*
* Start with the given allocation group.
*/
args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
flags = XFS_ALLOC_FLAG_TRYLOCK;
}
/*
* Loop over allocation groups twice; first time with
* trylock set, second time without.
*/
for (;;) {
args->pag = xfs_perag_get(mp, args->agno);
error = xfs_alloc_fix_freelist(args, flags);
if (error) {
trace_xfs_alloc_vextent_nofix(args);
goto error0;
}
/*
* If we get a buffer back then the allocation will fly.
*/
if (args->agbp) {
if ((error = xfs_alloc_ag_vextent(args)))
goto error0;
break;
}
trace_xfs_alloc_vextent_loopfailed(args);
/*
* Didn't work, figure out the next iteration.
*/
if (args->agno == sagno &&
type == XFS_ALLOCTYPE_START_BNO)
args->type = XFS_ALLOCTYPE_THIS_AG;
/*
* If we are try-locking, we can't deadlock on AGF
* locks, so we can wrap all the way back to the first
* AG. Otherwise, wrap back to the start AG so we can't
* deadlock, and let the end of scan handler decide what
* to do next.
*/
if (++(args->agno) == mp->m_sb.sb_agcount) {
if (flags & XFS_ALLOC_FLAG_TRYLOCK)
args->agno = 0;
else
args->agno = sagno;
}
/*
* Reached the starting a.g., must either be done
* or switch to non-trylock mode.
*/
if (args->agno == sagno) {
if (flags == 0) {
args->agbno = NULLAGBLOCK;
trace_xfs_alloc_vextent_allfailed(args);
break;
}
/*
* Blocking pass next, so we must obey minimum
* agno constraints to avoid ABBA AGF deadlocks.
*/
flags = 0;
if (minimum_agno > sagno)
sagno = minimum_agno;
if (type == XFS_ALLOCTYPE_START_BNO) {
args->agbno = XFS_FSB_TO_AGBNO(mp,
args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
}
}
xfs_perag_put(args->pag);
}
if (bump_rotor) {
if (args->agno == sagno)
mp->m_agfrotor = (mp->m_agfrotor + 1) %
(mp->m_sb.sb_agcount * rotorstep);
else
mp->m_agfrotor = (args->agno * rotorstep + 1) %
(mp->m_sb.sb_agcount * rotorstep);
}
break;
default:
ASSERT(0);
/* NOTREACHED */
}
if (args->agbno == NULLAGBLOCK) {
args->fsbno = NULLFSBLOCK;
} else {
args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
#ifdef DEBUG
ASSERT(args->len >= args->minlen);
ASSERT(args->len <= args->maxlen);
ASSERT(args->agbno % args->alignment == 0);
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
args->len);
#endif
}
/*
* Post-process allocation results to set the allocated block number correctly
* for the caller.
*
* XXX: xfs_alloc_vextent() should really be returning ENOSPC for ENOSPC, not
* hiding it behind a "successful" NULLFSBLOCK allocation.
*/
static void
xfs_alloc_vextent_set_fsbno(
struct xfs_alloc_arg *args,
xfs_agnumber_t minimum_agno)
{
struct xfs_mount *mp = args->mp;
/*
* We end up here with a locked AGF. If we failed, the caller is likely
* going to try to allocate again with different parameters, and that
* can widen the AGs that are searched for free space. If we have to do
* BMBT block allocation, we have to do a new allocation.
* We can end up here with a locked AGF. If we failed, the caller is
* likely going to try to allocate again with different parameters, and
* that can widen the AGs that are searched for free space. If we have
* to do BMBT block allocation, we have to do a new allocation.
*
* Hence leaving this function with the AGF locked opens up potential
* ABBA AGF deadlocks because a future allocation attempt in this
* transaction may attempt to lock a lower number AGF.
*
* We can't release the AGF until the transaction is commited, so at
* this point we must update the "firstblock" tracker to point at this
* AG if the tracker is empty or points to a lower AG. This allows the
* next allocation attempt to be modified appropriately to avoid
* this point we must update the "first allocation" tracker to point at
* this AG if the tracker is empty or points to a lower AG. This allows
* the next allocation attempt to be modified appropriately to avoid
* deadlocks.
*/
if (args->agbp &&
(args->tp->t_highest_agno == NULLAGNUMBER ||
args->pag->pag_agno > minimum_agno))
args->tp->t_highest_agno = args->pag->pag_agno;
xfs_perag_put(args->pag);
return 0;
error0:
args->agno > minimum_agno))
args->tp->t_highest_agno = args->agno;
/* Allocation failed with ENOSPC if NULLAGBLOCK was returned. */
if (args->agbno == NULLAGBLOCK) {
args->fsbno = NULLFSBLOCK;
return;
}
args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
#ifdef DEBUG
ASSERT(args->len >= args->minlen);
ASSERT(args->len <= args->maxlen);
ASSERT(args->agbno % args->alignment == 0);
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
#endif
}
/*
* Allocate within a single AG only.
*/
static int
xfs_alloc_vextent_this_ag(
struct xfs_alloc_arg *args,
xfs_agnumber_t minimum_agno)
{
struct xfs_mount *mp = args->mp;
int error;
error = xfs_alloc_vextent_check_args(args);
if (error) {
if (error == -ENOSPC)
return 0;
return error;
}
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
if (minimum_agno > args->agno) {
trace_xfs_alloc_vextent_skip_deadlock(args);
args->fsbno = NULLFSBLOCK;
return 0;
}
args->pag = xfs_perag_get(mp, args->agno);
error = xfs_alloc_fix_freelist(args, 0);
if (error) {
trace_xfs_alloc_vextent_nofix(args);
goto out_error;
}
if (!args->agbp) {
trace_xfs_alloc_vextent_noagbp(args);
args->fsbno = NULLFSBLOCK;
goto out_error;
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
error = xfs_alloc_ag_vextent(args);
xfs_alloc_vextent_set_fsbno(args, minimum_agno);
out_error:
xfs_perag_put(args->pag);
return error;
}
/*
* Iterate all AGs trying to allocate an extent starting from @start_ag.
*
* If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
* allocation attempts in @start_agno have locality information. If we fail to
* allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
* we attempt to allocation in as there is no locality optimisation possible for
* those allocations.
*
* When we wrap the AG iteration at the end of the filesystem, we have to be
* careful not to wrap into AGs below ones we already have locked in the
* transaction if we are doing a blocking iteration. This will result in an
* out-of-order locking of AGFs and hence can cause deadlocks.
*/
static int
xfs_alloc_vextent_iterate_ags(
struct xfs_alloc_arg *args,
xfs_agnumber_t minimum_agno,
xfs_agnumber_t start_agno,
uint32_t flags)
{
struct xfs_mount *mp = args->mp;
int error = 0;
ASSERT(start_agno >= minimum_agno);
/*
* Loop over allocation groups twice; first time with
* trylock set, second time without.
*/
args->agno = start_agno;
for (;;) {
args->pag = xfs_perag_get(mp, args->agno);
error = xfs_alloc_fix_freelist(args, flags);
if (error) {
trace_xfs_alloc_vextent_nofix(args);
break;
}
/*
* If we get a buffer back then the allocation will fly.
*/
if (args->agbp) {
error = xfs_alloc_ag_vextent(args);
break;
}
trace_xfs_alloc_vextent_loopfailed(args);
/*
* Didn't work, figure out the next iteration.
*/
if (args->agno == start_agno &&
args->otype == XFS_ALLOCTYPE_START_BNO)
args->type = XFS_ALLOCTYPE_THIS_AG;
/*
* If we are try-locking, we can't deadlock on AGF locks so we
* can wrap all the way back to the first AG. Otherwise, wrap
* back to the start AG so we can't deadlock and let the end of
* scan handler decide what to do next.
*/
if (++(args->agno) == mp->m_sb.sb_agcount) {
if (flags & XFS_ALLOC_FLAG_TRYLOCK)
args->agno = 0;
else
args->agno = minimum_agno;
}
/*
* Reached the starting a.g., must either be done
* or switch to non-trylock mode.
*/
if (args->agno == start_agno) {
if (flags == 0) {
args->agbno = NULLAGBLOCK;
trace_xfs_alloc_vextent_allfailed(args);
break;
}
flags = 0;
if (args->otype == XFS_ALLOCTYPE_START_BNO) {
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
}
}
xfs_perag_put(args->pag);
args->pag = NULL;
}
if (args->pag) {
xfs_perag_put(args->pag);
args->pag = NULL;
}
return error;
}
/*
* Iterate from the AGs from the start AG to the end of the filesystem, trying
* to allocate blocks. It starts with a near allocation attempt in the initial
* AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
* back to zero if allowed by previous allocations in this transaction,
* otherwise will wrap back to the start AG and run a second blocking pass to
* the end of the filesystem.
*/
static int
xfs_alloc_vextent_start_ag(
struct xfs_alloc_arg *args,
xfs_agnumber_t minimum_agno)
{
struct xfs_mount *mp = args->mp;
xfs_agnumber_t start_agno;
xfs_agnumber_t rotorstep = xfs_rotorstep;
bool bump_rotor = false;
int error;
error = xfs_alloc_vextent_check_args(args);
if (error) {
if (error == -ENOSPC)
return 0;
return error;
}
if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
xfs_is_inode32(mp)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
mp->m_sb.sb_agcount), 0);
bump_rotor = 1;
}
start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, args->fsbno));
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
args->type = XFS_ALLOCTYPE_NEAR_BNO;
error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
XFS_ALLOC_FLAG_TRYLOCK);
if (bump_rotor) {
if (args->agno == start_agno)
mp->m_agfrotor = (mp->m_agfrotor + 1) %
(mp->m_sb.sb_agcount * rotorstep);
else
mp->m_agfrotor = (args->agno * rotorstep + 1) %
(mp->m_sb.sb_agcount * rotorstep);
}
xfs_alloc_vextent_set_fsbno(args, minimum_agno);
return error;
}
/*
* Iterate from the agno indicated from args->fsbno through to the end of the
* filesystem attempting blocking allocation. This does not wrap or try a second
* pass, so will not recurse into AGs lower than indicated by fsbno.
*/
static int
xfs_alloc_vextent_first_ag(
struct xfs_alloc_arg *args,
xfs_agnumber_t minimum_agno)
{
struct xfs_mount *mp = args->mp;
xfs_agnumber_t start_agno;
int error;
error = xfs_alloc_vextent_check_args(args);
if (error) {
if (error == -ENOSPC)
return 0;
return error;
}
start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, args->fsbno));
args->type = XFS_ALLOCTYPE_THIS_AG;
error = xfs_alloc_vextent_iterate_ags(args, minimum_agno,
start_agno, 0);
xfs_alloc_vextent_set_fsbno(args, minimum_agno);
return error;
}
/*
* Allocate an extent (variable-size).
* Depending on the allocation type, we either look in a single allocation
* group or loop over the allocation groups to find the result.
*/
int
xfs_alloc_vextent(
struct xfs_alloc_arg *args)
{
xfs_agnumber_t minimum_agno = 0;
if (args->tp->t_highest_agno != NULLAGNUMBER)
minimum_agno = args->tp->t_highest_agno;
switch (args->type) {
case XFS_ALLOCTYPE_THIS_AG:
case XFS_ALLOCTYPE_NEAR_BNO:
case XFS_ALLOCTYPE_THIS_BNO:
return xfs_alloc_vextent_this_ag(args, minimum_agno);
case XFS_ALLOCTYPE_START_BNO:
return xfs_alloc_vextent_start_ag(args, minimum_agno);
case XFS_ALLOCTYPE_FIRST_AG:
return xfs_alloc_vextent_first_ag(args, minimum_agno);
default:
ASSERT(0);
/* NOTREACHED */
}
/* Should never get here */
return -EFSCORRUPTED;
}
/* Ensure that the freelist is at full capacity. */
int
xfs_free_extent_fix_freelist(