xfs: use a separate frextents counter for rt extent reservations

As mentioned in the previous commit, the kernel misuses sb_frextents in
the incore mount to reflect both incore reservations made by running
transactions as well as the actual count of free rt extents on disk.
This results in the superblock being written to the log with an
underestimate of the number of rt extents that are marked free in the
rtbitmap.

Teaching XFS to recompute frextents after log recovery avoids
operational problems in the current mount, but it doesn't solve the
problem of us writing undercounted frextents which are then recovered by
an older kernel that doesn't have that fix.

Create an incore percpu counter to mirror the ondisk frextents.  This
new counter will track transaction reservations and the only time we
will touch the incore super counter (i.e the one that gets logged) is
when those transactions commit updates to the rt bitmap.  This is in
contrast to the lazysbcount counters (e.g. fdblocks), where we know that
log recovery will always fix any incorrect counter that we log.
As a bonus, we only take m_sb_lock at transaction commit time.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
This commit is contained in:
Darrick J. Wong 2022-04-12 06:49:42 +10:00 committed by Dave Chinner
parent 5a605fd6cb
commit 2229276c52
8 changed files with 99 additions and 47 deletions

View File

@ -911,6 +911,11 @@ xfs_log_sb(
* reservations that have been taken out percpu counters. If we have an
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
*
* Do not update sb_frextents here because it is not part of the lazy
* sb counters, despite having a percpu counter. It is always kept
* consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
* and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);

View File

@ -349,10 +349,7 @@ xfs_fs_counts(
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp);
spin_lock(&mp->m_sb_lock);
cnt->freertx = mp->m_sb.sb_frextents;
spin_unlock(&mp->m_sb_lock);
cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
}
/*

View File

@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
uint64_t freertx;
if (!XFS_IS_REALTIME_INODE(ip))
return false;
freertx = READ_ONCE(mp->m_sb.sb_frextents);
return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
if (__percpu_counter_compare(&mp->m_frextents,
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
return false;
}
#else
# define xfs_inodegc_want_queue_rt_file(ip) (false)

View File

@ -1110,24 +1110,33 @@ xfs_fs_writable(
return true;
}
/* Adjust m_fdblocks or m_frextents. */
int
xfs_mod_fdblocks(
xfs_mod_freecounter(
struct xfs_mount *mp,
struct percpu_counter *counter,
int64_t delta,
bool rsvd)
{
int64_t lcounter;
long long res_used;
uint64_t set_aside = 0;
s32 batch;
uint64_t set_aside;
bool has_resv_pool;
ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
has_resv_pool = (counter == &mp->m_fdblocks);
if (rsvd)
ASSERT(has_resv_pool);
if (delta > 0) {
/*
* If the reserve pool is depleted, put blocks back into it
* first. Most of the time the pool is full.
*/
if (likely(mp->m_resblks == mp->m_resblks_avail)) {
percpu_counter_add(&mp->m_fdblocks, delta);
if (likely(!has_resv_pool ||
mp->m_resblks == mp->m_resblks_avail)) {
percpu_counter_add(counter, delta);
return 0;
}
@ -1139,7 +1148,7 @@ xfs_mod_fdblocks(
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
percpu_counter_add(&mp->m_fdblocks, delta);
percpu_counter_add(counter, delta);
}
spin_unlock(&mp->m_sb_lock);
return 0;
@ -1153,7 +1162,7 @@ xfs_mod_fdblocks(
* then make everything serialise as we are real close to
* ENOSPC.
*/
if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@ -1170,9 +1179,10 @@ xfs_mod_fdblocks(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
set_aside = xfs_fdblocks_unavailable(mp);
percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
if (has_resv_pool)
set_aside = xfs_fdblocks_unavailable(mp);
percpu_counter_add_batch(counter, delta, batch);
if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
@ -1183,8 +1193,8 @@ xfs_mod_fdblocks(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
percpu_counter_add(&mp->m_fdblocks, -delta);
if (!rsvd)
percpu_counter_add(counter, -delta);
if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
@ -1201,24 +1211,6 @@ xfs_mod_fdblocks(
return -ENOSPC;
}
int
xfs_mod_frextents(
struct xfs_mount *mp,
int64_t delta)
{
int64_t lcounter;
int ret = 0;
spin_lock(&mp->m_sb_lock);
lcounter = mp->m_sb.sb_frextents + delta;
if (lcounter < 0)
ret = -ENOSPC;
else
mp->m_sb.sb_frextents = lcounter;
spin_unlock(&mp->m_sb_lock);
return ret;
}
/*
* Used to free the superblock along various error paths.
*/

View File

@ -183,6 +183,8 @@ typedef struct xfs_mount {
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
struct percpu_counter m_frextents; /* free rt extent counter */
/*
* Count of data device blocks reserved for delayed allocations,
* including indlen blocks. Does not include allocated CoW staging
@ -494,9 +496,20 @@ xfs_fdblocks_unavailable(
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
int64_t delta, bool rsvd);
static inline int
xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
{
return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
}
static inline int
xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
{
return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
}
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);

View File

@ -1318,6 +1318,7 @@ xfs_rtalloc_reinit_frextents(
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_frextents = val;
spin_unlock(&mp->m_sb_lock);
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
return 0;
}

View File

@ -843,9 +843,11 @@ xfs_fs_statfs(
if (XFS_IS_REALTIME_MOUNT(mp) &&
(ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
s64 freertx;
statp->f_blocks = sbp->sb_rblocks;
statp->f_bavail = statp->f_bfree =
sbp->sb_frextents * sbp->sb_rextsize;
freertx = percpu_counter_sum_positive(&mp->m_frextents);
statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
}
return 0;
@ -1015,8 +1017,14 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
return 0;
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters(
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
}
static void
@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters(
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
percpu_counter_destroy(&mp->m_frextents);
}
static int

View File

@ -498,10 +498,31 @@ xfs_trans_apply_sb_deltas(
be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
}
if (tp->t_frextents_delta)
be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
if (tp->t_res_frextents_delta)
be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
/*
* Updating frextents requires careful handling because it does not
* behave like the lazysb counters because we cannot rely on log
* recovery in older kenels to recompute the value from the rtbitmap.
* This means that the ondisk frextents must be consistent with the
* rtbitmap.
*
* Therefore, log the frextents change to the ondisk superblock and
* update the incore superblock so that future calls to xfs_log_sb
* write the correct value ondisk.
*
* Don't touch m_frextents because it includes incore reservations,
* and those are handled by the unreserve function.
*/
if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
struct xfs_mount *mp = tp->t_mountp;
int64_t rtxdelta;
rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
spin_lock(&mp->m_sb_lock);
be64_add_cpu(&sbp->sb_frextents, rtxdelta);
mp->m_sb.sb_frextents += rtxdelta;
spin_unlock(&mp->m_sb_lock);
}
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@ -614,7 +635,12 @@ xfs_trans_unreserve_and_mod_sb(
if (ifreedelta)
percpu_counter_add(&mp->m_ifree, ifreedelta);
if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
if (rtxdelta) {
error = xfs_mod_frextents(mp, rtxdelta);
ASSERT(!error);
}
if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
/* apply remaining deltas */
@ -622,7 +648,12 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
mp->m_sb.sb_icount += idelta;
mp->m_sb.sb_ifree += ifreedelta;
mp->m_sb.sb_frextents += rtxdelta;
/*
* Do not touch sb_frextents here because we are dealing with incore
* reservation. sb_frextents is not part of the lazy sb counters so it
* must be consistent with the ondisk rtbitmap and must never include
* incore reservations.
*/
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;