mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-18 22:34:48 +00:00
xfs: fixes for 6.4-rc5
This update contains: - Propagate unlinked inode list corruption back up to log recovery (regression fix). - improve corruption detection for AGFL entries, AGFL indexes and XEFI extents (syzkaller fuzzer oops report). - Avoid double perag reference release (regression fix). - Improve extent merging detection in scrub (regression fix). - Fix a new undefined high bit shift (regression fix). - Fix for AGF vs inode cluster buffer deadlock (regression fix). -----BEGIN PGP SIGNATURE----- iQJIBAABCgAyFiEEmJOoJ8GffZYWSjj/regpR/R1+h0FAmSBcAIUHGRhdmlkQGZy b21vcmJpdC5jb20ACgkQregpR/R1+h1gEg/+IfG2aNR8P4+rqhOJ2yF5fZYtsqS1 HkOX/N/Q8gBNXMwh3wWoXyBJk7gBhwySwcGvlYXoMZf6+alXGHaTMl8whxmFYaT9 +aPBWo4lXRec6YHx016ZOjnNkLiWhyxdUvh85IFf0EJm5mK9QqjoX+lmbPc7HDzh 0nFL66jaxM8W36QhK0srdwwjD3kNgZ2ZRNonlRULOzyTPpFfh985esTrmfmn3Ulx xiejw57xdpti9x+Pm5WZjUsW1/gx50hMS+yn/KiIWTQqncIO/OuirZSTrOFtUWTM xIfMB9xlkdaSmMCUyx2r2RVWJawXP++aT7nbza2eWJa0WSn5kZmHXugzI+V9zUx7 M0oakkOXJl2pYakVr7G8JU4djZkQNu41JkuLVf5U7O3yYRWlXzViAqljd3S2C/+i pSjG9ram8esd/CAmw/hE6Jvhm6QYS1/D3KQ9Gs6JaptzR8Xjc7t7GEj1T0pMyPem iZx80C6fi87k/94hQ+HXalrAyJER9EmcQ25yngKucjgfrO0BrzNLGDus4uY0+IzX Y2T6xcSF/Vhd1soaklRuHryF7Vv7ECCIWVUV2pH7GHZwv0LaXvBnC1CUdYXskXy8 RGlIBL75lBkOfZp0zK/R11sm1qxzXPCayBkZtSglj/RdNaZiFO9uwO139eGl+xP5 ytjMvitThOGoAfU= =vT74 -----END PGP SIGNATURE----- Merge tag 'xfs-6.4-rc5-fixes' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull xfs fixes from Dave Chinner: "These are a set of regression fixes discovered on recent kernels. I was hoping to send this to you a week and half ago, but events out of my control delayed finalising the changes until early this week. Whilst the diffstat looks large for this stage of the merge window, a large chunk of it comes from moving the guts of one function from one file to another i.e. it's the same code, it is just run in a different context where it is safe to hold a specific lock. Otherwise the individual changes are relatively small and straigtht forward. Summary: - Propagate unlinked inode list corruption back up to log recovery (regression fix) - improve corruption detection for AGFL entries, AGFL indexes and XEFI extents (syzkaller fuzzer oops report) - Avoid double perag reference release (regression fix) - Improve extent merging detection in scrub (regression fix) - Fix a new undefined high bit shift (regression fix) - Fix for AGF vs inode cluster buffer deadlock (regression fix)" * tag 'xfs-6.4-rc5-fixes' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: xfs: collect errors from inodegc for unlinked inode recovery xfs: validate block number being freed before adding to xefi xfs: validity check agbnos on the AGFL xfs: fix agf/agfl verification on v4 filesystems xfs: fix double xfs_perag_rele() in xfs_filestream_pick_ag() xfs: fix broken logic when detecting mergeable bmap records xfs: Fix undefined behavior of shift into sign bit xfs: fix AGF vs inode cluster buffer deadlock xfs: defered work could create precommits xfs: restore allocation trylock iteration xfs: buffer pins need to hold a buffer reference
This commit is contained in:
commit
79b6fad546
@ -984,7 +984,10 @@ xfs_ag_shrink_space(
|
||||
if (err2 != -ENOSPC)
|
||||
goto resv_err;
|
||||
|
||||
__xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true);
|
||||
err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
|
||||
true);
|
||||
if (err2)
|
||||
goto resv_err;
|
||||
|
||||
/*
|
||||
* Roll the transaction before trying to re-init the per-ag
|
||||
|
@ -628,6 +628,25 @@ xfs_alloc_fixup_trees(
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We do not verify the AGFL contents against AGF-based index counters here,
|
||||
* even though we may have access to the perag that contains shadow copies. We
|
||||
* don't know if the AGF based counters have been checked, and if they have they
|
||||
* still may be inconsistent because they haven't yet been reset on the first
|
||||
* allocation after the AGF has been read in.
|
||||
*
|
||||
* This means we can only check that all agfl entries contain valid or null
|
||||
* values because we can't reliably determine the active range to exclude
|
||||
* NULLAGBNO as a valid value.
|
||||
*
|
||||
* However, we can't even do that for v4 format filesystems because there are
|
||||
* old versions of mkfs out there that does not initialise the AGFL to known,
|
||||
* verifiable values. HEnce we can't tell the difference between a AGFL block
|
||||
* allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
|
||||
*
|
||||
* As a result, we can only fully validate AGFL block numbers when we pull them
|
||||
* from the freelist in xfs_alloc_get_freelist().
|
||||
*/
|
||||
static xfs_failaddr_t
|
||||
xfs_agfl_verify(
|
||||
struct xfs_buf *bp)
|
||||
@ -637,12 +656,6 @@ xfs_agfl_verify(
|
||||
__be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
|
||||
int i;
|
||||
|
||||
/*
|
||||
* There is no verification of non-crc AGFLs because mkfs does not
|
||||
* initialise the AGFL to zero or NULL. Hence the only valid part of the
|
||||
* AGFL is what the AGF says is active. We can't get to the AGF, so we
|
||||
* can't verify just those entries are valid.
|
||||
*/
|
||||
if (!xfs_has_crc(mp))
|
||||
return NULL;
|
||||
|
||||
@ -2321,12 +2334,16 @@ xfs_free_agfl_block(
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the agfl fields of the agf for inconsistency or corruption. The purpose
|
||||
* is to detect an agfl header padding mismatch between current and early v5
|
||||
* kernels. This problem manifests as a 1-slot size difference between the
|
||||
* on-disk flcount and the active [first, last] range of a wrapped agfl. This
|
||||
* may also catch variants of agfl count corruption unrelated to padding. Either
|
||||
* way, we'll reset the agfl and warn the user.
|
||||
* Check the agfl fields of the agf for inconsistency or corruption.
|
||||
*
|
||||
* The original purpose was to detect an agfl header padding mismatch between
|
||||
* current and early v5 kernels. This problem manifests as a 1-slot size
|
||||
* difference between the on-disk flcount and the active [first, last] range of
|
||||
* a wrapped agfl.
|
||||
*
|
||||
* However, we need to use these same checks to catch agfl count corruptions
|
||||
* unrelated to padding. This could occur on any v4 or v5 filesystem, so either
|
||||
* way, we need to reset the agfl and warn the user.
|
||||
*
|
||||
* Return true if a reset is required before the agfl can be used, false
|
||||
* otherwise.
|
||||
@ -2342,10 +2359,6 @@ xfs_agfl_needs_reset(
|
||||
int agfl_size = xfs_agfl_size(mp);
|
||||
int active;
|
||||
|
||||
/* no agfl header on v4 supers */
|
||||
if (!xfs_has_crc(mp))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* The agf read verifier catches severe corruption of these fields.
|
||||
* Repeat some sanity checks to cover a packed -> unpacked mismatch if
|
||||
@ -2418,7 +2431,7 @@ xfs_agfl_reset(
|
||||
* the real allocation can proceed. Deferring the free disconnects freeing up
|
||||
* the AGFL slot from freeing the block.
|
||||
*/
|
||||
STATIC void
|
||||
static int
|
||||
xfs_defer_agfl_block(
|
||||
struct xfs_trans *tp,
|
||||
xfs_agnumber_t agno,
|
||||
@ -2437,17 +2450,21 @@ xfs_defer_agfl_block(
|
||||
xefi->xefi_blockcount = 1;
|
||||
xefi->xefi_owner = oinfo->oi_owner;
|
||||
|
||||
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, xefi->xefi_startblock)))
|
||||
return -EFSCORRUPTED;
|
||||
|
||||
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
|
||||
|
||||
xfs_extent_free_get_group(mp, xefi);
|
||||
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add the extent to the list of extents to be free at transaction end.
|
||||
* The list is maintained sorted (by block number).
|
||||
*/
|
||||
void
|
||||
int
|
||||
__xfs_free_extent_later(
|
||||
struct xfs_trans *tp,
|
||||
xfs_fsblock_t bno,
|
||||
@ -2474,6 +2491,9 @@ __xfs_free_extent_later(
|
||||
#endif
|
||||
ASSERT(xfs_extfree_item_cache != NULL);
|
||||
|
||||
if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
|
||||
return -EFSCORRUPTED;
|
||||
|
||||
xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
|
||||
GFP_KERNEL | __GFP_NOFAIL);
|
||||
xefi->xefi_startblock = bno;
|
||||
@ -2497,6 +2517,7 @@ __xfs_free_extent_later(
|
||||
|
||||
xfs_extent_free_get_group(mp, xefi);
|
||||
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
@ -2657,7 +2678,9 @@ xfs_alloc_fix_freelist(
|
||||
goto out_agbp_relse;
|
||||
|
||||
/* defer agfl frees */
|
||||
xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
|
||||
error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
|
||||
if (error)
|
||||
goto out_agbp_relse;
|
||||
}
|
||||
|
||||
targs.tp = tp;
|
||||
@ -2767,6 +2790,9 @@ xfs_alloc_get_freelist(
|
||||
*/
|
||||
agfl_bno = xfs_buf_to_agfl_bno(agflbp);
|
||||
bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
|
||||
if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
|
||||
return -EFSCORRUPTED;
|
||||
|
||||
be32_add_cpu(&agf->agf_flfirst, 1);
|
||||
xfs_trans_brelse(tp, agflbp);
|
||||
if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
|
||||
@ -2889,6 +2915,19 @@ xfs_alloc_put_freelist(
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify the AGF is consistent.
|
||||
*
|
||||
* We do not verify the AGFL indexes in the AGF are fully consistent here
|
||||
* because of issues with variable on-disk structure sizes. Instead, we check
|
||||
* the agfl indexes for consistency when we initialise the perag from the AGF
|
||||
* information after a read completes.
|
||||
*
|
||||
* If the index is inconsistent, then we mark the perag as needing an AGFL
|
||||
* reset. The first AGFL update performed then resets the AGFL indexes and
|
||||
* refills the AGFL with known good free blocks, allowing the filesystem to
|
||||
* continue operating normally at the cost of a few leaked free space blocks.
|
||||
*/
|
||||
static xfs_failaddr_t
|
||||
xfs_agf_verify(
|
||||
struct xfs_buf *bp)
|
||||
@ -2962,7 +3001,6 @@ xfs_agf_verify(
|
||||
return __this_address;
|
||||
|
||||
return NULL;
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
@ -3187,7 +3225,8 @@ xfs_alloc_vextent_check_args(
|
||||
*/
|
||||
static int
|
||||
xfs_alloc_vextent_prepare_ag(
|
||||
struct xfs_alloc_arg *args)
|
||||
struct xfs_alloc_arg *args,
|
||||
uint32_t flags)
|
||||
{
|
||||
bool need_pag = !args->pag;
|
||||
int error;
|
||||
@ -3196,7 +3235,7 @@ xfs_alloc_vextent_prepare_ag(
|
||||
args->pag = xfs_perag_get(args->mp, args->agno);
|
||||
|
||||
args->agbp = NULL;
|
||||
error = xfs_alloc_fix_freelist(args, 0);
|
||||
error = xfs_alloc_fix_freelist(args, flags);
|
||||
if (error) {
|
||||
trace_xfs_alloc_vextent_nofix(args);
|
||||
if (need_pag)
|
||||
@ -3336,7 +3375,7 @@ xfs_alloc_vextent_this_ag(
|
||||
return error;
|
||||
}
|
||||
|
||||
error = xfs_alloc_vextent_prepare_ag(args);
|
||||
error = xfs_alloc_vextent_prepare_ag(args, 0);
|
||||
if (!error && args->agbp)
|
||||
error = xfs_alloc_ag_vextent_size(args);
|
||||
|
||||
@ -3380,7 +3419,7 @@ restart:
|
||||
for_each_perag_wrap_range(mp, start_agno, restart_agno,
|
||||
mp->m_sb.sb_agcount, agno, args->pag) {
|
||||
args->agno = agno;
|
||||
error = xfs_alloc_vextent_prepare_ag(args);
|
||||
error = xfs_alloc_vextent_prepare_ag(args, flags);
|
||||
if (error)
|
||||
break;
|
||||
if (!args->agbp) {
|
||||
@ -3546,7 +3585,7 @@ xfs_alloc_vextent_exact_bno(
|
||||
return error;
|
||||
}
|
||||
|
||||
error = xfs_alloc_vextent_prepare_ag(args);
|
||||
error = xfs_alloc_vextent_prepare_ag(args, 0);
|
||||
if (!error && args->agbp)
|
||||
error = xfs_alloc_ag_vextent_exact(args);
|
||||
|
||||
@ -3587,7 +3626,7 @@ xfs_alloc_vextent_near_bno(
|
||||
if (needs_perag)
|
||||
args->pag = xfs_perag_grab(mp, args->agno);
|
||||
|
||||
error = xfs_alloc_vextent_prepare_ag(args);
|
||||
error = xfs_alloc_vextent_prepare_ag(args, 0);
|
||||
if (!error && args->agbp)
|
||||
error = xfs_alloc_ag_vextent_near(args);
|
||||
|
||||
|
@ -230,7 +230,7 @@ xfs_buf_to_agfl_bno(
|
||||
return bp->b_addr;
|
||||
}
|
||||
|
||||
void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
|
||||
int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
|
||||
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
|
||||
bool skip_discard);
|
||||
|
||||
@ -254,14 +254,14 @@ void xfs_extent_free_get_group(struct xfs_mount *mp,
|
||||
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
|
||||
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
|
||||
|
||||
static inline void
|
||||
static inline int
|
||||
xfs_free_extent_later(
|
||||
struct xfs_trans *tp,
|
||||
xfs_fsblock_t bno,
|
||||
xfs_filblks_t len,
|
||||
const struct xfs_owner_info *oinfo)
|
||||
{
|
||||
__xfs_free_extent_later(tp, bno, len, oinfo, false);
|
||||
return __xfs_free_extent_later(tp, bno, len, oinfo, false);
|
||||
}
|
||||
|
||||
|
||||
|
@ -572,8 +572,12 @@ xfs_bmap_btree_to_extents(
|
||||
cblock = XFS_BUF_TO_BLOCK(cbp);
|
||||
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
|
||||
return error;
|
||||
|
||||
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
|
||||
xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
|
||||
error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
ip->i_nblocks--;
|
||||
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
|
||||
xfs_trans_binval(tp, cbp);
|
||||
@ -5230,10 +5234,12 @@ xfs_bmap_del_extent_real(
|
||||
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
|
||||
xfs_refcount_decrease_extent(tp, del);
|
||||
} else {
|
||||
__xfs_free_extent_later(tp, del->br_startblock,
|
||||
error = __xfs_free_extent_later(tp, del->br_startblock,
|
||||
del->br_blockcount, NULL,
|
||||
(bflags & XFS_BMAPI_NODISCARD) ||
|
||||
del->br_state == XFS_EXT_UNWRITTEN);
|
||||
if (error)
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -268,11 +268,14 @@ xfs_bmbt_free_block(
|
||||
struct xfs_trans *tp = cur->bc_tp;
|
||||
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
|
||||
struct xfs_owner_info oinfo;
|
||||
int error;
|
||||
|
||||
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
|
||||
xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
|
||||
ip->i_nblocks--;
|
||||
error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
ip->i_nblocks--;
|
||||
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
|
||||
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
|
||||
return 0;
|
||||
|
@ -1834,7 +1834,7 @@ retry:
|
||||
* might be sparse and only free the regions that are allocated as part of the
|
||||
* chunk.
|
||||
*/
|
||||
STATIC void
|
||||
static int
|
||||
xfs_difree_inode_chunk(
|
||||
struct xfs_trans *tp,
|
||||
xfs_agnumber_t agno,
|
||||
@ -1851,10 +1851,10 @@ xfs_difree_inode_chunk(
|
||||
|
||||
if (!xfs_inobt_issparse(rec->ir_holemask)) {
|
||||
/* not sparse, calculate extent info directly */
|
||||
xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno),
|
||||
M_IGEO(mp)->ialloc_blks,
|
||||
&XFS_RMAP_OINFO_INODES);
|
||||
return;
|
||||
return xfs_free_extent_later(tp,
|
||||
XFS_AGB_TO_FSB(mp, agno, sagbno),
|
||||
M_IGEO(mp)->ialloc_blks,
|
||||
&XFS_RMAP_OINFO_INODES);
|
||||
}
|
||||
|
||||
/* holemask is only 16-bits (fits in an unsigned long) */
|
||||
@ -1871,6 +1871,8 @@ xfs_difree_inode_chunk(
|
||||
XFS_INOBT_HOLEMASK_BITS);
|
||||
nextbit = startidx + 1;
|
||||
while (startidx < XFS_INOBT_HOLEMASK_BITS) {
|
||||
int error;
|
||||
|
||||
nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
|
||||
nextbit);
|
||||
/*
|
||||
@ -1896,8 +1898,11 @@ xfs_difree_inode_chunk(
|
||||
|
||||
ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
|
||||
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
|
||||
xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno),
|
||||
contigblk, &XFS_RMAP_OINFO_INODES);
|
||||
error = xfs_free_extent_later(tp,
|
||||
XFS_AGB_TO_FSB(mp, agno, agbno),
|
||||
contigblk, &XFS_RMAP_OINFO_INODES);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/* reset range to current bit and carry on... */
|
||||
startidx = endidx = nextbit;
|
||||
@ -1905,6 +1910,7 @@ xfs_difree_inode_chunk(
|
||||
next:
|
||||
nextbit++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
STATIC int
|
||||
@ -2003,7 +2009,9 @@ xfs_difree_inobt(
|
||||
goto error0;
|
||||
}
|
||||
|
||||
xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
|
||||
error = xfs_difree_inode_chunk(tp, pag->pag_agno, &rec);
|
||||
if (error)
|
||||
goto error0;
|
||||
} else {
|
||||
xic->deleted = false;
|
||||
|
||||
|
@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 {
|
||||
#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */
|
||||
#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */
|
||||
|
||||
|
||||
/*
|
||||
* The timestamps are dirty, but not necessarily anything else in the inode
|
||||
* core. Unlike the other fields above this one must never make it to disk
|
||||
@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 {
|
||||
*/
|
||||
#define XFS_ILOG_TIMESTAMP 0x4000
|
||||
|
||||
/*
|
||||
* The version field has been changed, but not necessarily anything else of
|
||||
* interest. This must never make it to disk - it is used purely to ensure that
|
||||
* the inode item ->precommit operation can update the fsync flag triggers
|
||||
* in the inode item correctly.
|
||||
*/
|
||||
#define XFS_ILOG_IVERSION 0x8000
|
||||
|
||||
#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
|
||||
XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
|
||||
XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
|
||||
|
@ -1151,8 +1151,10 @@ xfs_refcount_adjust_extents(
|
||||
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
|
||||
cur->bc_ag.pag->pag_agno,
|
||||
tmp.rc_startblock);
|
||||
xfs_free_extent_later(cur->bc_tp, fsbno,
|
||||
error = xfs_free_extent_later(cur->bc_tp, fsbno,
|
||||
tmp.rc_blockcount, NULL);
|
||||
if (error)
|
||||
goto out_error;
|
||||
}
|
||||
|
||||
(*agbno) += tmp.rc_blockcount;
|
||||
@ -1210,8 +1212,10 @@ xfs_refcount_adjust_extents(
|
||||
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
|
||||
cur->bc_ag.pag->pag_agno,
|
||||
ext.rc_startblock);
|
||||
xfs_free_extent_later(cur->bc_tp, fsbno,
|
||||
error = xfs_free_extent_later(cur->bc_tp, fsbno,
|
||||
ext.rc_blockcount, NULL);
|
||||
if (error)
|
||||
goto out_error;
|
||||
}
|
||||
|
||||
skip:
|
||||
@ -1976,7 +1980,10 @@ xfs_refcount_recover_cow_leftovers(
|
||||
rr->rr_rrec.rc_blockcount);
|
||||
|
||||
/* Free the block. */
|
||||
xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL);
|
||||
error = xfs_free_extent_later(tp, fsb,
|
||||
rr->rr_rrec.rc_blockcount, NULL);
|
||||
if (error)
|
||||
goto out_trans;
|
||||
|
||||
error = xfs_trans_commit(tp);
|
||||
if (error)
|
||||
|
@ -40,9 +40,8 @@ xfs_trans_ijoin(
|
||||
iip->ili_lock_flags = lock_flags;
|
||||
ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
|
||||
|
||||
/*
|
||||
* Get a log_item_desc to point at the new item.
|
||||
*/
|
||||
/* Reset the per-tx dirty context and add the item to the tx. */
|
||||
iip->ili_dirty_flags = 0;
|
||||
xfs_trans_add_item(tp, &iip->ili_item);
|
||||
}
|
||||
|
||||
@ -76,17 +75,10 @@ xfs_trans_ichgtime(
|
||||
/*
|
||||
* This is called to mark the fields indicated in fieldmask as needing to be
|
||||
* logged when the transaction is committed. The inode must already be
|
||||
* associated with the given transaction.
|
||||
*
|
||||
* The values for fieldmask are defined in xfs_inode_item.h. We always log all
|
||||
* of the core inode if any of it has changed, and we always log all of the
|
||||
* inline data/extents/b-tree root if any of them has changed.
|
||||
*
|
||||
* Grab and pin the cluster buffer associated with this inode to avoid RMW
|
||||
* cycles at inode writeback time. Avoid the need to add error handling to every
|
||||
* xfs_trans_log_inode() call by shutting down on read error. This will cause
|
||||
* transactions to fail and everything to error out, just like if we return a
|
||||
* read error in a dirty transaction and cancel it.
|
||||
* associated with the given transaction. All we do here is record where the
|
||||
* inode was dirtied and mark the transaction and inode log item dirty;
|
||||
* everything else is done in the ->precommit log item operation after the
|
||||
* changes in the transaction have been completed.
|
||||
*/
|
||||
void
|
||||
xfs_trans_log_inode(
|
||||
@ -96,7 +88,6 @@ xfs_trans_log_inode(
|
||||
{
|
||||
struct xfs_inode_log_item *iip = ip->i_itemp;
|
||||
struct inode *inode = VFS_I(ip);
|
||||
uint iversion_flags = 0;
|
||||
|
||||
ASSERT(iip);
|
||||
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
|
||||
@ -104,18 +95,6 @@ xfs_trans_log_inode(
|
||||
|
||||
tp->t_flags |= XFS_TRANS_DIRTY;
|
||||
|
||||
/*
|
||||
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
|
||||
* don't matter - we either will need an extra transaction in 24 hours
|
||||
* to log the timestamps, or will clear already cleared fields in the
|
||||
* worst case.
|
||||
*/
|
||||
if (inode->i_state & I_DIRTY_TIME) {
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_state &= ~I_DIRTY_TIME;
|
||||
spin_unlock(&inode->i_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* First time we log the inode in a transaction, bump the inode change
|
||||
* counter if it is configured for this to occur. While we have the
|
||||
@ -128,86 +107,10 @@ xfs_trans_log_inode(
|
||||
if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
|
||||
if (IS_I_VERSION(inode) &&
|
||||
inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
|
||||
iversion_flags = XFS_ILOG_CORE;
|
||||
flags |= XFS_ILOG_IVERSION;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we're updating the inode core or the timestamps and it's possible
|
||||
* to upgrade this inode to bigtime format, do so now.
|
||||
*/
|
||||
if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
|
||||
xfs_has_bigtime(ip->i_mount) &&
|
||||
!xfs_inode_has_bigtime(ip)) {
|
||||
ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
|
||||
flags |= XFS_ILOG_CORE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Inode verifiers do not check that the extent size hint is an integer
|
||||
* multiple of the rt extent size on a directory with both rtinherit
|
||||
* and extszinherit flags set. If we're logging a directory that is
|
||||
* misconfigured in this way, clear the hint.
|
||||
*/
|
||||
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
|
||||
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
|
||||
(ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
|
||||
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
|
||||
XFS_DIFLAG_EXTSZINHERIT);
|
||||
ip->i_extsize = 0;
|
||||
flags |= XFS_ILOG_CORE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Record the specific change for fdatasync optimisation. This allows
|
||||
* fdatasync to skip log forces for inodes that are only timestamp
|
||||
* dirty.
|
||||
*/
|
||||
spin_lock(&iip->ili_lock);
|
||||
iip->ili_fsync_fields |= flags;
|
||||
|
||||
if (!iip->ili_item.li_buf) {
|
||||
struct xfs_buf *bp;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* We hold the ILOCK here, so this inode is not going to be
|
||||
* flushed while we are here. Further, because there is no
|
||||
* buffer attached to the item, we know that there is no IO in
|
||||
* progress, so nothing will clear the ili_fields while we read
|
||||
* in the buffer. Hence we can safely drop the spin lock and
|
||||
* read the buffer knowing that the state will not change from
|
||||
* here.
|
||||
*/
|
||||
spin_unlock(&iip->ili_lock);
|
||||
error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
|
||||
if (error) {
|
||||
xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We need an explicit buffer reference for the log item but
|
||||
* don't want the buffer to remain attached to the transaction.
|
||||
* Hold the buffer but release the transaction reference once
|
||||
* we've attached the inode log item to the buffer log item
|
||||
* list.
|
||||
*/
|
||||
xfs_buf_hold(bp);
|
||||
spin_lock(&iip->ili_lock);
|
||||
iip->ili_item.li_buf = bp;
|
||||
bp->b_flags |= _XBF_INODES;
|
||||
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
|
||||
xfs_trans_brelse(tp, bp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Always OR in the bits from the ili_last_fields field. This is to
|
||||
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
|
||||
* in the eventual clearing of the ili_fields bits. See the big comment
|
||||
* in xfs_iflush() for an explanation of this coordination mechanism.
|
||||
*/
|
||||
iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
|
||||
spin_unlock(&iip->ili_lock);
|
||||
iip->ili_dirty_flags |= flags;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -769,14 +769,14 @@ xchk_are_bmaps_contiguous(
|
||||
* mapping or false if there are no more mappings. Caller must ensure that
|
||||
* @info.icur is zeroed before the first call.
|
||||
*/
|
||||
static int
|
||||
static bool
|
||||
xchk_bmap_iext_iter(
|
||||
struct xchk_bmap_info *info,
|
||||
struct xfs_bmbt_irec *irec)
|
||||
{
|
||||
struct xfs_bmbt_irec got;
|
||||
struct xfs_ifork *ifp;
|
||||
xfs_filblks_t prev_len;
|
||||
unsigned int nr = 0;
|
||||
|
||||
ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
|
||||
|
||||
@ -790,12 +790,12 @@ xchk_bmap_iext_iter(
|
||||
irec->br_startoff);
|
||||
return false;
|
||||
}
|
||||
nr++;
|
||||
|
||||
/*
|
||||
* Iterate subsequent iextent records and merge them with the one
|
||||
* that we just read, if possible.
|
||||
*/
|
||||
prev_len = irec->br_blockcount;
|
||||
while (xfs_iext_peek_next_extent(ifp, &info->icur, &got)) {
|
||||
if (!xchk_are_bmaps_contiguous(irec, &got))
|
||||
break;
|
||||
@ -805,20 +805,21 @@ xchk_bmap_iext_iter(
|
||||
got.br_startoff);
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Notify the user of mergeable records in the data or attr
|
||||
* forks. CoW forks only exist in memory so we ignore them.
|
||||
*/
|
||||
if (info->whichfork != XFS_COW_FORK &&
|
||||
prev_len + got.br_blockcount > BMBT_BLOCKCOUNT_MASK)
|
||||
xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
|
||||
nr++;
|
||||
|
||||
irec->br_blockcount += got.br_blockcount;
|
||||
prev_len = got.br_blockcount;
|
||||
xfs_iext_next(ifp, &info->icur);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the merged mapping could be expressed with fewer bmbt records
|
||||
* than we actually found, notify the user that this fork could be
|
||||
* optimized. CoW forks only exist in memory so we ignore them.
|
||||
*/
|
||||
if (nr > 1 && info->whichfork != XFS_COW_FORK &&
|
||||
howmany_64(irec->br_blockcount, XFS_MAX_BMBT_EXTLEN) < nr)
|
||||
xchk_ino_set_preen(info->sc, info->sc->ip->i_ino);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -105,10 +105,10 @@ struct xfs_scrub {
|
||||
};
|
||||
|
||||
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
|
||||
#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */
|
||||
#define XCHK_FSGATES_DRAIN (1 << 2) /* defer ops draining enabled */
|
||||
#define XCHK_NEED_DRAIN (1 << 3) /* scrub needs to drain defer ops */
|
||||
#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */
|
||||
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
|
||||
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
|
||||
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
|
||||
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
|
||||
|
||||
/*
|
||||
* The XCHK_FSGATES* flags reflect functionality in the main filesystem that
|
||||
|
@ -452,10 +452,18 @@ xfs_buf_item_format(
|
||||
* This is called to pin the buffer associated with the buf log item in memory
|
||||
* so it cannot be written out.
|
||||
*
|
||||
* We also always take a reference to the buffer log item here so that the bli
|
||||
* is held while the item is pinned in memory. This means that we can
|
||||
* unconditionally drop the reference count a transaction holds when the
|
||||
* transaction is completed.
|
||||
* We take a reference to the buffer log item here so that the BLI life cycle
|
||||
* extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
|
||||
* inserted into the AIL.
|
||||
*
|
||||
* We also need to take a reference to the buffer itself as the BLI unpin
|
||||
* processing requires accessing the buffer after the BLI has dropped the final
|
||||
* BLI reference. See xfs_buf_item_unpin() for an explanation.
|
||||
* If unpins race to drop the final BLI reference and only the
|
||||
* BLI owns a reference to the buffer, then the loser of the race can have the
|
||||
* buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
|
||||
* pin count ensures the life cycle of the buffer extends for as
|
||||
* long as we hold the buffer pin reference in xfs_buf_item_unpin().
|
||||
*/
|
||||
STATIC void
|
||||
xfs_buf_item_pin(
|
||||
@ -470,13 +478,30 @@ xfs_buf_item_pin(
|
||||
|
||||
trace_xfs_buf_item_pin(bip);
|
||||
|
||||
xfs_buf_hold(bip->bli_buf);
|
||||
atomic_inc(&bip->bli_refcount);
|
||||
atomic_inc(&bip->bli_buf->b_pin_count);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is called to unpin the buffer associated with the buf log item which
|
||||
* was previously pinned with a call to xfs_buf_item_pin().
|
||||
* This is called to unpin the buffer associated with the buf log item which was
|
||||
* previously pinned with a call to xfs_buf_item_pin(). We enter this function
|
||||
* with a buffer pin count, a buffer reference and a BLI reference.
|
||||
*
|
||||
* We must drop the BLI reference before we unpin the buffer because the AIL
|
||||
* doesn't acquire a BLI reference whenever it accesses it. Therefore if the
|
||||
* refcount drops to zero, the bli could still be AIL resident and the buffer
|
||||
* submitted for I/O at any point before we return. This can result in IO
|
||||
* completion freeing the buffer while we are still trying to access it here.
|
||||
* This race condition can also occur in shutdown situations where we abort and
|
||||
* unpin buffers from contexts other that journal IO completion.
|
||||
*
|
||||
* Hence we have to hold a buffer reference per pin count to ensure that the
|
||||
* buffer cannot be freed until we have finished processing the unpin operation.
|
||||
* The reference is taken in xfs_buf_item_pin(), and we must hold it until we
|
||||
* are done processing the buffer state. In the case of an abort (remove =
|
||||
* true) then we re-use the current pin reference as the IO reference we hand
|
||||
* off to IO failure handling.
|
||||
*/
|
||||
STATIC void
|
||||
xfs_buf_item_unpin(
|
||||
@ -493,24 +518,18 @@ xfs_buf_item_unpin(
|
||||
|
||||
trace_xfs_buf_item_unpin(bip);
|
||||
|
||||
/*
|
||||
* Drop the bli ref associated with the pin and grab the hold required
|
||||
* for the I/O simulation failure in the abort case. We have to do this
|
||||
* before the pin count drops because the AIL doesn't acquire a bli
|
||||
* reference. Therefore if the refcount drops to zero, the bli could
|
||||
* still be AIL resident and the buffer submitted for I/O (and freed on
|
||||
* completion) at any point before we return. This can be removed once
|
||||
* the AIL properly holds a reference on the bli.
|
||||
*/
|
||||
freed = atomic_dec_and_test(&bip->bli_refcount);
|
||||
if (freed && !stale && remove)
|
||||
xfs_buf_hold(bp);
|
||||
if (atomic_dec_and_test(&bp->b_pin_count))
|
||||
wake_up_all(&bp->b_waiters);
|
||||
|
||||
/* nothing to do but drop the pin count if the bli is active */
|
||||
if (!freed)
|
||||
/*
|
||||
* Nothing to do but drop the buffer pin reference if the BLI is
|
||||
* still active.
|
||||
*/
|
||||
if (!freed) {
|
||||
xfs_buf_rele(bp);
|
||||
return;
|
||||
}
|
||||
|
||||
if (stale) {
|
||||
ASSERT(bip->bli_flags & XFS_BLI_STALE);
|
||||
@ -522,6 +541,15 @@ xfs_buf_item_unpin(
|
||||
|
||||
trace_xfs_buf_item_unpin_stale(bip);
|
||||
|
||||
/*
|
||||
* The buffer has been locked and referenced since it was marked
|
||||
* stale so we own both lock and reference exclusively here. We
|
||||
* do not need the pin reference any more, so drop it now so
|
||||
* that we only have one reference to drop once item completion
|
||||
* processing is complete.
|
||||
*/
|
||||
xfs_buf_rele(bp);
|
||||
|
||||
/*
|
||||
* If we get called here because of an IO error, we may or may
|
||||
* not have the item on the AIL. xfs_trans_ail_delete() will
|
||||
@ -538,16 +566,30 @@ xfs_buf_item_unpin(
|
||||
ASSERT(bp->b_log_item == NULL);
|
||||
}
|
||||
xfs_buf_relse(bp);
|
||||
} else if (remove) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (remove) {
|
||||
/*
|
||||
* The buffer must be locked and held by the caller to simulate
|
||||
* an async I/O failure. We acquired the hold for this case
|
||||
* before the buffer was unpinned.
|
||||
* We need to simulate an async IO failures here to ensure that
|
||||
* the correct error completion is run on this buffer. This
|
||||
* requires a reference to the buffer and for the buffer to be
|
||||
* locked. We can safely pass ownership of the pin reference to
|
||||
* the IO to ensure that nothing can free the buffer while we
|
||||
* wait for the lock and then run the IO failure completion.
|
||||
*/
|
||||
xfs_buf_lock(bp);
|
||||
bp->b_flags |= XBF_ASYNC;
|
||||
xfs_buf_ioend_fail(bp);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* BLI has no more active references - it will be moved to the AIL to
|
||||
* manage the remaining BLI/buffer life cycle. There is nothing left for
|
||||
* us to do here so drop the pin reference to the buffer.
|
||||
*/
|
||||
xfs_buf_rele(bp);
|
||||
}
|
||||
|
||||
STATIC uint
|
||||
|
@ -78,7 +78,6 @@ restart:
|
||||
*longest = 0;
|
||||
err = xfs_bmap_longest_free_extent(pag, NULL, longest);
|
||||
if (err) {
|
||||
xfs_perag_rele(pag);
|
||||
if (err != -EAGAIN)
|
||||
break;
|
||||
/* Couldn't lock the AGF, skip this AG. */
|
||||
|
@ -454,6 +454,27 @@ xfs_inodegc_queue_all(
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Wait for all queued work and collect errors */
|
||||
static int
|
||||
xfs_inodegc_wait_all(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
int cpu;
|
||||
int error = 0;
|
||||
|
||||
flush_workqueue(mp->m_inodegc_wq);
|
||||
for_each_online_cpu(cpu) {
|
||||
struct xfs_inodegc *gc;
|
||||
|
||||
gc = per_cpu_ptr(mp->m_inodegc, cpu);
|
||||
if (gc->error && !error)
|
||||
error = gc->error;
|
||||
gc->error = 0;
|
||||
}
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the validity of the inode we just found it the cache
|
||||
*/
|
||||
@ -1491,15 +1512,14 @@ xfs_blockgc_free_space(
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
xfs_inodegc_flush(mp);
|
||||
return 0;
|
||||
return xfs_inodegc_flush(mp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reclaim all the free space that we can by scheduling the background blockgc
|
||||
* and inodegc workers immediately and waiting for them all to clear.
|
||||
*/
|
||||
void
|
||||
int
|
||||
xfs_blockgc_flush_all(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
@ -1520,7 +1540,7 @@ xfs_blockgc_flush_all(
|
||||
for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
|
||||
flush_delayed_work(&pag->pag_blockgc_work);
|
||||
|
||||
xfs_inodegc_flush(mp);
|
||||
return xfs_inodegc_flush(mp);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1842,13 +1862,17 @@ xfs_inodegc_set_reclaimable(
|
||||
* This is the last chance to make changes to an otherwise unreferenced file
|
||||
* before incore reclamation happens.
|
||||
*/
|
||||
static void
|
||||
static int
|
||||
xfs_inodegc_inactivate(
|
||||
struct xfs_inode *ip)
|
||||
{
|
||||
int error;
|
||||
|
||||
trace_xfs_inode_inactivating(ip);
|
||||
xfs_inactive(ip);
|
||||
error = xfs_inactive(ip);
|
||||
xfs_inodegc_set_reclaimable(ip);
|
||||
return error;
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
@ -1880,8 +1904,12 @@ xfs_inodegc_worker(
|
||||
|
||||
WRITE_ONCE(gc->shrinker_hits, 0);
|
||||
llist_for_each_entry_safe(ip, n, node, i_gclist) {
|
||||
int error;
|
||||
|
||||
xfs_iflags_set(ip, XFS_INACTIVATING);
|
||||
xfs_inodegc_inactivate(ip);
|
||||
error = xfs_inodegc_inactivate(ip);
|
||||
if (error && !gc->error)
|
||||
gc->error = error;
|
||||
}
|
||||
|
||||
memalloc_nofs_restore(nofs_flag);
|
||||
@ -1905,13 +1933,13 @@ xfs_inodegc_push(
|
||||
* Force all currently queued inode inactivation work to run immediately and
|
||||
* wait for the work to finish.
|
||||
*/
|
||||
void
|
||||
int
|
||||
xfs_inodegc_flush(
|
||||
struct xfs_mount *mp)
|
||||
{
|
||||
xfs_inodegc_push(mp);
|
||||
trace_xfs_inodegc_flush(mp, __return_address);
|
||||
flush_workqueue(mp->m_inodegc_wq);
|
||||
return xfs_inodegc_wait_all(mp);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -62,7 +62,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp,
|
||||
unsigned int iwalk_flags);
|
||||
int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
|
||||
int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
|
||||
void xfs_blockgc_flush_all(struct xfs_mount *mp);
|
||||
int xfs_blockgc_flush_all(struct xfs_mount *mp);
|
||||
|
||||
void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
|
||||
void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
|
||||
@ -80,7 +80,7 @@ void xfs_blockgc_start(struct xfs_mount *mp);
|
||||
|
||||
void xfs_inodegc_worker(struct work_struct *work);
|
||||
void xfs_inodegc_push(struct xfs_mount *mp);
|
||||
void xfs_inodegc_flush(struct xfs_mount *mp);
|
||||
int xfs_inodegc_flush(struct xfs_mount *mp);
|
||||
void xfs_inodegc_stop(struct xfs_mount *mp);
|
||||
void xfs_inodegc_start(struct xfs_mount *mp);
|
||||
void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
|
||||
|
@ -1620,16 +1620,7 @@ xfs_inactive_ifree(
|
||||
*/
|
||||
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
|
||||
|
||||
/*
|
||||
* Just ignore errors at this point. There is nothing we can do except
|
||||
* to try to keep going. Make sure it's not a silent error.
|
||||
*/
|
||||
error = xfs_trans_commit(tp);
|
||||
if (error)
|
||||
xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
|
||||
__func__, error);
|
||||
|
||||
return 0;
|
||||
return xfs_trans_commit(tp);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1693,12 +1684,12 @@ xfs_inode_needs_inactive(
|
||||
* now be truncated. Also, we clear all of the read-ahead state
|
||||
* kept for the inode here since the file is now closed.
|
||||
*/
|
||||
void
|
||||
int
|
||||
xfs_inactive(
|
||||
xfs_inode_t *ip)
|
||||
{
|
||||
struct xfs_mount *mp;
|
||||
int error;
|
||||
int error = 0;
|
||||
int truncate = 0;
|
||||
|
||||
/*
|
||||
@ -1736,7 +1727,7 @@ xfs_inactive(
|
||||
* reference to the inode at this point anyways.
|
||||
*/
|
||||
if (xfs_can_free_eofblocks(ip, true))
|
||||
xfs_free_eofblocks(ip);
|
||||
error = xfs_free_eofblocks(ip);
|
||||
|
||||
goto out;
|
||||
}
|
||||
@ -1773,7 +1764,7 @@ xfs_inactive(
|
||||
/*
|
||||
* Free the inode.
|
||||
*/
|
||||
xfs_inactive_ifree(ip);
|
||||
error = xfs_inactive_ifree(ip);
|
||||
|
||||
out:
|
||||
/*
|
||||
@ -1781,6 +1772,7 @@ out:
|
||||
* the attached dquots.
|
||||
*/
|
||||
xfs_qm_dqdetach(ip);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -470,7 +470,7 @@ enum layout_break_reason {
|
||||
(xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
|
||||
|
||||
int xfs_release(struct xfs_inode *ip);
|
||||
void xfs_inactive(struct xfs_inode *ip);
|
||||
int xfs_inactive(struct xfs_inode *ip);
|
||||
int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
|
||||
struct xfs_inode **ipp, struct xfs_name *ci_name);
|
||||
int xfs_create(struct mnt_idmap *idmap,
|
||||
|
@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
|
||||
return container_of(lip, struct xfs_inode_log_item, ili_item);
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
xfs_inode_item_sort(
|
||||
struct xfs_log_item *lip)
|
||||
{
|
||||
return INODE_ITEM(lip)->ili_inode->i_ino;
|
||||
}
|
||||
|
||||
/*
|
||||
* Prior to finally logging the inode, we have to ensure that all the
|
||||
* per-modification inode state changes are applied. This includes VFS inode
|
||||
* state updates, format conversions, verifier state synchronisation and
|
||||
* ensuring the inode buffer remains in memory whilst the inode is dirty.
|
||||
*
|
||||
* We have to be careful when we grab the inode cluster buffer due to lock
|
||||
* ordering constraints. The unlinked inode modifications (xfs_iunlink_item)
|
||||
* require AGI -> inode cluster buffer lock order. The inode cluster buffer is
|
||||
* not locked until ->precommit, so it happens after everything else has been
|
||||
* modified.
|
||||
*
|
||||
* Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we
|
||||
* have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we
|
||||
* cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because
|
||||
* it can be called on a inode (e.g. via bumplink/droplink) before we take the
|
||||
* AGF lock modifying directory blocks.
|
||||
*
|
||||
* Rather than force a complete rework of all the transactions to call
|
||||
* xfs_trans_log_inode() once and once only at the end of every transaction, we
|
||||
* move the pinning of the inode cluster buffer to a ->precommit operation. This
|
||||
* matches how the xfs_iunlink_item locks the inode cluster buffer, and it
|
||||
* ensures that the inode cluster buffer locking is always done last in a
|
||||
* transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode
|
||||
* cluster buffer.
|
||||
*
|
||||
* If we return the inode number as the precommit sort key then we'll also
|
||||
* guarantee that the order all inode cluster buffer locking is the same all the
|
||||
* inodes and unlink items in the transaction.
|
||||
*/
|
||||
static int
|
||||
xfs_inode_item_precommit(
|
||||
struct xfs_trans *tp,
|
||||
struct xfs_log_item *lip)
|
||||
{
|
||||
struct xfs_inode_log_item *iip = INODE_ITEM(lip);
|
||||
struct xfs_inode *ip = iip->ili_inode;
|
||||
struct inode *inode = VFS_I(ip);
|
||||
unsigned int flags = iip->ili_dirty_flags;
|
||||
|
||||
/*
|
||||
* Don't bother with i_lock for the I_DIRTY_TIME check here, as races
|
||||
* don't matter - we either will need an extra transaction in 24 hours
|
||||
* to log the timestamps, or will clear already cleared fields in the
|
||||
* worst case.
|
||||
*/
|
||||
if (inode->i_state & I_DIRTY_TIME) {
|
||||
spin_lock(&inode->i_lock);
|
||||
inode->i_state &= ~I_DIRTY_TIME;
|
||||
spin_unlock(&inode->i_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we're updating the inode core or the timestamps and it's possible
|
||||
* to upgrade this inode to bigtime format, do so now.
|
||||
*/
|
||||
if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
|
||||
xfs_has_bigtime(ip->i_mount) &&
|
||||
!xfs_inode_has_bigtime(ip)) {
|
||||
ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
|
||||
flags |= XFS_ILOG_CORE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Inode verifiers do not check that the extent size hint is an integer
|
||||
* multiple of the rt extent size on a directory with both rtinherit
|
||||
* and extszinherit flags set. If we're logging a directory that is
|
||||
* misconfigured in this way, clear the hint.
|
||||
*/
|
||||
if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
|
||||
(ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
|
||||
(ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
|
||||
ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
|
||||
XFS_DIFLAG_EXTSZINHERIT);
|
||||
ip->i_extsize = 0;
|
||||
flags |= XFS_ILOG_CORE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Record the specific change for fdatasync optimisation. This allows
|
||||
* fdatasync to skip log forces for inodes that are only timestamp
|
||||
* dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
|
||||
* to XFS_ILOG_CORE so that the actual on-disk dirty tracking
|
||||
* (ili_fields) correctly tracks that the version has changed.
|
||||
*/
|
||||
spin_lock(&iip->ili_lock);
|
||||
iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
|
||||
if (flags & XFS_ILOG_IVERSION)
|
||||
flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
|
||||
|
||||
if (!iip->ili_item.li_buf) {
|
||||
struct xfs_buf *bp;
|
||||
int error;
|
||||
|
||||
/*
|
||||
* We hold the ILOCK here, so this inode is not going to be
|
||||
* flushed while we are here. Further, because there is no
|
||||
* buffer attached to the item, we know that there is no IO in
|
||||
* progress, so nothing will clear the ili_fields while we read
|
||||
* in the buffer. Hence we can safely drop the spin lock and
|
||||
* read the buffer knowing that the state will not change from
|
||||
* here.
|
||||
*/
|
||||
spin_unlock(&iip->ili_lock);
|
||||
error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
/*
|
||||
* We need an explicit buffer reference for the log item but
|
||||
* don't want the buffer to remain attached to the transaction.
|
||||
* Hold the buffer but release the transaction reference once
|
||||
* we've attached the inode log item to the buffer log item
|
||||
* list.
|
||||
*/
|
||||
xfs_buf_hold(bp);
|
||||
spin_lock(&iip->ili_lock);
|
||||
iip->ili_item.li_buf = bp;
|
||||
bp->b_flags |= _XBF_INODES;
|
||||
list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
|
||||
xfs_trans_brelse(tp, bp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Always OR in the bits from the ili_last_fields field. This is to
|
||||
* coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
|
||||
* in the eventual clearing of the ili_fields bits. See the big comment
|
||||
* in xfs_iflush() for an explanation of this coordination mechanism.
|
||||
*/
|
||||
iip->ili_fields |= (flags | iip->ili_last_fields);
|
||||
spin_unlock(&iip->ili_lock);
|
||||
|
||||
/*
|
||||
* We are done with the log item transaction dirty state, so clear it so
|
||||
* that it doesn't pollute future transactions.
|
||||
*/
|
||||
iip->ili_dirty_flags = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The logged size of an inode fork is always the current size of the inode
|
||||
* fork. This means that when an inode fork is relogged, the size of the logged
|
||||
@ -662,6 +809,8 @@ xfs_inode_item_committing(
|
||||
}
|
||||
|
||||
static const struct xfs_item_ops xfs_inode_item_ops = {
|
||||
.iop_sort = xfs_inode_item_sort,
|
||||
.iop_precommit = xfs_inode_item_precommit,
|
||||
.iop_size = xfs_inode_item_size,
|
||||
.iop_format = xfs_inode_item_format,
|
||||
.iop_pin = xfs_inode_item_pin,
|
||||
|
@ -17,6 +17,7 @@ struct xfs_inode_log_item {
|
||||
struct xfs_log_item ili_item; /* common portion */
|
||||
struct xfs_inode *ili_inode; /* inode ptr */
|
||||
unsigned short ili_lock_flags; /* inode lock flags */
|
||||
unsigned int ili_dirty_flags; /* dirty in current tx */
|
||||
/*
|
||||
* The ili_lock protects the interactions between the dirty state and
|
||||
* the flush state of the inode log item. This allows us to do atomic
|
||||
|
@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket(
|
||||
* just to flush the inodegc queue and wait for it to
|
||||
* complete.
|
||||
*/
|
||||
xfs_inodegc_flush(mp);
|
||||
error = xfs_inodegc_flush(mp);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
|
||||
prev_agino = agino;
|
||||
@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket(
|
||||
}
|
||||
|
||||
if (prev_ip) {
|
||||
int error2;
|
||||
|
||||
ip->i_prev_unlinked = prev_agino;
|
||||
xfs_irele(prev_ip);
|
||||
|
||||
error2 = xfs_inodegc_flush(mp);
|
||||
if (error2 && !error)
|
||||
return error2;
|
||||
}
|
||||
xfs_inodegc_flush(mp);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag(
|
||||
* bucket and remaining inodes on it unreferenced and
|
||||
* unfreeable.
|
||||
*/
|
||||
xfs_inodegc_flush(pag->pag_mount);
|
||||
xlog_recover_clear_agi_bucket(pag, bucket);
|
||||
}
|
||||
}
|
||||
@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks(
|
||||
|
||||
for_each_perag(log->l_mp, agno, pag)
|
||||
xlog_recover_iunlink_ag(pag);
|
||||
|
||||
/*
|
||||
* Flush the pending unlinked inodes to ensure that the inactivations
|
||||
* are fully completed on disk and the incore inodes can be reclaimed
|
||||
* before we signal that recovery is complete.
|
||||
*/
|
||||
xfs_inodegc_flush(log->l_mp);
|
||||
}
|
||||
|
||||
STATIC void
|
||||
|
@ -62,6 +62,7 @@ struct xfs_error_cfg {
|
||||
struct xfs_inodegc {
|
||||
struct llist_head list;
|
||||
struct delayed_work work;
|
||||
int error;
|
||||
|
||||
/* approximate count of inodes in the list */
|
||||
unsigned int items;
|
||||
|
@ -616,8 +616,10 @@ xfs_reflink_cancel_cow_blocks(
|
||||
xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
|
||||
del.br_blockcount);
|
||||
|
||||
xfs_free_extent_later(*tpp, del.br_startblock,
|
||||
error = xfs_free_extent_later(*tpp, del.br_startblock,
|
||||
del.br_blockcount, NULL);
|
||||
if (error)
|
||||
break;
|
||||
|
||||
/* Roll the transaction */
|
||||
error = xfs_defer_finish(tpp);
|
||||
|
@ -1100,6 +1100,7 @@ xfs_inodegc_init_percpu(
|
||||
#endif
|
||||
init_llist_head(&gc->list);
|
||||
gc->items = 0;
|
||||
gc->error = 0;
|
||||
INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
|
||||
}
|
||||
return 0;
|
||||
|
@ -290,7 +290,9 @@ retry:
|
||||
* Do not perform a synchronous scan because callers can hold
|
||||
* other locks.
|
||||
*/
|
||||
xfs_blockgc_flush_all(mp);
|
||||
error = xfs_blockgc_flush_all(mp);
|
||||
if (error)
|
||||
return error;
|
||||
want_retry = false;
|
||||
goto retry;
|
||||
}
|
||||
@ -970,6 +972,11 @@ __xfs_trans_commit(
|
||||
error = xfs_defer_finish_noroll(&tp);
|
||||
if (error)
|
||||
goto out_unreserve;
|
||||
|
||||
/* Run precommits from final tx in defer chain. */
|
||||
error = xfs_trans_run_precommits(tp);
|
||||
if (error)
|
||||
goto out_unreserve;
|
||||
}
|
||||
|
||||
/*
|
||||
|
Loading…
x
Reference in New Issue
Block a user