xfs: attach dquot buffer to dquot log item buffer

Ever since 6.12-rc1, I've observed a pile of warnings from the kernel
when running fstests with quotas enabled:

WARNING: CPU: 1 PID: 458580 at mm/page_alloc.c:4221 __alloc_pages_noprof+0xc9c/0xf18
CPU: 1 UID: 0 PID: 458580 Comm: xfsaild/sda3 Tainted: G        W          6.12.0-rc6-djwa #rc6 6ee3e0e531f6457e2d26aa008a3b65ff184b377c
<snip>
Call trace:
 __alloc_pages_noprof+0xc9c/0xf18
 alloc_pages_mpol_noprof+0x94/0x240
 alloc_pages_noprof+0x68/0xf8
 new_slab+0x3e0/0x568
 ___slab_alloc+0x5a0/0xb88
 __slab_alloc.constprop.0+0x7c/0xf8
 __kmalloc_noprof+0x404/0x4d0
 xfs_buf_get_map+0x594/0xde0 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_buf_read_map+0x64/0x2e0 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_trans_read_buf_map+0x1dc/0x518 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_qm_dqflush+0xac/0x468 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfs_qm_dquot_logitem_push+0xe4/0x148 [xfs 384cb02810558b4c490343c164e9407332118f88]
 xfsaild+0x3f4/0xde8 [xfs 384cb02810558b4c490343c164e9407332118f88]
 kthread+0x110/0x128
 ret_from_fork+0x10/0x20
---[ end trace 0000000000000000 ]---

This corresponds to the line:

	WARN_ON_ONCE(current->flags & PF_MEMALLOC);

within the NOFAIL checks.  What's happening here is that the XFS AIL is
trying to write a disk quota update back into the filesystem, but for
that it needs to read the ondisk buffer for the dquot.  The buffer is
not in memory anymore, probably because it was evicted.  Regardless, the
buffer cache tries to allocate a new buffer, but those allocations are
NOFAIL.  The AIL thread has marked itself PF_MEMALLOC (aka noreclaim)
since commit 43ff2122e6 ("xfs: on-stack delayed write buffer lists")
presumably because reclaim can push on XFS to push on the AIL.

An easy way to fix this probably would have been to drop the NOFAIL flag
from the xfs_buf allocation and open code a retry loop, but then there's
still the problem that for bs>ps filesystems, the buffer itself could
require up to 64k worth of pages.

Inode items had similar behavior (multi-page cluster buffers that we
don't want to allocate in the AIL) which we solved by making transaction
precommit attach the inode cluster buffers to the dirty log item.  Let's
solve the dquot problem in the same way.

So: Make a real precommit handler to read the dquot buffer and attach it
to the log item; pass it to dqflush in the push method; and have the
iodone function detach the buffer once we've flushed everything.  Add a
state flag to the log item to track when a thread has entered the
precommit -> push mechanism to skip the detaching if it turns out that
the dquot is very busy, as we don't hold the dquot lock between log item
commit and AIL push).

Reading and attaching the dquot buffer in the precommit hook is inspired
by the work done for inode cluster buffers some time ago.

Cc: <stable@vger.kernel.org> # v6.12
Fixes: 903edea6c5 ("mm: warn about illegal __GFP_NOFAIL usage in a more appropriate location and manner")
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
This commit is contained in:
Darrick J. Wong 2024-12-02 10:57:38 -08:00
parent ec88b41b93
commit acc8f8628c
6 changed files with 169 additions and 24 deletions

View File

@ -68,6 +68,30 @@ xfs_dquot_mark_sick(
}
}
/*
* Detach the dquot buffer if it's still attached, because we can get called
* through dqpurge after a log shutdown. Caller must hold the dqflock or have
* otherwise isolated the dquot.
*/
void
xfs_dquot_detach_buf(
struct xfs_dquot *dqp)
{
struct xfs_dq_logitem *qlip = &dqp->q_logitem;
struct xfs_buf *bp = NULL;
spin_lock(&qlip->qli_lock);
if (qlip->qli_item.li_buf) {
bp = qlip->qli_item.li_buf;
qlip->qli_item.li_buf = NULL;
}
spin_unlock(&qlip->qli_lock);
if (bp) {
list_del_init(&qlip->qli_item.li_bio_list);
xfs_buf_rele(bp);
}
}
/*
* This is called to free all the memory associated with a dquot
*/
@ -76,6 +100,7 @@ xfs_qm_dqdestroy(
struct xfs_dquot *dqp)
{
ASSERT(list_empty(&dqp->q_lru));
ASSERT(dqp->q_logitem.qli_item.li_buf == NULL);
kvfree(dqp->q_logitem.qli_item.li_lv_shadow);
mutex_destroy(&dqp->q_qlock);
@ -1146,6 +1171,7 @@ xfs_qm_dqflush_done(
container_of(lip, struct xfs_dq_logitem, qli_item);
struct xfs_dquot *dqp = qlip->qli_dquot;
struct xfs_ail *ailp = lip->li_ailp;
struct xfs_buf *bp = NULL;
xfs_lsn_t tail_lsn;
/*
@ -1171,6 +1197,20 @@ xfs_qm_dqflush_done(
}
}
/*
* If this dquot hasn't been dirtied since initiating the last dqflush,
* release the buffer reference. We already unlinked this dquot item
* from the buffer.
*/
spin_lock(&qlip->qli_lock);
if (!qlip->qli_dirty) {
bp = lip->li_buf;
lip->li_buf = NULL;
}
spin_unlock(&qlip->qli_lock);
if (bp)
xfs_buf_rele(bp);
/*
* Release the dq's flush lock since we're done with it.
*/
@ -1197,7 +1237,7 @@ xfs_buf_dquot_io_fail(
spin_lock(&bp->b_mount->m_ail->ail_lock);
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
xfs_set_li_failed(lip, bp);
set_bit(XFS_LI_FAILED, &lip->li_flags);
spin_unlock(&bp->b_mount->m_ail->ail_lock);
}
@ -1249,6 +1289,7 @@ int
xfs_dquot_read_buf(
struct xfs_trans *tp,
struct xfs_dquot *dqp,
xfs_buf_flags_t xbf_flags,
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dqp->q_mount;
@ -1256,7 +1297,7 @@ xfs_dquot_read_buf(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
mp->m_quotainfo->qi_dqchunklen, xbf_flags,
&bp, &xfs_dquot_buf_ops);
if (error == -EAGAIN)
return error;
@ -1275,6 +1316,77 @@ xfs_dquot_read_buf(
return error;
}
/*
* Attach a dquot buffer to this dquot to avoid allocating a buffer during a
* dqflush, since dqflush can be called from reclaim context.
*/
int
xfs_dquot_attach_buf(
struct xfs_trans *tp,
struct xfs_dquot *dqp)
{
struct xfs_dq_logitem *qlip = &dqp->q_logitem;
struct xfs_log_item *lip = &qlip->qli_item;
int error;
spin_lock(&qlip->qli_lock);
if (!lip->li_buf) {
struct xfs_buf *bp = NULL;
spin_unlock(&qlip->qli_lock);
error = xfs_dquot_read_buf(tp, dqp, 0, &bp);
if (error)
return error;
/*
* Attach the dquot to the buffer so that the AIL does not have
* to read the dquot buffer to push this item.
*/
xfs_buf_hold(bp);
spin_lock(&qlip->qli_lock);
lip->li_buf = bp;
xfs_trans_brelse(tp, bp);
}
qlip->qli_dirty = true;
spin_unlock(&qlip->qli_lock);
return 0;
}
/*
* Get a new reference the dquot buffer attached to this dquot for a dqflush
* operation.
*
* Returns 0 and a NULL bp if none was attached to the dquot; 0 and a locked
* bp; or -EAGAIN if the buffer could not be locked.
*/
int
xfs_dquot_use_attached_buf(
struct xfs_dquot *dqp,
struct xfs_buf **bpp)
{
struct xfs_buf *bp = dqp->q_logitem.qli_item.li_buf;
/*
* A NULL buffer can happen if the dquot dirty flag was set but the
* filesystem shut down before transaction commit happened. In that
* case we're not going to flush anyway.
*/
if (!bp) {
ASSERT(xfs_is_shutdown(dqp->q_mount));
*bpp = NULL;
return 0;
}
if (!xfs_buf_trylock(bp))
return -EAGAIN;
xfs_buf_hold(bp);
*bpp = bp;
return 0;
}
/*
* Write a modified dquot to disk.
* The dquot must be locked and the flush lock too taken by caller.
@ -1289,7 +1401,8 @@ xfs_qm_dqflush(
struct xfs_buf *bp)
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_log_item *lip = &dqp->q_logitem.qli_item;
struct xfs_dq_logitem *qlip = &dqp->q_logitem;
struct xfs_log_item *lip = &qlip->qli_item;
struct xfs_dqblk *dqblk;
xfs_failaddr_t fa;
int error;
@ -1319,8 +1432,15 @@ xfs_qm_dqflush(
*/
dqp->q_flags &= ~XFS_DQFLAG_DIRTY;
xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
&lip->li_lsn);
/*
* We hold the dquot lock, so nobody can dirty it while we're
* scheduling the write out. Clear the dirty-since-flush flag.
*/
spin_lock(&qlip->qli_lock);
qlip->qli_dirty = false;
spin_unlock(&qlip->qli_lock);
xfs_trans_ail_copy_lsn(mp->m_ail, &qlip->qli_flush_lsn, &lip->li_lsn);
/*
* copy the lsn into the on-disk dquot now while we have the in memory

View File

@ -215,7 +215,7 @@ void xfs_dquot_to_disk(struct xfs_disk_dquot *ddqp, struct xfs_dquot *dqp);
void xfs_qm_dqdestroy(struct xfs_dquot *dqp);
int xfs_dquot_read_buf(struct xfs_trans *tp, struct xfs_dquot *dqp,
struct xfs_buf **bpp);
xfs_buf_flags_t flags, struct xfs_buf **bpp);
int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf *bp);
void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
void xfs_qm_adjust_dqtimers(struct xfs_dquot *d);
@ -239,6 +239,10 @@ void xfs_dqlockn(struct xfs_dqtrx *q);
void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
int xfs_dquot_attach_buf(struct xfs_trans *tp, struct xfs_dquot *dqp);
int xfs_dquot_use_attached_buf(struct xfs_dquot *dqp, struct xfs_buf **bpp);
void xfs_dquot_detach_buf(struct xfs_dquot *dqp);
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
xfs_dqlock(dqp);

View File

@ -123,8 +123,9 @@ xfs_qm_dquot_logitem_push(
__releases(&lip->li_ailp->ail_lock)
__acquires(&lip->li_ailp->ail_lock)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
struct xfs_buf *bp = lip->li_buf;
struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
struct xfs_dquot *dqp = qlip->qli_dquot;
struct xfs_buf *bp;
uint rval = XFS_ITEM_SUCCESS;
int error;
@ -155,11 +156,10 @@ xfs_qm_dquot_logitem_push(
spin_unlock(&lip->li_ailp->ail_lock);
error = xfs_dquot_read_buf(NULL, dqp, &bp);
if (error) {
if (error == -EAGAIN)
rval = XFS_ITEM_LOCKED;
error = xfs_dquot_use_attached_buf(dqp, &bp);
if (error == -EAGAIN) {
xfs_dqfunlock(dqp);
rval = XFS_ITEM_LOCKED;
goto out_relock_ail;
}
@ -207,12 +207,10 @@ xfs_qm_dquot_logitem_committing(
}
#ifdef DEBUG_EXPENSIVE
static int
xfs_qm_dquot_logitem_precommit(
struct xfs_trans *tp,
struct xfs_log_item *lip)
static void
xfs_qm_dquot_logitem_precommit_check(
struct xfs_dquot *dqp)
{
struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
struct xfs_mount *mp = dqp->q_mount;
struct xfs_disk_dquot ddq = { };
xfs_failaddr_t fa;
@ -228,13 +226,24 @@ xfs_qm_dquot_logitem_precommit(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
ASSERT(fa == NULL);
}
return 0;
}
#else
# define xfs_qm_dquot_logitem_precommit NULL
# define xfs_qm_dquot_logitem_precommit_check(...) ((void)0)
#endif
static int
xfs_qm_dquot_logitem_precommit(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
struct xfs_dquot *dqp = qlip->qli_dquot;
xfs_qm_dquot_logitem_precommit_check(dqp);
return xfs_dquot_attach_buf(tp, dqp);
}
static const struct xfs_item_ops xfs_dquot_item_ops = {
.iop_size = xfs_qm_dquot_logitem_size,
.iop_precommit = xfs_qm_dquot_logitem_precommit,
@ -259,5 +268,7 @@ xfs_qm_dquot_logitem_init(
xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
&xfs_dquot_item_ops);
spin_lock_init(&lp->qli_lock);
lp->qli_dquot = dqp;
lp->qli_dirty = false;
}

View File

@ -14,6 +14,13 @@ struct xfs_dq_logitem {
struct xfs_log_item qli_item; /* common portion */
struct xfs_dquot *qli_dquot; /* dquot ptr */
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
/*
* We use this spinlock to coordinate access to the li_buf pointer in
* the log item and the qli_dirty flag.
*/
spinlock_t qli_lock;
bool qli_dirty; /* dirtied since last flush? */
};
void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);

View File

@ -148,7 +148,7 @@ xfs_qm_dqpurge(
* We don't care about getting disk errors here. We need
* to purge this dquot anyway, so we go ahead regardless.
*/
error = xfs_dquot_read_buf(NULL, dqp, &bp);
error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
if (error == -EAGAIN) {
xfs_dqfunlock(dqp);
dqp->q_flags &= ~XFS_DQFLAG_FREEING;
@ -168,6 +168,7 @@ xfs_qm_dqpurge(
}
xfs_dqflock(dqp);
}
xfs_dquot_detach_buf(dqp);
out_funlock:
ASSERT(atomic_read(&dqp->q_pincount) == 0);
@ -505,7 +506,7 @@ xfs_qm_dquot_isolate(
/* we have to drop the LRU lock to flush the dquot */
spin_unlock(&lru->lock);
error = xfs_dquot_read_buf(NULL, dqp, &bp);
error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
if (error) {
xfs_dqfunlock(dqp);
goto out_unlock_dirty;
@ -523,6 +524,8 @@ xfs_qm_dquot_isolate(
xfs_buf_relse(bp);
goto out_unlock_dirty;
}
xfs_dquot_detach_buf(dqp);
xfs_dqfunlock(dqp);
/*
@ -1510,7 +1513,7 @@ xfs_qm_flush_one(
goto out_unlock;
}
error = xfs_dquot_read_buf(NULL, dqp, &bp);
error = xfs_dquot_read_buf(NULL, dqp, XBF_TRYLOCK, &bp);
if (error)
goto out_unlock;

View File

@ -360,7 +360,7 @@ xfsaild_resubmit_item(
/* protected by ail_lock */
list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
if (bp->b_flags & _XBF_INODES)
if (bp->b_flags & (_XBF_INODES | _XBF_DQUOTS))
clear_bit(XFS_LI_FAILED, &lip->li_flags);
else
xfs_clear_li_failed(lip);