2018-06-06 02:42:14 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2008-10-30 06:06:08 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
|
|
|
#include "xfs_fs.h"
|
2019-06-29 02:25:35 +00:00
|
|
|
#include "xfs_shared.h"
|
2013-08-12 10:49:26 +00:00
|
|
|
#include "xfs_format.h"
|
2013-10-22 23:50:10 +00:00
|
|
|
#include "xfs_log_format.h"
|
|
|
|
#include "xfs_trans_resv.h"
|
2008-10-30 06:06:08 +00:00
|
|
|
#include "xfs_mount.h"
|
|
|
|
#include "xfs_inode.h"
|
2013-10-22 23:50:10 +00:00
|
|
|
#include "xfs_trans.h"
|
|
|
|
#include "xfs_trans_priv.h"
|
2008-10-30 06:06:08 +00:00
|
|
|
#include "xfs_inode_item.h"
|
2009-06-08 13:33:32 +00:00
|
|
|
#include "xfs_quota.h"
|
2009-12-14 23:14:59 +00:00
|
|
|
#include "xfs_trace.h"
|
2012-10-08 10:56:09 +00:00
|
|
|
#include "xfs_icache.h"
|
2013-08-12 10:49:45 +00:00
|
|
|
#include "xfs_bmap_util.h"
|
2014-07-24 09:49:28 +00:00
|
|
|
#include "xfs_dquot_item.h"
|
|
|
|
#include "xfs_dquot.h"
|
2016-10-03 16:11:46 +00:00
|
|
|
#include "xfs_reflink.h"
|
2020-05-14 21:01:19 +00:00
|
|
|
#include "xfs_ialloc.h"
|
2021-06-02 00:48:24 +00:00
|
|
|
#include "xfs_ag.h"
|
xfs: xfs_is_shutdown vs xlog_is_shutdown cage fight
I've been chasing a recent resurgence in generic/388 recovery
failure and/or corruption events. The events have largely been
uninitialised inode chunks being tripped over in log recovery
such as:
XFS (pmem1): User initiated shutdown received.
pmem1: writeback error on inode 12621949, offset 1019904, sector 12968096
XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/xfs/xfs_fsops.c:500). Shutting down filesystem.
XFS (pmem1): Please unmount the filesystem and rectify the problem(s)
XFS (pmem1): Unmounting Filesystem
XFS (pmem1): Mounting V5 Filesystem
XFS (pmem1): Starting recovery (logdev: internal)
XFS (pmem1): bad inode magic/vsn daddr 8723584 #0 (magic=1818)
XFS (pmem1): Metadata corruption detected at xfs_inode_buf_verify+0x180/0x190, xfs_inode block 0x851c80 xfs_inode_buf_verify
XFS (pmem1): Unmount and run xfs_repair
XFS (pmem1): First 128 bytes of corrupted metadata buffer:
00000000: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000010: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000020: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000030: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000040: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000050: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000060: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000070: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
XFS (pmem1): metadata I/O error in "xlog_recover_items_pass2+0x52/0xc0" at daddr 0x851c80 len 32 error 117
XFS (pmem1): log mount/recovery failed: error -117
XFS (pmem1): log mount failed
There have been isolated random other issues, too - xfs_repair fails
because it finds some corruption in symlink blocks, rmap
inconsistencies, etc - but they are nowhere near as common as the
uninitialised inode chunk failure.
The problem has clearly happened at runtime before recovery has run;
I can see the ICREATE log item in the log shortly before the
actively recovered range of the log. This means the ICREATE was
definitely created and written to the log, but for some reason the
tail of the log has been moved past the ordered buffer log item that
tracks INODE_ALLOC buffers and, supposedly, prevents the tail of the
log moving past the ICREATE log item before the inode chunk buffer
is written to disk.
Tracing the fsstress processes that are running when the filesystem
shut down immediately pin-pointed the problem:
user shutdown marks xfs_mount as shutdown
godown-213341 [008] 6398.022871: console: [ 6397.915392] XFS (pmem1): User initiated shutdown received.
.....
aild tries to push ordered inode cluster buffer
xfsaild/pmem1-213314 [001] 6398.022974: xfs_buf_trylock: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 16 pincount 0 lock 0 flags DONE|INODES|PAGES caller xfs_inode_item_push+0x8e
xfsaild/pmem1-213314 [001] 6398.022976: xfs_ilock_nowait: dev 259:1 ino 0x851c80 flags ILOCK_SHARED caller xfs_iflush_cluster+0xae
xfs_iflush_cluster() checks xfs_is_shutdown(), returns true,
calls xfs_iflush_abort() to kill writeback of the inode.
Inode is removed from AIL, drops cluster buffer reference.
xfsaild/pmem1-213314 [001] 6398.022977: xfs_ail_delete: dev 259:1 lip 0xffff88880247ed80 old lsn 7/20344 new lsn 7/21000 type XFS_LI_INODE flags IN_AIL
xfsaild/pmem1-213314 [001] 6398.022978: xfs_buf_rele: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 17 pincount 0 lock 0 flags DONE|INODES|PAGES caller xfs_iflush_abort+0xd7
.....
All inodes on cluster buffer are aborted, then the cluster buffer
itself is aborted and removed from the AIL *without writeback*:
xfsaild/pmem1-213314 [001] 6398.023011: xfs_buf_error_relse: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_ioend_fail+0x33
xfsaild/pmem1-213314 [001] 6398.023012: xfs_ail_delete: dev 259:1 lip 0xffff8888053efde8 old lsn 7/20344 new lsn 7/20344 type XFS_LI_BUF flags IN_AIL
The inode buffer was at 7/20344 when it was removed from the AIL.
xfsaild/pmem1-213314 [001] 6398.023012: xfs_buf_item_relse: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_item_done+0x31
xfsaild/pmem1-213314 [001] 6398.023012: xfs_buf_rele: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_item_relse+0x39
.....
Userspace is still running, doing stuff. an fsstress process runs
syncfs() or sync() and we end up in sync_fs_one_sb() which issues
a log force. This pushes on the CIL:
fsstress-213322 [001] 6398.024430: xfs_fs_sync_fs: dev 259:1 m_features 0x20000000019ff6e9 opstate (clean|shutdown|inodegc|blockgc) s_flags 0x70810000 caller sync_fs_one_sb+0x26
fsstress-213322 [001] 6398.024430: xfs_log_force: dev 259:1 lsn 0x0 caller xfs_fs_sync_fs+0x82
fsstress-213322 [001] 6398.024430: xfs_log_force: dev 259:1 lsn 0x5f caller xfs_log_force+0x7c
<...>-194402 [001] 6398.024467: kmem_alloc: size 176 flags 0x14 caller xlog_cil_push_work+0x9f
And the CIL fills up iclogs with pending changes. This picks up
the current tail from the AIL:
<...>-194402 [001] 6398.024497: xlog_iclog_get_space: dev 259:1 state XLOG_STATE_ACTIVE refcnt 1 offset 0 lsn 0x0 flags caller xlog_write+0x149
<...>-194402 [001] 6398.024498: xlog_iclog_switch: dev 259:1 state XLOG_STATE_ACTIVE refcnt 1 offset 0 lsn 0x700005408 flags caller xlog_state_get_iclog_space+0x37e
<...>-194402 [001] 6398.024521: xlog_iclog_release: dev 259:1 state XLOG_STATE_WANT_SYNC refcnt 1 offset 32256 lsn 0x700005408 flags caller xlog_write+0x5f9
<...>-194402 [001] 6398.024522: xfs_log_assign_tail_lsn: dev 259:1 new tail lsn 7/21000, old lsn 7/20344, last sync 7/21448
And it moves the tail of the log to 7/21000 from 7/20344. This
*moves the tail of the log beyond the ICREATE transaction* that was
at 7/20344 and pinned by the inode cluster buffer that was cancelled
above.
....
godown-213341 [008] 6398.027005: xfs_force_shutdown: dev 259:1 tag logerror flags log_io|force_umount file fs/xfs/xfs_fsops.c line_num 500
godown-213341 [008] 6398.027022: console: [ 6397.915406] pmem1: writeback error on inode 12621949, offset 1019904, sector 12968096
godown-213341 [008] 6398.030551: console: [ 6397.919546] XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/
And finally the log itself is now shutdown, stopping all further
writes to the log. But this is too late to prevent the corruption
that moving the tail of the log forwards after we start cancelling
writeback causes.
The fundamental problem here is that we are using the wrong shutdown
checks for log items. We've long conflated mount shutdown with log
shutdown state, and I started separating that recently with the
atomic shutdown state changes in commit b36d4651e165 ("xfs: make
forced shutdown processing atomic"). The changes in that commit
series are directly responsible for being able to diagnose this
issue because it clearly separated mount shutdown from log shutdown.
Essentially, once we start cancelling writeback of log items and
removing them from the AIL because the filesystem is shut down, we
*cannot* update the journal because we may have cancelled the items
that pin the tail of the log. That moves the tail of the log
forwards without having written the metadata back, hence we have
corrupt in memory state and writing to the journal propagates that
to the on-disk state.
What commit b36d4651e165 makes clear is that log item state needs to
change relative to log shutdown, not mount shutdown. IOWs, anything
that aborts metadata writeback needs to check log shutdown state
because log items directly affect log consistency. Having them check
mount shutdown state introduces the above race condition where we
cancel metadata writeback before the log shuts down.
To fix this, this patch works through all log items and converts
shutdown checks to use xlog_is_shutdown() rather than
xfs_is_shutdown(), so that we don't start aborting metadata
writeback before we shut off journal writes.
AFAICT, this race condition is a zero day IO error handling bug in
XFS that dates back to the introduction of XLOG_IO_ERROR,
XLOG_STATE_IOERROR and XFS_FORCED_SHUTDOWN back in January 1997.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-03-17 16:09:13 +00:00
|
|
|
#include "xfs_log_priv.h"
|
2024-02-22 20:32:43 +00:00
|
|
|
#include "xfs_health.h"
|
2024-11-04 04:18:50 +00:00
|
|
|
#include "xfs_da_format.h"
|
|
|
|
#include "xfs_dir2.h"
|
|
|
|
#include "xfs_metafile.h"
|
2008-10-30 06:06:08 +00:00
|
|
|
|
2017-12-11 11:35:19 +00:00
|
|
|
#include <linux/iversion.h>
|
2008-10-30 06:06:18 +00:00
|
|
|
|
2021-06-01 20:49:52 +00:00
|
|
|
/* Radix tree tags for incore inode tree. */
|
|
|
|
|
|
|
|
/* inode is to be reclaimed */
|
|
|
|
#define XFS_ICI_RECLAIM_TAG 0
|
|
|
|
/* Inode has speculative preallocations (posteof or cow) to clean. */
|
|
|
|
#define XFS_ICI_BLOCKGC_TAG 1
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The goal for walking incore inodes. These can correspond with incore inode
|
|
|
|
* radix tree tags when convenient. Avoid existing XFS_IWALK namespace.
|
|
|
|
*/
|
|
|
|
enum xfs_icwalk_goal {
|
|
|
|
/* Goals directly associated with tagged inodes. */
|
|
|
|
XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG,
|
2021-05-31 18:32:02 +00:00
|
|
|
XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG,
|
2021-06-01 20:49:52 +00:00
|
|
|
};
|
|
|
|
|
2021-05-31 18:31:59 +00:00
|
|
|
static int xfs_icwalk(struct xfs_mount *mp,
|
2021-06-07 16:34:51 +00:00
|
|
|
enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
|
2021-05-31 18:31:59 +00:00
|
|
|
static int xfs_icwalk_ag(struct xfs_perag *pag,
|
2021-06-07 16:34:51 +00:00
|
|
|
enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
|
2021-06-01 20:29:41 +00:00
|
|
|
|
2021-05-31 18:31:57 +00:00
|
|
|
/*
|
2021-06-07 16:34:51 +00:00
|
|
|
* Private inode cache walk flags for struct xfs_icwalk. Must not
|
|
|
|
* coincide with XFS_ICWALK_FLAGS_VALID.
|
2021-05-31 18:31:57 +00:00
|
|
|
*/
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
/* Stop scanning after icw_scan_limit inodes. */
|
|
|
|
#define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28)
|
|
|
|
|
2021-06-07 16:34:50 +00:00
|
|
|
#define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27)
|
2021-06-07 16:34:51 +00:00
|
|
|
#define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */
|
2021-06-07 16:34:50 +00:00
|
|
|
|
2021-08-06 18:05:36 +00:00
|
|
|
#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \
|
2021-06-07 16:34:51 +00:00
|
|
|
XFS_ICWALK_FLAG_RECLAIM_SICK | \
|
|
|
|
XFS_ICWALK_FLAG_UNION)
|
2021-05-31 18:31:57 +00:00
|
|
|
|
2024-08-29 04:08:40 +00:00
|
|
|
/* Marks for the perag xarray */
|
|
|
|
#define XFS_PERAG_RECLAIM_MARK XA_MARK_0
|
|
|
|
#define XFS_PERAG_BLOCKGC_MARK XA_MARK_1
|
|
|
|
|
|
|
|
static inline xa_mark_t ici_tag_to_mark(unsigned int tag)
|
|
|
|
{
|
|
|
|
if (tag == XFS_ICI_RECLAIM_TAG)
|
|
|
|
return XFS_PERAG_RECLAIM_MARK;
|
|
|
|
ASSERT(tag == XFS_ICI_BLOCKGC_TAG);
|
|
|
|
return XFS_PERAG_BLOCKGC_MARK;
|
|
|
|
}
|
|
|
|
|
2012-10-08 10:56:11 +00:00
|
|
|
/*
|
|
|
|
* Allocate and initialise an xfs_inode.
|
|
|
|
*/
|
xfs: recovery of swap extents operations for CRC filesystems
This is the recovery side of the btree block owner change operation
performed by swapext on CRC enabled filesystems. We detect that an
owner change is needed by the flag that has been placed on the inode
log format flag field. Because the inode recovery is being replayed
after the buffers that make up the BMBT in the given checkpoint, we
can walk all the buffers and directly modify them when we see the
flag set on an inode.
Because the inode can be relogged and hence present in multiple
chekpoints with the "change owner" flag set, we could do multiple
passes across the inode to do this change. While this isn't optimal,
we can't directly ignore the flag as there may be multiple
independent swap extent operations being replayed on the same inode
in different checkpoints so we can't ignore them.
Further, because the owner change operation uses ordered buffers, we
might have buffers that are newer on disk than the current
checkpoint and so already have the owner changed in them. Hence we
cannot just peek at a buffer in the tree and check that it has the
correct owner and assume that the change was completed.
So, for the moment just brute force the owner change every time we
see an inode with the flag set. Note that we have to be careful here
because the owner of the buffers may point to either the old owner
or the new owner. Currently the verifier can't verify the owner
directly, so there is no failure case here right now. If we verify
the owner exactly in future, then we'll have to take this into
account.
This was tested in terms of normal operation via xfstests - all of
the fsr tests now pass without failure. however, we really need to
modify xfs/227 to stress v3 inodes correctly to ensure we fully
cover this case for v5 filesystems.
In terms of recovery testing, I used a hacked version of xfs_fsr
that held the temp inode open for a few seconds before exiting so
that the filesystem could be shut down with an open owner change
recovery flags set on at least the temp inode. fsr leaves the temp
inode unlinked and in btree format, so this was necessary for the
owner change to be reliably replayed.
logprint confirmed the tmp inode in the log had the correct flag set:
INO: cnt:3 total:3 a:0x69e9e0 len:56 a:0x69ea20 len:176 a:0x69eae0 len:88
INODE: #regs:3 ino:0x44 flags:0x209 dsize:88
^^^^^
0x200 is set, indicating a data fork owner change needed to be
replayed on inode 0x44. A printk in the revoery code confirmed that
the inode change was recovered:
XFS (vdc): Mounting Filesystem
XFS (vdc): Starting recovery (logdev: internal)
recovering owner change ino 0x44
XFS (vdc): Version 5 superblock detected. This kernel L support enabled!
Use of these features in this kernel is at your own risk!
XFS (vdc): Ending recovery (logdev: internal)
The script used to test this was:
$ cat ./recovery-fsr.sh
#!/bin/bash
dev=/dev/vdc
mntpt=/mnt/scratch
testfile=$mntpt/testfile
umount $mntpt
mkfs.xfs -f -m crc=1 $dev
mount $dev $mntpt
chmod 777 $mntpt
for i in `seq 10000 -1 0`; do
xfs_io -f -d -c "pwrite $(($i * 4096)) 4096" $testfile > /dev/null 2>&1
done
xfs_bmap -vp $testfile |head -20
xfs_fsr -d -v $testfile &
sleep 10
/home/dave/src/xfstests-dev/src/godown -f $mntpt
wait
umount $mntpt
xfs_logprint -t $dev |tail -20
time mount $dev $mntpt
xfs_bmap -vp $testfile
umount $mntpt
$
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
2013-08-30 00:23:45 +00:00
|
|
|
struct xfs_inode *
|
2012-10-08 10:56:11 +00:00
|
|
|
xfs_inode_alloc(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
xfs_ino_t ino)
|
|
|
|
{
|
|
|
|
struct xfs_inode *ip;
|
|
|
|
|
|
|
|
/*
|
2020-07-22 16:23:04 +00:00
|
|
|
* XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
|
|
|
|
* and return NULL here on ENOMEM.
|
2012-10-08 10:56:11 +00:00
|
|
|
*/
|
2022-03-22 21:41:03 +00:00
|
|
|
ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
|
2020-07-22 16:23:04 +00:00
|
|
|
|
2012-10-08 10:56:11 +00:00
|
|
|
if (inode_init_always(mp->m_super, VFS_I(ip))) {
|
2021-10-12 18:09:23 +00:00
|
|
|
kmem_cache_free(xfs_inode_cache, ip);
|
2012-10-08 10:56:11 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2024-06-11 12:06:25 +00:00
|
|
|
/* VFS doesn't initialise i_mode! */
|
2016-02-09 05:54:58 +00:00
|
|
|
VFS_I(ip)->i_mode = 0;
|
2024-08-22 13:50:18 +00:00
|
|
|
mapping_set_folio_min_order(VFS_I(ip)->i_mapping,
|
|
|
|
M_IGEO(mp)->min_folio_order);
|
2016-02-09 05:54:58 +00:00
|
|
|
|
2015-10-12 07:21:22 +00:00
|
|
|
XFS_STATS_INC(mp, vn_active);
|
2012-10-08 10:56:11 +00:00
|
|
|
ASSERT(atomic_read(&ip->i_pincount) == 0);
|
|
|
|
ASSERT(ip->i_ino == 0);
|
|
|
|
|
|
|
|
/* initialise the xfs inode */
|
|
|
|
ip->i_ino = ino;
|
|
|
|
ip->i_mount = mp;
|
|
|
|
memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
|
2016-10-03 16:11:32 +00:00
|
|
|
ip->i_cowfp = NULL;
|
xfs: make inode attribute forks a permanent part of struct xfs_inode
Syzkaller reported a UAF bug a while back:
==================================================================
BUG: KASAN: use-after-free in xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
Read of size 4 at addr ffff88802cec919c by task syz-executor262/2958
CPU: 2 PID: 2958 Comm: syz-executor262 Not tainted
5.15.0-0.30.3-20220406_1406 #3
Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7860+a7792d29
04/01/2014
Call Trace:
<TASK>
__dump_stack lib/dump_stack.c:88 [inline]
dump_stack_lvl+0x82/0xa9 lib/dump_stack.c:106
print_address_description.constprop.9+0x21/0x2d5 mm/kasan/report.c:256
__kasan_report mm/kasan/report.c:442 [inline]
kasan_report.cold.14+0x7f/0x11b mm/kasan/report.c:459
xfs_ilock_attr_map_shared+0xe3/0xf6 fs/xfs/xfs_inode.c:127
xfs_attr_get+0x378/0x4c2 fs/xfs/libxfs/xfs_attr.c:159
xfs_xattr_get+0xe3/0x150 fs/xfs/xfs_xattr.c:36
__vfs_getxattr+0xdf/0x13d fs/xattr.c:399
cap_inode_need_killpriv+0x41/0x5d security/commoncap.c:300
security_inode_need_killpriv+0x4c/0x97 security/security.c:1408
dentry_needs_remove_privs.part.28+0x21/0x63 fs/inode.c:1912
dentry_needs_remove_privs+0x80/0x9e fs/inode.c:1908
do_truncate+0xc3/0x1e0 fs/open.c:56
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
RIP: 0033:0x7f7ef4bb753d
Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73
01 c3 48 8b 0d 1b 79 2c 00 f7 d8 64 89 01 48
RSP: 002b:00007f7ef52c2ed8 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
RAX: ffffffffffffffda RBX: 0000000000404148 RCX: 00007f7ef4bb753d
RDX: 00007f7ef4bb753d RSI: 0000000000000000 RDI: 0000000020004fc0
RBP: 0000000000404140 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0030656c69662f2e
R13: 00007ffd794db37f R14: 00007ffd794db470 R15: 00007f7ef52c2fc0
</TASK>
Allocated by task 2953:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track mm/kasan/common.c:46 [inline]
set_alloc_info mm/kasan/common.c:434 [inline]
__kasan_slab_alloc+0x68/0x7c mm/kasan/common.c:467
kasan_slab_alloc include/linux/kasan.h:254 [inline]
slab_post_alloc_hook mm/slab.h:519 [inline]
slab_alloc_node mm/slub.c:3213 [inline]
slab_alloc mm/slub.c:3221 [inline]
kmem_cache_alloc+0x11b/0x3eb mm/slub.c:3226
kmem_cache_zalloc include/linux/slab.h:711 [inline]
xfs_ifork_alloc+0x25/0xa2 fs/xfs/libxfs/xfs_inode_fork.c:287
xfs_bmap_add_attrfork+0x3f2/0x9b1 fs/xfs/libxfs/xfs_bmap.c:1098
xfs_attr_set+0xe38/0x12a7 fs/xfs/libxfs/xfs_attr.c:746
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_setxattr+0x11b/0x177 fs/xattr.c:180
__vfs_setxattr_noperm+0x128/0x5e0 fs/xattr.c:214
__vfs_setxattr_locked+0x1d4/0x258 fs/xattr.c:275
vfs_setxattr+0x154/0x33d fs/xattr.c:301
setxattr+0x216/0x29f fs/xattr.c:575
__do_sys_fsetxattr fs/xattr.c:632 [inline]
__se_sys_fsetxattr fs/xattr.c:621 [inline]
__x64_sys_fsetxattr+0x243/0x2fe fs/xattr.c:621
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
Freed by task 2949:
kasan_save_stack+0x19/0x38 mm/kasan/common.c:38
kasan_set_track+0x1c/0x21 mm/kasan/common.c:46
kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:360
____kasan_slab_free mm/kasan/common.c:366 [inline]
____kasan_slab_free mm/kasan/common.c:328 [inline]
__kasan_slab_free+0xe2/0x10e mm/kasan/common.c:374
kasan_slab_free include/linux/kasan.h:230 [inline]
slab_free_hook mm/slub.c:1700 [inline]
slab_free_freelist_hook mm/slub.c:1726 [inline]
slab_free mm/slub.c:3492 [inline]
kmem_cache_free+0xdc/0x3ce mm/slub.c:3508
xfs_attr_fork_remove+0x8d/0x132 fs/xfs/libxfs/xfs_attr_leaf.c:773
xfs_attr_sf_removename+0x5dd/0x6cb fs/xfs/libxfs/xfs_attr_leaf.c:822
xfs_attr_remove_iter+0x68c/0x805 fs/xfs/libxfs/xfs_attr.c:1413
xfs_attr_remove_args+0xb1/0x10d fs/xfs/libxfs/xfs_attr.c:684
xfs_attr_set+0xf1e/0x12a7 fs/xfs/libxfs/xfs_attr.c:802
xfs_xattr_set+0xeb/0x1a9 fs/xfs/xfs_xattr.c:59
__vfs_removexattr+0x106/0x16a fs/xattr.c:468
cap_inode_killpriv+0x24/0x47 security/commoncap.c:324
security_inode_killpriv+0x54/0xa1 security/security.c:1414
setattr_prepare+0x1a6/0x897 fs/attr.c:146
xfs_vn_change_ok+0x111/0x15e fs/xfs/xfs_iops.c:682
xfs_vn_setattr_size+0x5f/0x15a fs/xfs/xfs_iops.c:1065
xfs_vn_setattr+0x125/0x2ad fs/xfs/xfs_iops.c:1093
notify_change+0xae5/0x10a1 fs/attr.c:410
do_truncate+0x134/0x1e0 fs/open.c:64
handle_truncate fs/namei.c:3084 [inline]
do_open fs/namei.c:3432 [inline]
path_openat+0x30ab/0x396d fs/namei.c:3561
do_filp_open+0x1c4/0x290 fs/namei.c:3588
do_sys_openat2+0x60d/0x98c fs/open.c:1212
do_sys_open+0xcf/0x13c fs/open.c:1228
do_syscall_x64 arch/x86/entry/common.c:50 [inline]
do_syscall_64+0x3a/0x7e arch/x86/entry/common.c:80
entry_SYSCALL_64_after_hwframe+0x44/0x0
The buggy address belongs to the object at ffff88802cec9188
which belongs to the cache xfs_ifork of size 40
The buggy address is located 20 bytes inside of
40-byte region [ffff88802cec9188, ffff88802cec91b0)
The buggy address belongs to the page:
page:00000000c3af36a1 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x2cec9
flags: 0xfffffc0000200(slab|node=0|zone=1|lastcpupid=0x1fffff)
raw: 000fffffc0000200 ffffea00009d2580 0000000600000006 ffff88801a9ffc80
raw: 0000000000000000 0000000080490049 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected
Memory state around the buggy address:
ffff88802cec9080: fb fb fb fc fc fa fb fb fb fb fc fc fb fb fb fb
ffff88802cec9100: fb fc fc fb fb fb fb fb fc fc fb fb fb fb fb fc
>ffff88802cec9180: fc fa fb fb fb fb fc fc fa fb fb fb fb fc fc fb
^
ffff88802cec9200: fb fb fb fb fc fc fb fb fb fb fb fc fc fb fb fb
ffff88802cec9280: fb fb fc fc fa fb fb fb fb fc fc fa fb fb fb fb
==================================================================
The root cause of this bug is the unlocked access to xfs_inode.i_afp
from the getxattr code paths while trying to determine which ILOCK mode
to use to stabilize the xattr data. Unfortunately, the VFS does not
acquire i_rwsem when vfs_getxattr (or listxattr) call into the
filesystem, which means that getxattr can race with a removexattr that's
tearing down the attr fork and crash:
xfs_attr_set: xfs_attr_get:
xfs_attr_fork_remove: xfs_ilock_attr_map_shared:
xfs_idestroy_fork(ip->i_afp);
kmem_cache_free(xfs_ifork_cache, ip->i_afp);
if (ip->i_afp &&
ip->i_afp = NULL;
xfs_need_iread_extents(ip->i_afp))
<KABOOM>
ip->i_forkoff = 0;
Regrettably, the VFS is much more lax about i_rwsem and getxattr than
is immediately obvious -- not only does it not guarantee that we hold
i_rwsem, it actually doesn't guarantee that we *don't* hold it either.
The getxattr system call won't acquire the lock before calling XFS, but
the file capabilities code calls getxattr with and without i_rwsem held
to determine if the "security.capabilities" xattr is set on the file.
Fixing the VFS locking requires a treewide investigation into every code
path that could touch an xattr and what i_rwsem state it expects or sets
up. That could take years or even prove impossible; fortunately, we
can fix this UAF problem inside XFS.
An earlier version of this patch used smp_wmb in xfs_attr_fork_remove to
ensure that i_forkoff is always zeroed before i_afp is set to null and
changed the read paths to use smp_rmb before accessing i_forkoff and
i_afp, which avoided these UAF problems. However, the patch author was
too busy dealing with other problems in the meantime, and by the time he
came back to this issue, the situation had changed a bit.
On a modern system with selinux, each inode will always have at least
one xattr for the selinux label, so it doesn't make much sense to keep
incurring the extra pointer dereference. Furthermore, Allison's
upcoming parent pointer patchset will also cause nearly every inode in
the filesystem to have extended attributes. Therefore, make the inode
attribute fork structure part of struct xfs_inode, at a cost of 40 more
bytes.
This patch adds a clunky if_present field where necessary to maintain
the existing logic of xattr fork null pointer testing in the existing
codebase. The next patch switches the logic over to XFS_IFORK_Q and it
all goes away.
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
2022-07-09 17:56:06 +00:00
|
|
|
memset(&ip->i_af, 0, sizeof(ip->i_af));
|
|
|
|
ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
|
2018-07-17 23:51:50 +00:00
|
|
|
memset(&ip->i_df, 0, sizeof(ip->i_df));
|
2012-10-08 10:56:11 +00:00
|
|
|
ip->i_flags = 0;
|
|
|
|
ip->i_delayed_blks = 0;
|
2021-03-29 18:11:45 +00:00
|
|
|
ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
|
2021-03-29 18:11:40 +00:00
|
|
|
ip->i_nblocks = 0;
|
2021-03-29 18:11:44 +00:00
|
|
|
ip->i_forkoff = 0;
|
2019-04-12 14:40:25 +00:00
|
|
|
ip->i_sick = 0;
|
|
|
|
ip->i_checked = 0;
|
xfs: implement per-inode writeback completion queues
When scheduling writeback of dirty file data in the page cache, XFS uses
IO completion workqueue items to ensure that filesystem metadata only
updates after the write completes successfully. This is essential for
converting unwritten extents to real extents at the right time and
performing COW remappings.
Unfortunately, XFS queues each IO completion work item to an unbounded
workqueue, which means that the kernel can spawn dozens of threads to
try to handle the items quickly. These threads need to take the ILOCK
to update file metadata, which results in heavy ILOCK contention if a
large number of the work items target a single file, which is
inefficient.
Worse yet, the writeback completion threads get stuck waiting for the
ILOCK while holding transaction reservations, which can use up all
available log reservation space. When that happens, metadata updates to
other parts of the filesystem grind to a halt, even if the filesystem
could otherwise have handled it.
Even worse, if one of the things grinding to a halt happens to be a
thread in the middle of a defer-ops finish holding the same ILOCK and
trying to obtain more log reservation having exhausted the permanent
reservation, we now have an ABBA deadlock - writeback completion has a
transaction reserved and wants the ILOCK, and someone else has the ILOCK
and wants a transaction reservation.
Therefore, we create a per-inode writeback io completion queue + work
item. When writeback finishes, it can add the ioend to the per-inode
queue and let the single worker item process that queue. This
dramatically cuts down on the number of kworkers and ILOCK contention in
the system, and seems to have eliminated an occasional deadlock I was
seeing while running generic/476.
Testing with a program that simulates a heavy random-write workload to a
single file demonstrates that the number of kworkers drops from
approximately 120 threads per file to 1, without dramatically changing
write bandwidth or pagecache access latency.
Note that we leave the xfs-conv workqueue's max_active alone because we
still want to be able to run ioend processing for as many inodes as the
system can handle.
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
2019-04-15 20:13:20 +00:00
|
|
|
INIT_WORK(&ip->i_ioend_work, xfs_end_io);
|
|
|
|
INIT_LIST_HEAD(&ip->i_ioend_list);
|
|
|
|
spin_lock_init(&ip->i_ioend_lock);
|
2022-07-14 01:46:43 +00:00
|
|
|
ip->i_next_unlinked = NULLAGINO;
|
2023-09-11 15:39:07 +00:00
|
|
|
ip->i_prev_unlinked = 0;
|
2012-10-08 10:56:11 +00:00
|
|
|
|
|
|
|
return ip;
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC void
|
|
|
|
xfs_inode_free_callback(
|
|
|
|
struct rcu_head *head)
|
|
|
|
{
|
|
|
|
struct inode *inode = container_of(head, struct inode, i_rcu);
|
|
|
|
struct xfs_inode *ip = XFS_I(inode);
|
|
|
|
|
2016-02-09 05:54:58 +00:00
|
|
|
switch (VFS_I(ip)->i_mode & S_IFMT) {
|
2012-10-08 10:56:11 +00:00
|
|
|
case S_IFREG:
|
|
|
|
case S_IFDIR:
|
|
|
|
case S_IFLNK:
|
2020-05-18 17:29:27 +00:00
|
|
|
xfs_idestroy_fork(&ip->i_df);
|
2012-10-08 10:56:11 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2022-07-09 17:56:06 +00:00
|
|
|
xfs_ifork_zap_attr(ip);
|
|
|
|
|
2020-05-18 17:29:27 +00:00
|
|
|
if (ip->i_cowfp) {
|
|
|
|
xfs_idestroy_fork(ip->i_cowfp);
|
2021-10-12 18:09:23 +00:00
|
|
|
kmem_cache_free(xfs_ifork_cache, ip->i_cowfp);
|
2020-05-18 17:29:27 +00:00
|
|
|
}
|
2012-10-08 10:56:11 +00:00
|
|
|
if (ip->i_itemp) {
|
2018-05-09 14:47:34 +00:00
|
|
|
ASSERT(!test_bit(XFS_LI_IN_AIL,
|
|
|
|
&ip->i_itemp->ili_item.li_flags));
|
2012-10-08 10:56:11 +00:00
|
|
|
xfs_inode_item_destroy(ip);
|
|
|
|
ip->i_itemp = NULL;
|
|
|
|
}
|
|
|
|
|
2021-10-12 18:09:23 +00:00
|
|
|
kmem_cache_free(xfs_inode_cache, ip);
|
xfs: xfs_inode_free() isn't RCU safe
The xfs_inode freed in xfs_inode_free() has multiple allocated
structures attached to it. We free these in xfs_inode_free() before
we mark the inode as invalid, and before we run call_rcu() to queue
the structure for freeing.
Unfortunately, this freeing can race with other accesses that are in
the RCU current grace period that have found the inode in the radix
tree with a valid state. This includes xfs_iflush_cluster(), which
calls xfs_inode_clean(), and that accesses the inode log item on the
xfs_inode.
The log item structure is freed in xfs_inode_free(), so there is the
possibility we can be accessing freed memory in xfs_iflush_cluster()
after validating the xfs_inode structure as being valid for this RCU
context. Hence we can get spuriously incorrect clean state returned
from such checks. This can lead to use thinking the inode is dirty
when it is, in fact, clean, and so incorrectly attaching it to the
buffer for IO and completion processing.
This then leads to use-after-free situations on the xfs_inode itself
if the IO completes after the current RCU grace period expires. The
buffer callbacks will access the xfs_inode and try to do all sorts
of things it shouldn't with freed memory.
IOWs, xfs_iflush_cluster() only works correctly when racing with
inode reclaim if the inode log item is present and correctly stating
the inode is clean. If the inode is being freed, then reclaim has
already made sure the inode is clean, and hence xfs_iflush_cluster
can skip it. However, we are accessing the inode inode under RCU
read lock protection and so also must ensure that all dynamically
allocated memory we reference in this context is not freed until the
RCU grace period expires.
To fix this, move all the potential memory freeing into
xfs_inode_free_callback() so that we are guarantee RCU protected
lookup code will always have the memory structures it needs
available during the RCU grace period that lookup races can occur
in.
Discovered-by: Brain Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-05-18 04:01:53 +00:00
|
|
|
}
|
|
|
|
|
2016-05-18 04:09:12 +00:00
|
|
|
static void
|
|
|
|
__xfs_inode_free(
|
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
|
|
|
/* asserts to verify all state is correct here */
|
|
|
|
ASSERT(atomic_read(&ip->i_pincount) == 0);
|
2020-06-29 21:49:18 +00:00
|
|
|
ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
|
2016-05-18 04:09:12 +00:00
|
|
|
XFS_STATS_DEC(ip->i_mount, vn_active);
|
|
|
|
|
|
|
|
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
|
|
|
|
}
|
|
|
|
|
xfs: xfs_inode_free() isn't RCU safe
The xfs_inode freed in xfs_inode_free() has multiple allocated
structures attached to it. We free these in xfs_inode_free() before
we mark the inode as invalid, and before we run call_rcu() to queue
the structure for freeing.
Unfortunately, this freeing can race with other accesses that are in
the RCU current grace period that have found the inode in the radix
tree with a valid state. This includes xfs_iflush_cluster(), which
calls xfs_inode_clean(), and that accesses the inode log item on the
xfs_inode.
The log item structure is freed in xfs_inode_free(), so there is the
possibility we can be accessing freed memory in xfs_iflush_cluster()
after validating the xfs_inode structure as being valid for this RCU
context. Hence we can get spuriously incorrect clean state returned
from such checks. This can lead to use thinking the inode is dirty
when it is, in fact, clean, and so incorrectly attaching it to the
buffer for IO and completion processing.
This then leads to use-after-free situations on the xfs_inode itself
if the IO completes after the current RCU grace period expires. The
buffer callbacks will access the xfs_inode and try to do all sorts
of things it shouldn't with freed memory.
IOWs, xfs_iflush_cluster() only works correctly when racing with
inode reclaim if the inode log item is present and correctly stating
the inode is clean. If the inode is being freed, then reclaim has
already made sure the inode is clean, and hence xfs_iflush_cluster
can skip it. However, we are accessing the inode inode under RCU
read lock protection and so also must ensure that all dynamically
allocated memory we reference in this context is not freed until the
RCU grace period expires.
To fix this, move all the potential memory freeing into
xfs_inode_free_callback() so that we are guarantee RCU protected
lookup code will always have the memory structures it needs
available during the RCU grace period that lookup races can occur
in.
Discovered-by: Brain Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-05-18 04:01:53 +00:00
|
|
|
void
|
|
|
|
xfs_inode_free(
|
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
2020-08-17 23:41:01 +00:00
|
|
|
ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
|
2016-11-09 21:23:22 +00:00
|
|
|
|
2012-10-08 10:56:11 +00:00
|
|
|
/*
|
|
|
|
* Because we use RCU freeing we need to ensure the inode always
|
|
|
|
* appears to be reclaimed with an invalid inode number when in the
|
|
|
|
* free state. The ip->i_flags_lock provides the barrier against lookup
|
|
|
|
* races.
|
|
|
|
*/
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
|
|
|
ip->i_flags = XFS_IRECLAIM;
|
|
|
|
ip->i_ino = 0;
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
2016-05-18 04:09:12 +00:00
|
|
|
__xfs_inode_free(ip);
|
2012-10-08 10:56:11 +00:00
|
|
|
}
|
|
|
|
|
2016-05-18 04:20:08 +00:00
|
|
|
/*
|
2020-06-29 21:49:18 +00:00
|
|
|
* Queue background inode reclaim work if there are reclaimable inodes and there
|
|
|
|
* isn't reclaim work already scheduled or in progress.
|
2016-05-18 04:20:08 +00:00
|
|
|
*/
|
|
|
|
static void
|
|
|
|
xfs_reclaim_work_queue(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2024-11-04 04:18:38 +00:00
|
|
|
if (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) {
|
2016-05-18 04:20:08 +00:00
|
|
|
queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
|
|
|
|
msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
/*
|
|
|
|
* Background scanning to trim preallocated space. This is queued based on the
|
|
|
|
* 'speculative_prealloc_lifetime' tunable (5m by default).
|
|
|
|
*/
|
|
|
|
static inline void
|
|
|
|
xfs_blockgc_queue(
|
2016-05-18 04:20:08 +00:00
|
|
|
struct xfs_perag *pag)
|
2021-05-31 18:32:02 +00:00
|
|
|
{
|
2024-11-04 04:18:38 +00:00
|
|
|
struct xfs_mount *mp = pag_mount(pag);
|
2021-08-06 18:05:42 +00:00
|
|
|
|
|
|
|
if (!xfs_is_blockgc_enabled(mp))
|
|
|
|
return;
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
rcu_read_lock();
|
|
|
|
if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
|
2024-11-04 04:18:38 +00:00
|
|
|
queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work,
|
2021-05-31 18:32:02 +00:00
|
|
|
msecs_to_jiffies(xfs_blockgc_secs * 1000));
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set a tag on both the AG incore inode tree and the AG radix tree. */
|
|
|
|
static void
|
|
|
|
xfs_perag_set_inode_tag(
|
|
|
|
struct xfs_perag *pag,
|
|
|
|
xfs_agino_t agino,
|
|
|
|
unsigned int tag)
|
2016-05-18 04:20:08 +00:00
|
|
|
{
|
2021-05-31 18:32:02 +00:00
|
|
|
bool was_tagged;
|
2016-05-18 04:20:08 +00:00
|
|
|
|
2017-06-08 15:23:07 +00:00
|
|
|
lockdep_assert_held(&pag->pag_ici_lock);
|
2021-05-31 18:32:02 +00:00
|
|
|
|
|
|
|
was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
|
|
|
|
radix_tree_tag_set(&pag->pag_ici_root, agino, tag);
|
|
|
|
|
|
|
|
if (tag == XFS_ICI_RECLAIM_TAG)
|
|
|
|
pag->pag_ici_reclaimable++;
|
|
|
|
|
|
|
|
if (was_tagged)
|
2016-05-18 04:20:08 +00:00
|
|
|
return;
|
|
|
|
|
2024-11-04 04:18:38 +00:00
|
|
|
/* propagate the tag up into the pag xarray tree */
|
|
|
|
xfs_group_set_mark(pag_group(pag), ici_tag_to_mark(tag));
|
2016-05-18 04:20:08 +00:00
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
/* start background work */
|
|
|
|
switch (tag) {
|
|
|
|
case XFS_ICI_RECLAIM_TAG:
|
2024-11-04 04:18:38 +00:00
|
|
|
xfs_reclaim_work_queue(pag_mount(pag));
|
2021-05-31 18:32:02 +00:00
|
|
|
break;
|
|
|
|
case XFS_ICI_BLOCKGC_TAG:
|
|
|
|
xfs_blockgc_queue(pag);
|
|
|
|
break;
|
|
|
|
}
|
2016-05-18 04:20:08 +00:00
|
|
|
|
2023-02-12 22:14:52 +00:00
|
|
|
trace_xfs_perag_set_inode_tag(pag, _RET_IP_);
|
2016-05-18 04:20:08 +00:00
|
|
|
}
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
|
2016-05-18 04:20:08 +00:00
|
|
|
static void
|
2021-05-31 18:32:02 +00:00
|
|
|
xfs_perag_clear_inode_tag(
|
|
|
|
struct xfs_perag *pag,
|
|
|
|
xfs_agino_t agino,
|
|
|
|
unsigned int tag)
|
2016-05-18 04:20:08 +00:00
|
|
|
{
|
2017-06-08 15:23:07 +00:00
|
|
|
lockdep_assert_held(&pag->pag_ici_lock);
|
2021-05-31 18:32:02 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Reclaim can signal (with a null agino) that it cleared its own tag
|
|
|
|
* by removing the inode from the radix tree.
|
|
|
|
*/
|
|
|
|
if (agino != NULLAGINO)
|
|
|
|
radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
|
|
|
|
else
|
|
|
|
ASSERT(tag == XFS_ICI_RECLAIM_TAG);
|
|
|
|
|
|
|
|
if (tag == XFS_ICI_RECLAIM_TAG)
|
|
|
|
pag->pag_ici_reclaimable--;
|
|
|
|
|
|
|
|
if (radix_tree_tagged(&pag->pag_ici_root, tag))
|
2016-05-18 04:20:08 +00:00
|
|
|
return;
|
|
|
|
|
2024-11-04 04:18:38 +00:00
|
|
|
/* clear the tag from the pag xarray */
|
|
|
|
xfs_group_clear_mark(pag_group(pag), ici_tag_to_mark(tag));
|
2023-02-12 22:14:52 +00:00
|
|
|
trace_xfs_perag_clear_inode_tag(pag, _RET_IP_);
|
2021-05-31 18:32:02 +00:00
|
|
|
}
|
2016-05-18 04:20:08 +00:00
|
|
|
|
2024-08-29 04:08:38 +00:00
|
|
|
/*
|
2024-08-29 04:08:39 +00:00
|
|
|
* Find the next AG after @pag, or the first AG if @pag is NULL.
|
2024-08-29 04:08:38 +00:00
|
|
|
*/
|
|
|
|
static struct xfs_perag *
|
2024-08-29 04:08:39 +00:00
|
|
|
xfs_perag_grab_next_tag(
|
2024-08-29 04:08:38 +00:00
|
|
|
struct xfs_mount *mp,
|
2024-08-29 04:08:39 +00:00
|
|
|
struct xfs_perag *pag,
|
2024-08-29 04:08:38 +00:00
|
|
|
int tag)
|
|
|
|
{
|
2024-11-04 04:18:38 +00:00
|
|
|
return to_perag(xfs_group_grab_next_mark(mp,
|
|
|
|
pag ? pag_group(pag) : NULL,
|
|
|
|
ici_tag_to_mark(tag), XG_TYPE_AG));
|
2024-08-29 04:08:38 +00:00
|
|
|
}
|
|
|
|
|
2016-02-09 05:54:58 +00:00
|
|
|
/*
|
|
|
|
* When we recycle a reclaimable inode, we need to re-initialise the VFS inode
|
|
|
|
* part of the structure. This is made more complex by the fact we store
|
|
|
|
* information about the on-disk values in the VFS inode and so we can't just
|
2016-02-09 05:54:58 +00:00
|
|
|
* overwrite the values unconditionally. Hence we save the parameters we
|
2016-02-09 05:54:58 +00:00
|
|
|
* need to retain across reinitialisation, and rewrite them into the VFS inode
|
2016-02-09 05:54:58 +00:00
|
|
|
* after reinitialisation even if it fails.
|
2016-02-09 05:54:58 +00:00
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_reinit_inode(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct inode *inode)
|
|
|
|
{
|
2021-06-18 18:57:05 +00:00
|
|
|
int error;
|
|
|
|
uint32_t nlink = inode->i_nlink;
|
|
|
|
uint32_t generation = inode->i_generation;
|
|
|
|
uint64_t version = inode_peek_iversion(inode);
|
|
|
|
umode_t mode = inode->i_mode;
|
|
|
|
dev_t dev = inode->i_rdev;
|
|
|
|
kuid_t uid = inode->i_uid;
|
|
|
|
kgid_t gid = inode->i_gid;
|
2024-06-11 12:06:23 +00:00
|
|
|
unsigned long state = inode->i_state;
|
2016-02-09 05:54:58 +00:00
|
|
|
|
|
|
|
error = inode_init_always(mp->m_super, inode);
|
|
|
|
|
2016-02-09 05:54:58 +00:00
|
|
|
set_nlink(inode, nlink);
|
2016-02-09 05:54:58 +00:00
|
|
|
inode->i_generation = generation;
|
2017-12-11 11:35:19 +00:00
|
|
|
inode_set_iversion_queried(inode, version);
|
2016-02-09 05:54:58 +00:00
|
|
|
inode->i_mode = mode;
|
2018-01-26 19:24:40 +00:00
|
|
|
inode->i_rdev = dev;
|
2020-02-21 16:31:26 +00:00
|
|
|
inode->i_uid = uid;
|
|
|
|
inode->i_gid = gid;
|
2024-06-11 12:06:23 +00:00
|
|
|
inode->i_state = state;
|
2024-08-22 13:50:18 +00:00
|
|
|
mapping_set_folio_min_order(inode->i_mapping,
|
|
|
|
M_IGEO(mp)->min_folio_order);
|
2016-02-09 05:54:58 +00:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2021-06-18 18:57:05 +00:00
|
|
|
/*
|
|
|
|
* Carefully nudge an inode whose VFS state has been torn down back into a
|
|
|
|
* usable state. Drops the i_flags_lock and the rcu read lock.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_iget_recycle(
|
|
|
|
struct xfs_perag *pag,
|
|
|
|
struct xfs_inode *ip) __releases(&ip->i_flags_lock)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
struct inode *inode = VFS_I(ip);
|
|
|
|
int error;
|
|
|
|
|
|
|
|
trace_xfs_iget_recycle(ip);
|
|
|
|
|
2022-11-17 21:02:56 +00:00
|
|
|
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
|
|
|
|
return -EAGAIN;
|
|
|
|
|
2021-06-18 18:57:05 +00:00
|
|
|
/*
|
|
|
|
* We need to make it look like the inode is being reclaimed to prevent
|
|
|
|
* the actual reclaim workers from stomping over us while we recycle
|
|
|
|
* the inode. We can't clear the radix tree tag yet as it requires
|
|
|
|
* pag_ici_lock to be held exclusive.
|
|
|
|
*/
|
|
|
|
ip->i_flags |= XFS_IRECLAIM;
|
|
|
|
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
|
|
|
|
error = xfs_reinit_inode(mp, inode);
|
2022-11-17 21:02:56 +00:00
|
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
2021-06-18 18:57:05 +00:00
|
|
|
if (error) {
|
|
|
|
/*
|
|
|
|
* Re-initializing the inode failed, and we are in deep
|
|
|
|
* trouble. Try to re-add it to the reclaim list.
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
|
|
|
ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
|
|
|
|
ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
trace_xfs_iget_recycle_fail(ip);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
spin_lock(&pag->pag_ici_lock);
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear the per-lifetime state in the inode as we are now effectively
|
|
|
|
* a new inode and need to return to the initial state before reuse
|
|
|
|
* occurs.
|
|
|
|
*/
|
|
|
|
ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
|
|
|
|
ip->i_flags |= XFS_INEW;
|
|
|
|
xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
|
|
|
|
XFS_ICI_RECLAIM_TAG);
|
|
|
|
inode->i_state = I_NEW;
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
spin_unlock(&pag->pag_ici_lock);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-04-18 00:17:34 +00:00
|
|
|
/*
|
|
|
|
* If we are allocating a new inode, then check what was returned is
|
|
|
|
* actually a free, empty inode. If we are not allocating an inode,
|
|
|
|
* then check we didn't find a free inode.
|
|
|
|
*
|
|
|
|
* Returns:
|
|
|
|
* 0 if the inode free state matches the lookup context
|
|
|
|
* -ENOENT if the inode is free and we are not allocating
|
|
|
|
* -EFSCORRUPTED if there is any state mismatch at all
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_iget_check_free_state(
|
|
|
|
struct xfs_inode *ip,
|
|
|
|
int flags)
|
|
|
|
{
|
|
|
|
if (flags & XFS_IGET_CREATE) {
|
|
|
|
/* should be a free inode */
|
|
|
|
if (VFS_I(ip)->i_mode != 0) {
|
|
|
|
xfs_warn(ip->i_mount,
|
|
|
|
"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
|
|
|
|
ip->i_ino, VFS_I(ip)->i_mode);
|
2024-02-22 20:32:43 +00:00
|
|
|
xfs_agno_mark_sick(ip->i_mount,
|
|
|
|
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
|
|
|
|
XFS_SICK_AG_INOBT);
|
2018-04-18 00:17:34 +00:00
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
2021-03-29 18:11:40 +00:00
|
|
|
if (ip->i_nblocks != 0) {
|
2018-04-18 00:17:34 +00:00
|
|
|
xfs_warn(ip->i_mount,
|
|
|
|
"Corruption detected! Free inode 0x%llx has blocks allocated!",
|
|
|
|
ip->i_ino);
|
2024-02-22 20:32:43 +00:00
|
|
|
xfs_agno_mark_sick(ip->i_mount,
|
|
|
|
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
|
|
|
|
XFS_SICK_AG_INOBT);
|
2018-04-18 00:17:34 +00:00
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* should be an allocated inode */
|
|
|
|
if (VFS_I(ip)->i_mode == 0)
|
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
/* Make all pending inactivation work start immediately. */
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
static bool
|
2021-08-06 18:05:39 +00:00
|
|
|
xfs_inodegc_queue_all(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
struct xfs_inodegc *gc;
|
|
|
|
int cpu;
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
bool ret = false;
|
2021-08-06 18:05:39 +00:00
|
|
|
|
2023-09-11 15:39:03 +00:00
|
|
|
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
|
2021-08-06 18:05:39 +00:00
|
|
|
gc = per_cpu_ptr(mp->m_inodegc, cpu);
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
if (!llist_empty(&gc->list)) {
|
2022-06-16 14:44:31 +00:00
|
|
|
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
ret = true;
|
|
|
|
}
|
2021-08-06 18:05:39 +00:00
|
|
|
}
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
|
|
|
|
return ret;
|
2021-08-06 18:05:39 +00:00
|
|
|
}
|
|
|
|
|
2023-06-05 04:48:15 +00:00
|
|
|
/* Wait for all queued work and collect errors */
|
|
|
|
static int
|
|
|
|
xfs_inodegc_wait_all(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
flush_workqueue(mp->m_inodegc_wq);
|
2023-09-11 15:39:03 +00:00
|
|
|
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
|
2023-06-05 04:48:15 +00:00
|
|
|
struct xfs_inodegc *gc;
|
|
|
|
|
|
|
|
gc = per_cpu_ptr(mp->m_inodegc, cpu);
|
|
|
|
if (gc->error && !error)
|
|
|
|
error = gc->error;
|
|
|
|
gc->error = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2012-10-08 10:56:11 +00:00
|
|
|
/*
|
|
|
|
* Check the validity of the inode we just found it the cache
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
xfs_iget_cache_hit(
|
|
|
|
struct xfs_perag *pag,
|
|
|
|
struct xfs_inode *ip,
|
|
|
|
xfs_ino_t ino,
|
|
|
|
int flags,
|
|
|
|
int lock_flags) __releases(RCU)
|
|
|
|
{
|
|
|
|
struct inode *inode = VFS_I(ip);
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* check for re-use of an inode within an RCU grace period due to the
|
|
|
|
* radix tree nodes not being updated yet. We monitor for this by
|
|
|
|
* setting the inode number to zero before freeing the inode structure.
|
|
|
|
* If the inode has been reallocated and set up, then the inode number
|
|
|
|
* will not match, so check for that, too.
|
|
|
|
*/
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
2021-06-18 18:57:06 +00:00
|
|
|
if (ip->i_ino != ino)
|
|
|
|
goto out_skip;
|
2012-10-08 10:56:11 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are racing with another cache hit that is currently
|
|
|
|
* instantiating this inode or currently recycling it out of
|
2021-06-18 18:57:05 +00:00
|
|
|
* reclaimable state, wait for the initialisation to complete
|
2012-10-08 10:56:11 +00:00
|
|
|
* before continuing.
|
|
|
|
*
|
2021-08-06 18:05:39 +00:00
|
|
|
* If we're racing with the inactivation worker we also want to wait.
|
|
|
|
* If we're creating a new file, it's possible that the worker
|
|
|
|
* previously marked the inode as free on disk but hasn't finished
|
|
|
|
* updating the incore state yet. The AGI buffer will be dirty and
|
|
|
|
* locked to the icreate transaction, so a synchronous push of the
|
|
|
|
* inodegc workers would result in deadlock. For a regular iget, the
|
|
|
|
* worker is running already, so we might as well wait.
|
|
|
|
*
|
2012-10-08 10:56:11 +00:00
|
|
|
* XXX(hch): eventually we should do something equivalent to
|
|
|
|
* wait_on_inode to wait for these flags to be cleared
|
|
|
|
* instead of polling for it.
|
|
|
|
*/
|
2021-08-06 18:05:39 +00:00
|
|
|
if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
|
2021-06-18 18:57:06 +00:00
|
|
|
goto out_skip;
|
2012-10-08 10:56:11 +00:00
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
if (ip->i_flags & XFS_NEED_INACTIVE) {
|
|
|
|
/* Unlinked inodes cannot be re-grabbed. */
|
|
|
|
if (VFS_I(ip)->i_nlink == 0) {
|
|
|
|
error = -ENOENT;
|
|
|
|
goto out_error;
|
|
|
|
}
|
|
|
|
goto out_inodegc_flush;
|
|
|
|
}
|
|
|
|
|
2012-10-08 10:56:11 +00:00
|
|
|
/*
|
2018-04-18 00:17:34 +00:00
|
|
|
* Check the inode free state is valid. This also detects lookup
|
|
|
|
* racing with unlinks.
|
2012-10-08 10:56:11 +00:00
|
|
|
*/
|
2018-04-18 00:17:34 +00:00
|
|
|
error = xfs_iget_check_free_state(ip, flags);
|
|
|
|
if (error)
|
2012-10-08 10:56:11 +00:00
|
|
|
goto out_error;
|
|
|
|
|
2021-06-18 18:57:06 +00:00
|
|
|
/* Skip inodes that have no vfs state. */
|
|
|
|
if ((flags & XFS_IGET_INCORE) &&
|
|
|
|
(ip->i_flags & XFS_IRECLAIMABLE))
|
|
|
|
goto out_skip;
|
2017-06-19 15:58:56 +00:00
|
|
|
|
2021-06-18 18:57:06 +00:00
|
|
|
/* The inode fits the selection criteria; process it. */
|
|
|
|
if (ip->i_flags & XFS_IRECLAIMABLE) {
|
2021-06-18 18:57:05 +00:00
|
|
|
/* Drops i_flags_lock and RCU read lock. */
|
|
|
|
error = xfs_iget_recycle(pag, ip);
|
2022-11-17 21:02:56 +00:00
|
|
|
if (error == -EAGAIN)
|
|
|
|
goto out_skip;
|
2021-06-18 18:57:05 +00:00
|
|
|
if (error)
|
|
|
|
return error;
|
2012-10-08 10:56:11 +00:00
|
|
|
} else {
|
|
|
|
/* If the VFS inode is being torn down, pause and try again. */
|
2021-06-18 18:57:06 +00:00
|
|
|
if (!igrab(inode))
|
|
|
|
goto out_skip;
|
2012-10-08 10:56:11 +00:00
|
|
|
|
|
|
|
/* We've got a live one. */
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
rcu_read_unlock();
|
|
|
|
trace_xfs_iget_hit(ip);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (lock_flags != 0)
|
|
|
|
xfs_ilock(ip, lock_flags);
|
|
|
|
|
2017-06-19 15:58:56 +00:00
|
|
|
if (!(flags & XFS_IGET_INCORE))
|
2020-04-30 14:41:37 +00:00
|
|
|
xfs_iflags_clear(ip, XFS_ISTALE);
|
2015-10-12 07:21:22 +00:00
|
|
|
XFS_STATS_INC(mp, xs_ig_found);
|
2012-10-08 10:56:11 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2021-06-18 18:57:06 +00:00
|
|
|
out_skip:
|
|
|
|
trace_xfs_iget_skip(ip);
|
|
|
|
XFS_STATS_INC(mp, xs_ig_frecycle);
|
|
|
|
error = -EAGAIN;
|
2012-10-08 10:56:11 +00:00
|
|
|
out_error:
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
rcu_read_unlock();
|
|
|
|
return error;
|
2021-08-06 18:05:39 +00:00
|
|
|
|
|
|
|
out_inodegc_flush:
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
rcu_read_unlock();
|
|
|
|
/*
|
|
|
|
* Do not wait for the workers, because the caller could hold an AGI
|
|
|
|
* buffer lock. We're just going to sleep in a loop anyway.
|
|
|
|
*/
|
|
|
|
if (xfs_is_inodegc_enabled(mp))
|
|
|
|
xfs_inodegc_queue_all(mp);
|
|
|
|
return -EAGAIN;
|
2012-10-08 10:56:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
xfs_iget_cache_miss(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_perag *pag,
|
|
|
|
xfs_trans_t *tp,
|
|
|
|
xfs_ino_t ino,
|
|
|
|
struct xfs_inode **ipp,
|
|
|
|
int flags,
|
|
|
|
int lock_flags)
|
|
|
|
{
|
|
|
|
struct xfs_inode *ip;
|
|
|
|
int error;
|
|
|
|
xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
|
|
|
|
|
|
|
|
ip = xfs_inode_alloc(mp, ino);
|
|
|
|
if (!ip)
|
2014-06-25 04:58:08 +00:00
|
|
|
return -ENOMEM;
|
2012-10-08 10:56:11 +00:00
|
|
|
|
2023-02-12 22:14:52 +00:00
|
|
|
error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags);
|
2012-10-08 10:56:11 +00:00
|
|
|
if (error)
|
|
|
|
goto out_destroy;
|
|
|
|
|
2020-05-14 21:01:19 +00:00
|
|
|
/*
|
|
|
|
* For version 5 superblocks, if we are initialising a new inode and we
|
2021-08-19 01:46:52 +00:00
|
|
|
* are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can
|
2020-05-14 21:01:19 +00:00
|
|
|
* simply build the new inode core with a random generation number.
|
|
|
|
*
|
|
|
|
* For version 4 (and older) superblocks, log recovery is dependent on
|
2021-03-29 18:11:42 +00:00
|
|
|
* the i_flushiter field being initialised from the current on-disk
|
2020-05-14 21:01:19 +00:00
|
|
|
* value and hence we must also read the inode off disk even when
|
|
|
|
* initializing new inodes.
|
|
|
|
*/
|
2021-08-19 01:46:37 +00:00
|
|
|
if (xfs_has_v3inodes(mp) &&
|
2021-08-19 01:46:52 +00:00
|
|
|
(flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) {
|
2022-10-05 15:43:22 +00:00
|
|
|
VFS_I(ip)->i_generation = get_random_u32();
|
2020-05-14 21:01:19 +00:00
|
|
|
} else {
|
|
|
|
struct xfs_buf *bp;
|
|
|
|
|
2021-03-29 18:11:37 +00:00
|
|
|
error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
|
2020-05-14 21:01:19 +00:00
|
|
|
if (error)
|
|
|
|
goto out_destroy;
|
|
|
|
|
2021-03-29 18:11:37 +00:00
|
|
|
error = xfs_inode_from_disk(ip,
|
|
|
|
xfs_buf_offset(bp, ip->i_imap.im_boffset));
|
2020-05-14 21:01:19 +00:00
|
|
|
if (!error)
|
|
|
|
xfs_buf_set_ref(bp, XFS_INO_REF);
|
2024-02-22 20:32:43 +00:00
|
|
|
else
|
|
|
|
xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
|
2020-05-14 21:01:19 +00:00
|
|
|
xfs_trans_brelse(tp, bp);
|
|
|
|
|
|
|
|
if (error)
|
|
|
|
goto out_destroy;
|
|
|
|
}
|
|
|
|
|
2012-10-08 10:56:11 +00:00
|
|
|
trace_xfs_iget_miss(ip);
|
|
|
|
|
xfs: catch inode allocation state mismatch corruption
We recently came across a V4 filesystem causing memory corruption
due to a newly allocated inode being setup twice and being added to
the superblock inode list twice. From code inspection, the only way
this could happen is if a newly allocated inode was not marked as
free on disk (i.e. di_mode wasn't zero).
Running the metadump on an upstream debug kernel fails during inode
allocation like so:
XFS: Assertion failed: ip->i_d.di_nblocks == 0, file: fs/xfs/xfs_inod=
e.c, line: 838
------------[ cut here ]------------
kernel BUG at fs/xfs/xfs_message.c:114!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 11 PID: 3496 Comm: mkdir Not tainted 4.16.0-rc5-dgc #442
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/0=
1/2014
RIP: 0010:assfail+0x28/0x30
RSP: 0018:ffffc9000236fc80 EFLAGS: 00010202
RAX: 00000000ffffffea RBX: 0000000000004000 RCX: 0000000000000000
RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff8227211b
RBP: ffffc9000236fce8 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000bec R11: f000000000000000 R12: ffffc9000236fd30
R13: ffff8805c76bab80 R14: ffff8805c77ac800 R15: ffff88083fb12e10
FS: 00007fac8cbff040(0000) GS:ffff88083fd00000(0000) knlGS:0000000000000=
000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fffa6783ff8 CR3: 00000005c6e2b003 CR4: 00000000000606e0
Call Trace:
xfs_ialloc+0x383/0x570
xfs_dir_ialloc+0x6a/0x2a0
xfs_create+0x412/0x670
xfs_generic_create+0x1f7/0x2c0
? capable_wrt_inode_uidgid+0x3f/0x50
vfs_mkdir+0xfb/0x1b0
SyS_mkdir+0xcf/0xf0
do_syscall_64+0x73/0x1a0
entry_SYSCALL_64_after_hwframe+0x42/0xb7
Extracting the inode number we crashed on from an event trace and
looking at it with xfs_db:
xfs_db> inode 184452204
xfs_db> p
core.magic = 0x494e
core.mode = 0100644
core.version = 2
core.format = 2 (extents)
core.nlinkv2 = 1
core.onlink = 0
.....
Confirms that it is not a free inode on disk. xfs_repair
also trips over this inode:
.....
zero length extent (off = 0, fsbno = 0) in ino 184452204
correcting nextents for inode 184452204
bad attribute fork in inode 184452204, would clear attr fork
bad nblocks 1 for inode 184452204, would reset to 0
bad anextents 1 for inode 184452204, would reset to 0
imap claims in-use inode 184452204 is free, would correct imap
would have cleared inode 184452204
.....
disconnected inode 184452204, would move to lost+found
And so we have a situation where the directory structure and the
inobt thinks the inode is free, but the inode on disk thinks it is
still in use. Where this corruption came from is not possible to
diagnose, but we can detect it and prevent the kernel from oopsing
on lookup. The reproducer now results in:
$ sudo mkdir /mnt/scratch/{0,1,2,3,4,5}{0,1,2,3,4,5}
mkdir: cannot create directory =E2=80=98/mnt/scratch/00=E2=80=99: File ex=
ists
mkdir: cannot create directory =E2=80=98/mnt/scratch/01=E2=80=99: File ex=
ists
mkdir: cannot create directory =E2=80=98/mnt/scratch/03=E2=80=99: Structu=
re needs cleaning
mkdir: cannot create directory =E2=80=98/mnt/scratch/04=E2=80=99: Input/o=
utput error
mkdir: cannot create directory =E2=80=98/mnt/scratch/05=E2=80=99: Input/o=
utput error
....
And this corruption shutdown:
[ 54.843517] XFS (loop0): Corruption detected! Free inode 0xafe846c not=
marked free on disk
[ 54.845885] XFS (loop0): Internal error xfs_trans_cancel at line 1023 =
of file fs/xfs/xfs_trans.c. Caller xfs_create+0x425/0x670
[ 54.848994] CPU: 10 PID: 3541 Comm: mkdir Not tainted 4.16.0-rc5-dgc #=
443
[ 54.850753] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO=
S 1.10.2-1 04/01/2014
[ 54.852859] Call Trace:
[ 54.853531] dump_stack+0x85/0xc5
[ 54.854385] xfs_trans_cancel+0x197/0x1c0
[ 54.855421] xfs_create+0x425/0x670
[ 54.856314] xfs_generic_create+0x1f7/0x2c0
[ 54.857390] ? capable_wrt_inode_uidgid+0x3f/0x50
[ 54.858586] vfs_mkdir+0xfb/0x1b0
[ 54.859458] SyS_mkdir+0xcf/0xf0
[ 54.860254] do_syscall_64+0x73/0x1a0
[ 54.861193] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 54.862492] RIP: 0033:0x7fb73bddf547
[ 54.863358] RSP: 002b:00007ffdaa553338 EFLAGS: 00000246 ORIG_RAX: 0000=
000000000053
[ 54.865133] RAX: ffffffffffffffda RBX: 00007ffdaa55449a RCX: 00007fb73=
bddf547
[ 54.866766] RDX: 0000000000000001 RSI: 00000000000001ff RDI: 00007ffda=
a55449a
[ 54.868432] RBP: 00007ffdaa55449a R08: 00000000000001ff R09: 00005623a=
8670dd0
[ 54.870110] R10: 00007fb73be72d5b R11: 0000000000000246 R12: 000000000=
00001ff
[ 54.871752] R13: 00007ffdaa5534b0 R14: 0000000000000000 R15: 00007ffda=
a553500
[ 54.873429] XFS (loop0): xfs_do_force_shutdown(0x8) called from line 1=
024 of file fs/xfs/xfs_trans.c. Return address = ffffffff814cd050
[ 54.882790] XFS (loop0): Corruption of in-memory data detected. Shutt=
ing down filesystem
[ 54.884597] XFS (loop0): Please umount the filesystem and rectify the =
problem(s)
Note that this crash is only possible on v4 filesystemsi or v5
filesystems mounted with the ikeep mount option. For all other V5
filesystems, this problem cannot occur because we don't read inodes
we are allocating from disk - we simply overwrite them with the new
inode information.
Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Tested-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-03-23 17:22:53 +00:00
|
|
|
/*
|
2018-04-18 00:17:34 +00:00
|
|
|
* Check the inode free state is valid. This also detects lookup
|
|
|
|
* racing with unlinks.
|
xfs: catch inode allocation state mismatch corruption
We recently came across a V4 filesystem causing memory corruption
due to a newly allocated inode being setup twice and being added to
the superblock inode list twice. From code inspection, the only way
this could happen is if a newly allocated inode was not marked as
free on disk (i.e. di_mode wasn't zero).
Running the metadump on an upstream debug kernel fails during inode
allocation like so:
XFS: Assertion failed: ip->i_d.di_nblocks == 0, file: fs/xfs/xfs_inod=
e.c, line: 838
------------[ cut here ]------------
kernel BUG at fs/xfs/xfs_message.c:114!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 11 PID: 3496 Comm: mkdir Not tainted 4.16.0-rc5-dgc #442
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/0=
1/2014
RIP: 0010:assfail+0x28/0x30
RSP: 0018:ffffc9000236fc80 EFLAGS: 00010202
RAX: 00000000ffffffea RBX: 0000000000004000 RCX: 0000000000000000
RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff8227211b
RBP: ffffc9000236fce8 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000bec R11: f000000000000000 R12: ffffc9000236fd30
R13: ffff8805c76bab80 R14: ffff8805c77ac800 R15: ffff88083fb12e10
FS: 00007fac8cbff040(0000) GS:ffff88083fd00000(0000) knlGS:0000000000000=
000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fffa6783ff8 CR3: 00000005c6e2b003 CR4: 00000000000606e0
Call Trace:
xfs_ialloc+0x383/0x570
xfs_dir_ialloc+0x6a/0x2a0
xfs_create+0x412/0x670
xfs_generic_create+0x1f7/0x2c0
? capable_wrt_inode_uidgid+0x3f/0x50
vfs_mkdir+0xfb/0x1b0
SyS_mkdir+0xcf/0xf0
do_syscall_64+0x73/0x1a0
entry_SYSCALL_64_after_hwframe+0x42/0xb7
Extracting the inode number we crashed on from an event trace and
looking at it with xfs_db:
xfs_db> inode 184452204
xfs_db> p
core.magic = 0x494e
core.mode = 0100644
core.version = 2
core.format = 2 (extents)
core.nlinkv2 = 1
core.onlink = 0
.....
Confirms that it is not a free inode on disk. xfs_repair
also trips over this inode:
.....
zero length extent (off = 0, fsbno = 0) in ino 184452204
correcting nextents for inode 184452204
bad attribute fork in inode 184452204, would clear attr fork
bad nblocks 1 for inode 184452204, would reset to 0
bad anextents 1 for inode 184452204, would reset to 0
imap claims in-use inode 184452204 is free, would correct imap
would have cleared inode 184452204
.....
disconnected inode 184452204, would move to lost+found
And so we have a situation where the directory structure and the
inobt thinks the inode is free, but the inode on disk thinks it is
still in use. Where this corruption came from is not possible to
diagnose, but we can detect it and prevent the kernel from oopsing
on lookup. The reproducer now results in:
$ sudo mkdir /mnt/scratch/{0,1,2,3,4,5}{0,1,2,3,4,5}
mkdir: cannot create directory =E2=80=98/mnt/scratch/00=E2=80=99: File ex=
ists
mkdir: cannot create directory =E2=80=98/mnt/scratch/01=E2=80=99: File ex=
ists
mkdir: cannot create directory =E2=80=98/mnt/scratch/03=E2=80=99: Structu=
re needs cleaning
mkdir: cannot create directory =E2=80=98/mnt/scratch/04=E2=80=99: Input/o=
utput error
mkdir: cannot create directory =E2=80=98/mnt/scratch/05=E2=80=99: Input/o=
utput error
....
And this corruption shutdown:
[ 54.843517] XFS (loop0): Corruption detected! Free inode 0xafe846c not=
marked free on disk
[ 54.845885] XFS (loop0): Internal error xfs_trans_cancel at line 1023 =
of file fs/xfs/xfs_trans.c. Caller xfs_create+0x425/0x670
[ 54.848994] CPU: 10 PID: 3541 Comm: mkdir Not tainted 4.16.0-rc5-dgc #=
443
[ 54.850753] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO=
S 1.10.2-1 04/01/2014
[ 54.852859] Call Trace:
[ 54.853531] dump_stack+0x85/0xc5
[ 54.854385] xfs_trans_cancel+0x197/0x1c0
[ 54.855421] xfs_create+0x425/0x670
[ 54.856314] xfs_generic_create+0x1f7/0x2c0
[ 54.857390] ? capable_wrt_inode_uidgid+0x3f/0x50
[ 54.858586] vfs_mkdir+0xfb/0x1b0
[ 54.859458] SyS_mkdir+0xcf/0xf0
[ 54.860254] do_syscall_64+0x73/0x1a0
[ 54.861193] entry_SYSCALL_64_after_hwframe+0x42/0xb7
[ 54.862492] RIP: 0033:0x7fb73bddf547
[ 54.863358] RSP: 002b:00007ffdaa553338 EFLAGS: 00000246 ORIG_RAX: 0000=
000000000053
[ 54.865133] RAX: ffffffffffffffda RBX: 00007ffdaa55449a RCX: 00007fb73=
bddf547
[ 54.866766] RDX: 0000000000000001 RSI: 00000000000001ff RDI: 00007ffda=
a55449a
[ 54.868432] RBP: 00007ffdaa55449a R08: 00000000000001ff R09: 00005623a=
8670dd0
[ 54.870110] R10: 00007fb73be72d5b R11: 0000000000000246 R12: 000000000=
00001ff
[ 54.871752] R13: 00007ffdaa5534b0 R14: 0000000000000000 R15: 00007ffda=
a553500
[ 54.873429] XFS (loop0): xfs_do_force_shutdown(0x8) called from line 1=
024 of file fs/xfs/xfs_trans.c. Return address = ffffffff814cd050
[ 54.882790] XFS (loop0): Corruption of in-memory data detected. Shutt=
ing down filesystem
[ 54.884597] XFS (loop0): Please umount the filesystem and rectify the =
problem(s)
Note that this crash is only possible on v4 filesystemsi or v5
filesystems mounted with the ikeep mount option. For all other V5
filesystems, this problem cannot occur because we don't read inodes
we are allocating from disk - we simply overwrite them with the new
inode information.
Signed-Off-By: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Tested-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-03-23 17:22:53 +00:00
|
|
|
*/
|
2018-04-18 00:17:34 +00:00
|
|
|
error = xfs_iget_check_free_state(ip, flags);
|
|
|
|
if (error)
|
2012-10-08 10:56:11 +00:00
|
|
|
goto out_destroy;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Preload the radix tree so we can insert safely under the
|
|
|
|
* write spinlock. Note that we cannot sleep inside the preload
|
2024-01-15 22:59:45 +00:00
|
|
|
* region.
|
2012-10-08 10:56:11 +00:00
|
|
|
*/
|
2024-01-15 22:59:45 +00:00
|
|
|
if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
|
2014-06-25 04:58:08 +00:00
|
|
|
error = -EAGAIN;
|
2012-10-08 10:56:11 +00:00
|
|
|
goto out_destroy;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Because the inode hasn't been added to the radix-tree yet it can't
|
|
|
|
* be found by another thread, so we can do the non-sleeping lock here.
|
|
|
|
*/
|
|
|
|
if (lock_flags) {
|
|
|
|
if (!xfs_ilock_nowait(ip, lock_flags))
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* These values must be set before inserting the inode into the radix
|
|
|
|
* tree as the moment it is inserted a concurrent lookup (allowed by the
|
|
|
|
* RCU locking mechanism) can find it and that lookup must see that this
|
|
|
|
* is an inode currently under construction (i.e. that XFS_INEW is set).
|
|
|
|
* The ip->i_flags_lock that protects the XFS_INEW flag forms the
|
|
|
|
* memory barrier that ensures this detection works correctly at lookup
|
|
|
|
* time.
|
|
|
|
*/
|
|
|
|
if (flags & XFS_IGET_DONTCACHE)
|
2020-04-30 14:41:37 +00:00
|
|
|
d_mark_dontcache(VFS_I(ip));
|
2013-06-27 22:25:07 +00:00
|
|
|
ip->i_udquot = NULL;
|
|
|
|
ip->i_gdquot = NULL;
|
2013-07-11 05:00:40 +00:00
|
|
|
ip->i_pdquot = NULL;
|
2024-05-02 14:48:37 +00:00
|
|
|
xfs_iflags_set(ip, XFS_INEW);
|
2012-10-08 10:56:11 +00:00
|
|
|
|
|
|
|
/* insert the new inode */
|
|
|
|
spin_lock(&pag->pag_ici_lock);
|
|
|
|
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
|
|
|
|
if (unlikely(error)) {
|
|
|
|
WARN_ON(error != -EEXIST);
|
2015-10-12 07:21:22 +00:00
|
|
|
XFS_STATS_INC(mp, xs_ig_dup);
|
2014-06-25 04:58:08 +00:00
|
|
|
error = -EAGAIN;
|
2012-10-08 10:56:11 +00:00
|
|
|
goto out_preload_end;
|
|
|
|
}
|
|
|
|
spin_unlock(&pag->pag_ici_lock);
|
|
|
|
radix_tree_preload_end();
|
|
|
|
|
|
|
|
*ipp = ip;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_preload_end:
|
|
|
|
spin_unlock(&pag->pag_ici_lock);
|
|
|
|
radix_tree_preload_end();
|
|
|
|
if (lock_flags)
|
|
|
|
xfs_iunlock(ip, lock_flags);
|
|
|
|
out_destroy:
|
|
|
|
__destroy_inode(VFS_I(ip));
|
|
|
|
xfs_inode_free(ip);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-06-29 21:49:18 +00:00
|
|
|
* Look up an inode by number in the given file system. The inode is looked up
|
|
|
|
* in the cache held in each AG. If the inode is found in the cache, initialise
|
|
|
|
* the vfs inode if necessary.
|
2012-10-08 10:56:11 +00:00
|
|
|
*
|
2020-06-29 21:49:18 +00:00
|
|
|
* If it is not in core, read it in from the file system's device, add it to the
|
|
|
|
* cache and initialise the vfs inode.
|
2012-10-08 10:56:11 +00:00
|
|
|
*
|
|
|
|
* The inode is locked according to the value of the lock_flags parameter.
|
2020-06-29 21:49:18 +00:00
|
|
|
* Inode lookup is only done during metadata operations and not as part of the
|
|
|
|
* data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
|
2012-10-08 10:56:11 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_iget(
|
2020-06-29 21:49:18 +00:00
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_trans *tp,
|
|
|
|
xfs_ino_t ino,
|
|
|
|
uint flags,
|
|
|
|
uint lock_flags,
|
|
|
|
struct xfs_inode **ipp)
|
2012-10-08 10:56:11 +00:00
|
|
|
{
|
2020-06-29 21:49:18 +00:00
|
|
|
struct xfs_inode *ip;
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
xfs_agino_t agino;
|
|
|
|
int error;
|
2012-10-08 10:56:11 +00:00
|
|
|
|
|
|
|
ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
|
|
|
|
|
|
|
|
/* reject inode numbers outside existing AGs */
|
2024-08-30 22:36:48 +00:00
|
|
|
if (!xfs_verify_ino(mp, ino))
|
2014-06-25 04:58:08 +00:00
|
|
|
return -EINVAL;
|
2012-10-08 10:56:11 +00:00
|
|
|
|
2015-10-12 07:21:22 +00:00
|
|
|
XFS_STATS_INC(mp, xs_ig_attempts);
|
2015-08-28 04:50:56 +00:00
|
|
|
|
2012-10-08 10:56:11 +00:00
|
|
|
/* get the perag structure and ensure that it's inode capable */
|
|
|
|
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
|
|
|
|
agino = XFS_INO_TO_AGINO(mp, ino);
|
|
|
|
|
|
|
|
again:
|
|
|
|
error = 0;
|
|
|
|
rcu_read_lock();
|
|
|
|
ip = radix_tree_lookup(&pag->pag_ici_root, agino);
|
|
|
|
|
|
|
|
if (ip) {
|
|
|
|
error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
|
|
|
|
if (error)
|
|
|
|
goto out_error_or_again;
|
|
|
|
} else {
|
|
|
|
rcu_read_unlock();
|
2017-06-19 15:58:56 +00:00
|
|
|
if (flags & XFS_IGET_INCORE) {
|
2017-10-18 04:37:32 +00:00
|
|
|
error = -ENODATA;
|
2017-06-19 15:58:56 +00:00
|
|
|
goto out_error_or_again;
|
|
|
|
}
|
2015-10-12 07:21:22 +00:00
|
|
|
XFS_STATS_INC(mp, xs_ig_missed);
|
2012-10-08 10:56:11 +00:00
|
|
|
|
|
|
|
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
|
|
|
|
flags, lock_flags);
|
|
|
|
if (error)
|
|
|
|
goto out_error_or_again;
|
|
|
|
}
|
|
|
|
xfs_perag_put(pag);
|
|
|
|
|
|
|
|
*ipp = ip;
|
|
|
|
|
|
|
|
/*
|
2015-02-23 11:38:08 +00:00
|
|
|
* If we have a real type for an on-disk inode, we can setup the inode
|
2021-12-21 17:38:19 +00:00
|
|
|
* now. If it's a new inode being created, xfs_init_new_inode will
|
|
|
|
* handle it.
|
2012-10-08 10:56:11 +00:00
|
|
|
*/
|
2016-02-09 05:54:58 +00:00
|
|
|
if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
|
2015-02-23 11:38:08 +00:00
|
|
|
xfs_setup_existing_inode(ip);
|
2012-10-08 10:56:11 +00:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_error_or_again:
|
2023-04-12 02:00:21 +00:00
|
|
|
if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) &&
|
|
|
|
error == -EAGAIN) {
|
2012-10-08 10:56:11 +00:00
|
|
|
delay(1);
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
xfs_perag_put(pag);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2024-11-04 04:18:50 +00:00
|
|
|
/*
|
|
|
|
* Get a metadata inode.
|
|
|
|
*
|
2024-11-04 04:18:51 +00:00
|
|
|
* The metafile type must match the file mode exactly, and for files in the
|
|
|
|
* metadata directory tree, it must match the inode's metatype exactly.
|
2024-11-04 04:18:50 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_trans_metafile_iget(
|
|
|
|
struct xfs_trans *tp,
|
|
|
|
xfs_ino_t ino,
|
|
|
|
enum xfs_metafile_type metafile_type,
|
|
|
|
struct xfs_inode **ipp)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = tp->t_mountp;
|
|
|
|
struct xfs_inode *ip;
|
|
|
|
umode_t mode;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = xfs_iget(mp, tp, ino, 0, 0, &ip);
|
2024-11-04 04:18:52 +00:00
|
|
|
if (error == -EFSCORRUPTED || error == -EINVAL)
|
2024-11-04 04:18:50 +00:00
|
|
|
goto whine;
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
if (VFS_I(ip)->i_nlink == 0)
|
|
|
|
goto bad_rele;
|
|
|
|
|
|
|
|
if (metafile_type == XFS_METAFILE_DIR)
|
|
|
|
mode = S_IFDIR;
|
|
|
|
else
|
|
|
|
mode = S_IFREG;
|
|
|
|
if (inode_wrong_type(VFS_I(ip), mode))
|
|
|
|
goto bad_rele;
|
2024-11-04 04:18:51 +00:00
|
|
|
if (xfs_has_metadir(mp)) {
|
|
|
|
if (!xfs_is_metadir_inode(ip))
|
|
|
|
goto bad_rele;
|
|
|
|
if (metafile_type != ip->i_metatype)
|
|
|
|
goto bad_rele;
|
|
|
|
}
|
2024-11-04 04:18:50 +00:00
|
|
|
|
|
|
|
*ipp = ip;
|
|
|
|
return 0;
|
|
|
|
bad_rele:
|
|
|
|
xfs_irele(ip);
|
|
|
|
whine:
|
2024-11-04 04:18:51 +00:00
|
|
|
xfs_err(mp, "metadata inode 0x%llx type %u is corrupt", ino,
|
|
|
|
metafile_type);
|
2024-11-04 04:18:57 +00:00
|
|
|
xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR);
|
2024-11-04 04:18:50 +00:00
|
|
|
return -EFSCORRUPTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Grab a metadata file if the caller doesn't already have a transaction. */
|
|
|
|
int
|
|
|
|
xfs_metafile_iget(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
xfs_ino_t ino,
|
|
|
|
enum xfs_metafile_type metafile_type,
|
|
|
|
struct xfs_inode **ipp)
|
|
|
|
{
|
|
|
|
struct xfs_trans *tp;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = xfs_trans_alloc_empty(mp, &tp);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
|
|
|
|
xfs_trans_cancel(tp);
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2010-09-24 09:51:50 +00:00
|
|
|
/*
|
|
|
|
* Grab the inode for reclaim exclusively.
|
2020-07-01 17:21:05 +00:00
|
|
|
*
|
|
|
|
* We have found this inode via a lookup under RCU, so the inode may have
|
|
|
|
* already been freed, or it may be in the process of being recycled by
|
|
|
|
* xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
|
|
|
|
* has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
|
|
|
|
* will not be set. Hence we need to check for both these flag conditions to
|
|
|
|
* avoid inodes that are no longer reclaim candidates.
|
|
|
|
*
|
|
|
|
* Note: checking for other state flags here, under the i_flags_lock or not, is
|
|
|
|
* racy and should be avoided. Those races should be resolved only after we have
|
|
|
|
* ensured that we are able to reclaim this inode and the world can see that we
|
|
|
|
* are going to reclaim it.
|
|
|
|
*
|
|
|
|
* Return true if we grabbed it, false otherwise.
|
2010-09-24 09:51:50 +00:00
|
|
|
*/
|
2020-07-01 17:21:05 +00:00
|
|
|
static bool
|
2021-05-31 18:32:02 +00:00
|
|
|
xfs_reclaim_igrab(
|
2021-06-07 16:34:50 +00:00
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2010-09-24 09:51:50 +00:00
|
|
|
{
|
2010-12-17 06:29:43 +00:00
|
|
|
ASSERT(rcu_read_lock_held());
|
|
|
|
|
2010-09-24 09:51:50 +00:00
|
|
|
spin_lock(&ip->i_flags_lock);
|
2010-12-17 06:29:43 +00:00
|
|
|
if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
|
|
|
|
__xfs_iflags_test(ip, XFS_IRECLAIM)) {
|
|
|
|
/* not a reclaim candidate. */
|
2010-09-24 09:51:50 +00:00
|
|
|
spin_unlock(&ip->i_flags_lock);
|
2020-07-01 17:21:05 +00:00
|
|
|
return false;
|
2010-09-24 09:51:50 +00:00
|
|
|
}
|
2021-06-07 16:34:50 +00:00
|
|
|
|
|
|
|
/* Don't reclaim a sick inode unless the caller asked for it. */
|
|
|
|
if (ip->i_sick &&
|
2021-06-07 16:34:51 +00:00
|
|
|
(!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
|
2021-06-07 16:34:50 +00:00
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2010-09-24 09:51:50 +00:00
|
|
|
__xfs_iflags_set(ip, XFS_IRECLAIM);
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
2020-07-01 17:21:05 +00:00
|
|
|
return true;
|
2010-09-24 09:51:50 +00:00
|
|
|
}
|
|
|
|
|
2010-02-06 01:37:26 +00:00
|
|
|
/*
|
2020-06-29 21:49:18 +00:00
|
|
|
* Inode reclaim is non-blocking, so the default action if progress cannot be
|
|
|
|
* made is to "requeue" the inode for reclaim by unlocking it and clearing the
|
|
|
|
* XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about
|
|
|
|
* blocking anymore and hence we can wait for the inode to be able to reclaim
|
|
|
|
* it.
|
2010-02-06 01:37:26 +00:00
|
|
|
*
|
2020-06-29 21:49:18 +00:00
|
|
|
* We do no IO here - if callers require inodes to be cleaned they must push the
|
|
|
|
* AIL first to trigger writeback of dirty inodes. This enables writeback to be
|
|
|
|
* done in the background in a non-blocking manner, and enables memory reclaim
|
|
|
|
* to make progress without blocking.
|
2010-02-06 01:37:26 +00:00
|
|
|
*/
|
2020-07-01 17:21:28 +00:00
|
|
|
static void
|
2010-01-10 23:51:45 +00:00
|
|
|
xfs_reclaim_inode(
|
2009-06-08 13:35:14 +00:00
|
|
|
struct xfs_inode *ip,
|
2020-07-01 17:21:05 +00:00
|
|
|
struct xfs_perag *pag)
|
2008-10-30 06:37:03 +00:00
|
|
|
{
|
2016-05-18 04:09:12 +00:00
|
|
|
xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
|
2010-02-06 01:37:26 +00:00
|
|
|
|
2020-06-29 21:49:17 +00:00
|
|
|
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
|
2020-06-29 21:49:16 +00:00
|
|
|
goto out;
|
2020-08-17 23:41:01 +00:00
|
|
|
if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
|
2020-06-29 21:49:17 +00:00
|
|
|
goto out_iunlock;
|
2008-10-30 06:37:37 +00:00
|
|
|
|
xfs: xfs_is_shutdown vs xlog_is_shutdown cage fight
I've been chasing a recent resurgence in generic/388 recovery
failure and/or corruption events. The events have largely been
uninitialised inode chunks being tripped over in log recovery
such as:
XFS (pmem1): User initiated shutdown received.
pmem1: writeback error on inode 12621949, offset 1019904, sector 12968096
XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/xfs/xfs_fsops.c:500). Shutting down filesystem.
XFS (pmem1): Please unmount the filesystem and rectify the problem(s)
XFS (pmem1): Unmounting Filesystem
XFS (pmem1): Mounting V5 Filesystem
XFS (pmem1): Starting recovery (logdev: internal)
XFS (pmem1): bad inode magic/vsn daddr 8723584 #0 (magic=1818)
XFS (pmem1): Metadata corruption detected at xfs_inode_buf_verify+0x180/0x190, xfs_inode block 0x851c80 xfs_inode_buf_verify
XFS (pmem1): Unmount and run xfs_repair
XFS (pmem1): First 128 bytes of corrupted metadata buffer:
00000000: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000010: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000020: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000030: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000040: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000050: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000060: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
00000070: 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 ................
XFS (pmem1): metadata I/O error in "xlog_recover_items_pass2+0x52/0xc0" at daddr 0x851c80 len 32 error 117
XFS (pmem1): log mount/recovery failed: error -117
XFS (pmem1): log mount failed
There have been isolated random other issues, too - xfs_repair fails
because it finds some corruption in symlink blocks, rmap
inconsistencies, etc - but they are nowhere near as common as the
uninitialised inode chunk failure.
The problem has clearly happened at runtime before recovery has run;
I can see the ICREATE log item in the log shortly before the
actively recovered range of the log. This means the ICREATE was
definitely created and written to the log, but for some reason the
tail of the log has been moved past the ordered buffer log item that
tracks INODE_ALLOC buffers and, supposedly, prevents the tail of the
log moving past the ICREATE log item before the inode chunk buffer
is written to disk.
Tracing the fsstress processes that are running when the filesystem
shut down immediately pin-pointed the problem:
user shutdown marks xfs_mount as shutdown
godown-213341 [008] 6398.022871: console: [ 6397.915392] XFS (pmem1): User initiated shutdown received.
.....
aild tries to push ordered inode cluster buffer
xfsaild/pmem1-213314 [001] 6398.022974: xfs_buf_trylock: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 16 pincount 0 lock 0 flags DONE|INODES|PAGES caller xfs_inode_item_push+0x8e
xfsaild/pmem1-213314 [001] 6398.022976: xfs_ilock_nowait: dev 259:1 ino 0x851c80 flags ILOCK_SHARED caller xfs_iflush_cluster+0xae
xfs_iflush_cluster() checks xfs_is_shutdown(), returns true,
calls xfs_iflush_abort() to kill writeback of the inode.
Inode is removed from AIL, drops cluster buffer reference.
xfsaild/pmem1-213314 [001] 6398.022977: xfs_ail_delete: dev 259:1 lip 0xffff88880247ed80 old lsn 7/20344 new lsn 7/21000 type XFS_LI_INODE flags IN_AIL
xfsaild/pmem1-213314 [001] 6398.022978: xfs_buf_rele: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 17 pincount 0 lock 0 flags DONE|INODES|PAGES caller xfs_iflush_abort+0xd7
.....
All inodes on cluster buffer are aborted, then the cluster buffer
itself is aborted and removed from the AIL *without writeback*:
xfsaild/pmem1-213314 [001] 6398.023011: xfs_buf_error_relse: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_ioend_fail+0x33
xfsaild/pmem1-213314 [001] 6398.023012: xfs_ail_delete: dev 259:1 lip 0xffff8888053efde8 old lsn 7/20344 new lsn 7/20344 type XFS_LI_BUF flags IN_AIL
The inode buffer was at 7/20344 when it was removed from the AIL.
xfsaild/pmem1-213314 [001] 6398.023012: xfs_buf_item_relse: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_item_done+0x31
xfsaild/pmem1-213314 [001] 6398.023012: xfs_buf_rele: dev 259:1 daddr 0x851c80 bbcount 0x20 hold 2 pincount 0 lock 0 flags ASYNC|DONE|STALE|INODES|PAGES caller xfs_buf_item_relse+0x39
.....
Userspace is still running, doing stuff. an fsstress process runs
syncfs() or sync() and we end up in sync_fs_one_sb() which issues
a log force. This pushes on the CIL:
fsstress-213322 [001] 6398.024430: xfs_fs_sync_fs: dev 259:1 m_features 0x20000000019ff6e9 opstate (clean|shutdown|inodegc|blockgc) s_flags 0x70810000 caller sync_fs_one_sb+0x26
fsstress-213322 [001] 6398.024430: xfs_log_force: dev 259:1 lsn 0x0 caller xfs_fs_sync_fs+0x82
fsstress-213322 [001] 6398.024430: xfs_log_force: dev 259:1 lsn 0x5f caller xfs_log_force+0x7c
<...>-194402 [001] 6398.024467: kmem_alloc: size 176 flags 0x14 caller xlog_cil_push_work+0x9f
And the CIL fills up iclogs with pending changes. This picks up
the current tail from the AIL:
<...>-194402 [001] 6398.024497: xlog_iclog_get_space: dev 259:1 state XLOG_STATE_ACTIVE refcnt 1 offset 0 lsn 0x0 flags caller xlog_write+0x149
<...>-194402 [001] 6398.024498: xlog_iclog_switch: dev 259:1 state XLOG_STATE_ACTIVE refcnt 1 offset 0 lsn 0x700005408 flags caller xlog_state_get_iclog_space+0x37e
<...>-194402 [001] 6398.024521: xlog_iclog_release: dev 259:1 state XLOG_STATE_WANT_SYNC refcnt 1 offset 32256 lsn 0x700005408 flags caller xlog_write+0x5f9
<...>-194402 [001] 6398.024522: xfs_log_assign_tail_lsn: dev 259:1 new tail lsn 7/21000, old lsn 7/20344, last sync 7/21448
And it moves the tail of the log to 7/21000 from 7/20344. This
*moves the tail of the log beyond the ICREATE transaction* that was
at 7/20344 and pinned by the inode cluster buffer that was cancelled
above.
....
godown-213341 [008] 6398.027005: xfs_force_shutdown: dev 259:1 tag logerror flags log_io|force_umount file fs/xfs/xfs_fsops.c line_num 500
godown-213341 [008] 6398.027022: console: [ 6397.915406] pmem1: writeback error on inode 12621949, offset 1019904, sector 12968096
godown-213341 [008] 6398.030551: console: [ 6397.919546] XFS (pmem1): Log I/O Error (0x6) detected at xfs_fs_goingdown+0xa3/0xf0 (fs/
And finally the log itself is now shutdown, stopping all further
writes to the log. But this is too late to prevent the corruption
that moving the tail of the log forwards after we start cancelling
writeback causes.
The fundamental problem here is that we are using the wrong shutdown
checks for log items. We've long conflated mount shutdown with log
shutdown state, and I started separating that recently with the
atomic shutdown state changes in commit b36d4651e165 ("xfs: make
forced shutdown processing atomic"). The changes in that commit
series are directly responsible for being able to diagnose this
issue because it clearly separated mount shutdown from log shutdown.
Essentially, once we start cancelling writeback of log items and
removing them from the AIL because the filesystem is shut down, we
*cannot* update the journal because we may have cancelled the items
that pin the tail of the log. That moves the tail of the log
forwards without having written the metadata back, hence we have
corrupt in memory state and writing to the journal propagates that
to the on-disk state.
What commit b36d4651e165 makes clear is that log item state needs to
change relative to log shutdown, not mount shutdown. IOWs, anything
that aborts metadata writeback needs to check log shutdown state
because log items directly affect log consistency. Having them check
mount shutdown state introduces the above race condition where we
cancel metadata writeback before the log shuts down.
To fix this, this patch works through all log items and converts
shutdown checks to use xlog_is_shutdown() rather than
xfs_is_shutdown(), so that we don't start aborting metadata
writeback before we shut off journal writes.
AFAICT, this race condition is a zero day IO error handling bug in
XFS that dates back to the introduction of XLOG_IO_ERROR,
XLOG_STATE_IOERROR and XFS_FORCED_SHUTDOWN back in January 1997.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2022-03-17 16:09:13 +00:00
|
|
|
/*
|
|
|
|
* Check for log shutdown because aborting the inode can move the log
|
|
|
|
* tail and corrupt in memory state. This is fine if the log is shut
|
|
|
|
* down, but if the log is still active and only the mount is shut down
|
|
|
|
* then the in-memory log tail movement caused by the abort can be
|
|
|
|
* incorrectly propagated to disk.
|
|
|
|
*/
|
|
|
|
if (xlog_is_shutdown(ip->i_mount->m_log)) {
|
2010-02-06 01:37:26 +00:00
|
|
|
xfs_iunpin_wait(ip);
|
2022-03-30 01:21:59 +00:00
|
|
|
xfs_iflush_shutdown_abort(ip);
|
2010-02-06 01:37:26 +00:00
|
|
|
goto reclaim;
|
|
|
|
}
|
2020-06-29 21:49:16 +00:00
|
|
|
if (xfs_ipincount(ip))
|
2020-08-17 23:41:01 +00:00
|
|
|
goto out_clear_flush;
|
2020-06-29 21:49:16 +00:00
|
|
|
if (!xfs_inode_clean(ip))
|
2020-08-17 23:41:01 +00:00
|
|
|
goto out_clear_flush;
|
2012-04-23 05:58:35 +00:00
|
|
|
|
2020-08-17 23:41:01 +00:00
|
|
|
xfs_iflags_clear(ip, XFS_IFLUSHING);
|
2010-02-06 01:37:26 +00:00
|
|
|
reclaim:
|
2021-08-06 18:05:39 +00:00
|
|
|
trace_xfs_inode_reclaiming(ip);
|
2016-11-09 21:23:22 +00:00
|
|
|
|
2016-05-18 04:09:12 +00:00
|
|
|
/*
|
|
|
|
* Because we use RCU freeing we need to ensure the inode always appears
|
|
|
|
* to be reclaimed with an invalid inode number when in the free state.
|
2016-11-09 21:23:22 +00:00
|
|
|
* We do this as early as possible under the ILOCK so that
|
2017-08-25 17:05:26 +00:00
|
|
|
* xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
|
|
|
|
* detect races with us here. By doing this, we guarantee that once
|
|
|
|
* xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
|
|
|
|
* it will see either a valid inode that will serialise correctly, or it
|
|
|
|
* will see an invalid inode that it can skip.
|
2016-05-18 04:09:12 +00:00
|
|
|
*/
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
|
|
|
ip->i_flags = XFS_IRECLAIM;
|
|
|
|
ip->i_ino = 0;
|
xfs: only reset incore inode health state flags when reclaiming an inode
While running some fuzz tests on inode metadata, I noticed that the
filesystem health report (as provided by xfs_spaceman) failed to report
the file corruption even when spaceman was run immediately after running
xfs_scrub to detect the corruption. That isn't the intended behavior;
one ought to be able to run scrub to detect errors in the ondisk
metadata and be able to access to those reports for some time after the
scrub.
After running the same sequence through an instrumented kernel, I
discovered the reason why -- scrub igets the file, scans it, marks it
sick, and ireleases the inode. When the VFS lets go of the incore
inode, it moves to RECLAIMABLE state. If spaceman igets the incore
inode before it moves to RECLAIM state, iget reinitializes the VFS
state, clears the sick and checked masks, and hands back the inode. At
this point, the caller has the exact same incore inode, but with all the
health state erased.
In other words, we're erasing the incore inode's health state flags when
we've decided NOT to sever the link between the incore inode and the
ondisk inode. This is wrong, so we need to remove the lines that zero
the fields from xfs_iget_cache_hit.
As a precaution, we add the same lines into xfs_reclaim_inode just after
we sever the link between incore and ondisk inode. Strictly speaking
this isn't necessary because once an inode has gone through reclaim it
must go through xfs_inode_alloc (which also zeroes the state) and
xfs_iget is careful to check for mismatches between the inode it pulls
out of the radix tree and the one it wants.
Fixes: 6772c1f11206 ("xfs: track metadata health status")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
2021-06-07 16:34:49 +00:00
|
|
|
ip->i_sick = 0;
|
|
|
|
ip->i_checked = 0;
|
2016-05-18 04:09:12 +00:00
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
xfs: add log item precommit operation
For inodes that are dirty, we have an attached cluster buffer that
we want to use to track the dirty inode through the AIL.
Unfortunately, locking the cluster buffer and adding it to the
transaction when the inode is first logged in a transaction leads to
buffer lock ordering inversions.
The specific problem is ordering against the AGI buffer. When
modifying unlinked lists, the buffer lock order is AGI -> inode
cluster buffer as the AGI buffer lock serialises all access to the
unlinked lists. Unfortunately, functionality like xfs_droplink()
logs the inode before calling xfs_iunlink(), as do various directory
manipulation functions. The inode can be logged way down in the
stack as far as the bmapi routines and hence, without a major
rewrite of lots of APIs there's no way we can avoid the inode being
logged by something until after the AGI has been logged.
As we are going to be using ordered buffers for inode AIL tracking,
there isn't a need to actually lock that buffer against modification
as all the modifications are captured by logging the inode item
itself. Hence we don't actually need to join the cluster buffer into
the transaction until just before it is committed. This means we do
not perturb any of the existing buffer lock orders in transactions,
and the inode cluster buffer is always locked last in a transaction
that doesn't otherwise touch inode cluster buffers.
We do this by introducing a precommit log item method. This commit
just introduces the mechanism; the inode item implementation is in
followup commits.
The precommit items need to be sorted into consistent order as we
may be locking multiple items here. Hence if we have two dirty
inodes in cluster buffers A and B, and some other transaction has
two separate dirty inodes in the same cluster buffers, locking them
in different orders opens us up to ABBA deadlocks. Hence we sort the
items on the transaction based on the presence of a sort log item
method.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
2022-07-14 01:47:26 +00:00
|
|
|
ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL);
|
2010-01-10 23:51:45 +00:00
|
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
2010-07-20 07:53:25 +00:00
|
|
|
|
2015-10-12 07:21:22 +00:00
|
|
|
XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
|
2010-07-20 07:53:25 +00:00
|
|
|
/*
|
|
|
|
* Remove the inode from the per-AG radix tree.
|
|
|
|
*
|
|
|
|
* Because radix_tree_delete won't complain even if the item was never
|
|
|
|
* added to the tree assert that it's been there before to catch
|
|
|
|
* problems with the inode life time early on.
|
|
|
|
*/
|
2010-12-16 06:08:41 +00:00
|
|
|
spin_lock(&pag->pag_ici_lock);
|
2010-07-20 07:53:25 +00:00
|
|
|
if (!radix_tree_delete(&pag->pag_ici_root,
|
2016-05-18 04:09:12 +00:00
|
|
|
XFS_INO_TO_AGINO(ip->i_mount, ino)))
|
2010-07-20 07:53:25 +00:00
|
|
|
ASSERT(0);
|
2021-05-31 18:32:02 +00:00
|
|
|
xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
|
2010-12-16 06:08:41 +00:00
|
|
|
spin_unlock(&pag->pag_ici_lock);
|
2010-07-20 07:53:25 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Here we do an (almost) spurious inode lock in order to coordinate
|
|
|
|
* with inode cache radix tree lookups. This is because the lookup
|
|
|
|
* can reference the inodes in the cache without taking references.
|
|
|
|
*
|
|
|
|
* We make that OK here by ensuring that we wait until the inode is
|
2012-02-16 22:01:00 +00:00
|
|
|
* unlocked after the lookup before we go ahead and free it.
|
2010-07-20 07:53:25 +00:00
|
|
|
*/
|
2012-02-16 22:01:00 +00:00
|
|
|
xfs_ilock(ip, XFS_ILOCK_EXCL);
|
2021-05-31 18:31:57 +00:00
|
|
|
ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
|
2012-02-16 22:01:00 +00:00
|
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
xfs: Don't allow logging of XFS_ISTALE inodes
In tracking down a problem in this patchset, I discovered we are
reclaiming dirty stale inodes. This wasn't discovered until inodes
were always attached to the cluster buffer and then the rcu callback
that freed inodes was assert failing because the inode still had an
active pointer to the cluster buffer after it had been reclaimed.
Debugging the issue indicated that this was a pre-existing issue
resulting from the way the inodes are handled in xfs_inactive_ifree.
When we free a cluster buffer from xfs_ifree_cluster, all the inodes
in cache are marked XFS_ISTALE. Those that are clean have nothing
else done to them and so eventually get cleaned up by background
reclaim. i.e. it is assumed we'll never dirty/relog an inode marked
XFS_ISTALE.
On journal commit dirty stale inodes as are handled by both
buffer and inode log items to run though xfs_istale_done() and
removed from the AIL (buffer log item commit) or the log item will
simply unpin it because the buffer log item will clean it. What happens
to any specific inode is entirely dependent on which log item wins
the commit race, but the result is the same - stale inodes are
clean, not attached to the cluster buffer, and not in the AIL. Hence
inode reclaim can just free these inodes without further care.
However, if the stale inode is relogged, it gets dirtied again and
relogged into the CIL. Most of the time this isn't an issue, because
relogging simply changes the inode's location in the current
checkpoint. Problems arise, however, when the CIL checkpoints
between two transactions in the xfs_inactive_ifree() deferops
processing. This results in the XFS_ISTALE inode being redirtied
and inserted into the CIL without any of the other stale cluster
buffer infrastructure being in place.
Hence on journal commit, it simply gets unpinned, so it remains
dirty in memory. Everything in inode writeback avoids XFS_ISTALE
inodes so it can't be written back, and it is not tracked in the AIL
so there's not even a trigger to attempt to clean the inode. Hence
the inode just sits dirty in memory until inode reclaim comes along,
sees that it is XFS_ISTALE, and goes to reclaim it. This reclaiming
of a dirty inode caused use after free, list corruptions and other
nasty issues later in this patchset.
Hence this patch addresses a violation of the "never log XFS_ISTALE
inodes" caused by the deferops processing rolling a transaction
and relogging a stale inode in xfs_inactive_free. It also adds a
bunch of asserts to catch this problem in debug kernels so that
we don't reintroduce this problem in future.
Reproducer for this issue was generic/558 on a v4 filesystem.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2020-06-29 21:48:45 +00:00
|
|
|
ASSERT(xfs_inode_clean(ip));
|
2010-07-20 07:53:25 +00:00
|
|
|
|
2016-05-18 04:09:12 +00:00
|
|
|
__xfs_inode_free(ip);
|
2020-07-01 17:21:28 +00:00
|
|
|
return;
|
2012-04-23 05:58:35 +00:00
|
|
|
|
2020-08-17 23:41:01 +00:00
|
|
|
out_clear_flush:
|
|
|
|
xfs_iflags_clear(ip, XFS_IFLUSHING);
|
2020-06-29 21:49:17 +00:00
|
|
|
out_iunlock:
|
2012-04-23 05:58:35 +00:00
|
|
|
xfs_iunlock(ip, XFS_ILOCK_EXCL);
|
2020-06-29 21:49:17 +00:00
|
|
|
out:
|
2020-06-29 21:49:16 +00:00
|
|
|
xfs_iflags_clear(ip, XFS_IRECLAIM);
|
2008-10-30 06:37:37 +00:00
|
|
|
}
|
|
|
|
|
2021-06-07 16:34:50 +00:00
|
|
|
/* Reclaim sick inodes if we're unmounting or the fs went down. */
|
|
|
|
static inline bool
|
|
|
|
xfs_want_reclaim_sick(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2021-08-19 01:46:52 +00:00
|
|
|
return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) ||
|
2021-08-19 01:46:53 +00:00
|
|
|
xfs_is_shutdown(mp);
|
2021-06-07 16:34:50 +00:00
|
|
|
}
|
|
|
|
|
2020-07-01 17:21:28 +00:00
|
|
|
void
|
2008-10-30 06:37:37 +00:00
|
|
|
xfs_reclaim_inodes(
|
2020-07-01 17:21:28 +00:00
|
|
|
struct xfs_mount *mp)
|
2008-10-30 06:37:37 +00:00
|
|
|
{
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk icw = {
|
|
|
|
.icw_flags = 0,
|
2021-06-07 16:34:50 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
if (xfs_want_reclaim_sick(mp))
|
2021-06-07 16:34:51 +00:00
|
|
|
icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
|
2021-06-07 16:34:50 +00:00
|
|
|
|
2024-11-04 04:18:38 +00:00
|
|
|
while (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) {
|
2020-06-29 21:49:16 +00:00
|
|
|
xfs_ail_push_all_sync(mp->m_ail);
|
2021-06-07 16:34:51 +00:00
|
|
|
xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
|
2020-09-09 16:29:16 +00:00
|
|
|
}
|
2010-04-28 23:55:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-06-29 21:49:18 +00:00
|
|
|
* The shrinker infrastructure determines how many inodes we should scan for
|
|
|
|
* reclaim. We want as many clean inodes ready to reclaim as possible, so we
|
|
|
|
* push the AIL here. We also want to proactively free up memory if we can to
|
|
|
|
* minimise the amount of work memory reclaim has to do so we kick the
|
|
|
|
* background reclaim if it isn't already scheduled.
|
2010-04-28 23:55:50 +00:00
|
|
|
*/
|
2013-08-28 00:17:57 +00:00
|
|
|
long
|
2011-07-08 04:14:46 +00:00
|
|
|
xfs_reclaim_inodes_nr(
|
|
|
|
struct xfs_mount *mp,
|
2021-06-18 18:57:06 +00:00
|
|
|
unsigned long nr_to_scan)
|
2010-04-28 23:55:50 +00:00
|
|
|
{
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk icw = {
|
|
|
|
.icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT,
|
2021-06-18 18:57:06 +00:00
|
|
|
.icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan),
|
2021-05-31 18:32:02 +00:00
|
|
|
};
|
|
|
|
|
2021-06-07 16:34:50 +00:00
|
|
|
if (xfs_want_reclaim_sick(mp))
|
2021-06-07 16:34:51 +00:00
|
|
|
icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
|
2021-06-07 16:34:50 +00:00
|
|
|
|
2011-07-08 04:14:46 +00:00
|
|
|
/* kick background reclaimer and push the AIL */
|
2012-10-08 10:56:05 +00:00
|
|
|
xfs_reclaim_work_queue(mp);
|
2011-07-08 04:14:46 +00:00
|
|
|
xfs_ail_push_all(mp->m_ail);
|
2011-04-08 02:45:07 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
|
2020-06-29 21:49:16 +00:00
|
|
|
return 0;
|
2011-07-08 04:14:46 +00:00
|
|
|
}
|
2010-04-28 23:55:50 +00:00
|
|
|
|
2011-07-08 04:14:46 +00:00
|
|
|
/*
|
|
|
|
* Return the number of reclaimable inodes in the filesystem for
|
|
|
|
* the shrinker to determine how much to reclaim.
|
|
|
|
*/
|
2021-06-18 18:57:06 +00:00
|
|
|
long
|
2011-07-08 04:14:46 +00:00
|
|
|
xfs_reclaim_inodes_count(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2024-11-04 04:18:38 +00:00
|
|
|
XA_STATE (xas, &mp->m_groups[XG_TYPE_AG].xa, 0);
|
2021-06-18 18:57:06 +00:00
|
|
|
long reclaimable = 0;
|
2024-08-29 04:08:41 +00:00
|
|
|
struct xfs_perag *pag;
|
2010-04-28 23:55:50 +00:00
|
|
|
|
2024-08-29 04:08:41 +00:00
|
|
|
rcu_read_lock();
|
|
|
|
xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) {
|
|
|
|
trace_xfs_reclaim_inodes_count(pag, _THIS_IP_);
|
2010-07-19 22:07:02 +00:00
|
|
|
reclaimable += pag->pag_ici_reclaimable;
|
2024-08-29 04:08:41 +00:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2010-04-28 23:55:50 +00:00
|
|
|
return reclaimable;
|
|
|
|
}
|
|
|
|
|
2020-05-21 20:08:49 +00:00
|
|
|
STATIC bool
|
2021-06-07 16:34:51 +00:00
|
|
|
xfs_icwalk_match_id(
|
2012-11-07 17:21:13 +00:00
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2012-11-07 17:21:13 +00:00
|
|
|
{
|
2021-06-07 16:34:51 +00:00
|
|
|
if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
|
|
|
|
!uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
|
2020-05-21 20:08:49 +00:00
|
|
|
return false;
|
2012-11-07 17:21:13 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
|
|
|
|
!gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
|
2020-05-21 20:08:49 +00:00
|
|
|
return false;
|
2012-11-06 14:50:45 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
|
|
|
|
ip->i_projid != icw->icw_prid)
|
2020-05-21 20:08:49 +00:00
|
|
|
return false;
|
2012-11-06 14:50:45 +00:00
|
|
|
|
2020-05-21 20:08:49 +00:00
|
|
|
return true;
|
2012-11-07 17:21:13 +00:00
|
|
|
}
|
|
|
|
|
2014-07-24 09:44:28 +00:00
|
|
|
/*
|
|
|
|
* A union-based inode filtering algorithm. Process the inode if any of the
|
|
|
|
* criteria match. This is for global/internal scans only.
|
|
|
|
*/
|
2020-05-21 20:08:49 +00:00
|
|
|
STATIC bool
|
2021-06-07 16:34:51 +00:00
|
|
|
xfs_icwalk_match_id_union(
|
2014-07-24 09:44:28 +00:00
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2014-07-24 09:44:28 +00:00
|
|
|
{
|
2021-06-07 16:34:51 +00:00
|
|
|
if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
|
|
|
|
uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
|
2020-05-21 20:08:49 +00:00
|
|
|
return true;
|
2014-07-24 09:44:28 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
|
|
|
|
gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
|
2020-05-21 20:08:49 +00:00
|
|
|
return true;
|
2014-07-24 09:44:28 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
|
|
|
|
ip->i_projid == icw->icw_prid)
|
2020-05-21 20:08:49 +00:00
|
|
|
return true;
|
2014-07-24 09:44:28 +00:00
|
|
|
|
2020-05-21 20:08:49 +00:00
|
|
|
return false;
|
2014-07-24 09:44:28 +00:00
|
|
|
}
|
|
|
|
|
2020-05-21 20:08:48 +00:00
|
|
|
/*
|
|
|
|
* Is this inode @ip eligible for eof/cow block reclamation, given some
|
2021-06-07 16:34:51 +00:00
|
|
|
* filtering parameters @icw? The inode is eligible if @icw is null or
|
2020-05-21 20:08:48 +00:00
|
|
|
* if the predicate functions match.
|
|
|
|
*/
|
|
|
|
static bool
|
2021-06-07 16:34:51 +00:00
|
|
|
xfs_icwalk_match(
|
2020-05-21 20:08:48 +00:00
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2020-05-21 20:08:48 +00:00
|
|
|
{
|
2020-05-21 20:08:49 +00:00
|
|
|
bool match;
|
2020-05-21 20:08:48 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if (!icw)
|
2020-05-21 20:08:48 +00:00
|
|
|
return true;
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
|
|
|
|
match = xfs_icwalk_match_id_union(ip, icw);
|
2020-05-21 20:08:48 +00:00
|
|
|
else
|
2021-06-07 16:34:51 +00:00
|
|
|
match = xfs_icwalk_match_id(ip, icw);
|
2020-05-21 20:08:48 +00:00
|
|
|
if (!match)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* skip the inode if the file size is too small */
|
2021-06-07 16:34:51 +00:00
|
|
|
if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
|
|
|
|
XFS_ISIZE(ip) < icw->icw_min_file_size)
|
2020-05-21 20:08:48 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-07-01 17:21:28 +00:00
|
|
|
/*
|
|
|
|
* This is a fast pass over the inode cache to try to get reclaim moving on as
|
|
|
|
* many inodes as possible in a short period of time. It kicks itself every few
|
|
|
|
* seconds, as well as being kicked by the inode cache shrinker when memory
|
2020-06-29 21:49:18 +00:00
|
|
|
* goes low.
|
2020-07-01 17:21:28 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_reclaim_worker(
|
|
|
|
struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = container_of(to_delayed_work(work),
|
|
|
|
struct xfs_mount, m_reclaim_work);
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
|
2020-07-01 17:21:28 +00:00
|
|
|
xfs_reclaim_work_queue(mp);
|
|
|
|
}
|
|
|
|
|
2012-11-06 14:50:42 +00:00
|
|
|
STATIC int
|
|
|
|
xfs_inode_free_eofblocks(
|
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw,
|
2021-01-26 05:09:49 +00:00
|
|
|
unsigned int *lockflags)
|
2012-11-06 14:50:42 +00:00
|
|
|
{
|
2020-05-21 20:08:48 +00:00
|
|
|
bool wait;
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
|
2014-07-24 09:40:22 +00:00
|
|
|
|
2021-01-23 00:48:43 +00:00
|
|
|
if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
|
|
|
|
return 0;
|
|
|
|
|
2012-11-06 14:50:42 +00:00
|
|
|
/*
|
|
|
|
* If the mapping is dirty the operation can block and wait for some
|
|
|
|
* time. Unless we are waiting, skip it.
|
|
|
|
*/
|
2020-05-21 20:08:48 +00:00
|
|
|
if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
|
2012-11-06 14:50:42 +00:00
|
|
|
return 0;
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if (!xfs_icwalk_match(ip, icw))
|
2020-05-21 20:08:48 +00:00
|
|
|
return 0;
|
2012-11-07 17:21:13 +00:00
|
|
|
|
2017-01-28 07:22:55 +00:00
|
|
|
/*
|
|
|
|
* If the caller is waiting, return -EAGAIN to keep the background
|
|
|
|
* scanner moving and revisit the inode in a subsequent pass.
|
|
|
|
*/
|
2017-01-28 07:22:56 +00:00
|
|
|
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
|
2020-05-21 20:08:48 +00:00
|
|
|
if (wait)
|
|
|
|
return -EAGAIN;
|
|
|
|
return 0;
|
2017-01-28 07:22:55 +00:00
|
|
|
}
|
2021-01-26 05:09:49 +00:00
|
|
|
*lockflags |= XFS_IOLOCK_EXCL;
|
2020-05-21 20:08:48 +00:00
|
|
|
|
2024-06-19 17:32:43 +00:00
|
|
|
if (xfs_can_free_eofblocks(ip))
|
2021-03-23 23:59:31 +00:00
|
|
|
return xfs_free_eofblocks(ip);
|
|
|
|
|
2024-08-13 07:39:42 +00:00
|
|
|
/* inode could be preallocated */
|
2021-03-23 23:59:31 +00:00
|
|
|
trace_xfs_inode_free_eofblocks_invalid(ip);
|
|
|
|
xfs_inode_clear_eofblocks_tag(ip);
|
|
|
|
return 0;
|
2012-11-06 14:50:42 +00:00
|
|
|
}
|
|
|
|
|
2016-10-03 16:11:46 +00:00
|
|
|
static void
|
2021-01-23 00:48:43 +00:00
|
|
|
xfs_blockgc_set_iflag(
|
|
|
|
struct xfs_inode *ip,
|
|
|
|
unsigned long iflag)
|
2012-11-06 14:50:38 +00:00
|
|
|
{
|
2021-01-23 00:48:43 +00:00
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
|
|
|
|
ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
|
2012-11-06 14:50:38 +00:00
|
|
|
|
2016-09-19 01:09:48 +00:00
|
|
|
/*
|
|
|
|
* Don't bother locking the AG and looking up in the radix trees
|
|
|
|
* if we already know that we have the tag set.
|
|
|
|
*/
|
2021-01-23 00:48:43 +00:00
|
|
|
if (ip->i_flags & iflag)
|
2016-09-19 01:09:48 +00:00
|
|
|
return;
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
2021-01-23 00:48:43 +00:00
|
|
|
ip->i_flags |= iflag;
|
2016-09-19 01:09:48 +00:00
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
2012-11-06 14:50:38 +00:00
|
|
|
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
|
|
|
|
spin_lock(&pag->pag_ici_lock);
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
|
|
|
|
XFS_ICI_BLOCKGC_TAG);
|
2012-11-06 14:50:38 +00:00
|
|
|
|
|
|
|
spin_unlock(&pag->pag_ici_lock);
|
|
|
|
xfs_perag_put(pag);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2016-10-03 16:11:46 +00:00
|
|
|
xfs_inode_set_eofblocks_tag(
|
2012-11-06 14:50:38 +00:00
|
|
|
xfs_inode_t *ip)
|
2016-10-03 16:11:46 +00:00
|
|
|
{
|
|
|
|
trace_xfs_inode_set_eofblocks_tag(ip);
|
2021-01-23 00:48:43 +00:00
|
|
|
return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
|
2016-10-03 16:11:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2021-01-23 00:48:43 +00:00
|
|
|
xfs_blockgc_clear_iflag(
|
|
|
|
struct xfs_inode *ip,
|
|
|
|
unsigned long iflag)
|
2012-11-06 14:50:38 +00:00
|
|
|
{
|
2021-01-23 00:48:43 +00:00
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
bool clear_tag;
|
|
|
|
|
|
|
|
ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
|
2012-11-06 14:50:38 +00:00
|
|
|
|
2016-09-19 01:09:48 +00:00
|
|
|
spin_lock(&ip->i_flags_lock);
|
2021-01-23 00:48:43 +00:00
|
|
|
ip->i_flags &= ~iflag;
|
|
|
|
clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
|
2016-09-19 01:09:48 +00:00
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
2021-01-23 00:48:43 +00:00
|
|
|
if (!clear_tag)
|
|
|
|
return;
|
|
|
|
|
2012-11-06 14:50:38 +00:00
|
|
|
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
|
|
|
|
spin_lock(&pag->pag_ici_lock);
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
|
|
|
|
XFS_ICI_BLOCKGC_TAG);
|
2012-11-06 14:50:38 +00:00
|
|
|
|
|
|
|
spin_unlock(&pag->pag_ici_lock);
|
|
|
|
xfs_perag_put(pag);
|
|
|
|
}
|
|
|
|
|
2016-10-03 16:11:46 +00:00
|
|
|
void
|
|
|
|
xfs_inode_clear_eofblocks_tag(
|
|
|
|
xfs_inode_t *ip)
|
|
|
|
{
|
|
|
|
trace_xfs_inode_clear_eofblocks_tag(ip);
|
2021-01-23 00:48:43 +00:00
|
|
|
return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
|
2016-10-03 16:11:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
xfs: skip background cowblock trims on inodes open for write
The background blockgc scanner runs on a 5m interval by default and
trims preallocation (post-eof and cow fork) from inodes that are
otherwise idle. Idle effectively means that iolock can be acquired
without blocking and that the inode has no dirty pagecache or I/O in
flight.
This simple mechanism and heuristic has worked fairly well for
post-eof speculative preallocations. Support for reflink and COW
fork preallocations came sometime later and plugged into the same
mechanism, with similar heuristics. Some recent testing has shown
that COW fork preallocation may be notably more sensitive to blockgc
processing than post-eof preallocation, however.
For example, consider an 8GB reflinked file with a COW extent size
hint of 1MB. A worst case fully randomized overwrite of this file
results in ~8k extents of an average size of ~1MB. If the same
workload is interrupted a couple times for blockgc processing
(assuming the file goes idle), the resulting extent count explodes
to over 100k extents with an average size <100kB. This is
significantly worse than ideal and essentially defeats the COW
extent size hint mechanism.
While this particular test is instrumented, it reflects a fairly
reasonable pattern in practice where random I/Os might spread out
over a large period of time with varying periods of (in)activity.
For example, consider a cloned disk image file for a VM or container
with long uptime and variable and bursty usage. A background blockgc
scan that races and processes the image file when it happens to be
clean and idle can have a significant effect on the future
fragmentation level of the file, even when still in use.
To help combat this, update the heuristic to skip cowblocks inodes
that are currently opened for write access during non-sync blockgc
scans. This allows COW fork preallocations to persist for as long as
possible unless otherwise needed for functional purposes (i.e. a
sync scan), the file is idle and closed, or the inode is being
evicted from cache. While here, update the comments to help
distinguish performance oriented heuristics from the logic that
exists to maintain functional correctness.
Suggested-by: Darrick Wong <djwong@kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-03 12:47:13 +00:00
|
|
|
* Prepare to free COW fork blocks from an inode.
|
2016-10-03 16:11:46 +00:00
|
|
|
*/
|
2018-01-17 03:03:59 +00:00
|
|
|
static bool
|
|
|
|
xfs_prep_free_cowblocks(
|
xfs: skip background cowblock trims on inodes open for write
The background blockgc scanner runs on a 5m interval by default and
trims preallocation (post-eof and cow fork) from inodes that are
otherwise idle. Idle effectively means that iolock can be acquired
without blocking and that the inode has no dirty pagecache or I/O in
flight.
This simple mechanism and heuristic has worked fairly well for
post-eof speculative preallocations. Support for reflink and COW
fork preallocations came sometime later and plugged into the same
mechanism, with similar heuristics. Some recent testing has shown
that COW fork preallocation may be notably more sensitive to blockgc
processing than post-eof preallocation, however.
For example, consider an 8GB reflinked file with a COW extent size
hint of 1MB. A worst case fully randomized overwrite of this file
results in ~8k extents of an average size of ~1MB. If the same
workload is interrupted a couple times for blockgc processing
(assuming the file goes idle), the resulting extent count explodes
to over 100k extents with an average size <100kB. This is
significantly worse than ideal and essentially defeats the COW
extent size hint mechanism.
While this particular test is instrumented, it reflects a fairly
reasonable pattern in practice where random I/Os might spread out
over a large period of time with varying periods of (in)activity.
For example, consider a cloned disk image file for a VM or container
with long uptime and variable and bursty usage. A background blockgc
scan that races and processes the image file when it happens to be
clean and idle can have a significant effect on the future
fragmentation level of the file, even when still in use.
To help combat this, update the heuristic to skip cowblocks inodes
that are currently opened for write access during non-sync blockgc
scans. This allows COW fork preallocations to persist for as long as
possible unless otherwise needed for functional purposes (i.e. a
sync scan), the file is idle and closed, or the inode is being
evicted from cache. While here, update the comments to help
distinguish performance oriented heuristics from the logic that
exists to maintain functional correctness.
Suggested-by: Darrick Wong <djwong@kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-03 12:47:13 +00:00
|
|
|
struct xfs_inode *ip,
|
|
|
|
struct xfs_icwalk *icw)
|
2016-10-03 16:11:46 +00:00
|
|
|
{
|
xfs: skip background cowblock trims on inodes open for write
The background blockgc scanner runs on a 5m interval by default and
trims preallocation (post-eof and cow fork) from inodes that are
otherwise idle. Idle effectively means that iolock can be acquired
without blocking and that the inode has no dirty pagecache or I/O in
flight.
This simple mechanism and heuristic has worked fairly well for
post-eof speculative preallocations. Support for reflink and COW
fork preallocations came sometime later and plugged into the same
mechanism, with similar heuristics. Some recent testing has shown
that COW fork preallocation may be notably more sensitive to blockgc
processing than post-eof preallocation, however.
For example, consider an 8GB reflinked file with a COW extent size
hint of 1MB. A worst case fully randomized overwrite of this file
results in ~8k extents of an average size of ~1MB. If the same
workload is interrupted a couple times for blockgc processing
(assuming the file goes idle), the resulting extent count explodes
to over 100k extents with an average size <100kB. This is
significantly worse than ideal and essentially defeats the COW
extent size hint mechanism.
While this particular test is instrumented, it reflects a fairly
reasonable pattern in practice where random I/Os might spread out
over a large period of time with varying periods of (in)activity.
For example, consider a cloned disk image file for a VM or container
with long uptime and variable and bursty usage. A background blockgc
scan that races and processes the image file when it happens to be
clean and idle can have a significant effect on the future
fragmentation level of the file, even when still in use.
To help combat this, update the heuristic to skip cowblocks inodes
that are currently opened for write access during non-sync blockgc
scans. This allows COW fork preallocations to persist for as long as
possible unless otherwise needed for functional purposes (i.e. a
sync scan), the file is idle and closed, or the inode is being
evicted from cache. While here, update the comments to help
distinguish performance oriented heuristics from the logic that
exists to maintain functional correctness.
Suggested-by: Darrick Wong <djwong@kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-03 12:47:13 +00:00
|
|
|
bool sync;
|
|
|
|
|
|
|
|
sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
|
|
|
|
|
2016-11-08 01:53:33 +00:00
|
|
|
/*
|
|
|
|
* Just clear the tag if we have an empty cow fork or none at all. It's
|
|
|
|
* possible the inode was fully unshared since it was originally tagged.
|
|
|
|
*/
|
2018-07-17 23:51:51 +00:00
|
|
|
if (!xfs_inode_has_cow_data(ip)) {
|
2016-10-03 16:11:46 +00:00
|
|
|
trace_xfs_inode_free_cowblocks_invalid(ip);
|
|
|
|
xfs_inode_clear_cowblocks_tag(ip);
|
2018-01-17 03:03:59 +00:00
|
|
|
return false;
|
2016-10-03 16:11:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
xfs: skip background cowblock trims on inodes open for write
The background blockgc scanner runs on a 5m interval by default and
trims preallocation (post-eof and cow fork) from inodes that are
otherwise idle. Idle effectively means that iolock can be acquired
without blocking and that the inode has no dirty pagecache or I/O in
flight.
This simple mechanism and heuristic has worked fairly well for
post-eof speculative preallocations. Support for reflink and COW
fork preallocations came sometime later and plugged into the same
mechanism, with similar heuristics. Some recent testing has shown
that COW fork preallocation may be notably more sensitive to blockgc
processing than post-eof preallocation, however.
For example, consider an 8GB reflinked file with a COW extent size
hint of 1MB. A worst case fully randomized overwrite of this file
results in ~8k extents of an average size of ~1MB. If the same
workload is interrupted a couple times for blockgc processing
(assuming the file goes idle), the resulting extent count explodes
to over 100k extents with an average size <100kB. This is
significantly worse than ideal and essentially defeats the COW
extent size hint mechanism.
While this particular test is instrumented, it reflects a fairly
reasonable pattern in practice where random I/Os might spread out
over a large period of time with varying periods of (in)activity.
For example, consider a cloned disk image file for a VM or container
with long uptime and variable and bursty usage. A background blockgc
scan that races and processes the image file when it happens to be
clean and idle can have a significant effect on the future
fragmentation level of the file, even when still in use.
To help combat this, update the heuristic to skip cowblocks inodes
that are currently opened for write access during non-sync blockgc
scans. This allows COW fork preallocations to persist for as long as
possible unless otherwise needed for functional purposes (i.e. a
sync scan), the file is idle and closed, or the inode is being
evicted from cache. While here, update the comments to help
distinguish performance oriented heuristics from the logic that
exists to maintain functional correctness.
Suggested-by: Darrick Wong <djwong@kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-03 12:47:13 +00:00
|
|
|
* A cowblocks trim of an inode can have a significant effect on
|
|
|
|
* fragmentation even when a reasonable COW extent size hint is set.
|
|
|
|
* Therefore, we prefer to not process cowblocks unless they are clean
|
|
|
|
* and idle. We can never process a cowblocks inode that is dirty or has
|
|
|
|
* in-flight I/O under any circumstances, because outstanding writeback
|
|
|
|
* or dio expects targeted COW fork blocks exist through write
|
|
|
|
* completion where they can be remapped into the data fork.
|
|
|
|
*
|
|
|
|
* Therefore, the heuristic used here is to never process inodes
|
|
|
|
* currently opened for write from background (i.e. non-sync) scans. For
|
|
|
|
* sync scans, use the pagecache/dio state of the inode to ensure we
|
|
|
|
* never free COW fork blocks out from under pending I/O.
|
2016-10-03 16:11:46 +00:00
|
|
|
*/
|
xfs: skip background cowblock trims on inodes open for write
The background blockgc scanner runs on a 5m interval by default and
trims preallocation (post-eof and cow fork) from inodes that are
otherwise idle. Idle effectively means that iolock can be acquired
without blocking and that the inode has no dirty pagecache or I/O in
flight.
This simple mechanism and heuristic has worked fairly well for
post-eof speculative preallocations. Support for reflink and COW
fork preallocations came sometime later and plugged into the same
mechanism, with similar heuristics. Some recent testing has shown
that COW fork preallocation may be notably more sensitive to blockgc
processing than post-eof preallocation, however.
For example, consider an 8GB reflinked file with a COW extent size
hint of 1MB. A worst case fully randomized overwrite of this file
results in ~8k extents of an average size of ~1MB. If the same
workload is interrupted a couple times for blockgc processing
(assuming the file goes idle), the resulting extent count explodes
to over 100k extents with an average size <100kB. This is
significantly worse than ideal and essentially defeats the COW
extent size hint mechanism.
While this particular test is instrumented, it reflects a fairly
reasonable pattern in practice where random I/Os might spread out
over a large period of time with varying periods of (in)activity.
For example, consider a cloned disk image file for a VM or container
with long uptime and variable and bursty usage. A background blockgc
scan that races and processes the image file when it happens to be
clean and idle can have a significant effect on the future
fragmentation level of the file, even when still in use.
To help combat this, update the heuristic to skip cowblocks inodes
that are currently opened for write access during non-sync blockgc
scans. This allows COW fork preallocations to persist for as long as
possible unless otherwise needed for functional purposes (i.e. a
sync scan), the file is idle and closed, or the inode is being
evicted from cache. While here, update the comments to help
distinguish performance oriented heuristics from the logic that
exists to maintain functional correctness.
Suggested-by: Darrick Wong <djwong@kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-03 12:47:13 +00:00
|
|
|
if (!sync && inode_is_open_for_write(VFS_I(ip)))
|
|
|
|
return false;
|
xfs: don't free cowblocks from under dirty pagecache on unshare
fallocate unshare mode explicitly breaks extent sharing. When a
command completes, it checks the data fork for any remaining shared
extents to determine whether the reflink inode flag and COW fork
preallocation can be removed. This logic doesn't consider in-core
pagecache and I/O state, however, which means we can unsafely remove
COW fork blocks that are still needed under certain conditions.
For example, consider the following command sequence:
xfs_io -fc "pwrite 0 1k" -c "reflink <file> 0 256k 1k" \
-c "pwrite 0 32k" -c "funshare 0 1k" <file>
This allocates a data block at offset 0, shares it, and then
overwrites it with a larger buffered write. The overwrite triggers
COW fork preallocation, 32 blocks by default, which maps the entire
32k write to delalloc in the COW fork. All but the shared block at
offset 0 remains hole mapped in the data fork. The unshare command
redirties and flushes the folio at offset 0, removing the only
shared extent from the inode. Since the inode no longer maps shared
extents, unshare purges the COW fork before the remaining 28k may
have written back.
This leaves dirty pagecache backed by holes, which writeback quietly
skips, thus leaving clean, non-zeroed pagecache over holes in the
file. To verify, fiemap shows holes in the first 32k of the file and
reads return different data across a remount:
$ xfs_io -c "fiemap -v" <file>
<file>:
EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS
...
1: [8..511]: hole 504
...
$ xfs_io -c "pread -v 4k 8" <file>
00001000: cd cd cd cd cd cd cd cd ........
$ umount <mnt>; mount <dev> <mnt>
$ xfs_io -c "pread -v 4k 8" <file>
00001000: 00 00 00 00 00 00 00 00 ........
To avoid this problem, make unshare follow the same rules used for
background cowblock scanning and never purge the COW fork for inodes
with dirty pagecache or in-flight I/O.
Fixes: 46afb0628b86347 ("xfs: only flush the unshared range in xfs_reflink_unshare")
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-06 11:40:51 +00:00
|
|
|
return xfs_can_free_cowblocks(ip);
|
2018-01-17 03:03:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Automatic CoW Reservation Freeing
|
|
|
|
*
|
|
|
|
* These functions automatically garbage collect leftover CoW reservations
|
|
|
|
* that were made on behalf of a cowextsize hint when we start to run out
|
|
|
|
* of quota or when the reservations sit around for too long. If the file
|
|
|
|
* has dirty pages or is undergoing writeback, its CoW reservations will
|
|
|
|
* be retained.
|
|
|
|
*
|
|
|
|
* The actual garbage collection piggybacks off the same code that runs
|
|
|
|
* the speculative EOF preallocation garbage collector.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_inode_free_cowblocks(
|
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw,
|
2021-01-26 05:09:49 +00:00
|
|
|
unsigned int *lockflags)
|
2018-01-17 03:03:59 +00:00
|
|
|
{
|
2021-01-23 00:48:35 +00:00
|
|
|
bool wait;
|
2018-01-17 03:03:59 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
|
2021-01-23 00:48:35 +00:00
|
|
|
|
2021-01-23 00:48:43 +00:00
|
|
|
if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
|
|
|
|
return 0;
|
|
|
|
|
xfs: skip background cowblock trims on inodes open for write
The background blockgc scanner runs on a 5m interval by default and
trims preallocation (post-eof and cow fork) from inodes that are
otherwise idle. Idle effectively means that iolock can be acquired
without blocking and that the inode has no dirty pagecache or I/O in
flight.
This simple mechanism and heuristic has worked fairly well for
post-eof speculative preallocations. Support for reflink and COW
fork preallocations came sometime later and plugged into the same
mechanism, with similar heuristics. Some recent testing has shown
that COW fork preallocation may be notably more sensitive to blockgc
processing than post-eof preallocation, however.
For example, consider an 8GB reflinked file with a COW extent size
hint of 1MB. A worst case fully randomized overwrite of this file
results in ~8k extents of an average size of ~1MB. If the same
workload is interrupted a couple times for blockgc processing
(assuming the file goes idle), the resulting extent count explodes
to over 100k extents with an average size <100kB. This is
significantly worse than ideal and essentially defeats the COW
extent size hint mechanism.
While this particular test is instrumented, it reflects a fairly
reasonable pattern in practice where random I/Os might spread out
over a large period of time with varying periods of (in)activity.
For example, consider a cloned disk image file for a VM or container
with long uptime and variable and bursty usage. A background blockgc
scan that races and processes the image file when it happens to be
clean and idle can have a significant effect on the future
fragmentation level of the file, even when still in use.
To help combat this, update the heuristic to skip cowblocks inodes
that are currently opened for write access during non-sync blockgc
scans. This allows COW fork preallocations to persist for as long as
possible unless otherwise needed for functional purposes (i.e. a
sync scan), the file is idle and closed, or the inode is being
evicted from cache. While here, update the comments to help
distinguish performance oriented heuristics from the logic that
exists to maintain functional correctness.
Suggested-by: Darrick Wong <djwong@kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-03 12:47:13 +00:00
|
|
|
if (!xfs_prep_free_cowblocks(ip, icw))
|
2016-10-03 16:11:46 +00:00
|
|
|
return 0;
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if (!xfs_icwalk_match(ip, icw))
|
2020-05-21 20:08:48 +00:00
|
|
|
return 0;
|
2016-10-03 16:11:46 +00:00
|
|
|
|
2021-01-23 00:48:35 +00:00
|
|
|
/*
|
|
|
|
* If the caller is waiting, return -EAGAIN to keep the background
|
|
|
|
* scanner moving and revisit the inode in a subsequent pass.
|
|
|
|
*/
|
2021-01-26 05:09:49 +00:00
|
|
|
if (!(*lockflags & XFS_IOLOCK_EXCL) &&
|
|
|
|
!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
|
2021-01-23 00:48:35 +00:00
|
|
|
if (wait)
|
|
|
|
return -EAGAIN;
|
|
|
|
return 0;
|
|
|
|
}
|
2021-01-26 05:09:49 +00:00
|
|
|
*lockflags |= XFS_IOLOCK_EXCL;
|
|
|
|
|
2021-01-23 00:48:35 +00:00
|
|
|
if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
|
|
|
|
if (wait)
|
2021-01-26 05:09:49 +00:00
|
|
|
return -EAGAIN;
|
|
|
|
return 0;
|
2021-01-23 00:48:35 +00:00
|
|
|
}
|
2021-01-26 05:09:49 +00:00
|
|
|
*lockflags |= XFS_MMAPLOCK_EXCL;
|
2016-10-03 16:11:46 +00:00
|
|
|
|
2018-01-17 03:03:59 +00:00
|
|
|
/*
|
|
|
|
* Check again, nobody else should be able to dirty blocks or change
|
|
|
|
* the reflink iflag now that we have the first two locks held.
|
|
|
|
*/
|
xfs: skip background cowblock trims on inodes open for write
The background blockgc scanner runs on a 5m interval by default and
trims preallocation (post-eof and cow fork) from inodes that are
otherwise idle. Idle effectively means that iolock can be acquired
without blocking and that the inode has no dirty pagecache or I/O in
flight.
This simple mechanism and heuristic has worked fairly well for
post-eof speculative preallocations. Support for reflink and COW
fork preallocations came sometime later and plugged into the same
mechanism, with similar heuristics. Some recent testing has shown
that COW fork preallocation may be notably more sensitive to blockgc
processing than post-eof preallocation, however.
For example, consider an 8GB reflinked file with a COW extent size
hint of 1MB. A worst case fully randomized overwrite of this file
results in ~8k extents of an average size of ~1MB. If the same
workload is interrupted a couple times for blockgc processing
(assuming the file goes idle), the resulting extent count explodes
to over 100k extents with an average size <100kB. This is
significantly worse than ideal and essentially defeats the COW
extent size hint mechanism.
While this particular test is instrumented, it reflects a fairly
reasonable pattern in practice where random I/Os might spread out
over a large period of time with varying periods of (in)activity.
For example, consider a cloned disk image file for a VM or container
with long uptime and variable and bursty usage. A background blockgc
scan that races and processes the image file when it happens to be
clean and idle can have a significant effect on the future
fragmentation level of the file, even when still in use.
To help combat this, update the heuristic to skip cowblocks inodes
that are currently opened for write access during non-sync blockgc
scans. This allows COW fork preallocations to persist for as long as
possible unless otherwise needed for functional purposes (i.e. a
sync scan), the file is idle and closed, or the inode is being
evicted from cache. While here, update the comments to help
distinguish performance oriented heuristics from the logic that
exists to maintain functional correctness.
Suggested-by: Darrick Wong <djwong@kernel.org>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
2024-09-03 12:47:13 +00:00
|
|
|
if (xfs_prep_free_cowblocks(ip, icw))
|
2018-01-17 03:03:59 +00:00
|
|
|
ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
|
2016-10-03 16:11:46 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
xfs_inode_set_cowblocks_tag(
|
|
|
|
xfs_inode_t *ip)
|
|
|
|
{
|
2016-10-24 03:21:00 +00:00
|
|
|
trace_xfs_inode_set_cowblocks_tag(ip);
|
2021-01-23 00:48:43 +00:00
|
|
|
return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
|
2016-10-03 16:11:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
xfs_inode_clear_cowblocks_tag(
|
|
|
|
xfs_inode_t *ip)
|
|
|
|
{
|
2016-10-24 03:21:00 +00:00
|
|
|
trace_xfs_inode_clear_cowblocks_tag(ip);
|
2021-01-23 00:48:43 +00:00
|
|
|
return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
|
2016-10-03 16:11:46 +00:00
|
|
|
}
|
2018-05-09 17:03:56 +00:00
|
|
|
|
|
|
|
/* Disable post-EOF and CoW block auto-reclamation. */
|
|
|
|
void
|
2021-01-23 00:48:44 +00:00
|
|
|
xfs_blockgc_stop(
|
2018-05-09 17:03:56 +00:00
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2024-11-04 04:18:39 +00:00
|
|
|
struct xfs_perag *pag = NULL;
|
2021-01-23 00:48:44 +00:00
|
|
|
|
2021-08-06 18:05:42 +00:00
|
|
|
if (!xfs_clear_blockgc_enabled(mp))
|
|
|
|
return;
|
|
|
|
|
2024-11-04 04:18:39 +00:00
|
|
|
while ((pag = xfs_perag_next(mp, pag)))
|
2021-01-23 00:48:44 +00:00
|
|
|
cancel_delayed_work_sync(&pag->pag_blockgc_work);
|
2021-08-06 18:05:42 +00:00
|
|
|
trace_xfs_blockgc_stop(mp, __return_address);
|
2018-05-09 17:03:56 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Enable post-EOF and CoW block auto-reclamation. */
|
|
|
|
void
|
2021-01-23 00:48:44 +00:00
|
|
|
xfs_blockgc_start(
|
2018-05-09 17:03:56 +00:00
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2024-08-29 04:08:39 +00:00
|
|
|
struct xfs_perag *pag = NULL;
|
2021-01-23 00:48:44 +00:00
|
|
|
|
2021-08-06 18:05:42 +00:00
|
|
|
if (xfs_set_blockgc_enabled(mp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
trace_xfs_blockgc_start(mp, __return_address);
|
2024-08-29 04:08:39 +00:00
|
|
|
while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
|
2021-01-23 00:48:44 +00:00
|
|
|
xfs_blockgc_queue(pag);
|
2018-05-09 17:03:56 +00:00
|
|
|
}
|
2021-01-23 00:48:36 +00:00
|
|
|
|
2021-06-02 06:01:44 +00:00
|
|
|
/* Don't try to run block gc on an inode that's in any of these states. */
|
|
|
|
#define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \
|
2021-08-06 18:05:39 +00:00
|
|
|
XFS_NEED_INACTIVE | \
|
|
|
|
XFS_INACTIVATING | \
|
2021-06-02 06:01:44 +00:00
|
|
|
XFS_IRECLAIMABLE | \
|
|
|
|
XFS_IRECLAIM)
|
2021-06-01 20:29:41 +00:00
|
|
|
/*
|
2021-05-31 18:31:58 +00:00
|
|
|
* Decide if the given @ip is eligible for garbage collection of speculative
|
|
|
|
* preallocations, and grab it if so. Returns true if it's ready to go or
|
|
|
|
* false if we should just ignore it.
|
2021-06-01 20:29:41 +00:00
|
|
|
*/
|
|
|
|
static bool
|
2021-05-31 18:31:58 +00:00
|
|
|
xfs_blockgc_igrab(
|
2021-05-31 18:31:59 +00:00
|
|
|
struct xfs_inode *ip)
|
2021-06-01 20:29:41 +00:00
|
|
|
{
|
|
|
|
struct inode *inode = VFS_I(ip);
|
|
|
|
|
|
|
|
ASSERT(rcu_read_lock_held());
|
|
|
|
|
|
|
|
/* Check for stale RCU freed inode */
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
|
|
|
if (!ip->i_ino)
|
|
|
|
goto out_unlock_noent;
|
|
|
|
|
2021-06-02 06:01:44 +00:00
|
|
|
if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
|
2021-06-01 20:29:41 +00:00
|
|
|
goto out_unlock_noent;
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
|
|
|
/* nothing to sync during shutdown */
|
2021-08-19 01:46:53 +00:00
|
|
|
if (xfs_is_shutdown(ip->i_mount))
|
2021-06-01 20:29:41 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
/* If we can't grab the inode, it must on it's way to reclaim. */
|
|
|
|
if (!igrab(inode))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
/* inode is valid */
|
|
|
|
return true;
|
|
|
|
|
|
|
|
out_unlock_noent:
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-01-23 00:48:43 +00:00
|
|
|
/* Scan one incore inode for block preallocations that we can remove. */
|
|
|
|
static int
|
|
|
|
xfs_blockgc_scan_inode(
|
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2021-01-23 00:48:39 +00:00
|
|
|
{
|
2021-01-26 05:09:49 +00:00
|
|
|
unsigned int lockflags = 0;
|
2021-01-23 00:48:39 +00:00
|
|
|
int error;
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
|
2021-01-23 00:48:39 +00:00
|
|
|
if (error)
|
2021-01-26 05:09:49 +00:00
|
|
|
goto unlock;
|
2021-01-23 00:48:39 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
|
2021-01-26 05:09:49 +00:00
|
|
|
unlock:
|
|
|
|
if (lockflags)
|
|
|
|
xfs_iunlock(ip, lockflags);
|
2021-05-31 18:32:00 +00:00
|
|
|
xfs_irele(ip);
|
2021-01-26 05:09:49 +00:00
|
|
|
return error;
|
2021-01-23 00:48:39 +00:00
|
|
|
}
|
|
|
|
|
2021-01-23 00:48:43 +00:00
|
|
|
/* Background worker that trims preallocated space. */
|
|
|
|
void
|
|
|
|
xfs_blockgc_worker(
|
|
|
|
struct work_struct *work)
|
|
|
|
{
|
2021-01-23 00:48:44 +00:00
|
|
|
struct xfs_perag *pag = container_of(to_delayed_work(work),
|
|
|
|
struct xfs_perag, pag_blockgc_work);
|
2024-11-04 04:18:38 +00:00
|
|
|
struct xfs_mount *mp = pag_mount(pag);
|
2021-01-23 00:48:43 +00:00
|
|
|
int error;
|
|
|
|
|
2021-08-06 18:05:42 +00:00
|
|
|
trace_xfs_blockgc_worker(mp, __return_address);
|
|
|
|
|
2021-05-31 18:32:00 +00:00
|
|
|
error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
|
2021-01-23 00:48:43 +00:00
|
|
|
if (error)
|
2021-01-23 00:48:44 +00:00
|
|
|
xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
|
2024-11-04 04:18:38 +00:00
|
|
|
pag_agno(pag), error);
|
2021-01-23 00:48:44 +00:00
|
|
|
xfs_blockgc_queue(pag);
|
2021-01-23 00:48:43 +00:00
|
|
|
}
|
|
|
|
|
2021-01-23 00:48:39 +00:00
|
|
|
/*
|
2021-08-06 18:05:41 +00:00
|
|
|
* Try to free space in the filesystem by purging inactive inodes, eofblocks
|
|
|
|
* and cowblocks.
|
2021-01-23 00:48:39 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_blockgc_free_space(
|
|
|
|
struct xfs_mount *mp,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2021-01-23 00:48:39 +00:00
|
|
|
{
|
2021-08-06 18:05:41 +00:00
|
|
|
int error;
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
|
2021-01-23 00:48:39 +00:00
|
|
|
|
2021-08-06 18:05:41 +00:00
|
|
|
error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
2023-06-05 04:48:15 +00:00
|
|
|
return xfs_inodegc_flush(mp);
|
2021-01-23 00:48:39 +00:00
|
|
|
}
|
|
|
|
|
2021-08-06 18:05:42 +00:00
|
|
|
/*
|
|
|
|
* Reclaim all the free space that we can by scheduling the background blockgc
|
|
|
|
* and inodegc workers immediately and waiting for them all to clear.
|
|
|
|
*/
|
2023-06-05 04:48:15 +00:00
|
|
|
int
|
2021-08-06 18:05:42 +00:00
|
|
|
xfs_blockgc_flush_all(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2024-08-29 04:08:39 +00:00
|
|
|
struct xfs_perag *pag = NULL;
|
2021-08-06 18:05:42 +00:00
|
|
|
|
|
|
|
trace_xfs_blockgc_flush_all(mp, __return_address);
|
|
|
|
|
|
|
|
/*
|
2024-08-29 04:08:39 +00:00
|
|
|
* For each blockgc worker, move its queue time up to now. If it wasn't
|
|
|
|
* queued, it will not be requeued. Then flush whatever is left.
|
2021-08-06 18:05:42 +00:00
|
|
|
*/
|
2024-08-29 04:08:39 +00:00
|
|
|
while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
|
2024-11-04 04:18:38 +00:00
|
|
|
mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0);
|
2021-08-06 18:05:42 +00:00
|
|
|
|
2024-08-29 04:08:39 +00:00
|
|
|
while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG)))
|
2021-08-06 18:05:42 +00:00
|
|
|
flush_delayed_work(&pag->pag_blockgc_work);
|
|
|
|
|
2023-06-05 04:48:15 +00:00
|
|
|
return xfs_inodegc_flush(mp);
|
2021-08-06 18:05:42 +00:00
|
|
|
}
|
|
|
|
|
2021-01-23 00:48:36 +00:00
|
|
|
/*
|
2021-01-23 00:48:37 +00:00
|
|
|
* Run cow/eofblocks scans on the supplied dquots. We don't know exactly which
|
|
|
|
* quota caused an allocation failure, so we make a best effort by including
|
|
|
|
* each quota under low free space conditions (less than 1% free space) in the
|
|
|
|
* scan.
|
2021-01-23 00:48:36 +00:00
|
|
|
*
|
|
|
|
* Callers must not hold any inode's ILOCK. If requesting a synchronous scan
|
2021-06-07 16:34:51 +00:00
|
|
|
* (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
|
2021-01-23 00:48:36 +00:00
|
|
|
* MMAPLOCK.
|
2021-01-23 00:48:36 +00:00
|
|
|
*/
|
2021-01-23 00:48:36 +00:00
|
|
|
int
|
2021-01-23 00:48:37 +00:00
|
|
|
xfs_blockgc_free_dquots(
|
|
|
|
struct xfs_mount *mp,
|
|
|
|
struct xfs_dquot *udqp,
|
|
|
|
struct xfs_dquot *gdqp,
|
|
|
|
struct xfs_dquot *pdqp,
|
2021-06-07 16:34:51 +00:00
|
|
|
unsigned int iwalk_flags)
|
2021-01-23 00:48:36 +00:00
|
|
|
{
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk icw = {0};
|
2021-01-23 00:48:36 +00:00
|
|
|
bool do_work = false;
|
|
|
|
|
2021-01-23 00:48:37 +00:00
|
|
|
if (!udqp && !gdqp && !pdqp)
|
|
|
|
return 0;
|
|
|
|
|
2021-01-23 00:48:36 +00:00
|
|
|
/*
|
2021-01-23 00:48:36 +00:00
|
|
|
* Run a scan to free blocks using the union filter to cover all
|
|
|
|
* applicable quotas in a single scan.
|
2021-01-23 00:48:36 +00:00
|
|
|
*/
|
2021-06-07 16:34:51 +00:00
|
|
|
icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
|
2021-01-23 00:48:36 +00:00
|
|
|
|
2021-01-23 00:48:37 +00:00
|
|
|
if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
|
2021-06-07 16:34:51 +00:00
|
|
|
icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
|
|
|
|
icw.icw_flags |= XFS_ICWALK_FLAG_UID;
|
2021-01-23 00:48:37 +00:00
|
|
|
do_work = true;
|
2021-01-23 00:48:36 +00:00
|
|
|
}
|
|
|
|
|
2021-01-23 00:48:37 +00:00
|
|
|
if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
|
2021-06-07 16:34:51 +00:00
|
|
|
icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
|
|
|
|
icw.icw_flags |= XFS_ICWALK_FLAG_GID;
|
2021-01-23 00:48:37 +00:00
|
|
|
do_work = true;
|
2021-01-23 00:48:36 +00:00
|
|
|
}
|
|
|
|
|
2021-01-23 00:48:37 +00:00
|
|
|
if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
|
2021-06-07 16:34:51 +00:00
|
|
|
icw.icw_prid = pdqp->q_id;
|
|
|
|
icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
|
2021-01-23 00:48:37 +00:00
|
|
|
do_work = true;
|
2021-01-23 00:48:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!do_work)
|
2021-01-23 00:48:36 +00:00
|
|
|
return 0;
|
2021-01-23 00:48:36 +00:00
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
return xfs_blockgc_free_space(mp, &icw);
|
2021-01-23 00:48:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Run cow/eofblocks scans on the quotas attached to the inode. */
|
|
|
|
int
|
|
|
|
xfs_blockgc_free_quota(
|
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
unsigned int iwalk_flags)
|
2021-01-23 00:48:37 +00:00
|
|
|
{
|
|
|
|
return xfs_blockgc_free_dquots(ip->i_mount,
|
|
|
|
xfs_inode_dquot(ip, XFS_DQTYPE_USER),
|
|
|
|
xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
|
2021-06-07 16:34:51 +00:00
|
|
|
xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
|
2021-01-23 00:48:36 +00:00
|
|
|
}
|
2021-06-01 20:29:41 +00:00
|
|
|
|
|
|
|
/* XFS Inode Cache Walking Code */
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
/*
|
|
|
|
* The inode lookup is done in batches to keep the amount of lock traffic and
|
|
|
|
* radix tree lookups to a minimum. The batch size is a trade off between
|
|
|
|
* lookup reduction and stack usage. This is in the reclaim path, so we can't
|
|
|
|
* be too greedy.
|
|
|
|
*/
|
|
|
|
#define XFS_LOOKUP_BATCH 32
|
|
|
|
|
|
|
|
|
2021-05-31 18:31:58 +00:00
|
|
|
/*
|
|
|
|
* Decide if we want to grab this inode in anticipation of doing work towards
|
2021-05-31 18:32:00 +00:00
|
|
|
* the goal.
|
2021-05-31 18:31:58 +00:00
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
xfs_icwalk_igrab(
|
|
|
|
enum xfs_icwalk_goal goal,
|
2021-06-07 16:34:50 +00:00
|
|
|
struct xfs_inode *ip,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2021-05-31 18:31:58 +00:00
|
|
|
{
|
|
|
|
switch (goal) {
|
|
|
|
case XFS_ICWALK_BLOCKGC:
|
2021-05-31 18:31:59 +00:00
|
|
|
return xfs_blockgc_igrab(ip);
|
2021-05-31 18:32:02 +00:00
|
|
|
case XFS_ICWALK_RECLAIM:
|
2021-06-07 16:34:51 +00:00
|
|
|
return xfs_reclaim_igrab(ip, icw);
|
2021-05-31 18:31:58 +00:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-05-31 18:32:00 +00:00
|
|
|
/*
|
|
|
|
* Process an inode. Each processing function must handle any state changes
|
|
|
|
* made by the icwalk igrab function. Return -EAGAIN to skip an inode.
|
|
|
|
*/
|
2021-05-31 18:32:00 +00:00
|
|
|
static inline int
|
|
|
|
xfs_icwalk_process_inode(
|
|
|
|
enum xfs_icwalk_goal goal,
|
|
|
|
struct xfs_inode *ip,
|
2021-05-31 18:32:02 +00:00
|
|
|
struct xfs_perag *pag,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2021-05-31 18:32:00 +00:00
|
|
|
{
|
2021-05-31 18:32:00 +00:00
|
|
|
int error = 0;
|
2021-05-31 18:32:00 +00:00
|
|
|
|
|
|
|
switch (goal) {
|
|
|
|
case XFS_ICWALK_BLOCKGC:
|
2021-06-07 16:34:51 +00:00
|
|
|
error = xfs_blockgc_scan_inode(ip, icw);
|
2021-05-31 18:32:00 +00:00
|
|
|
break;
|
2021-05-31 18:32:02 +00:00
|
|
|
case XFS_ICWALK_RECLAIM:
|
|
|
|
xfs_reclaim_inode(ip, pag);
|
|
|
|
break;
|
2021-05-31 18:32:00 +00:00
|
|
|
}
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2021-06-01 20:29:41 +00:00
|
|
|
/*
|
2021-05-31 18:32:00 +00:00
|
|
|
* For a given per-AG structure @pag and a goal, grab qualifying inodes and
|
|
|
|
* process them in some manner.
|
2021-06-01 20:29:41 +00:00
|
|
|
*/
|
|
|
|
static int
|
2021-06-02 05:41:25 +00:00
|
|
|
xfs_icwalk_ag(
|
2021-06-01 20:29:41 +00:00
|
|
|
struct xfs_perag *pag,
|
2021-05-31 18:32:00 +00:00
|
|
|
enum xfs_icwalk_goal goal,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2021-06-01 20:29:41 +00:00
|
|
|
{
|
2024-11-04 04:18:38 +00:00
|
|
|
struct xfs_mount *mp = pag_mount(pag);
|
2021-06-01 20:29:41 +00:00
|
|
|
uint32_t first_index;
|
|
|
|
int last_error = 0;
|
|
|
|
int skipped;
|
|
|
|
bool done;
|
|
|
|
int nr_found;
|
|
|
|
|
|
|
|
restart:
|
|
|
|
done = false;
|
|
|
|
skipped = 0;
|
2021-05-31 18:32:02 +00:00
|
|
|
if (goal == XFS_ICWALK_RECLAIM)
|
|
|
|
first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
|
|
|
|
else
|
|
|
|
first_index = 0;
|
2021-06-01 20:29:41 +00:00
|
|
|
nr_found = 0;
|
|
|
|
do {
|
|
|
|
struct xfs_inode *batch[XFS_LOOKUP_BATCH];
|
|
|
|
int error = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
|
2021-08-13 16:16:52 +00:00
|
|
|
nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
|
|
|
|
(void **) batch, first_index,
|
|
|
|
XFS_LOOKUP_BATCH, goal);
|
2021-06-01 20:29:41 +00:00
|
|
|
if (!nr_found) {
|
2021-05-31 18:32:02 +00:00
|
|
|
done = true;
|
2021-06-01 20:29:41 +00:00
|
|
|
rcu_read_unlock();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Grab the inodes before we drop the lock. if we found
|
|
|
|
* nothing, nr == 0 and the loop will be skipped.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < nr_found; i++) {
|
|
|
|
struct xfs_inode *ip = batch[i];
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if (done || !xfs_icwalk_igrab(goal, ip, icw))
|
2021-06-01 20:29:41 +00:00
|
|
|
batch[i] = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the index for the next lookup. Catch
|
|
|
|
* overflows into the next AG range which can occur if
|
|
|
|
* we have inodes in the last block of the AG and we
|
|
|
|
* are currently pointing to the last inode.
|
|
|
|
*
|
|
|
|
* Because we may see inodes that are from the wrong AG
|
|
|
|
* due to RCU freeing and reallocation, only update the
|
|
|
|
* index if it lies in this AG. It was a race that lead
|
|
|
|
* us to see this inode, so another lookup from the
|
|
|
|
* same index will not find it again.
|
|
|
|
*/
|
2024-11-04 04:18:38 +00:00
|
|
|
if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag))
|
2021-06-01 20:29:41 +00:00
|
|
|
continue;
|
|
|
|
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
|
|
|
|
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
|
|
|
|
done = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* unlock now we've grabbed the inodes. */
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
for (i = 0; i < nr_found; i++) {
|
|
|
|
if (!batch[i])
|
|
|
|
continue;
|
2021-05-31 18:32:02 +00:00
|
|
|
error = xfs_icwalk_process_inode(goal, batch[i], pag,
|
2021-06-07 16:34:51 +00:00
|
|
|
icw);
|
2021-06-01 20:29:41 +00:00
|
|
|
if (error == -EAGAIN) {
|
|
|
|
skipped++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (error && last_error != -EFSCORRUPTED)
|
|
|
|
last_error = error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* bail out if the filesystem is corrupted. */
|
|
|
|
if (error == -EFSCORRUPTED)
|
|
|
|
break;
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
|
2021-06-07 16:34:51 +00:00
|
|
|
if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
|
|
|
|
icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
|
|
|
|
if (icw->icw_scan_limit <= 0)
|
2021-05-31 18:32:02 +00:00
|
|
|
break;
|
|
|
|
}
|
2021-06-01 20:29:41 +00:00
|
|
|
} while (nr_found && !done);
|
|
|
|
|
2021-05-31 18:32:02 +00:00
|
|
|
if (goal == XFS_ICWALK_RECLAIM) {
|
|
|
|
if (done)
|
|
|
|
first_index = 0;
|
|
|
|
WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
|
|
|
|
}
|
|
|
|
|
2021-06-01 20:29:41 +00:00
|
|
|
if (skipped) {
|
|
|
|
delay(1);
|
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
return last_error;
|
|
|
|
}
|
|
|
|
|
2021-05-31 18:32:00 +00:00
|
|
|
/* Walk all incore inodes to achieve a given goal. */
|
2021-06-01 20:29:41 +00:00
|
|
|
static int
|
2021-06-02 05:41:25 +00:00
|
|
|
xfs_icwalk(
|
2021-06-01 20:29:41 +00:00
|
|
|
struct xfs_mount *mp,
|
2021-05-31 18:32:00 +00:00
|
|
|
enum xfs_icwalk_goal goal,
|
2021-06-07 16:34:51 +00:00
|
|
|
struct xfs_icwalk *icw)
|
2021-06-01 20:29:41 +00:00
|
|
|
{
|
2024-08-29 04:08:39 +00:00
|
|
|
struct xfs_perag *pag = NULL;
|
2021-06-01 20:29:41 +00:00
|
|
|
int error = 0;
|
|
|
|
int last_error = 0;
|
|
|
|
|
2024-08-29 04:08:39 +00:00
|
|
|
while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) {
|
2021-06-07 16:34:51 +00:00
|
|
|
error = xfs_icwalk_ag(pag, goal, icw);
|
2021-06-01 20:29:41 +00:00
|
|
|
if (error) {
|
|
|
|
last_error = error;
|
2021-08-13 16:16:52 +00:00
|
|
|
if (error == -EFSCORRUPTED) {
|
2023-02-12 22:14:42 +00:00
|
|
|
xfs_perag_rele(pag);
|
2021-06-01 20:29:41 +00:00
|
|
|
break;
|
2021-08-13 16:16:52 +00:00
|
|
|
}
|
2021-06-01 20:29:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return last_error;
|
2021-06-07 16:34:51 +00:00
|
|
|
BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
|
2021-06-01 20:29:41 +00:00
|
|
|
}
|
2021-08-06 18:05:38 +00:00
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
static void
|
|
|
|
xfs_check_delalloc(
|
|
|
|
struct xfs_inode *ip,
|
|
|
|
int whichfork)
|
|
|
|
{
|
2022-07-09 17:56:05 +00:00
|
|
|
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
|
2021-08-06 18:05:38 +00:00
|
|
|
struct xfs_bmbt_irec got;
|
|
|
|
struct xfs_iext_cursor icur;
|
|
|
|
|
|
|
|
if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
|
|
|
|
return;
|
|
|
|
do {
|
|
|
|
if (isnullstartblock(got.br_startblock)) {
|
|
|
|
xfs_warn(ip->i_mount,
|
|
|
|
"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
|
|
|
|
ip->i_ino,
|
|
|
|
whichfork == XFS_DATA_FORK ? "data" : "cow",
|
|
|
|
got.br_startoff, got.br_blockcount);
|
|
|
|
}
|
|
|
|
} while (xfs_iext_next_extent(ifp, &icur, &got));
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define xfs_check_delalloc(ip, whichfork) do { } while (0)
|
|
|
|
#endif
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
/* Schedule the inode for reclaim. */
|
|
|
|
static void
|
|
|
|
xfs_inodegc_set_reclaimable(
|
2021-08-06 18:05:38 +00:00
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
struct xfs_perag *pag;
|
|
|
|
|
2021-08-19 01:46:53 +00:00
|
|
|
if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) {
|
2021-08-06 18:05:38 +00:00
|
|
|
xfs_check_delalloc(ip, XFS_DATA_FORK);
|
|
|
|
xfs_check_delalloc(ip, XFS_COW_FORK);
|
|
|
|
ASSERT(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
|
|
|
|
spin_lock(&pag->pag_ici_lock);
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
trace_xfs_inode_set_reclaimable(ip);
|
|
|
|
ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
|
|
|
|
ip->i_flags |= XFS_IRECLAIMABLE;
|
2021-08-06 18:05:38 +00:00
|
|
|
xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
|
|
|
|
XFS_ICI_RECLAIM_TAG);
|
|
|
|
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
spin_unlock(&pag->pag_ici_lock);
|
|
|
|
xfs_perag_put(pag);
|
|
|
|
}
|
2021-08-06 18:05:39 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Free all speculative preallocations and possibly even the inode itself.
|
|
|
|
* This is the last chance to make changes to an otherwise unreferenced file
|
|
|
|
* before incore reclamation happens.
|
|
|
|
*/
|
2023-06-05 04:48:15 +00:00
|
|
|
static int
|
2021-08-06 18:05:39 +00:00
|
|
|
xfs_inodegc_inactivate(
|
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
2023-06-05 04:48:15 +00:00
|
|
|
int error;
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
trace_xfs_inode_inactivating(ip);
|
2023-06-05 04:48:15 +00:00
|
|
|
error = xfs_inactive(ip);
|
2021-08-06 18:05:39 +00:00
|
|
|
xfs_inodegc_set_reclaimable(ip);
|
2023-06-05 04:48:15 +00:00
|
|
|
return error;
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
xfs_inodegc_worker(
|
|
|
|
struct work_struct *work)
|
|
|
|
{
|
2022-06-16 14:44:31 +00:00
|
|
|
struct xfs_inodegc *gc = container_of(to_delayed_work(work),
|
|
|
|
struct xfs_inodegc, work);
|
2021-08-06 18:05:39 +00:00
|
|
|
struct llist_node *node = llist_del_all(&gc->list);
|
|
|
|
struct xfs_inode *ip, *n;
|
2023-09-11 15:39:03 +00:00
|
|
|
struct xfs_mount *mp = gc->mp;
|
2022-12-27 17:41:30 +00:00
|
|
|
unsigned int nofs_flag;
|
2021-08-06 18:05:39 +00:00
|
|
|
|
2023-09-11 15:39:03 +00:00
|
|
|
/*
|
|
|
|
* Clear the cpu mask bit and ensure that we have seen the latest
|
|
|
|
* update of the gc structure associated with this CPU. This matches
|
|
|
|
* with the release semantics used when setting the cpumask bit in
|
|
|
|
* xfs_inodegc_queue.
|
|
|
|
*/
|
|
|
|
cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
|
|
|
|
smp_mb__after_atomic();
|
2023-05-01 23:16:12 +00:00
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
WRITE_ONCE(gc->items, 0);
|
|
|
|
|
|
|
|
if (!node)
|
|
|
|
return;
|
|
|
|
|
2022-12-27 17:41:30 +00:00
|
|
|
/*
|
|
|
|
* We can allocate memory here while doing writeback on behalf of
|
|
|
|
* memory reclaim. To avoid memory allocation deadlocks set the
|
|
|
|
* task-wide nofs context for the following operations.
|
|
|
|
*/
|
|
|
|
nofs_flag = memalloc_nofs_save();
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
ip = llist_entry(node, struct xfs_inode, i_gclist);
|
2023-09-11 15:39:03 +00:00
|
|
|
trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
|
2021-08-06 18:05:39 +00:00
|
|
|
|
2021-08-06 18:05:43 +00:00
|
|
|
WRITE_ONCE(gc->shrinker_hits, 0);
|
2021-08-06 18:05:39 +00:00
|
|
|
llist_for_each_entry_safe(ip, n, node, i_gclist) {
|
2023-06-05 04:48:15 +00:00
|
|
|
int error;
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
xfs_iflags_set(ip, XFS_INACTIVATING);
|
2023-06-05 04:48:15 +00:00
|
|
|
error = xfs_inodegc_inactivate(ip);
|
|
|
|
if (error && !gc->error)
|
|
|
|
gc->error = error;
|
2021-08-06 18:05:39 +00:00
|
|
|
}
|
2022-12-27 17:41:30 +00:00
|
|
|
|
|
|
|
memalloc_nofs_restore(nofs_flag);
|
2021-08-06 18:05:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2022-06-16 14:44:32 +00:00
|
|
|
* Expedite all pending inodegc work to run immediately. This does not wait for
|
|
|
|
* completion of the work.
|
2021-08-06 18:05:39 +00:00
|
|
|
*/
|
|
|
|
void
|
2022-06-16 14:44:32 +00:00
|
|
|
xfs_inodegc_push(
|
2021-08-06 18:05:39 +00:00
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
if (!xfs_is_inodegc_enabled(mp))
|
|
|
|
return;
|
2022-06-16 14:44:32 +00:00
|
|
|
trace_xfs_inodegc_push(mp, __return_address);
|
|
|
|
xfs_inodegc_queue_all(mp);
|
|
|
|
}
|
2021-08-06 18:05:39 +00:00
|
|
|
|
2022-06-16 14:44:32 +00:00
|
|
|
/*
|
|
|
|
* Force all currently queued inode inactivation work to run immediately and
|
|
|
|
* wait for the work to finish.
|
|
|
|
*/
|
2023-06-05 04:48:15 +00:00
|
|
|
int
|
2022-06-16 14:44:32 +00:00
|
|
|
xfs_inodegc_flush(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
xfs_inodegc_push(mp);
|
2021-08-06 18:05:39 +00:00
|
|
|
trace_xfs_inodegc_flush(mp, __return_address);
|
2023-06-05 04:48:15 +00:00
|
|
|
return xfs_inodegc_wait_all(mp);
|
2021-08-06 18:05:39 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush all the pending work and then disable the inode inactivation background
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
* workers and wait for them to stop. Caller must hold sb->s_umount to
|
|
|
|
* coordinate changes in the inodegc_enabled state.
|
2021-08-06 18:05:39 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_inodegc_stop(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
bool rerun;
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
if (!xfs_clear_inodegc_enabled(mp))
|
|
|
|
return;
|
|
|
|
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
/*
|
|
|
|
* Drain all pending inodegc work, including inodes that could be
|
|
|
|
* queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan
|
|
|
|
* threads that sample the inodegc state just prior to us clearing it.
|
|
|
|
* The inodegc flag state prevents new threads from queuing more
|
|
|
|
* inodes, so we queue pending work items and flush the workqueue until
|
|
|
|
* all inodegc lists are empty. IOWs, we cannot use drain_workqueue
|
|
|
|
* here because it does not allow other unserialized mechanisms to
|
|
|
|
* reschedule inodegc work while this draining is in progress.
|
|
|
|
*/
|
2021-08-06 18:05:39 +00:00
|
|
|
xfs_inodegc_queue_all(mp);
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
do {
|
|
|
|
flush_workqueue(mp->m_inodegc_wq);
|
|
|
|
rerun = xfs_inodegc_queue_all(mp);
|
|
|
|
} while (rerun);
|
2021-08-06 18:05:39 +00:00
|
|
|
|
|
|
|
trace_xfs_inodegc_stop(mp, __return_address);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Enable the inode inactivation background workers and schedule deferred inode
|
xfs: fix xfs_inodegc_stop racing with mod_delayed_work
syzbot reported this warning from the faux inodegc shrinker that tries
to kick off inodegc work:
------------[ cut here ]------------
WARNING: CPU: 1 PID: 102 at kernel/workqueue.c:1445 __queue_work+0xd44/0x1120 kernel/workqueue.c:1444
RIP: 0010:__queue_work+0xd44/0x1120 kernel/workqueue.c:1444
Call Trace:
__queue_delayed_work+0x1c8/0x270 kernel/workqueue.c:1672
mod_delayed_work_on+0xe1/0x220 kernel/workqueue.c:1746
xfs_inodegc_shrinker_scan fs/xfs/xfs_icache.c:2212 [inline]
xfs_inodegc_shrinker_scan+0x250/0x4f0 fs/xfs/xfs_icache.c:2191
do_shrink_slab+0x428/0xaa0 mm/vmscan.c:853
shrink_slab+0x175/0x660 mm/vmscan.c:1013
shrink_one+0x502/0x810 mm/vmscan.c:5343
shrink_many mm/vmscan.c:5394 [inline]
lru_gen_shrink_node mm/vmscan.c:5511 [inline]
shrink_node+0x2064/0x35f0 mm/vmscan.c:6459
kswapd_shrink_node mm/vmscan.c:7262 [inline]
balance_pgdat+0xa02/0x1ac0 mm/vmscan.c:7452
kswapd+0x677/0xd60 mm/vmscan.c:7712
kthread+0x2e8/0x3a0 kernel/kthread.c:376
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
This warning corresponds to this code in __queue_work:
/*
* For a draining wq, only works from the same workqueue are
* allowed. The __WQ_DESTROYING helps to spot the issue that
* queues a new work item to a wq after destroy_workqueue(wq).
*/
if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq))))
return;
For this to trip, we must have a thread draining the inodedgc workqueue
and a second thread trying to queue inodegc work to that workqueue.
This can happen if freezing or a ro remount race with reclaim poking our
faux inodegc shrinker and another thread dropping an unlinked O_RDONLY
file:
Thread 0 Thread 1 Thread 2
xfs_inodegc_stop
xfs_inodegc_shrinker_scan
xfs_is_inodegc_enabled
<yes, will continue>
xfs_clear_inodegc_enabled
xfs_inodegc_queue_all
<list empty, do not queue inodegc worker>
xfs_inodegc_queue
<add to list>
xfs_is_inodegc_enabled
<no, returns>
drain_workqueue
<set WQ_DRAINING>
llist_empty
<no, will queue list>
mod_delayed_work_on(..., 0)
__queue_work
<sees WQ_DRAINING, kaboom>
In other words, everything between the access to inodegc_enabled state
and the decision to poke the inodegc workqueue requires some kind of
coordination to avoid the WQ_DRAINING state. We could perhaps introduce
a lock here, but we could also try to eliminate WQ_DRAINING from the
picture.
We could replace the drain_workqueue call with a loop that flushes the
workqueue and queues workers as long as there is at least one inode
present in the per-cpu inodegc llists. We've disabled inodegc at this
point, so we know that the number of queued inodes will eventually hit
zero as long as xfs_inodegc_start cannot reactivate the workers.
There are four callers of xfs_inodegc_start. Three of them come from the
VFS with s_umount held: filesystem thawing, failed filesystem freezing,
and the rw remount transition. The fourth caller is mounting rw (no
remount or freezing possible).
There are three callers ofs xfs_inodegc_stop. One is unmounting (no
remount or thaw possible). Two of them come from the VFS with s_umount
held: fs freezing and ro remount transition.
Hence, it is correct to replace the drain_workqueue call with a loop
that drains the inodegc llists.
Fixes: 6191cf3ad59f ("xfs: flush inodegc workqueue tasks before cancel")
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2023-05-01 23:16:14 +00:00
|
|
|
* inactivation work if there is any. Caller must hold sb->s_umount to
|
|
|
|
* coordinate changes in the inodegc_enabled state.
|
2021-08-06 18:05:39 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_inodegc_start(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
|
|
|
if (xfs_set_inodegc_enabled(mp))
|
|
|
|
return;
|
|
|
|
|
|
|
|
trace_xfs_inodegc_start(mp, __return_address);
|
|
|
|
xfs_inodegc_queue_all(mp);
|
|
|
|
}
|
|
|
|
|
2021-08-06 18:05:41 +00:00
|
|
|
#ifdef CONFIG_XFS_RT
|
|
|
|
static inline bool
|
|
|
|
xfs_inodegc_want_queue_rt_file(
|
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
|
|
|
|
if (!XFS_IS_REALTIME_INODE(ip))
|
|
|
|
return false;
|
|
|
|
|
2022-04-11 20:49:42 +00:00
|
|
|
if (__percpu_counter_compare(&mp->m_frextents,
|
|
|
|
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
|
|
|
|
XFS_FDBLOCKS_BATCH) < 0)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
2021-08-06 18:05:41 +00:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
# define xfs_inodegc_want_queue_rt_file(ip) (false)
|
|
|
|
#endif /* CONFIG_XFS_RT */
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
/*
|
|
|
|
* Schedule the inactivation worker when:
|
|
|
|
*
|
|
|
|
* - We've accumulated more than one inode cluster buffer's worth of inodes.
|
2021-08-06 18:05:40 +00:00
|
|
|
* - There is less than 5% free space left.
|
2021-08-06 18:05:40 +00:00
|
|
|
* - Any of the quotas for this inode are near an enforcement limit.
|
2021-08-06 18:05:39 +00:00
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
xfs_inodegc_want_queue_work(
|
|
|
|
struct xfs_inode *ip,
|
|
|
|
unsigned int items)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
|
|
|
|
if (items > mp->m_ino_geo.inodes_per_cluster)
|
|
|
|
return true;
|
|
|
|
|
2021-08-06 18:05:40 +00:00
|
|
|
if (__percpu_counter_compare(&mp->m_fdblocks,
|
|
|
|
mp->m_low_space[XFS_LOWSP_5_PCNT],
|
|
|
|
XFS_FDBLOCKS_BATCH) < 0)
|
|
|
|
return true;
|
|
|
|
|
2021-08-06 18:05:41 +00:00
|
|
|
if (xfs_inodegc_want_queue_rt_file(ip))
|
|
|
|
return true;
|
|
|
|
|
2021-08-06 18:05:40 +00:00
|
|
|
if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
|
|
|
|
return true;
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Upper bound on the number of inodes in each AG that can be queued for
|
|
|
|
* inactivation at any given time, to avoid monopolizing the workqueue.
|
|
|
|
*/
|
|
|
|
#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make the frontend wait for inactivations when:
|
|
|
|
*
|
2021-08-06 18:05:43 +00:00
|
|
|
* - Memory shrinkers queued the inactivation worker and it hasn't finished.
|
2021-08-06 18:05:39 +00:00
|
|
|
* - The queue depth exceeds the maximum allowable percpu backlog.
|
|
|
|
*
|
xfs: don't use current->journal_info
syzbot reported an ext4 panic during a page fault where found a
journal handle when it didn't expect to find one. The structure
it tripped over had a value of 'TRAN' in the first entry in the
structure, and that indicates it tripped over a struct xfs_trans
instead of a jbd2 handle.
The reason for this is that the page fault was taken during a
copy-out to a user buffer from an xfs bulkstat operation. XFS uses
an "empty" transaction context for bulkstat to do automated metadata
buffer cleanup, and so the transaction context is valid across the
copyout of the bulkstat info into the user buffer.
We are using empty transaction contexts like this in XFS to reduce
the risk of failing to release objects we reference during the
operation, especially during error handling. Hence we really need to
ensure that we can take page faults from these contexts without
leaving landmines for the code processing the page fault to trip
over.
However, this same behaviour could happen from any other filesystem
that triggers a page fault or any other exception that is handled
on-stack from within a task context that has current->journal_info
set. Having a page fault from some other filesystem bounce into XFS
where we have to run a transaction isn't a bug at all, but the usage
of current->journal_info means that this could result corruption of
the outer task's journal_info structure.
The problem is purely that we now have two different contexts that
now think they own current->journal_info. IOWs, no filesystem can
allow page faults or on-stack exceptions while current->journal_info
is set by the filesystem because the exception processing might use
current->journal_info itself.
If we end up with nested XFS transactions whilst holding an empty
transaction, then it isn't an issue as the outer transaction does
not hold a log reservation. If we ignore the current->journal_info
usage, then the only problem that might occur is a deadlock if the
exception tries to take the same locks the upper context holds.
That, however, is not a problem that setting current->journal_info
would solve, so it's largely an irrelevant concern here.
IOWs, we really only use current->journal_info for a warning check
in xfs_vm_writepages() to ensure we aren't doing writeback from a
transaction context. Writeback might need to do allocation, so it
can need to run transactions itself. Hence it's a debug check to
warn us that we've done something silly, and largely it is not all
that useful.
So let's just remove all the use of current->journal_info in XFS and
get rid of all the potential issues from nested contexts where
current->journal_info might get misused by another filesystem
context.
Reported-by: syzbot+cdee56dbcdf0096ef605@syzkaller.appspotmail.com
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Mark Tinguely <mark.tinguely@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-03-18 22:36:28 +00:00
|
|
|
* Note: If we are in a NOFS context here (e.g. current thread is running a
|
|
|
|
* transaction) the we don't want to block here as inodegc progress may require
|
|
|
|
* filesystem resources we hold to make progress and that could result in a
|
|
|
|
* deadlock. Hence we skip out of here if we are in a scoped NOFS context.
|
2021-08-06 18:05:39 +00:00
|
|
|
*/
|
|
|
|
static inline bool
|
|
|
|
xfs_inodegc_want_flush_work(
|
|
|
|
struct xfs_inode *ip,
|
2021-08-06 18:05:43 +00:00
|
|
|
unsigned int items,
|
|
|
|
unsigned int shrinker_hits)
|
2021-08-06 18:05:39 +00:00
|
|
|
{
|
xfs: don't use current->journal_info
syzbot reported an ext4 panic during a page fault where found a
journal handle when it didn't expect to find one. The structure
it tripped over had a value of 'TRAN' in the first entry in the
structure, and that indicates it tripped over a struct xfs_trans
instead of a jbd2 handle.
The reason for this is that the page fault was taken during a
copy-out to a user buffer from an xfs bulkstat operation. XFS uses
an "empty" transaction context for bulkstat to do automated metadata
buffer cleanup, and so the transaction context is valid across the
copyout of the bulkstat info into the user buffer.
We are using empty transaction contexts like this in XFS to reduce
the risk of failing to release objects we reference during the
operation, especially during error handling. Hence we really need to
ensure that we can take page faults from these contexts without
leaving landmines for the code processing the page fault to trip
over.
However, this same behaviour could happen from any other filesystem
that triggers a page fault or any other exception that is handled
on-stack from within a task context that has current->journal_info
set. Having a page fault from some other filesystem bounce into XFS
where we have to run a transaction isn't a bug at all, but the usage
of current->journal_info means that this could result corruption of
the outer task's journal_info structure.
The problem is purely that we now have two different contexts that
now think they own current->journal_info. IOWs, no filesystem can
allow page faults or on-stack exceptions while current->journal_info
is set by the filesystem because the exception processing might use
current->journal_info itself.
If we end up with nested XFS transactions whilst holding an empty
transaction, then it isn't an issue as the outer transaction does
not hold a log reservation. If we ignore the current->journal_info
usage, then the only problem that might occur is a deadlock if the
exception tries to take the same locks the upper context holds.
That, however, is not a problem that setting current->journal_info
would solve, so it's largely an irrelevant concern here.
IOWs, we really only use current->journal_info for a warning check
in xfs_vm_writepages() to ensure we aren't doing writeback from a
transaction context. Writeback might need to do allocation, so it
can need to run transactions itself. Hence it's a debug check to
warn us that we've done something silly, and largely it is not all
that useful.
So let's just remove all the use of current->journal_info in XFS and
get rid of all the potential issues from nested contexts where
current->journal_info might get misused by another filesystem
context.
Reported-by: syzbot+cdee56dbcdf0096ef605@syzkaller.appspotmail.com
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Reviewed-by: Mark Tinguely <mark.tinguely@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-03-18 22:36:28 +00:00
|
|
|
if (current->flags & PF_MEMALLOC_NOFS)
|
2021-08-06 18:05:39 +00:00
|
|
|
return false;
|
|
|
|
|
2021-08-06 18:05:43 +00:00
|
|
|
if (shrinker_hits > 0)
|
|
|
|
return true;
|
|
|
|
|
2021-08-06 18:05:39 +00:00
|
|
|
if (items > XFS_INODEGC_MAX_BACKLOG)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Queue a background inactivation worker if there are inodes that need to be
|
|
|
|
* inactivated and higher level xfs code hasn't disabled the background
|
|
|
|
* workers.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
xfs_inodegc_queue(
|
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
struct xfs_inodegc *gc;
|
|
|
|
int items;
|
2021-08-06 18:05:43 +00:00
|
|
|
unsigned int shrinker_hits;
|
2023-09-11 15:39:03 +00:00
|
|
|
unsigned int cpu_nr;
|
2022-06-16 14:44:31 +00:00
|
|
|
unsigned long queue_delay = 1;
|
2021-08-06 18:05:39 +00:00
|
|
|
|
|
|
|
trace_xfs_inode_set_need_inactive(ip);
|
|
|
|
spin_lock(&ip->i_flags_lock);
|
|
|
|
ip->i_flags |= XFS_NEED_INACTIVE;
|
|
|
|
spin_unlock(&ip->i_flags_lock);
|
|
|
|
|
2023-09-11 15:39:03 +00:00
|
|
|
cpu_nr = get_cpu();
|
|
|
|
gc = this_cpu_ptr(mp->m_inodegc);
|
2021-08-06 18:05:39 +00:00
|
|
|
llist_add(&ip->i_gclist, &gc->list);
|
|
|
|
items = READ_ONCE(gc->items);
|
|
|
|
WRITE_ONCE(gc->items, items + 1);
|
2021-08-06 18:05:43 +00:00
|
|
|
shrinker_hits = READ_ONCE(gc->shrinker_hits);
|
2021-08-06 18:05:39 +00:00
|
|
|
|
2023-09-11 15:39:03 +00:00
|
|
|
/*
|
|
|
|
* Ensure the list add is always seen by anyone who finds the cpumask
|
|
|
|
* bit set. This effectively gives the cpumask bit set operation
|
|
|
|
* release ordering semantics.
|
|
|
|
*/
|
|
|
|
smp_mb__before_atomic();
|
|
|
|
if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
|
|
|
|
cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
|
|
|
|
|
2022-06-16 14:44:31 +00:00
|
|
|
/*
|
|
|
|
* We queue the work while holding the current CPU so that the work
|
|
|
|
* is scheduled to run on this CPU.
|
|
|
|
*/
|
|
|
|
if (!xfs_is_inodegc_enabled(mp)) {
|
2023-09-11 15:39:03 +00:00
|
|
|
put_cpu();
|
2021-08-06 18:05:39 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2022-06-16 14:44:31 +00:00
|
|
|
if (xfs_inodegc_want_queue_work(ip, items))
|
|
|
|
queue_delay = 0;
|
|
|
|
|
|
|
|
trace_xfs_inodegc_queue(mp, __return_address);
|
2023-05-01 23:16:05 +00:00
|
|
|
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
|
|
|
|
queue_delay);
|
2023-09-11 15:39:03 +00:00
|
|
|
put_cpu();
|
2022-06-16 14:44:31 +00:00
|
|
|
|
2021-08-06 18:05:43 +00:00
|
|
|
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
|
2021-08-06 18:05:39 +00:00
|
|
|
trace_xfs_inodegc_throttle(mp, __return_address);
|
2022-06-16 14:44:31 +00:00
|
|
|
flush_delayed_work(&gc->work);
|
2021-08-06 18:05:39 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We set the inode flag atomically with the radix tree tag. Once we get tag
|
|
|
|
* lookups on the radix tree, this inode flag can go away.
|
|
|
|
*
|
|
|
|
* We always use background reclaim here because even if the inode is clean, it
|
|
|
|
* still may be under IO and hence we have wait for IO completion to occur
|
|
|
|
* before we can reclaim the inode. The background reclaim path handles this
|
|
|
|
* more efficiently than we can here, so simply let background reclaim tear down
|
|
|
|
* all inodes.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
xfs_inode_mark_reclaimable(
|
|
|
|
struct xfs_inode *ip)
|
|
|
|
{
|
|
|
|
struct xfs_mount *mp = ip->i_mount;
|
|
|
|
bool need_inactive;
|
|
|
|
|
|
|
|
XFS_STATS_INC(mp, vn_reclaim);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We should never get here with any of the reclaim flags already set.
|
|
|
|
*/
|
|
|
|
ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));
|
|
|
|
|
|
|
|
need_inactive = xfs_inode_needs_inactive(ip);
|
|
|
|
if (need_inactive) {
|
|
|
|
xfs_inodegc_queue(ip);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Going straight to reclaim, so drop the dquots. */
|
|
|
|
xfs_qm_dqdetach(ip);
|
|
|
|
xfs_inodegc_set_reclaimable(ip);
|
|
|
|
}
|
2021-08-06 18:05:43 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Register a phony shrinker so that we can run background inodegc sooner when
|
|
|
|
* there's memory pressure. Inactivation does not itself free any memory but
|
|
|
|
* it does make inodes reclaimable, which eventually frees memory.
|
|
|
|
*
|
|
|
|
* The count function, seek value, and batch value are crafted to trigger the
|
|
|
|
* scan function during the second round of scanning. Hopefully this means
|
|
|
|
* that we reclaimed enough memory that initiating metadata transactions won't
|
|
|
|
* make things worse.
|
|
|
|
*/
|
|
|
|
#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY)
|
|
|
|
#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
|
|
|
|
|
|
|
|
static unsigned long
|
|
|
|
xfs_inodegc_shrinker_count(
|
|
|
|
struct shrinker *shrink,
|
|
|
|
struct shrink_control *sc)
|
|
|
|
{
|
2023-09-11 09:44:34 +00:00
|
|
|
struct xfs_mount *mp = shrink->private_data;
|
2021-08-06 18:05:43 +00:00
|
|
|
struct xfs_inodegc *gc;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
if (!xfs_is_inodegc_enabled(mp))
|
|
|
|
return 0;
|
|
|
|
|
2023-09-11 15:39:03 +00:00
|
|
|
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
|
2021-08-06 18:05:43 +00:00
|
|
|
gc = per_cpu_ptr(mp->m_inodegc, cpu);
|
|
|
|
if (!llist_empty(&gc->list))
|
|
|
|
return XFS_INODEGC_SHRINKER_COUNT;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long
|
|
|
|
xfs_inodegc_shrinker_scan(
|
|
|
|
struct shrinker *shrink,
|
|
|
|
struct shrink_control *sc)
|
|
|
|
{
|
2023-09-11 09:44:34 +00:00
|
|
|
struct xfs_mount *mp = shrink->private_data;
|
2021-08-06 18:05:43 +00:00
|
|
|
struct xfs_inodegc *gc;
|
|
|
|
int cpu;
|
|
|
|
bool no_items = true;
|
|
|
|
|
|
|
|
if (!xfs_is_inodegc_enabled(mp))
|
|
|
|
return SHRINK_STOP;
|
|
|
|
|
|
|
|
trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
|
|
|
|
|
2023-09-11 15:39:03 +00:00
|
|
|
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
|
2021-08-06 18:05:43 +00:00
|
|
|
gc = per_cpu_ptr(mp->m_inodegc, cpu);
|
|
|
|
if (!llist_empty(&gc->list)) {
|
|
|
|
unsigned int h = READ_ONCE(gc->shrinker_hits);
|
|
|
|
|
|
|
|
WRITE_ONCE(gc->shrinker_hits, h + 1);
|
2022-06-16 14:44:31 +00:00
|
|
|
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
|
2021-08-06 18:05:43 +00:00
|
|
|
no_items = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If there are no inodes to inactivate, we don't want the shrinker
|
|
|
|
* to think there's deferred work to call us back about.
|
|
|
|
*/
|
|
|
|
if (no_items)
|
|
|
|
return LONG_MAX;
|
|
|
|
|
|
|
|
return SHRINK_STOP;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
|
|
|
|
int
|
|
|
|
xfs_inodegc_register_shrinker(
|
|
|
|
struct xfs_mount *mp)
|
|
|
|
{
|
2023-09-11 09:44:34 +00:00
|
|
|
mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB,
|
|
|
|
"xfs-inodegc:%s",
|
|
|
|
mp->m_super->s_id);
|
|
|
|
if (!mp->m_inodegc_shrinker)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count;
|
|
|
|
mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan;
|
|
|
|
mp->m_inodegc_shrinker->seeks = 0;
|
|
|
|
mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH;
|
|
|
|
mp->m_inodegc_shrinker->private_data = mp;
|
2021-08-06 18:05:43 +00:00
|
|
|
|
2023-09-11 09:44:34 +00:00
|
|
|
shrinker_register(mp->m_inodegc_shrinker);
|
2021-08-06 18:05:43 +00:00
|
|
|
|
2023-09-11 09:44:34 +00:00
|
|
|
return 0;
|
2021-08-06 18:05:43 +00:00
|
|
|
}
|