We still have some races in filesystem methods when exposed to RCU

pathwalk.  This series is a result of code audit (the second round
 of it) and it should deal with most of that stuff.  Exceptions: ntfs3
 ->d_hash()/->d_compare() and ceph_d_revalidate().  Up to maintainers (a
 note for NTFS folks - when documentation says that a method may not block,
 it *does* imply that blocking allocations are to be avoided.  Really).
 
 Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQQqUNBr3gm4hGXdBJlZ7Krx/gZQ6wUCZdroDAAKCRBZ7Krx/gZQ
 60dKAQCzp6rYr3ye4nylho9Rzu8LEpH04TuNf3C6JuyUaNHxHwEAvNLatZsyFnmV
 Ksp2Rg/IlKPNtQgYJ8xPxv9DFmNe8gI=
 =47Un
 -----END PGP SIGNATURE-----

Merge tag 'pull-fixes.pathwalk-rcu-2' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

Pull RCU pathwalk fixes from Al Viro:
 "We still have some races in filesystem methods when exposed to RCU
  pathwalk. This series is a result of code audit (the second round of
  it) and it should deal with most of that stuff.

  Still pending: ntfs3 ->d_hash()/->d_compare() and ceph_d_revalidate().
  Up to maintainers (a note for NTFS folks - when documentation says
  that a method may not block, it *does* imply that blocking allocations
  are to be avoided. Really)"

[ More explanations for people who aren't familiar with the vagaries of
  RCU path walking: most of it is hidden from filesystems, but if a
  filesystem actively participates in the low-level path walking it
  needs to make sure the fields involved in that walk are RCU-safe.

  That "actively participate in low-level path walking" includes things
  like having its own ->d_hash()/->d_compare() routines, or by having
  its own directory permission function that doesn't just use the common
  helpers.  Having a ->d_revalidate() function will also have this issue.

  Note that instead of making everything RCU safe you can also choose to
  abort the RCU pathwalk if your operation cannot be done safely under
  RCU, but that obviously comes with a performance penalty. One common
  pattern is to allow the simple cases under RCU, and abort only if you
  need to do something more complicated.

  So not everything needs to be RCU-safe, and things like the inode etc
  that the VFS itself maintains obviously already are. But these fixes
  tend to be about properly RCU-delaying things like ->s_fs_info that
  are maintained by the filesystem and that got potentially released too
  early.   - Linus ]

* tag 'pull-fixes.pathwalk-rcu-2' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs:
  ext4_get_link(): fix breakage in RCU mode
  cifs_get_link(): bail out in unsafe case
  fuse: fix UAF in rcu pathwalks
  procfs: make freeing proc_fs_info rcu-delayed
  procfs: move dropping pde and pid from ->evict_inode() to ->free_inode()
  nfs: fix UAF on pathwalk running into umount
  nfs: make nfs_set_verifier() safe for use in RCU pathwalk
  afs: fix __afs_break_callback() / afs_drop_open_mmap() race
  hfsplus: switch to rcu-delayed unloading of nls and freeing ->s_fs_info
  exfat: move freeing sbi, upcase table and dropping nls into rcu-delayed helper
  affs: free affs_sb_info with kfree_rcu()
  rcu pathwalk: prevent bogus hard errors from may_lookup()
  fs/super.c: don't drop ->s_user_ns until we free struct super_block itself
This commit is contained in:
Linus Torvalds 2024-02-25 09:29:05 -08:00
commit 66a97c2ec9
22 changed files with 88 additions and 63 deletions

View File

@ -105,6 +105,7 @@ struct affs_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sb_work; /* superblock flush delayed work */
spinlock_t work_lock; /* protects sb_work and work_queued */
struct rcu_head rcu;
};
#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */

View File

@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb)
affs_brelse(sbi->s_root_bh);
kfree(sbi->s_prefix);
mutex_destroy(&sbi->s_bmlock);
kfree(sbi);
kfree_rcu(sbi, rcu);
}
}

View File

@ -417,13 +417,17 @@ static void afs_add_open_mmap(struct afs_vnode *vnode)
static void afs_drop_open_mmap(struct afs_vnode *vnode)
{
if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1))
return;
down_write(&vnode->volume->open_mmaps_lock);
if (atomic_read(&vnode->cb_nr_mmap) == 0)
read_seqlock_excl(&vnode->cb_lock);
// the only place where ->cb_nr_mmap may hit 0
// see __afs_break_callback() for the other side...
if (atomic_dec_and_test(&vnode->cb_nr_mmap))
list_del_init(&vnode->cb_mmap_link);
read_sequnlock_excl(&vnode->cb_lock);
up_write(&vnode->volume->open_mmaps_lock);
flush_work(&vnode->cb_work);

View File

@ -275,6 +275,7 @@ struct exfat_sb_info {
spinlock_t inode_hash_lock;
struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
struct rcu_head rcu;
};
#define EXFAT_CACHE_VALID 0

View File

@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb,
unsigned int sect_size = sb->s_blocksize;
unsigned int i, index = 0;
u32 chksum = 0;
int ret;
unsigned char skip = false;
unsigned short *upcase_table;
@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb,
if (!bh) {
exfat_err(sb, "failed to read sector(0x%llx)",
(unsigned long long)sector);
ret = -EIO;
goto free_table;
return -EIO;
}
sector++;
for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) {
@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb,
exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)",
index, chksum, utbl_checksum);
ret = -EINVAL;
free_table:
exfat_free_upcase_table(sbi);
return ret;
return -EINVAL;
}
static int exfat_load_default_upcase_table(struct super_block *sb)
{
int i, ret = -EIO;
int i;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
unsigned char skip = false;
unsigned short uni = 0, *upcase_table;
@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb)
return 0;
/* FATAL error: default upcase table has error */
exfat_free_upcase_table(sbi);
return ret;
return -EIO;
}
int exfat_create_upcase_table(struct super_block *sb)

View File

@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb)
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
unload_nls(sbi->nls_io);
exfat_free_upcase_table(sbi);
}
static int exfat_sync_fs(struct super_block *sb, int wait)
@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb)
ret = exfat_load_bitmap(sb);
if (ret) {
exfat_err(sb, "failed to load alloc-bitmap");
goto free_upcase_table;
goto free_bh;
}
ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb)
free_alloc_bitmap:
exfat_free_bitmap(sbi);
free_upcase_table:
exfat_free_upcase_table(sbi);
free_bh:
brelse(sbi->boot_bh);
return ret;
@ -701,12 +696,10 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_root = NULL;
free_table:
exfat_free_upcase_table(sbi);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
check_nls_io:
unload_nls(sbi->nls_io);
return err;
}
@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc)
return 0;
}
static void delayed_free(struct rcu_head *p)
{
struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
unload_nls(sbi->nls_io);
exfat_free_upcase_table(sbi);
exfat_free_sbi(sbi);
}
static void exfat_kill_sb(struct super_block *sb)
{
struct exfat_sb_info *sbi = sb->s_fs_info;
kill_block_super(sb);
if (sbi)
exfat_free_sbi(sbi);
call_rcu(&sbi->rcu, delayed_free);
}
static struct file_system_type exfat_fs_type = {

View File

@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
if (!dentry) {
bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
if (IS_ERR(bh))
return ERR_CAST(bh);
if (!bh || !ext4_buffer_uptodate(bh))
if (IS_ERR(bh) || !bh)
return ERR_PTR(-ECHILD);
if (!ext4_buffer_uptodate(bh)) {
brelse(bh);
return ERR_PTR(-ECHILD);
}
} else {
bh = ext4_bread(NULL, inode, 0, 0);
if (IS_ERR(bh))

View File

@ -474,8 +474,7 @@ static int cuse_send_init(struct cuse_conn *cc)
static void cuse_fc_release(struct fuse_conn *fc)
{
struct cuse_conn *cc = fc_to_cc(fc);
kfree_rcu(cc, fc.rcu);
kfree(fc_to_cc(fc));
}
/**

View File

@ -888,6 +888,7 @@ struct fuse_mount {
/* Entry on fc->mounts */
struct list_head fc_entry;
struct rcu_head rcu;
};
static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)

View File

@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
static void delayed_release(struct rcu_head *p)
{
struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
put_user_ns(fc->user_ns);
fc->release(fc);
}
void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc)
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
put_user_ns(fc->user_ns);
bucket = rcu_dereference_protected(fc->curr_bucket, 1);
if (bucket) {
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
fc->release(fc);
call_rcu(&fc->rcu, delayed_release);
}
}
EXPORT_SYMBOL_GPL(fuse_conn_put);
@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init);
void fuse_free_conn(struct fuse_conn *fc)
{
WARN_ON(!list_empty(&fc->devices));
kfree_rcu(fc, rcu);
kfree(fc);
}
EXPORT_SYMBOL_GPL(fuse_free_conn);
@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb)
void fuse_mount_destroy(struct fuse_mount *fm)
{
fuse_conn_put(fm->fc);
kfree(fm);
kfree_rcu(fm, rcu);
}
EXPORT_SYMBOL(fuse_mount_destroy);

View File

@ -190,6 +190,7 @@ struct hfsplus_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
struct rcu_head rcu;
};
#define HFSPLUS_SB_WRITEBACKUP 0

View File

@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb)
spin_unlock(&sbi->work_lock);
}
static void delayed_free(struct rcu_head *p)
{
struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu);
unload_nls(sbi->nls);
kfree(sbi);
}
static void hfsplus_put_super(struct super_block *sb)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb)
hfs_btree_close(sbi->ext_tree);
kfree(sbi->s_vhdr_buf);
kfree(sbi->s_backup_vhdr_buf);
unload_nls(sbi->nls);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
call_rcu(&sbi->rcu, delayed_free);
}
static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)

View File

@ -1717,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap,
{
if (nd->flags & LOOKUP_RCU) {
int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD || !try_to_unlazy(nd))
if (!err) // success, keep going
return 0;
if (!try_to_unlazy(nd))
return -ECHILD; // redo it all non-lazy
if (err != -ECHILD) // hard error
return err;
}
return inode_permission(idmap, nd->inode, MAY_EXEC);

View File

@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp)
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
kfree(clp);
kfree_rcu(clp, rcu);
}
EXPORT_SYMBOL_GPL(nfs_free_client);
@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void)
}
EXPORT_SYMBOL_GPL(nfs_alloc_server);
static void delayed_free(struct rcu_head *p)
{
struct nfs_server *server = container_of(p, struct nfs_server, rcu);
nfs_free_iostats(server->io_stats);
kfree(server);
}
/*
* Free up a server record
*/
@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
nfs_free_iostats(server->io_stats);
put_cred(server->cred);
kfree(server);
nfs_release_automount_timer();
call_rcu(&server->rcu, delayed_free);
}
EXPORT_SYMBOL_GPL(nfs_free_server);

View File

@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry)
static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
{
struct inode *inode = d_inode(dentry);
struct inode *dir = d_inode(dentry->d_parent);
struct inode *dir = d_inode_rcu(dentry->d_parent);
if (!nfs_verify_change_attribute(dir, verf))
if (!dir || !nfs_verify_change_attribute(dir, verf))
return;
if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
nfs_set_verifier_delegated(&verf);

View File

@ -1878,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei)
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->lock);
}
put_pid(pid);
}
struct inode *proc_pid_make_inode(struct super_block *sb,

View File

@ -30,7 +30,6 @@
static void proc_evict_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);
@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode)
clear_inode(inode);
/* Stop tracking associated processes */
if (ei->pid) {
if (ei->pid)
proc_pid_evict_inode(ei);
ei->pid = NULL;
}
/* Let go of any associated proc directory entry */
de = ei->pde;
if (de) {
pde_put(de);
ei->pde = NULL;
}
head = ei->sysctl;
if (head) {
@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_free_inode(struct inode *inode)
{
struct proc_inode *ei = PROC_I(inode);
if (ei->pid)
put_pid(ei->pid);
/* Let go of any associated proc directory entry */
if (ei->pde)
pde_put(ei->pde);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}

View File

@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb)
kill_anon_super(sb);
put_pid_ns(fs_info->pid_ns);
kfree(fs_info);
kfree_rcu(fs_info, rcu);
}
static struct file_system_type proc_fs_type = {

View File

@ -1172,6 +1172,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
{
char *target_path;
if (!dentry)
return ERR_PTR(-ECHILD);
target_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (!target_path)
return ERR_PTR(-ENOMEM);

View File

@ -274,9 +274,10 @@ static void destroy_super_work(struct work_struct *work)
{
struct super_block *s = container_of(work, struct super_block,
destroy_work);
int i;
for (i = 0; i < SB_FREEZE_LEVELS; i++)
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
for (int i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s);
}
@ -296,9 +297,6 @@ static void destroy_unused_super(struct super_block *s)
super_unlock_excl(s);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
shrinker_free(s->s_shrink);
/* no delays needed */
destroy_super_work(&s->destroy_work);
@ -409,9 +407,6 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_dentry_lru.node);
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
security_sb_free(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
}
}

View File

@ -124,6 +124,7 @@ struct nfs_client {
char cl_ipaddr[48];
struct net *cl_net;
struct list_head pending_cb_stateids;
struct rcu_head rcu;
};
/*
@ -265,6 +266,7 @@ struct nfs_server {
const struct cred *cred;
bool has_sec_mnt_opts;
struct kobject kobj;
struct rcu_head rcu;
};
/* Server capabilities */

View File

@ -65,6 +65,7 @@ struct proc_fs_info {
kgid_t pid_gid;
enum proc_hidepid hide_pid;
enum proc_pidonly pidonly;
struct rcu_head rcu;
};
static inline struct proc_fs_info *proc_sb_info(struct super_block *sb)