linux-next/fs/kernfs/kernfs-internal.h
Tejun Heo 4207b556e6 kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_node_by_id()
The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.

Consider the following BPF program:

  SEC("fentry/__set_cpus_allowed_ptr_locked")
  int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
	       struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
  {
	  struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);

	  if (cgrp) {
		  bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
		  bpf_cgroup_release(cgrp);
	  }
	  return 0;
  }

__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:

  =====================================================
  WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
  6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
  -----------------------------------------------------
  repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
  ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70

		and this task is already holding:
  ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
  which would create a new lock dependency:
   (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
  ...
   Possible interrupt unsafe locking scenario:

	 CPU0                    CPU1
	 ----                    ----
    lock(kernfs_idr_lock);
				 local_irq_disable();
				 lock(&rq->__lock);
				 lock(kernfs_idr_lock);
    <Interrupt>
      lock(&rq->__lock);

		 *** DEADLOCK ***
  ...
  Call Trace:
   dump_stack_lvl+0x55/0x70
   dump_stack+0x10/0x20
   __lock_acquire+0x781/0x2a40
   lock_acquire+0xbf/0x1f0
   _raw_spin_lock+0x2f/0x40
   kernfs_find_and_get_node_by_id+0x1e/0x70
   cgroup_get_from_id+0x21/0x240
   bpf_cgroup_from_id+0xe/0x20
   bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
   bpf_trampoline_6442545632+0x4f/0x1000
   __set_cpus_allowed_ptr_locked+0x5/0x5a0
   sched_setaffinity+0x1b3/0x290
   __x64_sys_sched_setaffinity+0x4f/0x60
   do_syscall_64+0x40/0xe0
   entry_SYSCALL_64_after_hwframe+0x46/0x4e

Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.

This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2024-01-30 15:54:25 -08:00

177 lines
4.6 KiB
C

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* fs/kernfs/kernfs-internal.h - kernfs internal header file
*
* Copyright (c) 2001-3 Patrick Mochel
* Copyright (c) 2007 SUSE Linux Products GmbH
* Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
*/
#ifndef __KERNFS_INTERNAL_H
#define __KERNFS_INTERNAL_H
#include <linux/lockdep.h>
#include <linux/fs.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/xattr.h>
#include <linux/kernfs.h>
#include <linux/fs_context.h>
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
struct timespec64 ia_atime;
struct timespec64 ia_mtime;
struct timespec64 ia_ctime;
struct simple_xattrs xattrs;
atomic_t nr_user_xattrs;
atomic_t user_xattr_size;
};
struct kernfs_root {
/* published fields */
struct kernfs_node *kn;
unsigned int flags; /* KERNFS_ROOT_* flags */
/* private fields, do not use outside kernfs proper */
struct idr ino_idr;
u32 last_id_lowbits;
u32 id_highbits;
struct kernfs_syscall_ops *syscall_ops;
/* list of kernfs_super_info of this root, protected by kernfs_rwsem */
struct list_head supers;
wait_queue_head_t deactivate_waitq;
struct rw_semaphore kernfs_rwsem;
struct rw_semaphore kernfs_iattr_rwsem;
struct rw_semaphore kernfs_supers_rwsem;
struct rcu_head rcu;
};
/* +1 to avoid triggering overflow warning when negating it */
#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
/**
* kernfs_root - find out the kernfs_root a kernfs_node belongs to
* @kn: kernfs_node of interest
*
* Return: the kernfs_root @kn belongs to.
*/
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
{
/* if parent exists, it's always a dir; otherwise, @sd is a dir */
if (kn->parent)
kn = kn->parent;
return kn->dir.root;
}
/*
* mount.c
*/
struct kernfs_super_info {
struct super_block *sb;
/*
* The root associated with this super_block. Each super_block is
* identified by the root and ns it's associated with.
*/
struct kernfs_root *root;
/*
* Each sb is associated with one namespace tag, currently the
* network namespace of the task which mounted this kernfs
* instance. If multiple tags become necessary, make the following
* an array and compare kernfs_node tag against every entry.
*/
const void *ns;
/* anchored at kernfs_root->supers, protected by kernfs_rwsem */
struct list_head node;
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
if (d_really_is_negative(dentry))
return NULL;
return d_inode(dentry)->i_private;
}
static inline void kernfs_set_rev(struct kernfs_node *parent,
struct dentry *dentry)
{
dentry->d_time = parent->dir.rev;
}
static inline void kernfs_inc_rev(struct kernfs_node *parent)
{
parent->dir.rev++;
}
static inline bool kernfs_dir_changed(struct kernfs_node *parent,
struct dentry *dentry)
{
if (parent->dir.rev != dentry->d_time)
return true;
return false;
}
extern const struct super_operations kernfs_sops;
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
/*
* inode.c
*/
extern const struct xattr_handler * const kernfs_xattr_handlers[];
void kernfs_evict_inode(struct inode *inode);
int kernfs_iop_permission(struct mnt_idmap *idmap,
struct inode *inode, int mask);
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *iattr);
int kernfs_iop_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
/*
* dir.c
*/
extern const struct dentry_operations kernfs_dops;
extern const struct file_operations kernfs_dir_fops;
extern const struct inode_operations kernfs_dir_iops;
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
void kernfs_put_active(struct kernfs_node *kn);
int kernfs_add_one(struct kernfs_node *kn);
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
const char *name, umode_t mode,
kuid_t uid, kgid_t gid,
unsigned flags);
/*
* file.c
*/
extern const struct file_operations kernfs_file_fops;
bool kernfs_should_drain_open_files(struct kernfs_node *kn);
void kernfs_drain_open_files(struct kernfs_node *kn);
/*
* symlink.c
*/
extern const struct inode_operations kernfs_symlink_iops;
/*
* kernfs locks
*/
extern struct kernfs_global_locks *kernfs_locks;
#endif /* __KERNFS_INTERNAL_H */