2019-06-01 08:08:42 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2013-11-24 14:54:58 +00:00
|
|
|
/*
|
|
|
|
* kernfs.h - pseudo filesystem decoupled from vfs locking
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __LINUX_KERNFS_H
|
|
|
|
#define __LINUX_KERNFS_H
|
|
|
|
|
2013-11-23 22:21:50 +00:00
|
|
|
#include <linux/err.h>
|
2013-11-28 19:54:20 +00:00
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/mutex.h>
|
2013-11-28 19:54:41 +00:00
|
|
|
#include <linux/idr.h>
|
2013-11-28 19:54:29 +00:00
|
|
|
#include <linux/lockdep.h>
|
2013-11-29 22:18:32 +00:00
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/atomic.h>
|
2021-12-09 12:30:08 +00:00
|
|
|
#include <linux/bug.h>
|
|
|
|
#include <linux/types.h>
|
2018-07-20 21:56:47 +00:00
|
|
|
#include <linux/uidgid.h>
|
2014-02-03 19:02:55 +00:00
|
|
|
#include <linux/wait.h>
|
2021-11-18 23:00:08 +00:00
|
|
|
#include <linux/rwsem.h>
|
2022-06-15 02:10:59 +00:00
|
|
|
#include <linux/cache.h>
|
2013-11-23 22:21:49 +00:00
|
|
|
|
2013-11-23 22:21:52 +00:00
|
|
|
struct file;
|
2014-01-17 14:57:49 +00:00
|
|
|
struct dentry;
|
2013-11-23 22:21:52 +00:00
|
|
|
struct iattr;
|
2013-11-28 19:54:20 +00:00
|
|
|
struct seq_file;
|
|
|
|
struct vm_area_struct;
|
2021-12-09 12:30:08 +00:00
|
|
|
struct vm_operations_struct;
|
2013-11-28 19:54:43 +00:00
|
|
|
struct super_block;
|
|
|
|
struct file_system_type;
|
fs: kernfs: add poll file operation
Patch series "psi: pressure stall monitors", v3.
Android is adopting psi to detect and remedy memory pressure that
results in stuttering and decreased responsiveness on mobile devices.
Psi gives us the stall information, but because we're dealing with
latencies in the millisecond range, periodically reading the pressure
files to detect stalls in a timely fashion is not feasible. Psi also
doesn't aggregate its averages at a high enough frequency right now.
This patch series extends the psi interface such that users can
configure sensitive latency thresholds and use poll() and friends to be
notified when these are breached.
As high-frequency aggregation is costly, it implements an aggregation
method that is optimized for fast, short-interval averaging, and makes
the aggregation frequency adaptive, such that high-frequency updates
only happen while monitored stall events are actively occurring.
With these patches applied, Android can monitor for, and ward off,
mounting memory shortages before they cause problems for the user. For
example, using memory stall monitors in userspace low memory killer
daemon (lmkd) we can detect mounting pressure and kill less important
processes before device becomes visibly sluggish.
In our memory stress testing psi memory monitors produce roughly 10x
less false positives compared to vmpressure signals. Having ability to
specify multiple triggers for the same psi metric allows other parts of
Android framework to monitor memory state of the device and act
accordingly.
The new interface is straightforward. The user opens one of the
pressure files for writing and writes a trigger description into the
file descriptor that defines the stall state - some or full, and the
maximum stall time over a given window of time. E.g.:
/* Signal when stall time exceeds 100ms of a 1s window */
char trigger[] = "full 100000 1000000";
fd = open("/proc/pressure/memory");
write(fd, trigger, sizeof(trigger));
while (poll() >= 0) {
...
}
close(fd);
When the monitored stall state is entered, psi adapts its aggregation
frequency according to what the configured time window requires in order
to emit event signals in a timely fashion. Once the stalling subsides,
aggregation reverts back to normal.
The trigger is associated with the open file descriptor. To stop
monitoring, the user only needs to close the file descriptor and the
trigger is discarded.
Patches 1-4 prepare the psi code for polling support. Patch 5
implements the adaptive polling logic, the pressure growth detection
optimized for short intervals, and hooks up write() and poll() on the
pressure files.
The patches were developed in collaboration with Johannes Weiner.
This patch (of 5):
Kernfs has a standardized poll/notification mechanism for waking all
pollers on all fds when a filesystem node changes. To allow polling for
custom events, add a .poll callback that can override the default.
This is in preparation for pollable cgroup pressure files which have
per-fd trigger configurations.
Link: http://lkml.kernel.org/r/20190124211518.244221-2-surenb@google.com
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-05 23:45:45 +00:00
|
|
|
struct poll_table_struct;
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
struct fs_context;
|
2013-11-23 22:21:52 +00:00
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
struct kernfs_fs_context;
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_open_node;
|
|
|
|
struct kernfs_iattrs;
|
2013-11-29 22:18:32 +00:00
|
|
|
|
2022-06-15 02:10:59 +00:00
|
|
|
/*
|
|
|
|
* NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash
|
|
|
|
* table of locks.
|
|
|
|
* Having a small hash table would impact scalability, since
|
|
|
|
* more and more kernfs_node objects will end up using same lock
|
|
|
|
* and having a very large hash table would waste memory.
|
|
|
|
*
|
|
|
|
* At the moment size of hash table of locks is being set based on
|
|
|
|
* the number of CPUs as follows:
|
|
|
|
*
|
|
|
|
* NR_CPU NR_KERNFS_LOCK_BITS NR_KERNFS_LOCKS
|
|
|
|
* 1 1 2
|
|
|
|
* 2-3 2 4
|
|
|
|
* 4-7 4 16
|
|
|
|
* 8-15 6 64
|
|
|
|
* 16-31 8 256
|
|
|
|
* 32 and more 10 1024
|
|
|
|
*
|
|
|
|
* The above relation between NR_CPU and number of locks is based
|
|
|
|
* on some internal experimentation which involved booting qemu
|
|
|
|
* with different values of smp, performing some sysfs operations
|
|
|
|
* on all CPUs and observing how increase in number of locks impacts
|
|
|
|
* completion time of these sysfs operations on each CPU.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
#define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32)))
|
|
|
|
#else
|
|
|
|
#define NR_KERNFS_LOCK_BITS 1
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define NR_KERNFS_LOCKS (1 << NR_KERNFS_LOCK_BITS)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There's one kernfs_open_file for each open file and one kernfs_open_node
|
|
|
|
* for each kernfs_node with one or more open files.
|
|
|
|
*
|
|
|
|
* filp->private_data points to seq_file whose ->private points to
|
|
|
|
* kernfs_open_file.
|
|
|
|
*
|
|
|
|
* kernfs_open_files are chained at kernfs_open_node->files, which is
|
|
|
|
* protected by kernfs_global_locks.open_file_mutex[i].
|
|
|
|
*
|
|
|
|
* To reduce possible contention in sysfs access, arising due to single
|
|
|
|
* locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node
|
|
|
|
* object address as hash keys to get the index of these locks.
|
|
|
|
*
|
|
|
|
* Hashed mutexes are safe to use here because operations using these don't
|
|
|
|
* rely on global exclusion.
|
|
|
|
*
|
|
|
|
* In future we intend to replace other global locks with hashed ones as well.
|
|
|
|
* kernfs_global_locks acts as a holder for all such hash tables.
|
|
|
|
*/
|
|
|
|
struct kernfs_global_locks {
|
|
|
|
struct mutex open_file_mutex[NR_KERNFS_LOCKS];
|
|
|
|
};
|
|
|
|
|
2013-11-29 22:18:32 +00:00
|
|
|
enum kernfs_node_type {
|
2013-12-11 19:11:56 +00:00
|
|
|
KERNFS_DIR = 0x0001,
|
|
|
|
KERNFS_FILE = 0x0002,
|
|
|
|
KERNFS_LINK = 0x0004,
|
2013-11-29 22:18:32 +00:00
|
|
|
};
|
|
|
|
|
2020-03-12 20:03:16 +00:00
|
|
|
#define KERNFS_TYPE_MASK 0x000f
|
|
|
|
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
|
|
|
|
#define KERNFS_MAX_USER_XATTRS 128
|
|
|
|
#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10)
|
2013-11-29 22:18:32 +00:00
|
|
|
|
|
|
|
enum kernfs_node_flag {
|
2014-02-03 19:09:12 +00:00
|
|
|
KERNFS_ACTIVATED = 0x0010,
|
2013-12-11 19:11:56 +00:00
|
|
|
KERNFS_NS = 0x0020,
|
|
|
|
KERNFS_HAS_SEQ_SHOW = 0x0040,
|
|
|
|
KERNFS_HAS_MMAP = 0x0080,
|
|
|
|
KERNFS_LOCKDEP = 0x0100,
|
2022-08-28 05:04:39 +00:00
|
|
|
KERNFS_HIDDEN = 0x0200,
|
kernfs, sysfs, driver-core: implement kernfs_remove_self() and its wrappers
Sometimes it's necessary to implement a node which wants to delete
nodes including itself. This isn't straightforward because of kernfs
active reference. While a file operation is in progress, an active
reference is held and kernfs_remove() waits for all such references to
drain before completing. For a self-deleting node, this is a deadlock
as kernfs_remove() ends up waiting for an active reference that itself
is sitting on top of.
This currently is worked around in the sysfs layer using
sysfs_schedule_callback() which makes such removals asynchronous.
While it works, it's rather cumbersome and inherently breaks
synchronicity of the operation - the file operation which triggered
the operation may complete before the removal is finished (or even
started) and the removal may fail asynchronously. If a removal
operation is immmediately followed by another operation which expects
the specific name to be available (e.g. removal followed by rename
onto the same name), there's no way to make the latter operation
reliable.
The thing is there's no inherent reason for this to be asynchrnous.
All that's necessary to do this synchronous is a dedicated operation
which drops its own active ref and deactivates self. This patch
implements kernfs_remove_self() and its wrappers in sysfs and driver
core. kernfs_remove_self() is to be called from one of the file
operations, drops the active ref the task is holding, removes the self
node, and restores active ref to the dead node so that the ref is
balanced afterwards. __kernfs_remove() is updated so that it takes an
early exit if the target node is already fully removed so that the
active ref restored by kernfs_remove_self() after removal doesn't
confuse the deactivation path.
This makes implementing self-deleting nodes very easy. The normal
removal path doesn't even need to be changed to use
kernfs_remove_self() for the self-deleting node. The method can
invoke kernfs_remove_self() on itself before proceeding the normal
removal path. kernfs_remove() invoked on the node by the normal
deletion path will simply be ignored.
This will replace sysfs_schedule_callback(). A subtle feature of
sysfs_schedule_callback() is that it collapses multiple invocations -
even if multiple removals are triggered, the removal callback is run
only once. An equivalent effect can be achieved by testing the return
value of kernfs_remove_self() - only the one which gets %true return
value should proceed with actual deletion. All other instances of
kernfs_remove_self() will wait till the enclosing kernfs operation
which invoked the winning instance of kernfs_remove_self() finishes
and then return %false. This trivially makes all users of
kernfs_remove_self() automatically show correct synchronous behavior
even when there are multiple concurrent operations - all "echo 1 >
delete" instances will finish only after the whole operation is
completed by one of the instances.
Note that manipulation of active ref is implemented in separate public
functions - kernfs_[un]break_active_protection().
kernfs_remove_self() is the only user at the moment but this will be
used to cater to more complex cases.
v2: For !CONFIG_SYSFS, dummy version kernfs_remove_self() was missing
and sysfs_remove_file_self() had incorrect return type. Fix it.
Reported by kbuild test bot.
v3: kernfs_[un]break_active_protection() separated out from
kernfs_remove_self() and exposed as public API.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2014-02-03 19:03:01 +00:00
|
|
|
KERNFS_SUICIDAL = 0x0400,
|
|
|
|
KERNFS_SUICIDED = 0x0800,
|
2015-05-13 21:09:29 +00:00
|
|
|
KERNFS_EMPTY_DIR = 0x1000,
|
2016-12-27 19:49:03 +00:00
|
|
|
KERNFS_HAS_RELEASE = 0x2000,
|
2022-08-28 05:04:37 +00:00
|
|
|
KERNFS_REMOVING = 0x4000,
|
2013-11-29 22:18:32 +00:00
|
|
|
};
|
|
|
|
|
2014-02-03 19:09:12 +00:00
|
|
|
/* @flags for kernfs_create_root() */
|
|
|
|
enum kernfs_root_flag {
|
2014-05-12 17:56:27 +00:00
|
|
|
/*
|
|
|
|
* kernfs_nodes are created in the deactivated state and invisible.
|
|
|
|
* They require explicit kernfs_activate() to become visible. This
|
|
|
|
* can be used to make related nodes become visible atomically
|
|
|
|
* after all nodes are created successfully.
|
|
|
|
*/
|
|
|
|
KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001,
|
|
|
|
|
|
|
|
/*
|
2019-04-02 14:01:46 +00:00
|
|
|
* For regular files, if the opener has CAP_DAC_OVERRIDE, open(2)
|
2014-05-12 17:56:27 +00:00
|
|
|
* succeeds regardless of the RW permissions. sysfs had an extra
|
|
|
|
* layer of enforcement where open(2) fails with -EACCES regardless
|
|
|
|
* of CAP_DAC_OVERRIDE if the permission doesn't have the
|
|
|
|
* respective read or write access at all (none of S_IRUGO or
|
|
|
|
* S_IWUGO) or the respective operation isn't implemented. The
|
|
|
|
* following flag enables that behavior.
|
|
|
|
*/
|
|
|
|
KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002,
|
2017-07-12 18:49:51 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The filesystem supports exportfs operation, so userspace can use
|
|
|
|
* fhandle to access nodes of the fs.
|
|
|
|
*/
|
|
|
|
KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004,
|
2020-03-12 20:03:16 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Support user xattrs to be written to nodes rooted at this root.
|
|
|
|
*/
|
|
|
|
KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008,
|
2014-02-03 19:09:12 +00:00
|
|
|
};
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
/* type-specific structures for kernfs_node union members */
|
|
|
|
struct kernfs_elem_dir {
|
2013-11-29 22:18:32 +00:00
|
|
|
unsigned long subdirs;
|
2013-12-11 19:11:54 +00:00
|
|
|
/* children rbtree starts here and goes through kn->rb */
|
2013-11-29 22:18:32 +00:00
|
|
|
struct rb_root children;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The kernfs hierarchy this directory belongs to. This fits
|
2013-12-11 19:11:53 +00:00
|
|
|
* better directly in kernfs_node but is here to save space.
|
2013-11-29 22:18:32 +00:00
|
|
|
*/
|
|
|
|
struct kernfs_root *root;
|
2021-07-16 09:28:18 +00:00
|
|
|
/*
|
|
|
|
* Monotonic revision counter, used to identify if a directory
|
|
|
|
* node has changed during negative dentry revalidation.
|
|
|
|
*/
|
|
|
|
unsigned long rev;
|
2013-11-29 22:18:32 +00:00
|
|
|
};
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
struct kernfs_elem_symlink {
|
|
|
|
struct kernfs_node *target_kn;
|
2013-11-29 22:18:32 +00:00
|
|
|
};
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
struct kernfs_elem_attr {
|
2013-11-29 22:18:32 +00:00
|
|
|
const struct kernfs_ops *ops;
|
2022-06-15 02:10:56 +00:00
|
|
|
struct kernfs_open_node __rcu *open;
|
2013-11-29 22:18:32 +00:00
|
|
|
loff_t size;
|
2022-07-05 20:10:26 +00:00
|
|
|
struct kernfs_node *notify_next; /* for kernfs_notify() */
|
2013-11-29 22:18:32 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
2013-12-11 19:11:53 +00:00
|
|
|
* kernfs_node - the building block of kernfs hierarchy. Each and every
|
|
|
|
* kernfs node is represented by single kernfs_node. Most fields are
|
2013-11-29 22:18:32 +00:00
|
|
|
* private to kernfs and shouldn't be accessed directly by kernfs users.
|
|
|
|
*
|
2020-10-15 18:57:26 +00:00
|
|
|
* As long as count reference is held, the kernfs_node itself is
|
2013-12-11 19:11:53 +00:00
|
|
|
* accessible. Dereferencing elem or any other outer entity requires
|
|
|
|
* active reference.
|
2013-11-29 22:18:32 +00:00
|
|
|
*/
|
2013-12-11 19:11:53 +00:00
|
|
|
struct kernfs_node {
|
2013-12-11 19:11:54 +00:00
|
|
|
atomic_t count;
|
|
|
|
atomic_t active;
|
2013-11-29 22:18:32 +00:00
|
|
|
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
|
|
struct lockdep_map dep_map;
|
|
|
|
#endif
|
2014-02-07 18:32:07 +00:00
|
|
|
/*
|
|
|
|
* Use kernfs_get_parent() and kernfs_name/path() instead of
|
|
|
|
* accessing the following two fields directly. If the node is
|
|
|
|
* never moved to a different parent, it is safe to access the
|
|
|
|
* parent directly.
|
|
|
|
*/
|
2013-12-11 19:11:54 +00:00
|
|
|
struct kernfs_node *parent;
|
|
|
|
const char *name;
|
2013-11-29 22:18:32 +00:00
|
|
|
|
2013-12-11 19:11:54 +00:00
|
|
|
struct rb_node rb;
|
2013-11-29 22:18:32 +00:00
|
|
|
|
2013-12-11 19:11:54 +00:00
|
|
|
const void *ns; /* namespace tag */
|
2014-01-13 22:09:38 +00:00
|
|
|
unsigned int hash; /* ns + name hash */
|
2024-01-10 18:28:16 +00:00
|
|
|
unsigned short flags;
|
|
|
|
umode_t mode;
|
|
|
|
|
2013-11-29 22:18:32 +00:00
|
|
|
union {
|
2013-12-11 19:11:54 +00:00
|
|
|
struct kernfs_elem_dir dir;
|
|
|
|
struct kernfs_elem_symlink symlink;
|
|
|
|
struct kernfs_elem_attr attr;
|
2013-11-29 22:18:32 +00:00
|
|
|
};
|
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
/*
|
2019-11-04 23:54:30 +00:00
|
|
|
* 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit,
|
|
|
|
* the low 32bits are ino and upper generation.
|
2019-11-04 23:54:30 +00:00
|
|
|
*/
|
|
|
|
u64 id;
|
|
|
|
|
2024-01-10 18:28:16 +00:00
|
|
|
void *priv;
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_iattrs *iattr;
|
kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_node_by_id()
The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.
Consider the following BPF program:
SEC("fentry/__set_cpus_allowed_ptr_locked")
int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
{
struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);
if (cgrp) {
bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
bpf_cgroup_release(cgrp);
}
return 0;
}
__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:
=====================================================
WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
-----------------------------------------------------
repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70
and this task is already holding:
ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
which would create a new lock dependency:
(&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
...
Possible interrupt unsafe locking scenario:
CPU0 CPU1
---- ----
lock(kernfs_idr_lock);
local_irq_disable();
lock(&rq->__lock);
lock(kernfs_idr_lock);
<Interrupt>
lock(&rq->__lock);
*** DEADLOCK ***
...
Call Trace:
dump_stack_lvl+0x55/0x70
dump_stack+0x10/0x20
__lock_acquire+0x781/0x2a40
lock_acquire+0xbf/0x1f0
_raw_spin_lock+0x2f/0x40
kernfs_find_and_get_node_by_id+0x1e/0x70
cgroup_get_from_id+0x21/0x240
bpf_cgroup_from_id+0xe/0x20
bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
bpf_trampoline_6442545632+0x4f/0x1000
__set_cpus_allowed_ptr_locked+0x5/0x5a0
sched_setaffinity+0x1b3/0x290
__x64_sys_sched_setaffinity+0x4f/0x60
do_syscall_64+0x40/0xe0
entry_SYSCALL_64_after_hwframe+0x46/0x4e
Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.
This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2024-01-09 21:48:04 +00:00
|
|
|
|
|
|
|
struct rcu_head rcu;
|
2013-11-29 22:18:32 +00:00
|
|
|
};
|
2013-11-24 14:54:58 +00:00
|
|
|
|
2013-12-11 21:03:00 +00:00
|
|
|
/*
|
2014-02-03 19:09:09 +00:00
|
|
|
* kernfs_syscall_ops may be specified on kernfs_create_root() to support
|
|
|
|
* syscalls. These optional callbacks are invoked on the matching syscalls
|
|
|
|
* and can perform any kernfs operations which don't necessarily have to be
|
|
|
|
* the exact operation requested. An active reference is held for each
|
|
|
|
* kernfs_node parameter.
|
2013-12-11 21:03:00 +00:00
|
|
|
*/
|
2014-02-03 19:09:09 +00:00
|
|
|
struct kernfs_syscall_ops {
|
2014-02-03 19:09:10 +00:00
|
|
|
int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
|
|
|
|
|
2013-12-11 21:03:00 +00:00
|
|
|
int (*mkdir)(struct kernfs_node *parent, const char *name,
|
|
|
|
umode_t mode);
|
|
|
|
int (*rmdir)(struct kernfs_node *kn);
|
|
|
|
int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
|
|
|
|
const char *new_name);
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
|
|
|
|
struct kernfs_root *root);
|
2013-12-11 21:03:00 +00:00
|
|
|
};
|
|
|
|
|
2022-02-22 07:07:13 +00:00
|
|
|
struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root);
|
2013-11-28 19:54:40 +00:00
|
|
|
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_open_file {
|
2013-11-28 19:54:20 +00:00
|
|
|
/* published fields */
|
2013-12-11 19:11:53 +00:00
|
|
|
struct kernfs_node *kn;
|
2013-11-28 19:54:20 +00:00
|
|
|
struct file *file;
|
2016-12-27 19:49:03 +00:00
|
|
|
struct seq_file *seq_file;
|
2014-02-03 19:09:14 +00:00
|
|
|
void *priv;
|
2013-11-28 19:54:20 +00:00
|
|
|
|
|
|
|
/* private fields, do not use outside kernfs proper */
|
|
|
|
struct mutex mutex;
|
2016-03-31 10:45:06 +00:00
|
|
|
struct mutex prealloc_mutex;
|
2013-11-28 19:54:20 +00:00
|
|
|
int event;
|
|
|
|
struct list_head list;
|
2014-10-13 05:41:28 +00:00
|
|
|
char *prealloc_buf;
|
2013-11-28 19:54:20 +00:00
|
|
|
|
2014-03-04 20:38:46 +00:00
|
|
|
size_t atomic_write_len;
|
2016-12-27 19:49:02 +00:00
|
|
|
bool mmapped:1;
|
2016-12-27 19:49:03 +00:00
|
|
|
bool released:1;
|
2013-11-28 19:54:20 +00:00
|
|
|
const struct vm_operations_struct *vm_ops;
|
|
|
|
};
|
|
|
|
|
2013-11-28 19:54:21 +00:00
|
|
|
struct kernfs_ops {
|
2016-12-27 19:49:03 +00:00
|
|
|
/*
|
|
|
|
* Optional open/release methods. Both are called with
|
|
|
|
* @of->seq_file populated.
|
|
|
|
*/
|
|
|
|
int (*open)(struct kernfs_open_file *of);
|
|
|
|
void (*release)(struct kernfs_open_file *of);
|
|
|
|
|
2013-11-28 19:54:21 +00:00
|
|
|
/*
|
|
|
|
* Read is handled by either seq_file or raw_read().
|
|
|
|
*
|
2013-11-28 19:54:26 +00:00
|
|
|
* If seq_show() is present, seq_file path is active. Other seq
|
|
|
|
* operations are optional and if not implemented, the behavior is
|
|
|
|
* equivalent to single_open(). @sf->private points to the
|
2013-12-11 19:11:55 +00:00
|
|
|
* associated kernfs_open_file.
|
2013-11-28 19:54:21 +00:00
|
|
|
*
|
|
|
|
* read() is bounced through kernel buffer and a read larger than
|
|
|
|
* PAGE_SIZE results in partial operation of PAGE_SIZE.
|
|
|
|
*/
|
|
|
|
int (*seq_show)(struct seq_file *sf, void *v);
|
2013-11-28 19:54:26 +00:00
|
|
|
|
|
|
|
void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
|
|
|
|
void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
|
|
|
|
void (*seq_stop)(struct seq_file *sf, void *v);
|
2013-11-28 19:54:21 +00:00
|
|
|
|
2013-12-11 19:11:55 +00:00
|
|
|
ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
|
2013-11-28 19:54:21 +00:00
|
|
|
loff_t off);
|
|
|
|
|
|
|
|
/*
|
2014-02-03 19:09:13 +00:00
|
|
|
* write() is bounced through kernel buffer. If atomic_write_len
|
|
|
|
* is not set, a write larger than PAGE_SIZE results in partial
|
|
|
|
* operations of PAGE_SIZE chunks. If atomic_write_len is set,
|
|
|
|
* writes upto the specified size are executed atomically but
|
|
|
|
* larger ones are rejected with -E2BIG.
|
2013-11-28 19:54:21 +00:00
|
|
|
*/
|
2014-02-03 19:09:13 +00:00
|
|
|
size_t atomic_write_len;
|
2014-10-13 05:41:28 +00:00
|
|
|
/*
|
|
|
|
* "prealloc" causes a buffer to be allocated at open for
|
|
|
|
* all read/write requests. As ->seq_show uses seq_read()
|
|
|
|
* which does its own allocation, it is incompatible with
|
|
|
|
* ->prealloc. Provide ->read and ->write with ->prealloc.
|
|
|
|
*/
|
|
|
|
bool prealloc;
|
2013-12-11 19:11:55 +00:00
|
|
|
ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
|
2013-11-28 19:54:21 +00:00
|
|
|
loff_t off);
|
|
|
|
|
fs: kernfs: add poll file operation
Patch series "psi: pressure stall monitors", v3.
Android is adopting psi to detect and remedy memory pressure that
results in stuttering and decreased responsiveness on mobile devices.
Psi gives us the stall information, but because we're dealing with
latencies in the millisecond range, periodically reading the pressure
files to detect stalls in a timely fashion is not feasible. Psi also
doesn't aggregate its averages at a high enough frequency right now.
This patch series extends the psi interface such that users can
configure sensitive latency thresholds and use poll() and friends to be
notified when these are breached.
As high-frequency aggregation is costly, it implements an aggregation
method that is optimized for fast, short-interval averaging, and makes
the aggregation frequency adaptive, such that high-frequency updates
only happen while monitored stall events are actively occurring.
With these patches applied, Android can monitor for, and ward off,
mounting memory shortages before they cause problems for the user. For
example, using memory stall monitors in userspace low memory killer
daemon (lmkd) we can detect mounting pressure and kill less important
processes before device becomes visibly sluggish.
In our memory stress testing psi memory monitors produce roughly 10x
less false positives compared to vmpressure signals. Having ability to
specify multiple triggers for the same psi metric allows other parts of
Android framework to monitor memory state of the device and act
accordingly.
The new interface is straightforward. The user opens one of the
pressure files for writing and writes a trigger description into the
file descriptor that defines the stall state - some or full, and the
maximum stall time over a given window of time. E.g.:
/* Signal when stall time exceeds 100ms of a 1s window */
char trigger[] = "full 100000 1000000";
fd = open("/proc/pressure/memory");
write(fd, trigger, sizeof(trigger));
while (poll() >= 0) {
...
}
close(fd);
When the monitored stall state is entered, psi adapts its aggregation
frequency according to what the configured time window requires in order
to emit event signals in a timely fashion. Once the stalling subsides,
aggregation reverts back to normal.
The trigger is associated with the open file descriptor. To stop
monitoring, the user only needs to close the file descriptor and the
trigger is discarded.
Patches 1-4 prepare the psi code for polling support. Patch 5
implements the adaptive polling logic, the pressure growth detection
optimized for short intervals, and hooks up write() and poll() on the
pressure files.
The patches were developed in collaboration with Johannes Weiner.
This patch (of 5):
Kernfs has a standardized poll/notification mechanism for waking all
pollers on all fds when a filesystem node changes. To allow polling for
custom events, add a .poll callback that can override the default.
This is in preparation for pollable cgroup pressure files which have
per-fd trigger configurations.
Link: http://lkml.kernel.org/r/20190124211518.244221-2-surenb@google.com
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-05 23:45:45 +00:00
|
|
|
__poll_t (*poll)(struct kernfs_open_file *of,
|
|
|
|
struct poll_table_struct *pt);
|
|
|
|
|
2013-12-11 19:11:55 +00:00
|
|
|
int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
|
2023-09-25 08:40:12 +00:00
|
|
|
loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence);
|
2013-11-28 19:54:21 +00:00
|
|
|
};
|
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
/*
|
|
|
|
* The kernfs superblock creation/mount parameter context.
|
|
|
|
*/
|
|
|
|
struct kernfs_fs_context {
|
|
|
|
struct kernfs_root *root; /* Root of the hierarchy being mounted */
|
|
|
|
void *ns_tag; /* Namespace tag of the mount (or NULL) */
|
|
|
|
unsigned long magic; /* File system specific magic number */
|
|
|
|
|
|
|
|
/* The following are set/used by kernfs_mount() */
|
|
|
|
bool new_sb_created; /* Set to T if we allocated a new sb */
|
|
|
|
};
|
|
|
|
|
2014-02-03 19:09:17 +00:00
|
|
|
#ifdef CONFIG_KERNFS
|
2013-11-23 22:21:49 +00:00
|
|
|
|
2013-12-11 19:11:56 +00:00
|
|
|
static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
|
2013-11-29 22:18:32 +00:00
|
|
|
{
|
2013-12-11 19:11:56 +00:00
|
|
|
return kn->flags & KERNFS_TYPE_MASK;
|
2013-11-29 22:18:32 +00:00
|
|
|
}
|
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
static inline ino_t kernfs_id_ino(u64 id)
|
|
|
|
{
|
2019-11-04 23:54:30 +00:00
|
|
|
/* id is ino if ino_t is 64bit; otherwise, low 32bits */
|
|
|
|
if (sizeof(ino_t) >= sizeof(u64))
|
|
|
|
return id;
|
|
|
|
else
|
|
|
|
return (u32)id;
|
2019-11-04 23:54:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline u32 kernfs_id_gen(u64 id)
|
|
|
|
{
|
2019-11-04 23:54:30 +00:00
|
|
|
/* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */
|
|
|
|
if (sizeof(ino_t) >= sizeof(u64))
|
|
|
|
return 1;
|
|
|
|
else
|
|
|
|
return id >> 32;
|
2019-11-04 23:54:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline ino_t kernfs_ino(struct kernfs_node *kn)
|
|
|
|
{
|
|
|
|
return kernfs_id_ino(kn->id);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline ino_t kernfs_gen(struct kernfs_node *kn)
|
|
|
|
{
|
|
|
|
return kernfs_id_gen(kn->id);
|
|
|
|
}
|
|
|
|
|
2013-11-29 22:18:32 +00:00
|
|
|
/**
|
|
|
|
* kernfs_enable_ns - enable namespace under a directory
|
2013-12-11 19:11:53 +00:00
|
|
|
* @kn: directory of interest, should be empty
|
2013-11-29 22:18:32 +00:00
|
|
|
*
|
2013-12-11 19:11:53 +00:00
|
|
|
* This is to be called right after @kn is created to enable namespace
|
|
|
|
* under it. All children of @kn must have non-NULL namespace tags and
|
2013-11-29 22:18:32 +00:00
|
|
|
* only the ones which match the super_block's tag will be visible.
|
|
|
|
*/
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline void kernfs_enable_ns(struct kernfs_node *kn)
|
2013-11-29 22:18:32 +00:00
|
|
|
{
|
2013-12-11 19:11:56 +00:00
|
|
|
WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
|
2013-12-11 19:11:54 +00:00
|
|
|
WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
|
2013-12-11 19:11:56 +00:00
|
|
|
kn->flags |= KERNFS_NS;
|
2013-11-29 22:18:32 +00:00
|
|
|
}
|
|
|
|
|
2013-11-29 22:19:09 +00:00
|
|
|
/**
|
|
|
|
* kernfs_ns_enabled - test whether namespace is enabled
|
2013-12-11 19:11:53 +00:00
|
|
|
* @kn: the node to test
|
2013-11-29 22:19:09 +00:00
|
|
|
*
|
|
|
|
* Test whether namespace filtering is enabled for the children of @ns.
|
|
|
|
*/
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
|
2013-11-29 22:19:09 +00:00
|
|
|
{
|
2013-12-11 19:11:56 +00:00
|
|
|
return kn->flags & KERNFS_NS;
|
2013-11-29 22:19:09 +00:00
|
|
|
}
|
|
|
|
|
2014-02-07 18:32:07 +00:00
|
|
|
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
|
2016-01-29 08:54:04 +00:00
|
|
|
int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn,
|
|
|
|
char *buf, size_t buflen);
|
2014-02-07 18:32:07 +00:00
|
|
|
void pr_cont_kernfs_name(struct kernfs_node *kn);
|
|
|
|
void pr_cont_kernfs_path(struct kernfs_node *kn);
|
|
|
|
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
|
2013-12-11 19:11:53 +00:00
|
|
|
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
|
|
|
|
const char *name, const void *ns);
|
2015-11-20 20:55:52 +00:00
|
|
|
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
|
|
|
|
const char *path, const void *ns);
|
2013-12-11 19:11:53 +00:00
|
|
|
void kernfs_get(struct kernfs_node *kn);
|
|
|
|
void kernfs_put(struct kernfs_node *kn);
|
2013-11-28 19:54:30 +00:00
|
|
|
|
2014-02-03 19:09:15 +00:00
|
|
|
struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
|
|
|
|
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
|
2015-06-18 20:54:28 +00:00
|
|
|
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
|
2014-02-03 19:09:15 +00:00
|
|
|
|
2016-01-29 08:54:08 +00:00
|
|
|
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
|
|
|
|
struct super_block *sb);
|
2014-02-03 19:09:09 +00:00
|
|
|
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
|
2014-02-03 19:09:12 +00:00
|
|
|
unsigned int flags, void *priv);
|
2013-11-28 19:54:40 +00:00
|
|
|
void kernfs_destroy_root(struct kernfs_root *root);
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
|
2013-12-11 21:02:55 +00:00
|
|
|
const char *name, umode_t mode,
|
2018-07-20 21:56:47 +00:00
|
|
|
kuid_t uid, kgid_t gid,
|
2013-12-11 21:02:55 +00:00
|
|
|
void *priv, const void *ns);
|
2015-05-13 21:09:29 +00:00
|
|
|
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
|
|
|
|
const char *name);
|
2013-12-11 21:02:57 +00:00
|
|
|
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
|
2018-07-20 21:56:47 +00:00
|
|
|
const char *name, umode_t mode,
|
|
|
|
kuid_t uid, kgid_t gid,
|
|
|
|
loff_t size,
|
2013-12-11 21:02:57 +00:00
|
|
|
const struct kernfs_ops *ops,
|
|
|
|
void *priv, const void *ns,
|
|
|
|
struct lock_class_key *key);
|
2013-12-11 19:11:53 +00:00
|
|
|
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
|
|
|
|
const char *name,
|
|
|
|
struct kernfs_node *target);
|
2014-02-03 19:09:12 +00:00
|
|
|
void kernfs_activate(struct kernfs_node *kn);
|
2022-08-28 05:04:39 +00:00
|
|
|
void kernfs_show(struct kernfs_node *kn, bool show);
|
2013-12-11 19:11:53 +00:00
|
|
|
void kernfs_remove(struct kernfs_node *kn);
|
kernfs, sysfs, driver-core: implement kernfs_remove_self() and its wrappers
Sometimes it's necessary to implement a node which wants to delete
nodes including itself. This isn't straightforward because of kernfs
active reference. While a file operation is in progress, an active
reference is held and kernfs_remove() waits for all such references to
drain before completing. For a self-deleting node, this is a deadlock
as kernfs_remove() ends up waiting for an active reference that itself
is sitting on top of.
This currently is worked around in the sysfs layer using
sysfs_schedule_callback() which makes such removals asynchronous.
While it works, it's rather cumbersome and inherently breaks
synchronicity of the operation - the file operation which triggered
the operation may complete before the removal is finished (or even
started) and the removal may fail asynchronously. If a removal
operation is immmediately followed by another operation which expects
the specific name to be available (e.g. removal followed by rename
onto the same name), there's no way to make the latter operation
reliable.
The thing is there's no inherent reason for this to be asynchrnous.
All that's necessary to do this synchronous is a dedicated operation
which drops its own active ref and deactivates self. This patch
implements kernfs_remove_self() and its wrappers in sysfs and driver
core. kernfs_remove_self() is to be called from one of the file
operations, drops the active ref the task is holding, removes the self
node, and restores active ref to the dead node so that the ref is
balanced afterwards. __kernfs_remove() is updated so that it takes an
early exit if the target node is already fully removed so that the
active ref restored by kernfs_remove_self() after removal doesn't
confuse the deactivation path.
This makes implementing self-deleting nodes very easy. The normal
removal path doesn't even need to be changed to use
kernfs_remove_self() for the self-deleting node. The method can
invoke kernfs_remove_self() on itself before proceeding the normal
removal path. kernfs_remove() invoked on the node by the normal
deletion path will simply be ignored.
This will replace sysfs_schedule_callback(). A subtle feature of
sysfs_schedule_callback() is that it collapses multiple invocations -
even if multiple removals are triggered, the removal callback is run
only once. An equivalent effect can be achieved by testing the return
value of kernfs_remove_self() - only the one which gets %true return
value should proceed with actual deletion. All other instances of
kernfs_remove_self() will wait till the enclosing kernfs operation
which invoked the winning instance of kernfs_remove_self() finishes
and then return %false. This trivially makes all users of
kernfs_remove_self() automatically show correct synchronous behavior
even when there are multiple concurrent operations - all "echo 1 >
delete" instances will finish only after the whole operation is
completed by one of the instances.
Note that manipulation of active ref is implemented in separate public
functions - kernfs_[un]break_active_protection().
kernfs_remove_self() is the only user at the moment but this will be
used to cater to more complex cases.
v2: For !CONFIG_SYSFS, dummy version kernfs_remove_self() was missing
and sysfs_remove_file_self() had incorrect return type. Fix it.
Reported by kbuild test bot.
v3: kernfs_[un]break_active_protection() separated out from
kernfs_remove_self() and exposed as public API.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2014-02-03 19:03:01 +00:00
|
|
|
void kernfs_break_active_protection(struct kernfs_node *kn);
|
|
|
|
void kernfs_unbreak_active_protection(struct kernfs_node *kn);
|
|
|
|
bool kernfs_remove_self(struct kernfs_node *kn);
|
2013-12-11 19:11:53 +00:00
|
|
|
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
|
2013-11-23 22:21:49 +00:00
|
|
|
const void *ns);
|
2013-12-11 19:11:53 +00:00
|
|
|
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
|
2013-11-23 22:21:51 +00:00
|
|
|
const char *new_name, const void *new_ns);
|
2013-12-11 19:11:53 +00:00
|
|
|
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
|
fs: kernfs: add poll file operation
Patch series "psi: pressure stall monitors", v3.
Android is adopting psi to detect and remedy memory pressure that
results in stuttering and decreased responsiveness on mobile devices.
Psi gives us the stall information, but because we're dealing with
latencies in the millisecond range, periodically reading the pressure
files to detect stalls in a timely fashion is not feasible. Psi also
doesn't aggregate its averages at a high enough frequency right now.
This patch series extends the psi interface such that users can
configure sensitive latency thresholds and use poll() and friends to be
notified when these are breached.
As high-frequency aggregation is costly, it implements an aggregation
method that is optimized for fast, short-interval averaging, and makes
the aggregation frequency adaptive, such that high-frequency updates
only happen while monitored stall events are actively occurring.
With these patches applied, Android can monitor for, and ward off,
mounting memory shortages before they cause problems for the user. For
example, using memory stall monitors in userspace low memory killer
daemon (lmkd) we can detect mounting pressure and kill less important
processes before device becomes visibly sluggish.
In our memory stress testing psi memory monitors produce roughly 10x
less false positives compared to vmpressure signals. Having ability to
specify multiple triggers for the same psi metric allows other parts of
Android framework to monitor memory state of the device and act
accordingly.
The new interface is straightforward. The user opens one of the
pressure files for writing and writes a trigger description into the
file descriptor that defines the stall state - some or full, and the
maximum stall time over a given window of time. E.g.:
/* Signal when stall time exceeds 100ms of a 1s window */
char trigger[] = "full 100000 1000000";
fd = open("/proc/pressure/memory");
write(fd, trigger, sizeof(trigger));
while (poll() >= 0) {
...
}
close(fd);
When the monitored stall state is entered, psi adapts its aggregation
frequency according to what the configured time window requires in order
to emit event signals in a timely fashion. Once the stalling subsides,
aggregation reverts back to normal.
The trigger is associated with the open file descriptor. To stop
monitoring, the user only needs to close the file descriptor and the
trigger is discarded.
Patches 1-4 prepare the psi code for polling support. Patch 5
implements the adaptive polling logic, the pressure growth detection
optimized for short intervals, and hooks up write() and poll() on the
pressure files.
The patches were developed in collaboration with Johannes Weiner.
This patch (of 5):
Kernfs has a standardized poll/notification mechanism for waking all
pollers on all fds when a filesystem node changes. To allow polling for
custom events, add a .poll callback that can override the default.
This is in preparation for pollable cgroup pressure files which have
per-fd trigger configurations.
Link: http://lkml.kernel.org/r/20190124211518.244221-2-surenb@google.com
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-03-05 23:45:45 +00:00
|
|
|
__poll_t kernfs_generic_poll(struct kernfs_open_file *of,
|
|
|
|
struct poll_table_struct *pt);
|
2013-12-11 19:11:53 +00:00
|
|
|
void kernfs_notify(struct kernfs_node *kn);
|
2013-11-23 22:21:49 +00:00
|
|
|
|
2019-04-03 07:29:41 +00:00
|
|
|
int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
|
|
|
|
void *value, size_t size);
|
|
|
|
int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
|
|
|
|
const void *value, size_t size, int flags);
|
2019-02-22 14:57:16 +00:00
|
|
|
|
2013-11-28 19:54:43 +00:00
|
|
|
const void *kernfs_super_ns(struct super_block *sb);
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
int kernfs_get_tree(struct fs_context *fc);
|
|
|
|
void kernfs_free_fs_context(struct fs_context *fc);
|
2013-11-28 19:54:43 +00:00
|
|
|
void kernfs_kill_sb(struct super_block *sb);
|
|
|
|
|
|
|
|
void kernfs_init(void);
|
|
|
|
|
2019-11-04 23:54:30 +00:00
|
|
|
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
|
|
|
|
u64 id);
|
2014-02-03 19:09:17 +00:00
|
|
|
#else /* CONFIG_KERNFS */
|
2013-11-23 22:21:49 +00:00
|
|
|
|
2013-12-11 19:11:56 +00:00
|
|
|
static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
|
2013-11-29 22:18:32 +00:00
|
|
|
{ return 0; } /* whatever */
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline void kernfs_enable_ns(struct kernfs_node *kn) { }
|
2013-11-29 22:18:32 +00:00
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
|
2013-11-29 22:19:09 +00:00
|
|
|
{ return false; }
|
|
|
|
|
2014-02-07 18:32:07 +00:00
|
|
|
static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
|
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2016-08-10 15:23:43 +00:00
|
|
|
static inline int kernfs_path_from_node(struct kernfs_node *root_kn,
|
|
|
|
struct kernfs_node *kn,
|
|
|
|
char *buf, size_t buflen)
|
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2014-02-07 18:32:07 +00:00
|
|
|
static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
|
|
|
|
static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { }
|
|
|
|
|
|
|
|
static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
|
|
|
|
{ return NULL; }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline struct kernfs_node *
|
|
|
|
kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
|
2013-11-28 19:54:30 +00:00
|
|
|
const void *ns)
|
|
|
|
{ return NULL; }
|
2015-11-20 20:55:52 +00:00
|
|
|
static inline struct kernfs_node *
|
|
|
|
kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path,
|
|
|
|
const void *ns)
|
|
|
|
{ return NULL; }
|
2013-11-28 19:54:30 +00:00
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline void kernfs_get(struct kernfs_node *kn) { }
|
|
|
|
static inline void kernfs_put(struct kernfs_node *kn) { }
|
2013-11-28 19:54:30 +00:00
|
|
|
|
2014-02-03 19:09:15 +00:00
|
|
|
static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
|
|
|
|
{ return NULL; }
|
|
|
|
|
|
|
|
static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
|
|
|
|
{ return NULL; }
|
|
|
|
|
2015-06-18 20:54:28 +00:00
|
|
|
static inline struct inode *
|
|
|
|
kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
|
|
|
|
{ return NULL; }
|
|
|
|
|
2013-12-11 21:03:00 +00:00
|
|
|
static inline struct kernfs_root *
|
2014-02-03 19:09:12 +00:00
|
|
|
kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
|
|
|
|
void *priv)
|
2013-11-28 19:54:40 +00:00
|
|
|
{ return ERR_PTR(-ENOSYS); }
|
|
|
|
|
|
|
|
static inline void kernfs_destroy_root(struct kernfs_root *root) { }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline struct kernfs_node *
|
2013-12-11 21:02:55 +00:00
|
|
|
kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
|
2018-07-20 21:56:47 +00:00
|
|
|
umode_t mode, kuid_t uid, kgid_t gid,
|
|
|
|
void *priv, const void *ns)
|
2013-11-28 19:54:15 +00:00
|
|
|
{ return ERR_PTR(-ENOSYS); }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline struct kernfs_node *
|
2013-12-11 21:02:57 +00:00
|
|
|
__kernfs_create_file(struct kernfs_node *parent, const char *name,
|
2018-07-20 21:56:47 +00:00
|
|
|
umode_t mode, kuid_t uid, kgid_t gid,
|
|
|
|
loff_t size, const struct kernfs_ops *ops,
|
2015-02-13 22:36:31 +00:00
|
|
|
void *priv, const void *ns, struct lock_class_key *key)
|
2013-11-28 19:54:24 +00:00
|
|
|
{ return ERR_PTR(-ENOSYS); }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline struct kernfs_node *
|
|
|
|
kernfs_create_link(struct kernfs_node *parent, const char *name,
|
|
|
|
struct kernfs_node *target)
|
2013-11-23 22:21:50 +00:00
|
|
|
{ return ERR_PTR(-ENOSYS); }
|
|
|
|
|
2014-02-03 19:09:12 +00:00
|
|
|
static inline void kernfs_activate(struct kernfs_node *kn) { }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline void kernfs_remove(struct kernfs_node *kn) { }
|
2013-11-23 22:21:49 +00:00
|
|
|
|
kernfs, sysfs, driver-core: implement kernfs_remove_self() and its wrappers
Sometimes it's necessary to implement a node which wants to delete
nodes including itself. This isn't straightforward because of kernfs
active reference. While a file operation is in progress, an active
reference is held and kernfs_remove() waits for all such references to
drain before completing. For a self-deleting node, this is a deadlock
as kernfs_remove() ends up waiting for an active reference that itself
is sitting on top of.
This currently is worked around in the sysfs layer using
sysfs_schedule_callback() which makes such removals asynchronous.
While it works, it's rather cumbersome and inherently breaks
synchronicity of the operation - the file operation which triggered
the operation may complete before the removal is finished (or even
started) and the removal may fail asynchronously. If a removal
operation is immmediately followed by another operation which expects
the specific name to be available (e.g. removal followed by rename
onto the same name), there's no way to make the latter operation
reliable.
The thing is there's no inherent reason for this to be asynchrnous.
All that's necessary to do this synchronous is a dedicated operation
which drops its own active ref and deactivates self. This patch
implements kernfs_remove_self() and its wrappers in sysfs and driver
core. kernfs_remove_self() is to be called from one of the file
operations, drops the active ref the task is holding, removes the self
node, and restores active ref to the dead node so that the ref is
balanced afterwards. __kernfs_remove() is updated so that it takes an
early exit if the target node is already fully removed so that the
active ref restored by kernfs_remove_self() after removal doesn't
confuse the deactivation path.
This makes implementing self-deleting nodes very easy. The normal
removal path doesn't even need to be changed to use
kernfs_remove_self() for the self-deleting node. The method can
invoke kernfs_remove_self() on itself before proceeding the normal
removal path. kernfs_remove() invoked on the node by the normal
deletion path will simply be ignored.
This will replace sysfs_schedule_callback(). A subtle feature of
sysfs_schedule_callback() is that it collapses multiple invocations -
even if multiple removals are triggered, the removal callback is run
only once. An equivalent effect can be achieved by testing the return
value of kernfs_remove_self() - only the one which gets %true return
value should proceed with actual deletion. All other instances of
kernfs_remove_self() will wait till the enclosing kernfs operation
which invoked the winning instance of kernfs_remove_self() finishes
and then return %false. This trivially makes all users of
kernfs_remove_self() automatically show correct synchronous behavior
even when there are multiple concurrent operations - all "echo 1 >
delete" instances will finish only after the whole operation is
completed by one of the instances.
Note that manipulation of active ref is implemented in separate public
functions - kernfs_[un]break_active_protection().
kernfs_remove_self() is the only user at the moment but this will be
used to cater to more complex cases.
v2: For !CONFIG_SYSFS, dummy version kernfs_remove_self() was missing
and sysfs_remove_file_self() had incorrect return type. Fix it.
Reported by kbuild test bot.
v3: kernfs_[un]break_active_protection() separated out from
kernfs_remove_self() and exposed as public API.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2014-02-03 19:03:01 +00:00
|
|
|
static inline bool kernfs_remove_self(struct kernfs_node *kn)
|
|
|
|
{ return false; }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
|
2013-11-23 22:21:49 +00:00
|
|
|
const char *name, const void *ns)
|
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline int kernfs_rename_ns(struct kernfs_node *kn,
|
|
|
|
struct kernfs_node *new_parent,
|
2013-11-23 22:21:51 +00:00
|
|
|
const char *new_name, const void *new_ns)
|
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline int kernfs_setattr(struct kernfs_node *kn,
|
2013-11-23 22:21:52 +00:00
|
|
|
const struct iattr *iattr)
|
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2023-07-24 12:18:16 +00:00
|
|
|
static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of,
|
|
|
|
struct poll_table_struct *pt)
|
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline void kernfs_notify(struct kernfs_node *kn) { }
|
2013-11-28 19:54:27 +00:00
|
|
|
|
2019-04-03 07:29:41 +00:00
|
|
|
static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
|
|
|
|
void *value, size_t size)
|
2019-02-22 14:57:16 +00:00
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2019-04-03 07:29:41 +00:00
|
|
|
static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
|
|
|
|
const void *value, size_t size, int flags)
|
2019-02-22 14:57:16 +00:00
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
2013-11-28 19:54:43 +00:00
|
|
|
static inline const void *kernfs_super_ns(struct super_block *sb)
|
|
|
|
{ return NULL; }
|
|
|
|
|
kernfs, sysfs, cgroup, intel_rdt: Support fs_context
Make kernfs support superblock creation/mount/remount with fs_context.
This requires that sysfs, cgroup and intel_rdt, which are built on kernfs,
be made to support fs_context also.
Notes:
(1) A kernfs_fs_context struct is created to wrap fs_context and the
kernfs mount parameters are moved in here (or are in fs_context).
(2) kernfs_mount{,_ns}() are made into kernfs_get_tree(). The extra
namespace tag parameter is passed in the context if desired
(3) kernfs_free_fs_context() is provided as a destructor for the
kernfs_fs_context struct, but for the moment it does nothing except
get called in the right places.
(4) sysfs doesn't wrap kernfs_fs_context since it has no parameters to
pass, but possibly this should be done anyway in case someone wants to
add a parameter in future.
(5) A cgroup_fs_context struct is created to wrap kernfs_fs_context and
the cgroup v1 and v2 mount parameters are all moved there.
(6) cgroup1 parameter parsing error messages are now handled by invalf(),
which allows userspace to collect them directly.
(7) cgroup1 parameter cleanup is now done in the context destructor rather
than in the mount/get_tree and remount functions.
Weirdies:
(*) cgroup_do_get_tree() calls cset_cgroup_from_root() with locks held,
but then uses the resulting pointer after dropping the locks. I'm
told this is okay and needs commenting.
(*) The cgroup refcount web. This really needs documenting.
(*) cgroup2 only has one root?
Add a suggestion from Thomas Gleixner in which the RDT enablement code is
placed into its own function.
[folded a leak fix from Andrey Vagin]
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Tejun Heo <tj@kernel.org>
cc: Li Zefan <lizefan@huawei.com>
cc: Johannes Weiner <hannes@cmpxchg.org>
cc: cgroups@vger.kernel.org
cc: fenghua.yu@intel.com
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:26 +00:00
|
|
|
static inline int kernfs_get_tree(struct fs_context *fc)
|
|
|
|
{ return -ENOSYS; }
|
|
|
|
|
|
|
|
static inline void kernfs_free_fs_context(struct fs_context *fc) { }
|
2013-11-28 19:54:43 +00:00
|
|
|
|
|
|
|
static inline void kernfs_kill_sb(struct super_block *sb) { }
|
|
|
|
|
|
|
|
static inline void kernfs_init(void) { }
|
|
|
|
|
2014-02-03 19:09:17 +00:00
|
|
|
#endif /* CONFIG_KERNFS */
|
2013-11-23 22:21:49 +00:00
|
|
|
|
2016-08-10 15:23:44 +00:00
|
|
|
/**
|
|
|
|
* kernfs_path - build full path of a given node
|
|
|
|
* @kn: kernfs_node of interest
|
|
|
|
* @buf: buffer to copy @kn's name into
|
|
|
|
* @buflen: size of @buf
|
|
|
|
*
|
2018-08-13 06:52:09 +00:00
|
|
|
* If @kn is NULL result will be "(null)".
|
|
|
|
*
|
|
|
|
* Returns the length of the full path. If the full length is equal to or
|
|
|
|
* greater than @buflen, @buf contains the truncated path with the trailing
|
|
|
|
* '\0'. On error, -errno is returned.
|
2016-08-10 15:23:44 +00:00
|
|
|
*/
|
|
|
|
static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
|
|
|
|
{
|
|
|
|
return kernfs_path_from_node(kn, NULL, buf, buflen);
|
|
|
|
}
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline struct kernfs_node *
|
|
|
|
kernfs_find_and_get(struct kernfs_node *kn, const char *name)
|
2013-11-28 19:54:30 +00:00
|
|
|
{
|
2013-12-11 19:11:53 +00:00
|
|
|
return kernfs_find_and_get_ns(kn, name, NULL);
|
2013-11-28 19:54:30 +00:00
|
|
|
}
|
|
|
|
|
2015-11-20 20:55:52 +00:00
|
|
|
static inline struct kernfs_node *
|
|
|
|
kernfs_walk_and_get(struct kernfs_node *kn, const char *path)
|
|
|
|
{
|
|
|
|
return kernfs_walk_and_get_ns(kn, path, NULL);
|
|
|
|
}
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline struct kernfs_node *
|
2013-12-11 21:02:55 +00:00
|
|
|
kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
|
|
|
|
void *priv)
|
2013-11-28 19:54:15 +00:00
|
|
|
{
|
2018-07-20 21:56:47 +00:00
|
|
|
return kernfs_create_dir_ns(parent, name, mode,
|
|
|
|
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
|
|
|
|
priv, NULL);
|
2013-11-28 19:54:15 +00:00
|
|
|
}
|
|
|
|
|
2013-12-11 19:11:53 +00:00
|
|
|
static inline int kernfs_remove_by_name(struct kernfs_node *parent,
|
2013-11-23 22:21:49 +00:00
|
|
|
const char *name)
|
|
|
|
{
|
|
|
|
return kernfs_remove_by_name_ns(parent, name, NULL);
|
|
|
|
}
|
|
|
|
|
2014-02-03 19:09:15 +00:00
|
|
|
static inline int kernfs_rename(struct kernfs_node *kn,
|
|
|
|
struct kernfs_node *new_parent,
|
|
|
|
const char *new_name)
|
|
|
|
{
|
|
|
|
return kernfs_rename_ns(kn, new_parent, new_name, NULL);
|
|
|
|
}
|
|
|
|
|
2013-11-24 14:54:58 +00:00
|
|
|
#endif /* __LINUX_KERNFS_H */
|