2013-11-24 14:54:58 +00:00
|
|
|
/*
|
|
|
|
* fs/kernfs/mount.c - kernfs mount implementation
|
|
|
|
*
|
|
|
|
* Copyright (c) 2001-3 Patrick Mochel
|
|
|
|
* Copyright (c) 2007 SUSE Linux Products GmbH
|
|
|
|
* Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
|
|
|
|
*
|
|
|
|
* This file is released under the GPLv2.
|
|
|
|
*/
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/magic.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/pagemap.h>
|
2016-01-29 08:54:08 +00:00
|
|
|
#include <linux/namei.h>
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
#include <linux/seq_file.h>
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
#include "kernfs-internal.h"
|
|
|
|
|
2013-12-11 19:11:57 +00:00
|
|
|
struct kmem_cache *kernfs_node_cache;
|
2013-11-28 19:54:44 +00:00
|
|
|
|
2014-02-03 19:09:10 +00:00
|
|
|
static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
|
|
|
|
{
|
|
|
|
struct kernfs_root *root = kernfs_info(sb)->root;
|
|
|
|
struct kernfs_syscall_ops *scops = root->syscall_ops;
|
|
|
|
|
|
|
|
if (scops && scops->remount_fs)
|
|
|
|
return scops->remount_fs(root, flags, data);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
|
|
|
|
{
|
2017-07-12 18:49:49 +00:00
|
|
|
struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry));
|
2014-02-03 19:09:10 +00:00
|
|
|
struct kernfs_syscall_ops *scops = root->syscall_ops;
|
|
|
|
|
|
|
|
if (scops && scops->show_options)
|
|
|
|
return scops->show_options(sf, root);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
|
|
|
|
{
|
2017-07-12 18:49:49 +00:00
|
|
|
struct kernfs_node *node = kernfs_dentry_node(dentry);
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
struct kernfs_root *root = kernfs_root(node);
|
|
|
|
struct kernfs_syscall_ops *scops = root->syscall_ops;
|
|
|
|
|
|
|
|
if (scops && scops->show_path)
|
|
|
|
return scops->show_path(sf, node, root);
|
|
|
|
|
2016-05-12 05:29:45 +00:00
|
|
|
seq_dentry(sf, dentry, " \t\n\\");
|
|
|
|
return 0;
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
}
|
|
|
|
|
2014-02-14 08:57:27 +00:00
|
|
|
const struct super_operations kernfs_sops = {
|
2013-11-28 19:54:44 +00:00
|
|
|
.statfs = simple_statfs,
|
|
|
|
.drop_inode = generic_delete_inode,
|
2013-12-11 19:11:58 +00:00
|
|
|
.evict_inode = kernfs_evict_inode,
|
2014-02-03 19:09:10 +00:00
|
|
|
|
|
|
|
.remount_fs = kernfs_sop_remount_fs,
|
|
|
|
.show_options = kernfs_sop_show_options,
|
cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces
Patch summary:
When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.
Short explanation (courtesy of mkerrisk):
If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point. Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.
Long version:
When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b. The root dentry field of the mountinfo
entry will show '/a/b'.
cat > /tmp/do1 << EOF
mount -t cgroup -o freezer freezer /mnt
grep freezer /proc/self/mountinfo
EOF
unshare -Gm bash /tmp/do1
> 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
> 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer
The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':
grep freezer /proc/self/cgroup
9:freezer:/
If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:
mount --bind /sys/fs/cgroup/freezer/a/b /mnt
130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer
In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.
Example (by mkerrisk):
First, a little script to save some typing and verbiage:
echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'
Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:
2653
2653 # Our shell
14254 # cat(1)
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):
Look at the state of the /proc files:
/proc/self/cgroup: 10:freezer:/
mountinfo: / /sys/fs/cgroup/freezer
The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.
However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:
# propagating to other mountns
/proc/self/cgroup: 7:freezer:/
mountinfo: /a/b /mnt/freezer
The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.
With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace. So the same steps as above:
/proc/self/cgroup: 10:freezer:/a/b
mountinfo: / /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: /../.. /sys/fs/cgroup/freezer
/proc/self/cgroup: 10:freezer:/
mountinfo: / /mnt/freezer
cgroup.clone_children freezer.parent_freezing freezer.state tasks
cgroup.procs freezer.self_freezing notify_on_release
3164
2653 # First shell that placed in this cgroup
3164 # Shell started by 'unshare'
14197 # cat(1)
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2016-05-09 14:59:55 +00:00
|
|
|
.show_path = kernfs_sop_show_path,
|
2013-11-28 19:54:44 +00:00
|
|
|
};
|
|
|
|
|
2014-02-03 19:09:15 +00:00
|
|
|
/**
|
|
|
|
* kernfs_root_from_sb - determine kernfs_root associated with a super_block
|
|
|
|
* @sb: the super_block in question
|
|
|
|
*
|
|
|
|
* Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
|
|
|
|
* %NULL is returned.
|
|
|
|
*/
|
|
|
|
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
|
|
|
|
{
|
|
|
|
if (sb->s_op == &kernfs_sops)
|
|
|
|
return kernfs_info(sb)->root;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-01-29 08:54:08 +00:00
|
|
|
/*
|
|
|
|
* find the next ancestor in the path down to @child, where @parent was the
|
|
|
|
* ancestor whose descendant we want to find.
|
|
|
|
*
|
|
|
|
* Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
|
|
|
|
* node. If @parent is b, then we return the node for c.
|
|
|
|
* Passing in d as @parent is not ok.
|
|
|
|
*/
|
|
|
|
static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
|
|
|
|
struct kernfs_node *parent)
|
|
|
|
{
|
|
|
|
if (child == parent) {
|
|
|
|
pr_crit_once("BUG in find_next_ancestor: called with parent == child");
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (child->parent != parent) {
|
|
|
|
if (!child->parent)
|
|
|
|
return NULL;
|
|
|
|
child = child->parent;
|
|
|
|
}
|
|
|
|
|
|
|
|
return child;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kernfs_node_dentry - get a dentry for the given kernfs_node
|
|
|
|
* @kn: kernfs_node for which a dentry is needed
|
|
|
|
* @sb: the kernfs super_block
|
|
|
|
*/
|
|
|
|
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
|
|
|
|
struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct dentry *dentry;
|
|
|
|
struct kernfs_node *knparent = NULL;
|
|
|
|
|
|
|
|
BUG_ON(sb->s_op != &kernfs_sops);
|
|
|
|
|
|
|
|
dentry = dget(sb->s_root);
|
|
|
|
|
|
|
|
/* Check if this is the root kernfs_node */
|
|
|
|
if (!kn->parent)
|
|
|
|
return dentry;
|
|
|
|
|
|
|
|
knparent = find_next_ancestor(kn, NULL);
|
|
|
|
if (WARN_ON(!knparent))
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
do {
|
|
|
|
struct dentry *dtmp;
|
|
|
|
struct kernfs_node *kntmp;
|
|
|
|
|
|
|
|
if (kn == knparent)
|
|
|
|
return dentry;
|
|
|
|
kntmp = find_next_ancestor(kn, knparent);
|
|
|
|
if (WARN_ON(!kntmp))
|
|
|
|
return ERR_PTR(-EINVAL);
|
2016-04-11 12:42:55 +00:00
|
|
|
dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
|
|
|
|
strlen(kntmp->name));
|
2016-01-29 08:54:08 +00:00
|
|
|
dput(dentry);
|
|
|
|
if (IS_ERR(dtmp))
|
|
|
|
return dtmp;
|
|
|
|
knparent = kntmp;
|
|
|
|
dentry = dtmp;
|
|
|
|
} while (true);
|
|
|
|
}
|
|
|
|
|
2014-04-26 07:40:28 +00:00
|
|
|
static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
2013-11-28 19:54:44 +00:00
|
|
|
struct inode *inode;
|
|
|
|
struct dentry *root;
|
|
|
|
|
2014-04-09 15:07:30 +00:00
|
|
|
info->sb = sb;
|
2016-06-09 20:34:02 +00:00
|
|
|
/* Userspace would break if executables or devices appear on sysfs */
|
|
|
|
sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 12:29:47 +00:00
|
|
|
sb->s_blocksize = PAGE_SIZE;
|
|
|
|
sb->s_blocksize_bits = PAGE_SHIFT;
|
2014-04-26 07:40:28 +00:00
|
|
|
sb->s_magic = magic;
|
2013-12-11 19:11:57 +00:00
|
|
|
sb->s_op = &kernfs_sops;
|
2016-09-29 15:48:33 +00:00
|
|
|
sb->s_xattr = kernfs_xattr_handlers;
|
2013-11-28 19:54:44 +00:00
|
|
|
sb->s_time_gran = 1;
|
|
|
|
|
|
|
|
/* get root inode, initialize and unlock it */
|
2013-12-11 19:11:57 +00:00
|
|
|
mutex_lock(&kernfs_mutex);
|
2013-12-11 19:11:58 +00:00
|
|
|
inode = kernfs_get_inode(sb, info->root->kn);
|
2013-12-11 19:11:57 +00:00
|
|
|
mutex_unlock(&kernfs_mutex);
|
2013-11-28 19:54:44 +00:00
|
|
|
if (!inode) {
|
2013-12-11 19:11:58 +00:00
|
|
|
pr_debug("kernfs: could not get root inode\n");
|
2013-11-28 19:54:44 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* instantiate and link root dentry */
|
|
|
|
root = d_make_root(inode);
|
|
|
|
if (!root) {
|
|
|
|
pr_debug("%s: could not get root dentry!\n", __func__);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
sb->s_root = root;
|
2013-12-11 19:11:57 +00:00
|
|
|
sb->s_d_op = &kernfs_dops;
|
2013-11-28 19:54:44 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-12-11 19:11:58 +00:00
|
|
|
static int kernfs_test_super(struct super_block *sb, void *data)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *sb_info = kernfs_info(sb);
|
|
|
|
struct kernfs_super_info *info = data;
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
return sb_info->root == info->root && sb_info->ns == info->ns;
|
|
|
|
}
|
|
|
|
|
2013-12-11 19:11:58 +00:00
|
|
|
static int kernfs_set_super(struct super_block *sb, void *data)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
error = set_anon_super(sb, data);
|
|
|
|
if (!error)
|
|
|
|
sb->s_fs_info = data;
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kernfs_super_ns - determine the namespace tag of a kernfs super_block
|
|
|
|
* @sb: super_block of interest
|
|
|
|
*
|
|
|
|
* Return the namespace tag associated with kernfs super_block @sb.
|
|
|
|
*/
|
|
|
|
const void *kernfs_super_ns(struct super_block *sb)
|
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
2013-11-28 19:54:44 +00:00
|
|
|
|
|
|
|
return info->ns;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kernfs_mount_ns - kernfs mount helper
|
|
|
|
* @fs_type: file_system_type of the fs being mounted
|
|
|
|
* @flags: mount flags specified for the mount
|
|
|
|
* @root: kernfs_root of the hierarchy being mounted
|
2014-04-26 07:40:28 +00:00
|
|
|
* @magic: file system specific magic number
|
2014-02-25 11:28:44 +00:00
|
|
|
* @new_sb_created: tell the caller if we allocated a new superblock
|
2013-11-28 19:54:44 +00:00
|
|
|
* @ns: optional namespace tag of the mount
|
|
|
|
*
|
|
|
|
* This is to be called from each kernfs user's file_system_type->mount()
|
|
|
|
* implementation, which should pass through the specified @fs_type and
|
|
|
|
* @flags, and specify the hierarchy and namespace tag to mount via @root
|
|
|
|
* and @ns, respectively.
|
|
|
|
*
|
|
|
|
* The return value can be passed to the vfs layer verbatim.
|
|
|
|
*/
|
|
|
|
struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
|
2014-04-26 07:40:28 +00:00
|
|
|
struct kernfs_root *root, unsigned long magic,
|
|
|
|
bool *new_sb_created, const void *ns)
|
2013-11-28 19:54:44 +00:00
|
|
|
{
|
|
|
|
struct super_block *sb;
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info;
|
2013-11-28 19:54:44 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
info = kzalloc(sizeof(*info), GFP_KERNEL);
|
|
|
|
if (!info)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
info->root = root;
|
|
|
|
info->ns = ns;
|
|
|
|
|
2016-05-24 14:29:01 +00:00
|
|
|
sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
|
|
|
|
&init_user_ns, info);
|
2013-11-28 19:54:44 +00:00
|
|
|
if (IS_ERR(sb) || sb->s_fs_info != info)
|
|
|
|
kfree(info);
|
|
|
|
if (IS_ERR(sb))
|
|
|
|
return ERR_CAST(sb);
|
2014-02-25 11:28:44 +00:00
|
|
|
|
|
|
|
if (new_sb_created)
|
|
|
|
*new_sb_created = !sb->s_root;
|
|
|
|
|
2013-11-28 19:54:44 +00:00
|
|
|
if (!sb->s_root) {
|
2014-04-09 15:07:30 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
|
|
|
|
2014-04-26 07:40:28 +00:00
|
|
|
error = kernfs_fill_super(sb, magic);
|
2013-11-28 19:54:44 +00:00
|
|
|
if (error) {
|
|
|
|
deactivate_locked_super(sb);
|
|
|
|
return ERR_PTR(error);
|
|
|
|
}
|
|
|
|
sb->s_flags |= MS_ACTIVE;
|
2014-04-09 15:07:30 +00:00
|
|
|
|
|
|
|
mutex_lock(&kernfs_mutex);
|
|
|
|
list_add(&info->node, &root->supers);
|
|
|
|
mutex_unlock(&kernfs_mutex);
|
2013-11-28 19:54:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return dget(sb->s_root);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* kernfs_kill_sb - kill_sb for kernfs
|
|
|
|
* @sb: super_block being killed
|
|
|
|
*
|
|
|
|
* This can be used directly for file_system_type->kill_sb(). If a kernfs
|
|
|
|
* user needs extra cleanup, it can implement its own kill_sb() and call
|
|
|
|
* this function at the end.
|
|
|
|
*/
|
|
|
|
void kernfs_kill_sb(struct super_block *sb)
|
|
|
|
{
|
2013-12-11 19:11:55 +00:00
|
|
|
struct kernfs_super_info *info = kernfs_info(sb);
|
2013-11-28 19:54:44 +00:00
|
|
|
|
2014-04-09 15:07:30 +00:00
|
|
|
mutex_lock(&kernfs_mutex);
|
|
|
|
list_del(&info->node);
|
|
|
|
mutex_unlock(&kernfs_mutex);
|
|
|
|
|
2013-11-28 19:54:44 +00:00
|
|
|
/*
|
|
|
|
* Remove the superblock from fs_supers/s_instances
|
2013-12-11 19:11:55 +00:00
|
|
|
* so we can't find it, before freeing kernfs_super_info.
|
2013-11-28 19:54:44 +00:00
|
|
|
*/
|
|
|
|
kill_anon_super(sb);
|
|
|
|
kfree(info);
|
|
|
|
}
|
|
|
|
|
2014-06-30 03:50:28 +00:00
|
|
|
/**
|
|
|
|
* kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
|
|
|
|
* @kernfs_root: the kernfs_root in question
|
|
|
|
* @ns: the namespace tag
|
|
|
|
*
|
|
|
|
* Pin the superblock so the superblock won't be destroyed in subsequent
|
|
|
|
* operations. This can be used to block ->kill_sb() which may be useful
|
|
|
|
* for kernfs users which dynamically manage superblocks.
|
|
|
|
*
|
|
|
|
* Returns NULL if there's no superblock associated to this kernfs_root, or
|
|
|
|
* -EINVAL if the superblock is being freed.
|
|
|
|
*/
|
|
|
|
struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
|
|
|
|
{
|
|
|
|
struct kernfs_super_info *info;
|
|
|
|
struct super_block *sb = NULL;
|
|
|
|
|
|
|
|
mutex_lock(&kernfs_mutex);
|
|
|
|
list_for_each_entry(info, &root->supers, node) {
|
|
|
|
if (info->ns == ns) {
|
|
|
|
sb = info->sb;
|
|
|
|
if (!atomic_inc_not_zero(&info->sb->s_active))
|
|
|
|
sb = ERR_PTR(-EINVAL);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&kernfs_mutex);
|
|
|
|
return sb;
|
|
|
|
}
|
|
|
|
|
2013-11-28 19:54:44 +00:00
|
|
|
void __init kernfs_init(void)
|
|
|
|
{
|
2017-07-12 18:49:48 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino
|
|
|
|
* can access the slab lock free. This could introduce stale nodes,
|
|
|
|
* please see how kernfs_find_and_get_node_by_ino filters out stale
|
|
|
|
* nodes.
|
|
|
|
*/
|
2013-12-11 19:11:57 +00:00
|
|
|
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
|
2013-12-11 19:11:53 +00:00
|
|
|
sizeof(struct kernfs_node),
|
2017-07-12 18:49:48 +00:00
|
|
|
0,
|
|
|
|
SLAB_PANIC | SLAB_TYPESAFE_BY_RCU,
|
|
|
|
NULL);
|
2013-11-28 19:54:44 +00:00
|
|
|
}
|