From dd7cb142f467c4660698bcaa4a48c688b443ab81 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 24 Jun 2024 11:49:44 -0400 Subject: [PATCH 1/8] fs: relax permissions for listmount() It is sufficient to have capabilities in the owning user namespace of the mount namespace to list all mounts regardless of whether they are reachable or not. Link: https://lore.kernel.org/r/8adc0d3f4f7495faacc6a7c63095961f7f1637c7.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 02a697287da5..22274f74beb0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5104,7 +5104,7 @@ static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids, * mounts to show users. */ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; ret = security_sb_statfs(orig.dentry); From f3107df39df123328a9d3c8f40c006834b37287d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 24 Jun 2024 11:49:45 -0400 Subject: [PATCH 2/8] fs: relax permissions for statmount() It is sufficient to have capabilities in the owning user namespace of the mount namespace to stat a mount regardless of whether it's reachable or not. Link: https://lore.kernel.org/r/bf5961d71ec479ba85806766b0d8d96043e67bba.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 22274f74beb0..352e8c90b9f7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4930,6 +4930,7 @@ static int copy_statmount_to_user(struct kstatmount *s) static int do_statmount(struct kstatmount *s) { struct mount *m = real_mount(s->mnt); + struct mnt_namespace *ns = m->mnt_ns; int err; /* @@ -4937,7 +4938,7 @@ static int do_statmount(struct kstatmount *s) * mounts to show users. */ if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && - !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN)) + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; err = security_sb_statfs(s->mnt->mnt_root); From 1901c92497bd90caf608a474f1bf4d8795b372a2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Jun 2024 11:49:46 -0400 Subject: [PATCH 3/8] fs: keep an index of current mount namespaces In order to allow for listmount() to be used on different namespaces we need a way to lookup a mount ns by its id. Keep a rbtree of the current !anonymous mount name spaces indexed by ID that we can use to look up the namespace. Co-developed-by: Christian Brauner Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/e5fdd78a90f5b00a75bd893962a70f52a2c015cd.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/mount.h | 2 + fs/namespace.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 2 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 4a42fc68f4cc..5b980c262c0c 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -16,6 +16,8 @@ struct mnt_namespace { u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; + struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index 352e8c90b9f7..eebe9d912a71 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -78,6 +78,8 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +static DEFINE_RWLOCK(mnt_ns_tree_lock); +static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ struct mount_kattr { unsigned int attr_set; @@ -103,6 +105,109 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) +{ + u64 seq_b = ns->seq; + + if (seq < seq_b) + return -1; + if (seq > seq_b) + return 1; + return 0; +} + +static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); +} + +static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +{ + struct mnt_namespace *ns_a = node_to_mnt_ns(a); + struct mnt_namespace *ns_b = node_to_mnt_ns(b); + u64 seq_a = ns_a->seq; + + return mnt_ns_cmp(seq_a, ns_b) < 0; +} + +static void mnt_ns_tree_add(struct mnt_namespace *ns) +{ + guard(write_lock)(&mnt_ns_tree_lock); + rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); +} + +static void mnt_ns_release(struct mnt_namespace *ns) +{ + lockdep_assert_not_held(&mnt_ns_tree_lock); + + /* keep alive for {list,stat}mount() */ + if (refcount_dec_and_test(&ns->passive)) { + put_user_ns(ns->user_ns); + kfree(ns); + } +} +DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) + +static void mnt_ns_tree_remove(struct mnt_namespace *ns) +{ + /* remove from global mount namespace list */ + if (!is_anon_ns(ns)) { + guard(write_lock)(&mnt_ns_tree_lock); + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + } + + mnt_ns_release(ns); +} + +/* + * Returns the mount namespace which either has the specified id, or has the + * next smallest id afer the specified one. + */ +static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +{ + struct rb_node *node = mnt_ns_tree.rb_node; + struct mnt_namespace *ret = NULL; + + lockdep_assert_held(&mnt_ns_tree_lock); + + while (node) { + struct mnt_namespace *n = node_to_mnt_ns(node); + + if (mnt_ns_id <= n->seq) { + ret = node_to_mnt_ns(node); + if (mnt_ns_id == n->seq) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + return ret; +} + +/* + * Lookup a mount namespace by id and take a passive reference count. Taking a + * passive reference means the mount namespace can be emptied if e.g., the last + * task holding an active reference exits. To access the mounts of the + * namespace the @namespace_sem must first be acquired. If the namespace has + * already shut down before acquiring @namespace_sem, {list,stat}mount() will + * see that the mount rbtree of the namespace is empty. + */ +static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) +{ + struct mnt_namespace *ns; + + guard(read_lock)(&mnt_ns_tree_lock); + ns = mnt_ns_find_id_at(mnt_ns_id); + if (!ns || ns->seq != mnt_ns_id) + return NULL; + + refcount_inc(&ns->passive); + return ns; +} + static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); @@ -3733,8 +3838,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); - put_user_ns(ns->user_ns); - kfree(ns); + mnt_ns_tree_remove(ns); } /* @@ -3773,7 +3877,9 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); + refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; @@ -3850,6 +3956,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } + mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -5205,6 +5312,8 @@ static void __init init_mount_tree(void) set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); + + mnt_ns_tree_add(ns); } void __init mnt_init(void) From 09b31295f833031c88419550172703d45c5401e3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Jun 2024 11:49:47 -0400 Subject: [PATCH 4/8] fs: export the mount ns id via statmount In order to allow users to iterate through children mount namespaces via listmount we need a way for them to know what the ns id for the mount. Add a new field to statmount called mnt_ns_id which will carry the ns id for the given mount entry. Co-developed-by: Christian Brauner Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/6dabf437331fb7415d886f7c64b21cb2a50b1c66.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/namespace.c | 11 +++++++++++ include/uapi/linux/mount.h | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index eebe9d912a71..ed2d9353e4be 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4974,6 +4974,14 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_mnt_ns_id(struct kstatmount *s) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + + s->sm.mask |= STATMOUNT_MNT_NS_ID; + s->sm.mnt_ns_id = ns->seq; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret; @@ -5070,6 +5078,9 @@ static int do_statmount(struct kstatmount *s) if (!err && s->mask & STATMOUNT_MNT_POINT) err = statmount_string(s, STATMOUNT_MNT_POINT); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) + statmount_mnt_ns_id(s); + if (err) return err; diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 88d78de1519f..a07508aee518 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -172,7 +172,8 @@ struct statmount { __u64 propagate_from; /* Propagation from in current namespace */ __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ - __u64 __spare2[50]; + __u64 mnt_ns_id; /* ID of the mount namespace */ + __u64 __spare2[49]; char str[]; /* Variable size part containing strings */ }; @@ -202,6 +203,7 @@ struct mnt_id_req { #define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ #define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ /* * Special @mnt_id values that can be passed to listmount From 0a3deb11858ae8a0b3849b5fda45512ad383f0e1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 24 Jun 2024 11:49:48 -0400 Subject: [PATCH 5/8] fs: Allow listmount() in foreign mount namespace Expand struct mnt_id_req to add an optional mnt_ns_id field. When this field is populated, listmount() will be performed on the specified mount namespace, provided the currently application has CAP_SYS_ADMIN in its user namespace and the mount namespace is a child of the current namespace. Co-developed-by: Josef Bacik Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/49930bdce29a8367a213eb14c1e68e7e49284f86.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/namespace.c | 88 ++++++++++++++++++++++++++++++-------- include/uapi/linux/mount.h | 2 + 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ed2d9353e4be..a54d68f822a8 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5122,7 +5122,7 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, int ret; size_t usize; - BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0); + BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); ret = get_user(usize, &req->size); if (ret) @@ -5140,6 +5140,58 @@ static int copy_mnt_id_req(const struct mnt_id_req __user *req, return 0; } +static struct mount *listmnt_next(struct mount *curr, bool reverse) +{ + struct rb_node *node; + + if (reverse) + node = rb_prev(&curr->mnt_node); + else + node = rb_next(&curr->mnt_node); + + return node_to_mount(node); +} + +static int grab_requested_root(struct mnt_namespace *ns, struct path *root) +{ + struct mount *first; + + rwsem_assert_held(&namespace_sem); + + /* We're looking at our own ns, just use get_fs_root. */ + if (ns == current->nsproxy->mnt_ns) { + get_fs_root(current->fs, root); + return 0; + } + + /* + * We have to find the first mount in our ns and use that, however it + * may not exist, so handle that properly. + */ + if (RB_EMPTY_ROOT(&ns->mounts)) + return -ENOENT; + + first = listmnt_next(ns->root, false); + if (!first) + return -ENOENT; + root->mnt = mntget(&first->mnt); + root->dentry = dget(root->mnt->mnt_root); + return 0; +} + +/* + * If the user requested a specific mount namespace id, look that up and return + * that, or if not simply grab a passive reference on our mount namespace and + * return that. + */ +static struct mnt_namespace *grab_requested_mnt_ns(u64 mnt_ns_id) +{ + if (mnt_ns_id) + return lookup_mnt_ns(mnt_ns_id); + refcount_inc(¤t->nsproxy->mnt_ns->passive); + return current->nsproxy->mnt_ns; +} + SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) @@ -5185,30 +5237,21 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, return ret; } -static struct mount *listmnt_next(struct mount *curr, bool reverse) -{ - struct rb_node *node; - - if (reverse) - node = rb_prev(&curr->mnt_node); - else - node = rb_next(&curr->mnt_node); - - return node_to_mount(node); -} - -static ssize_t do_listmount(u64 mnt_parent_id, u64 last_mnt_id, u64 *mnt_ids, - size_t nr_mnt_ids, bool reverse) +static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, + u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, + bool reverse) { struct path root __free(path_put) = {}; - struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); - get_fs_root(current->fs, &root); + ret = grab_requested_root(ns, &root); + if (ret) + return ret; + if (mnt_parent_id == LSMT_ROOT) { orig = root; } else { @@ -5260,6 +5303,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, { u64 *kmnt_ids __free(kvfree) = NULL; const size_t maxcount = 1000000; + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; ssize_t ret; @@ -5286,8 +5330,16 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (!kmnt_ids) return -ENOMEM; + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + if (!ns) + return -ENOENT; + + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -ENOENT; + scoped_guard(rwsem_read, &namespace_sem) - ret = do_listmount(kreq.mnt_id, kreq.param, kmnt_ids, + ret = do_listmount(ns, kreq.mnt_id, kreq.param, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index a07508aee518..ee1559cd6764 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -189,10 +189,12 @@ struct mnt_id_req { __u32 spare; __u64 mnt_id; __u64 param; + __u64 mnt_ns_id; }; /* List of all mnt_id_req versions. */ #define MNT_ID_REQ_SIZE_VER0 24 /* sizeof first published struct */ +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ /* * @mask bits for statmount(2) From 71aacb4c8c3d19da053363a5fe7538a8af082d56 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 24 Jun 2024 11:49:49 -0400 Subject: [PATCH 6/8] fs: Allow statmount() in foreign mount namespace This patch makes use of the new mnt_ns_id field in struct mnt_id_req to allow users to stat mount entries not in their mount namespace. The rules are the same as listmount(), the user must have CAP_SYS_ADMIN in their user namespace and the target mount namespace must be a child of the current namespace. Co-developed-by: Josef Bacik Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/52a2e17e50ba7aa420bc8bae1d9e88ff593395c1.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/namespace.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a54d68f822a8..e871f73c4c8c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4974,10 +4974,8 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } -static void statmount_mnt_ns_id(struct kstatmount *s) +static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { - struct mnt_namespace *ns = current->nsproxy->mnt_ns; - s->sm.mask |= STATMOUNT_MNT_NS_ID; s->sm.mnt_ns_id = ns->seq; } @@ -5079,7 +5077,7 @@ static int do_statmount(struct kstatmount *s) err = statmount_string(s, STATMOUNT_MNT_POINT); if (!err && s->mask & STATMOUNT_MNT_NS_ID) - statmount_mnt_ns_id(s); + statmount_mnt_ns_id(s, ns); if (err) return err; @@ -5196,6 +5194,7 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, struct statmount __user *, buf, size_t, bufsize, unsigned int, flags) { + struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct vfsmount *mnt; struct mnt_id_req kreq; struct kstatmount ks; @@ -5210,13 +5209,28 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, if (ret) return ret; + ns = grab_requested_mnt_ns(kreq.mnt_ns_id); + if (!ns) + return -ENOENT; + + if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + return -ENOENT; + retry: ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size); if (ret) return ret; down_read(&namespace_sem); - mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns); + /* Has the namespace already been emptied? */ + if (kreq.mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) { + up_read(&namespace_sem); + kvfree(ks.seq.buf); + return -ENOENT; + } + + mnt = lookup_mnt_in_ns(kreq.mnt_id, ns); if (!mnt) { up_read(&namespace_sem); kvfree(ks.seq.buf); @@ -5224,7 +5238,12 @@ SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, } ks.mnt = mnt; - get_fs_root(current->fs, &ks.root); + ret = grab_requested_root(ns, &ks.root); + if (ret) { + up_read(&namespace_sem); + kvfree(ks.seq.buf); + return ret; + } ret = do_statmount(&ks); path_put(&ks.root); up_read(&namespace_sem); From e8e43a1fcc5c07575f37e40f8a2cd78aee46f9a0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Jun 2024 11:49:50 -0400 Subject: [PATCH 7/8] fs: add an ioctl to get the mnt ns id from nsfs In order to utilize the listmount() and statmount() extensions that allow us to call them on different namespaces we need a way to get the mnt namespace id from user space. Add an ioctl to nsfs that will allow us to extract the mnt namespace id in order to make these new extensions usable. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/180449959d5a756af7306d6bda55f41b9d53e3cb.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- fs/nsfs.c | 14 ++++++++++++++ include/uapi/linux/nsfs.h | 2 ++ 2 files changed, 16 insertions(+) diff --git a/fs/nsfs.c b/fs/nsfs.c index 07e22a15ef02..af352dadffe1 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -12,6 +12,7 @@ #include #include +#include "mount.h" #include "internal.h" static struct vfsmount *nsfs_mnt; @@ -143,6 +144,19 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, argp = (uid_t __user *) arg; uid = from_kuid_munged(current_user_ns(), user_ns->owner); return put_user(uid, argp); + case NS_GET_MNTNS_ID: { + struct mnt_namespace *mnt_ns; + __u64 __user *idp; + __u64 id; + + if (ns->ops->type != CLONE_NEWNS) + return -EINVAL; + + mnt_ns = container_of(ns, struct mnt_namespace, ns); + idp = (__u64 __user *)arg; + id = mnt_ns->seq; + return put_user(id, idp); + } default: return -ENOTTY; } diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index a0c8552b64ee..56e8b1639b98 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -15,5 +15,7 @@ #define NS_GET_NSTYPE _IO(NSIO, 0x3) /* Get owner UID (in the caller's user namespace) for a user namespace */ #define NS_GET_OWNER_UID _IO(NSIO, 0x4) +/* Get the id for a mount namespace */ +#define NS_GET_MNTNS_ID _IO(NSIO, 0x5) #endif /* __LINUX_NSFS_H */ From d896f71ce1f2e73813dc6f639eb0cf6f4beefdaa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Jun 2024 11:49:51 -0400 Subject: [PATCH 8/8] selftests: add a test for the foreign mnt ns extensions This tests both statmount and listmount to make sure they work with the extensions that allow us to specify a mount ns to enter in order to find the mount entries. Signed-off-by: Josef Bacik Link: https://lore.kernel.org/r/2d1a35bc9ab94b4656c056c420f25e429e7eb0b1.1719243756.git.josef@toxicpanda.com Signed-off-by: Christian Brauner --- .../selftests/filesystems/statmount/Makefile | 2 +- .../filesystems/statmount/statmount.h | 46 +++ .../filesystems/statmount/statmount_test.c | 53 +-- .../filesystems/statmount/statmount_test_ns.c | 364 ++++++++++++++++++ 4 files changed, 424 insertions(+), 41 deletions(-) create mode 100644 tools/testing/selftests/filesystems/statmount/statmount.h create mode 100644 tools/testing/selftests/filesystems/statmount/statmount_test_ns.c diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile index 07a0d5b545ca..3af3136e35a4 100644 --- a/tools/testing/selftests/filesystems/statmount/Makefile +++ b/tools/testing/selftests/filesystems/statmount/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) -TEST_GEN_PROGS := statmount_test +TEST_GEN_PROGS := statmount_test statmount_test_ns include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/statmount/statmount.h b/tools/testing/selftests/filesystems/statmount/statmount.h new file mode 100644 index 000000000000..f4294bab9d73 --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/statmount.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __STATMOUNT_H +#define __STATMOUNT_H + +#include +#include +#include + +static inline int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask, + struct statmount *buf, size_t bufsize, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = mask, + }; + + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } + + return syscall(__NR_statmount, &req, buf, bufsize, flags); +} + +static ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id, + uint64_t last_mnt_id, uint64_t list[], size_t num, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER0, + .mnt_id = mnt_id, + .param = last_mnt_id, + }; + + if (mnt_ns_id) { + req.size = MNT_ID_REQ_SIZE_VER1; + req.mnt_ns_id = mnt_ns_id; + } + + return syscall(__NR_listmount, &req, list, num, flags); +} + +#endif /* __STATMOUNT_H */ diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test.c b/tools/testing/selftests/filesystems/statmount/statmount_test.c index e6d7c4f1c85b..4f7023c2de77 100644 --- a/tools/testing/selftests/filesystems/statmount/statmount_test.c +++ b/tools/testing/selftests/filesystems/statmount/statmount_test.c @@ -4,17 +4,15 @@ #include #include -#include #include #include #include #include #include #include -#include #include -#include +#include "statmount.h" #include "../../kselftest.h" static const char *const known_fs[] = { @@ -36,18 +34,6 @@ static const char *const known_fs[] = { "ufs", "v7", "vboxsf", "vfat", "virtiofs", "vxfs", "xenfs", "xfs", "zonefs", NULL }; -static int statmount(uint64_t mnt_id, uint64_t mask, struct statmount *buf, - size_t bufsize, unsigned int flags) -{ - struct mnt_id_req req = { - .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, - .param = mask, - }; - - return syscall(__NR_statmount, &req, buf, bufsize, flags); -} - static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigned int flags) { size_t bufsize = 1 << 15; @@ -56,7 +42,7 @@ static struct statmount *statmount_alloc(uint64_t mnt_id, uint64_t mask, unsigne int ret; for (;;) { - ret = statmount(mnt_id, mask, tmp, bufsize, flags); + ret = statmount(mnt_id, 0, mask, tmp, bufsize, flags); if (ret != -1) break; if (tofree) @@ -122,7 +108,6 @@ static int orig_root; static uint64_t root_id, parent_id; static uint32_t old_root_id, old_parent_id; - static void cleanup_namespace(void) { fchdir(orig_root); @@ -138,7 +123,7 @@ static void setup_namespace(void) uid_t uid = getuid(); gid_t gid = getgid(); - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); if (ret == -1) ksft_exit_fail_msg("unsharing mountns and userns: %s\n", strerror(errno)); @@ -208,25 +193,13 @@ static int setup_mount_tree(int log2_num) return 0; } -static ssize_t listmount(uint64_t mnt_id, uint64_t last_mnt_id, - uint64_t list[], size_t num, unsigned int flags) -{ - struct mnt_id_req req = { - .size = MNT_ID_REQ_SIZE_VER0, - .mnt_id = mnt_id, - .param = last_mnt_id, - }; - - return syscall(__NR_listmount, &req, list, num, flags); -} - static void test_listmount_empty_root(void) { ssize_t res; const unsigned int size = 32; uint64_t list[size]; - res = listmount(LSMT_ROOT, 0, list, size, 0); + res = listmount(LSMT_ROOT, 0, 0, list, size, 0); if (res == -1) { ksft_test_result_fail("listmount: %s\n", strerror(errno)); return; @@ -251,7 +224,7 @@ static void test_statmount_zero_mask(void) struct statmount sm; int ret; - ret = statmount(root_id, 0, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, 0, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount zero mask: %s\n", strerror(errno)); @@ -277,7 +250,7 @@ static void test_statmount_mnt_basic(void) int ret; uint64_t mask = STATMOUNT_MNT_BASIC; - ret = statmount(root_id, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount mnt basic: %s\n", strerror(errno)); @@ -337,7 +310,7 @@ static void test_statmount_sb_basic(void) struct statx sx; struct statfs sf; - ret = statmount(root_id, mask, &sm, sizeof(sm), 0); + ret = statmount(root_id, 0, mask, &sm, sizeof(sm), 0); if (ret == -1) { ksft_test_result_fail("statmount sb basic: %s\n", strerror(errno)); @@ -498,14 +471,14 @@ static void test_statmount_string(uint64_t mask, size_t off, const char *name) exactsize = sm->size; shortsize = sizeof(*sm) + i; - ret = statmount(root_id, mask, sm, exactsize, 0); + ret = statmount(root_id, 0, mask, sm, exactsize, 0); if (ret == -1) { ksft_test_result_fail("statmount exact size: %s\n", strerror(errno)); goto out; } errno = 0; - ret = statmount(root_id, mask, sm, shortsize, 0); + ret = statmount(root_id, 0, mask, sm, shortsize, 0); if (ret != -1 || errno != EOVERFLOW) { ksft_test_result_fail("should have failed with EOVERFLOW: %s\n", strerror(errno)); @@ -533,7 +506,7 @@ static void test_listmount_tree(void) if (res == -1) return; - num = res = listmount(LSMT_ROOT, 0, list, size, 0); + num = res = listmount(LSMT_ROOT, 0, 0, list, size, 0); if (res == -1) { ksft_test_result_fail("listmount: %s\n", strerror(errno)); return; @@ -545,7 +518,7 @@ static void test_listmount_tree(void) } for (i = 0; i < size - step;) { - res = listmount(LSMT_ROOT, i ? list2[i - 1] : 0, list2 + i, step, 0); + res = listmount(LSMT_ROOT, 0, i ? list2[i - 1] : 0, list2 + i, step, 0); if (res == -1) ksft_test_result_fail("short listmount: %s\n", strerror(errno)); @@ -577,11 +550,11 @@ int main(void) int ret; uint64_t all_mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC | STATMOUNT_PROPAGATE_FROM | STATMOUNT_MNT_ROOT | - STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE; + STATMOUNT_MNT_POINT | STATMOUNT_FS_TYPE | STATMOUNT_MNT_NS_ID; ksft_print_header(); - ret = statmount(0, 0, NULL, 0, 0); + ret = statmount(0, 0, 0, NULL, 0, 0); assert(ret == -1); if (errno == ENOSYS) ksft_exit_skip("statmount() syscall not supported\n"); diff --git a/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c new file mode 100644 index 000000000000..e044f5fc57fd --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/statmount_test_ns.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "statmount.h" +#include "../../kselftest.h" + +#define NSID_PASS 0 +#define NSID_FAIL 1 +#define NSID_SKIP 2 +#define NSID_ERROR 3 + +static void handle_result(int ret, const char *testname) +{ + if (ret == NSID_PASS) + ksft_test_result_pass("%s\n", testname); + else if (ret == NSID_FAIL) + ksft_test_result_fail("%s\n", testname); + else if (ret == NSID_ERROR) + ksft_exit_fail_msg("%s\n", testname); + else + ksft_test_result_skip("%s\n", testname); +} + +static inline int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + ksft_print_msg("waitpid returned -1, errno=%d\n", errno); + return -1; + } + + if (!WIFEXITED(status)) { + ksft_print_msg( + "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n", + WIFSIGNALED(status), WTERMSIG(status)); + return -1; + } + + ret = WEXITSTATUS(status); + return ret; +} + +static int get_mnt_ns_id(const char *mnt_ns, uint64_t *mnt_ns_id) +{ + int fd = open(mnt_ns, O_RDONLY); + + if (fd < 0) { + ksft_print_msg("failed to open for ns %s: %s\n", + mnt_ns, strerror(errno)); + sleep(60); + return NSID_ERROR; + } + + if (ioctl(fd, NS_GET_MNTNS_ID, mnt_ns_id) < 0) { + ksft_print_msg("failed to get the nsid for ns %s: %s\n", + mnt_ns, strerror(errno)); + return NSID_ERROR; + } + close(fd); + return NSID_PASS; +} + +static int get_mnt_id(const char *path, uint64_t *mnt_id) +{ + struct statx sx; + int ret; + + ret = statx(AT_FDCWD, path, 0, STATX_MNT_ID_UNIQUE, &sx); + if (ret == -1) { + ksft_print_msg("retrieving unique mount ID for %s: %s\n", path, + strerror(errno)); + return NSID_ERROR; + } + + if (!(sx.stx_mask & STATX_MNT_ID_UNIQUE)) { + ksft_print_msg("no unique mount ID available for %s\n", path); + return NSID_ERROR; + } + + *mnt_id = sx.stx_mnt_id; + return NSID_PASS; +} + +static int write_file(const char *path, const char *val) +{ + int fd = open(path, O_WRONLY); + size_t len = strlen(val); + int ret; + + if (fd == -1) { + ksft_print_msg("opening %s for write: %s\n", path, strerror(errno)); + return NSID_ERROR; + } + + ret = write(fd, val, len); + if (ret == -1) { + ksft_print_msg("writing to %s: %s\n", path, strerror(errno)); + return NSID_ERROR; + } + if (ret != len) { + ksft_print_msg("short write to %s\n", path); + return NSID_ERROR; + } + + ret = close(fd); + if (ret == -1) { + ksft_print_msg("closing %s\n", path); + return NSID_ERROR; + } + + return NSID_PASS; +} + +static int setup_namespace(void) +{ + int ret; + char buf[32]; + uid_t uid = getuid(); + gid_t gid = getgid(); + + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); + if (ret == -1) + ksft_exit_fail_msg("unsharing mountns and userns: %s\n", + strerror(errno)); + + sprintf(buf, "0 %d 1", uid); + ret = write_file("/proc/self/uid_map", buf); + if (ret != NSID_PASS) + return ret; + ret = write_file("/proc/self/setgroups", "deny"); + if (ret != NSID_PASS) + return ret; + sprintf(buf, "0 %d 1", gid); + ret = write_file("/proc/self/gid_map", buf); + if (ret != NSID_PASS) + return ret; + + ret = mount("", "/", NULL, MS_REC|MS_PRIVATE, NULL); + if (ret == -1) { + ksft_print_msg("making mount tree private: %s\n", + strerror(errno)); + return NSID_ERROR; + } + + return NSID_PASS; +} + +static int _test_statmount_mnt_ns_id(void) +{ + struct statmount sm; + uint64_t mnt_ns_id; + uint64_t root_id; + int ret; + + ret = get_mnt_ns_id("/proc/self/ns/mnt", &mnt_ns_id); + if (ret != NSID_PASS) + return ret; + + ret = get_mnt_id("/", &root_id); + if (ret != NSID_PASS) + return ret; + + ret = statmount(root_id, 0, STATMOUNT_MNT_NS_ID, &sm, sizeof(sm), 0); + if (ret == -1) { + ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (sm.size != sizeof(sm)) { + ksft_print_msg("unexpected size: %u != %u\n", sm.size, + (uint32_t)sizeof(sm)); + return NSID_FAIL; + } + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + return NSID_SKIP; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("unexpected mnt ns ID: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + return NSID_FAIL; + } + + return NSID_PASS; +} + +static void test_statmount_mnt_ns_id(void) +{ + pid_t pid; + int ret; + + pid = fork(); + if (pid < 0) + ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno)); + + /* We're the original pid, wait for the result. */ + if (pid != 0) { + ret = wait_for_pid(pid); + handle_result(ret, "test statmount ns id"); + return; + } + + ret = setup_namespace(); + if (ret != NSID_PASS) + exit(ret); + ret = _test_statmount_mnt_ns_id(); + exit(ret); +} + +static int validate_external_listmount(pid_t pid, uint64_t child_nr_mounts) +{ + uint64_t list[256]; + uint64_t mnt_ns_id; + uint64_t nr_mounts; + char buf[256]; + int ret; + + /* Get the mount ns id for our child. */ + snprintf(buf, sizeof(buf), "/proc/%lu/ns/mnt", (unsigned long)pid); + ret = get_mnt_ns_id(buf, &mnt_ns_id); + + nr_mounts = listmount(LSMT_ROOT, mnt_ns_id, 0, list, 256, 0); + if (nr_mounts == (uint64_t)-1) { + ksft_print_msg("listmount: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (nr_mounts != child_nr_mounts) { + ksft_print_msg("listmount results is %zi != %zi\n", nr_mounts, + child_nr_mounts); + return NSID_FAIL; + } + + /* Validate that all of our entries match our mnt_ns_id. */ + for (int i = 0; i < nr_mounts; i++) { + struct statmount sm; + + ret = statmount(list[i], mnt_ns_id, STATMOUNT_MNT_NS_ID, &sm, + sizeof(sm), 0); + if (ret < 0) { + ksft_print_msg("statmount mnt ns id: %s\n", strerror(errno)); + return NSID_ERROR; + } + + if (sm.mask != STATMOUNT_MNT_NS_ID) { + ksft_print_msg("statmount mnt ns id unavailable\n"); + return NSID_SKIP; + } + + if (sm.mnt_ns_id != mnt_ns_id) { + ksft_print_msg("listmount gave us the wrong ns id: 0x%llx != 0x%llx\n", + (unsigned long long)sm.mnt_ns_id, + (unsigned long long)mnt_ns_id); + return NSID_FAIL; + } + } + + return NSID_PASS; +} + +static void test_listmount_ns(void) +{ + uint64_t nr_mounts; + char pval; + int child_ready_pipe[2]; + int parent_ready_pipe[2]; + pid_t pid; + int ret, child_ret; + + if (pipe(child_ready_pipe) < 0) + ksft_exit_fail_msg("failed to create the child pipe: %s\n", + strerror(errno)); + if (pipe(parent_ready_pipe) < 0) + ksft_exit_fail_msg("failed to create the parent pipe: %s\n", + strerror(errno)); + + pid = fork(); + if (pid < 0) + ksft_exit_fail_msg("failed to fork: %s\n", strerror(errno)); + + if (pid == 0) { + char cval; + uint64_t list[256]; + + close(child_ready_pipe[0]); + close(parent_ready_pipe[1]); + + ret = setup_namespace(); + if (ret != NSID_PASS) + exit(ret); + + nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 256, 0); + if (nr_mounts == (uint64_t)-1) { + ksft_print_msg("listmount: %s\n", strerror(errno)); + exit(NSID_FAIL); + } + + /* + * Tell our parent how many mounts we have, and then wait for it + * to tell us we're done. + */ + write(child_ready_pipe[1], &nr_mounts, sizeof(nr_mounts)); + read(parent_ready_pipe[0], &cval, sizeof(cval)); + exit(NSID_PASS); + } + + close(child_ready_pipe[1]); + close(parent_ready_pipe[0]); + + /* Wait until the child has created everything. */ + if (read(child_ready_pipe[0], &nr_mounts, sizeof(nr_mounts)) != + sizeof(nr_mounts)) + ret = NSID_ERROR; + + ret = validate_external_listmount(pid, nr_mounts); + + if (write(parent_ready_pipe[1], &pval, sizeof(pval)) != sizeof(pval)) + ret = NSID_ERROR; + + child_ret = wait_for_pid(pid); + if (child_ret != NSID_PASS) + ret = child_ret; + handle_result(ret, "test listmount ns id"); +} + +int main(void) +{ + int ret; + + ksft_print_header(); + ret = statmount(0, 0, 0, NULL, 0, 0); + assert(ret == -1); + if (errno == ENOSYS) + ksft_exit_skip("statmount() syscall not supported\n"); + + ksft_set_plan(2); + test_statmount_mnt_ns_id(); + test_listmount_ns(); + + if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +}