From 2d8b01f8b5519ab1b2882c146e5289c95ff0b0ff Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:40 +0100 Subject: [PATCH 01/14] mount: remove inlude/nospec.h include It's not needed, so remove it. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-1-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3f..c3dbe6a7ab6b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "pnode.h" #include "internal.h" From 7e2578cbec19c19ee35cc25f5cf4acc0d2e5fb89 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:41 +0100 Subject: [PATCH 02/14] fs: add mount namespace to rbtree late There's no point doing that under the namespace semaphore it just gives the false impression that it protects the mount namespace rbtree and it simply doesn't. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-2-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index c3dbe6a7ab6b..10fa18dd6601 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3983,7 +3983,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -3991,6 +3990,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); + mnt_ns_tree_add(new_ns); return new_ns; } From b31421b665332df1e517b40c7bef654ccdff5866 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:42 +0100 Subject: [PATCH 03/14] fs: lockless mntns rbtree lookup Currently we use a read-write lock but for the simple search case we can make this lockless. Creating a new mount namespace is a rather rare event compared with querying mounts in a foreign mount namespace. Once this is picked up by e.g., systemd to list mounts in another mount in it's isolated services or in containers this will be used a lot so this seems worthwhile doing. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-3-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/mount.h | 5 ++- fs/namespace.c | 116 +++++++++++++++++++++++++++++-------------------- 2 files changed, 74 insertions(+), 47 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 185fc56afc13..36ead0e45e8a 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -12,7 +12,10 @@ struct mnt_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ - wait_queue_head_t poll; + union { + wait_queue_head_t poll; + struct rcu_head mnt_ns_rcu; + }; u64 event; unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; diff --git a/fs/namespace.c b/fs/namespace.c index 10fa18dd6601..ce4e7399be72 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -79,6 +79,8 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_RWLOCK(mnt_ns_tree_lock); +static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); + static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ struct mount_kattr { @@ -105,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) -{ - u64 seq_b = ns->seq; - - if (seq < seq_b) - return -1; - if (seq > seq_b) - return 1; - return 0; -} - static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { if (!node) @@ -123,19 +114,41 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); } -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) { struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_b = node_to_mnt_ns(b); u64 seq_a = ns_a->seq; + u64 seq_b = ns_b->seq; - return mnt_ns_cmp(seq_a, ns_b) < 0; + if (seq_a < seq_b) + return -1; + if (seq_a > seq_b) + return 1; + return 0; +} + +static inline void mnt_ns_tree_write_lock(void) +{ + write_lock(&mnt_ns_tree_lock); + write_seqcount_begin(&mnt_ns_tree_seqcount); +} + +static inline void mnt_ns_tree_write_unlock(void) +{ + write_seqcount_end(&mnt_ns_tree_seqcount); + write_unlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) { - guard(write_lock)(&mnt_ns_tree_lock); - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); + struct rb_node *node; + + mnt_ns_tree_write_lock(); + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); + mnt_ns_tree_write_unlock(); + + WARN_ON_ONCE(node); } static void mnt_ns_release(struct mnt_namespace *ns) @@ -150,41 +163,33 @@ static void mnt_ns_release(struct mnt_namespace *ns) } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +static void mnt_ns_release_rcu(struct rcu_head *rcu) +{ + mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); +} + static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (!is_anon_ns(ns)) { - guard(write_lock)(&mnt_ns_tree_lock); + mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + mnt_ns_tree_write_unlock(); } - mnt_ns_release(ns); + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); } -/* - * Returns the mount namespace which either has the specified id, or has the - * next smallest id afer the specified one. - */ -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +static int mnt_ns_find(const void *key, const struct rb_node *node) { - struct rb_node *node = mnt_ns_tree.rb_node; - struct mnt_namespace *ret = NULL; + const u64 mnt_ns_id = *(u64 *)key; + const struct mnt_namespace *ns = node_to_mnt_ns(node); - lockdep_assert_held(&mnt_ns_tree_lock); - - while (node) { - struct mnt_namespace *n = node_to_mnt_ns(node); - - if (mnt_ns_id <= n->seq) { - ret = node_to_mnt_ns(node); - if (mnt_ns_id == n->seq) - break; - node = node->rb_left; - } else { - node = node->rb_right; - } - } - return ret; + if (mnt_ns_id < ns->seq) + return -1; + if (mnt_ns_id > ns->seq) + return 1; + return 0; } /* @@ -194,18 +199,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. + * + * Note the lookup is lockless protected by a sequence counter. We only + * need to guard against false negatives as false positives aren't + * possible. So if we didn't find a mount namespace and the sequence + * counter has changed we need to retry. If the sequence counter is + * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; + struct mnt_namespace *ns; + struct rb_node *node; + unsigned int seq; - guard(read_lock)(&mnt_ns_tree_lock); - ns = mnt_ns_find_id_at(mnt_ns_id); - if (!ns || ns->seq != mnt_ns_id) - return NULL; + guard(rcu)(); + do { + seq = read_seqcount_begin(&mnt_ns_tree_seqcount); + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); + if (node) + break; + } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); - refcount_inc(&ns->passive); - return ns; + if (!node) + return NULL; + + /* + * The last reference count is put with RCU delay so we can + * unconditonally acquire a reference here. + */ + ns = node_to_mnt_ns(node); + refcount_inc(&ns->passive); + return ns; } static inline void lock_mount_hash(void) From eb7489b34f4cd273928c982c1d12af37f9e0132c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:43 +0100 Subject: [PATCH 04/14] rculist: add list_bidir_{del,prev}_rcu() Currently there is no primitive for retrieving the previous list member. To do this we need a new deletion primitive that doesn't poison the prev pointer and a corresponding retrieval helper. Note that it is not valid to ues both list_del_rcu() and list_bidir_del_rcu() on the same list. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-4-6e3cdaf9b280@kernel.org Reviewed-by: Paul E. McKenney Reviewed-by: Jeff Layton Suggested-by: Paul E. McKenney Signed-off-by: Christian Brauner --- include/linux/rculist.h | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 14dfa6008467..1b11926ddd47 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) +/* + * Return the ->prev pointer of a list_head in an rcu safe way. Don't + * access it directly. + * + * Any list traversed with list_bidir_prev_rcu() must never use + * list_del_rcu(). Doing so will poison the ->prev pointer that + * list_bidir_prev_rcu() relies on, which will result in segfaults. + * To prevent these segfaults, use list_bidir_del_rcu() instead + * of list_del_rcu(). + */ +#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev))) /** * list_tail_rcu - returns the prev pointer of the head of the list @@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry) entry->prev = LIST_POISON2; } +/** + * list_bidir_del_rcu - deletes entry from list without re-initialization + * @entry: the element to delete from the list. + * + * In contrast to list_del_rcu() doesn't poison the prev pointer thus + * allowing backwards traversal via list_bidir_prev_rcu(). + * + * Note: list_empty() on entry does not return true after this because + * the entry is in a special undefined state that permits RCU-based + * lockfree reverse traversal. In particular this means that we can not + * poison the forward and backwards pointers that may still be used for + * walking the list. + * + * The caller must take whatever precautions are necessary (such as + * holding appropriate locks) to avoid racing with another list-mutation + * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on + * this same list. However, it is perfectly legal to run concurrently + * with the _rcu list-traversal primitives, such as + * list_for_each_entry_rcu(). + * + * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on + * the same list. + * + * Note that the caller is not permitted to immediately free + * the newly deleted entry. Instead, either synchronize_rcu() + * or call_rcu() must be used to defer freeing until an RCU + * grace period has elapsed. + */ +static inline void list_bidir_del_rcu(struct list_head *entry) +{ + __list_del_entry(entry); +} + /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. From 208fc3fb60e67aa6c68ce961efa92837c114667c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:44 +0100 Subject: [PATCH 05/14] fs: lockless mntns lookup for nsfs We already made the rbtree lookup lockless for the simple lookup case. However, walking the list of mount namespaces via nsfs still happens with taking the read lock blocking concurrent additions of new mount namespaces pointlessly. Plus, such additions are rare anyway so allow lockless lookup of the previous and next mount namespace by keeping a separate list. This also allows to make some things simpler in the code. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-5-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Suggested-by: Peter Zijlstra Signed-off-by: Christian Brauner --- fs/mount.h | 13 ++++--------- fs/namespace.c | 42 +++++++++++++++++++++++++++++------------- fs/nsfs.c | 5 +---- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 36ead0e45e8a..8cda387f47c5 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -20,6 +20,7 @@ struct mnt_namespace { unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ + struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ } __randomize_layout; @@ -157,15 +158,9 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) } bool has_locked_children(struct mount *mnt, struct dentry *dentry); -struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous); -static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns) -{ - return __lookup_next_mnt_ns(mntns, false); -} -static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns) -{ - return __lookup_next_mnt_ns(mntns, true); -} +struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns, + bool previous); + static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); diff --git a/fs/namespace.c b/fs/namespace.c index ce4e7399be72..751dc14debb3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -82,6 +82,7 @@ static DEFINE_RWLOCK(mnt_ns_tree_lock); static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ +static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ struct mount_kattr { unsigned int attr_set; @@ -142,10 +143,19 @@ static inline void mnt_ns_tree_write_unlock(void) static void mnt_ns_tree_add(struct mnt_namespace *ns) { - struct rb_node *node; + struct rb_node *node, *prev; mnt_ns_tree_write_lock(); node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->mnt_ns_tree_node); + if (!prev) + list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); + else + list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); mnt_ns_tree_write_unlock(); WARN_ON_ONCE(node); @@ -174,6 +184,7 @@ static void mnt_ns_tree_remove(struct mnt_namespace *ns) if (!is_anon_ns(ns)) { mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + list_bidir_del_rcu(&ns->mnt_ns_list); mnt_ns_tree_write_unlock(); } @@ -2088,30 +2099,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) return &mnt->ns; } -struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous) +struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { - guard(read_lock)(&mnt_ns_tree_lock); + guard(rcu)(); + for (;;) { - struct rb_node *node; + struct list_head *list; if (previous) - node = rb_prev(&mntns->mnt_ns_tree_node); + list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); else - node = rb_next(&mntns->mnt_ns_tree_node); - if (!node) + list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); + if (list_is_head(list, &mnt_ns_list)) return ERR_PTR(-ENOENT); - mntns = node_to_mnt_ns(node); - node = &mntns->mnt_ns_tree_node; + mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + /* + * The last passive reference count is put with RCU + * delay so accessing the mount namespace is not just + * safe but all relevant members are still valid. + */ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) continue; /* - * Holding mnt_ns_tree_lock prevents the mount namespace from - * being freed but it may well be on it's deathbed. We want an - * active reference, not just a passive one here as we're - * persisting the mount namespace. + * We need an active reference count as we're persisting + * the mount namespace and it might already be on its + * deathbed. */ if (!refcount_inc_not_zero(&mntns->ns.count)) continue; @@ -3928,6 +3943,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + INIT_LIST_HEAD(&new_ns->mnt_ns_list); RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); diff --git a/fs/nsfs.c b/fs/nsfs.c index c675fc40ce2d..663f8656158d 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl, if (usize < MNT_NS_INFO_SIZE_VER0) return -EINVAL; - if (previous) - mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns)); - else - mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns)); + mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous); if (IS_ERR(mnt_ns)) return PTR_ERR(mnt_ns); From 63ef7e34943373aebaf19c9646a2be91323cb1e8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:45 +0100 Subject: [PATCH 06/14] fs: simplify rwlock to spinlock We're not taking the read_lock() anymore now that all lookup is lockless. Just use a simple spinlock. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-6-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 751dc14debb3..d0e6551b36e2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -78,8 +78,7 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ -static DEFINE_RWLOCK(mnt_ns_tree_lock); -static seqcount_rwlock_t mnt_ns_tree_seqcount = SEQCNT_RWLOCK_ZERO(mnt_ns_tree_seqcount, &mnt_ns_tree_lock); +static DEFINE_SEQLOCK(mnt_ns_tree_lock); static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ @@ -131,14 +130,12 @@ static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) static inline void mnt_ns_tree_write_lock(void) { - write_lock(&mnt_ns_tree_lock); - write_seqcount_begin(&mnt_ns_tree_seqcount); + write_seqlock(&mnt_ns_tree_lock); } static inline void mnt_ns_tree_write_unlock(void) { - write_seqcount_end(&mnt_ns_tree_seqcount); - write_unlock(&mnt_ns_tree_lock); + write_sequnlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) @@ -163,7 +160,7 @@ static void mnt_ns_tree_add(struct mnt_namespace *ns) static void mnt_ns_release(struct mnt_namespace *ns) { - lockdep_assert_not_held(&mnt_ns_tree_lock); + lockdep_assert_not_held(&mnt_ns_tree_lock.lock); /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { @@ -225,11 +222,11 @@ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) guard(rcu)(); do { - seq = read_seqcount_begin(&mnt_ns_tree_seqcount); + seq = read_seqbegin(&mnt_ns_tree_lock); node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); if (node) break; - } while (read_seqcount_retry(&mnt_ns_tree_seqcount, seq)); + } while (read_seqretry(&mnt_ns_tree_lock, seq)); if (!node) return NULL; From 2598fc06474e031fe26870493abc22e5dea857dc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:46 +0100 Subject: [PATCH 07/14] seltests: move nsfs into filesystems subfolder I'm going to be adding new tests for it and it belongs under filesystem selftests. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-7-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- tools/testing/selftests/{ => filesystems}/nsfs/.gitignore | 0 tools/testing/selftests/{ => filesystems}/nsfs/Makefile | 2 +- tools/testing/selftests/{ => filesystems}/nsfs/config | 0 tools/testing/selftests/{ => filesystems}/nsfs/owner.c | 0 tools/testing/selftests/{ => filesystems}/nsfs/pidns.c | 0 5 files changed, 1 insertion(+), 1 deletion(-) rename tools/testing/selftests/{ => filesystems}/nsfs/.gitignore (100%) rename tools/testing/selftests/{ => filesystems}/nsfs/Makefile (82%) rename tools/testing/selftests/{ => filesystems}/nsfs/config (100%) rename tools/testing/selftests/{ => filesystems}/nsfs/owner.c (100%) rename tools/testing/selftests/{ => filesystems}/nsfs/pidns.c (100%) diff --git a/tools/testing/selftests/nsfs/.gitignore b/tools/testing/selftests/filesystems/nsfs/.gitignore similarity index 100% rename from tools/testing/selftests/nsfs/.gitignore rename to tools/testing/selftests/filesystems/nsfs/.gitignore diff --git a/tools/testing/selftests/nsfs/Makefile b/tools/testing/selftests/filesystems/nsfs/Makefile similarity index 82% rename from tools/testing/selftests/nsfs/Makefile rename to tools/testing/selftests/filesystems/nsfs/Makefile index dd9bd50b7b93..c2f3ca6e488e 100644 --- a/tools/testing/selftests/nsfs/Makefile +++ b/tools/testing/selftests/filesystems/nsfs/Makefile @@ -3,4 +3,4 @@ TEST_GEN_PROGS := owner pidns CFLAGS := -Wall -Werror -include ../lib.mk +include ../../lib.mk diff --git a/tools/testing/selftests/nsfs/config b/tools/testing/selftests/filesystems/nsfs/config similarity index 100% rename from tools/testing/selftests/nsfs/config rename to tools/testing/selftests/filesystems/nsfs/config diff --git a/tools/testing/selftests/nsfs/owner.c b/tools/testing/selftests/filesystems/nsfs/owner.c similarity index 100% rename from tools/testing/selftests/nsfs/owner.c rename to tools/testing/selftests/filesystems/nsfs/owner.c diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/filesystems/nsfs/pidns.c similarity index 100% rename from tools/testing/selftests/nsfs/pidns.c rename to tools/testing/selftests/filesystems/nsfs/pidns.c From 3e02f80793795822903a5c24de88fc9e4f08fa9e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:47 +0100 Subject: [PATCH 08/14] selftests: add tests for mntns iteration Test that forward and backward iteration works correctly. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-8-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- .../selftests/filesystems/nsfs/.gitignore | 1 + .../selftests/filesystems/nsfs/Makefile | 2 +- .../filesystems/nsfs/iterate_mntns.c | 149 ++++++++++++++++++ 3 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/filesystems/nsfs/iterate_mntns.c diff --git a/tools/testing/selftests/filesystems/nsfs/.gitignore b/tools/testing/selftests/filesystems/nsfs/.gitignore index ed79ebdf286e..92a8249006d1 100644 --- a/tools/testing/selftests/filesystems/nsfs/.gitignore +++ b/tools/testing/selftests/filesystems/nsfs/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only owner pidns +iterate_mntns diff --git a/tools/testing/selftests/filesystems/nsfs/Makefile b/tools/testing/selftests/filesystems/nsfs/Makefile index c2f3ca6e488e..231aaa7dfd95 100644 --- a/tools/testing/selftests/filesystems/nsfs/Makefile +++ b/tools/testing/selftests/filesystems/nsfs/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -TEST_GEN_PROGS := owner pidns +TEST_GEN_PROGS := owner pidns iterate_mntns CFLAGS := -Wall -Werror diff --git a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c new file mode 100644 index 000000000000..457cf76f3c5f --- /dev/null +++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Christian Brauner + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include "../../kselftest_harness.h" + +#define MNT_NS_COUNT 11 +#define MNT_NS_LAST_INDEX 10 + +struct mnt_ns_info { + __u32 size; + __u32 nr_mounts; + __u64 mnt_ns_id; +}; + +#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */ + +/* Get information about namespace. */ +#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info) +/* Get next namespace. */ +#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info) +/* Get previous namespace. */ +#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info) + +FIXTURE(iterate_mount_namespaces) { + int fd_mnt_ns[MNT_NS_COUNT]; + __u64 mnt_ns_id[MNT_NS_COUNT]; +}; + +FIXTURE_SETUP(iterate_mount_namespaces) +{ + for (int i = 0; i < MNT_NS_COUNT; i++) + self->fd_mnt_ns[i] = -EBADF; + + /* + * Creating a new user namespace let's us guarantee that we only see + * mount namespaces that we did actually create. + */ + ASSERT_EQ(unshare(CLONE_NEWUSER), 0); + + for (int i = 0; i < MNT_NS_COUNT; i++) { + struct mnt_ns_info info = {}; + + ASSERT_EQ(unshare(CLONE_NEWNS), 0); + self->fd_mnt_ns[i] = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); + ASSERT_GE(self->fd_mnt_ns[i], 0); + ASSERT_EQ(ioctl(self->fd_mnt_ns[i], NS_MNT_GET_INFO, &info), 0); + self->mnt_ns_id[i] = info.mnt_ns_id; + } +} + +FIXTURE_TEARDOWN(iterate_mount_namespaces) +{ + for (int i = 0; i < MNT_NS_COUNT; i++) { + if (self->fd_mnt_ns[i] < 0) + continue; + ASSERT_EQ(close(self->fd_mnt_ns[i]), 0); + } +} + +TEST_F(iterate_mount_namespaces, iterate_all_forward) +{ + int fd_mnt_ns_cur, count = 0; + + fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC); + ASSERT_GE(fd_mnt_ns_cur, 0); + + for (;; count++) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_next; + + fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info); + if (fd_mnt_ns_next < 0 && errno == ENOENT) + break; + ASSERT_GE(fd_mnt_ns_next, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_next; + } + ASSERT_EQ(count, MNT_NS_LAST_INDEX); +} + +TEST_F(iterate_mount_namespaces, iterate_all_backwards) +{ + int fd_mnt_ns_cur, count = 0; + + fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC); + ASSERT_GE(fd_mnt_ns_cur, 0); + + for (;; count++) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_prev; + + fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info); + if (fd_mnt_ns_prev < 0 && errno == ENOENT) + break; + ASSERT_GE(fd_mnt_ns_prev, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_prev; + } + ASSERT_EQ(count, MNT_NS_LAST_INDEX); +} + +TEST_F(iterate_mount_namespaces, iterate_forward) +{ + int fd_mnt_ns_cur; + + ASSERT_EQ(setns(self->fd_mnt_ns[0], CLONE_NEWNS), 0); + + fd_mnt_ns_cur = self->fd_mnt_ns[0]; + for (int i = 1; i < MNT_NS_COUNT; i++) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_next; + + fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info); + ASSERT_GE(fd_mnt_ns_next, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_next; + ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); + } +} + +TEST_F(iterate_mount_namespaces, iterate_backward) +{ + int fd_mnt_ns_cur; + + ASSERT_EQ(setns(self->fd_mnt_ns[MNT_NS_LAST_INDEX], CLONE_NEWNS), 0); + + fd_mnt_ns_cur = self->fd_mnt_ns[MNT_NS_LAST_INDEX]; + for (int i = MNT_NS_LAST_INDEX - 1; i >= 0; i--) { + struct mnt_ns_info info = {}; + int fd_mnt_ns_prev; + + fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info); + ASSERT_GE(fd_mnt_ns_prev, 0); + ASSERT_EQ(close(fd_mnt_ns_cur), 0); + fd_mnt_ns_cur = fd_mnt_ns_prev; + ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]); + } +} + +TEST_HARNESS_MAIN From 27765562f66aa43bceaba7006358a573435e2de4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:48 +0100 Subject: [PATCH 09/14] selftests: remove unneeded include The pidfd header will be included in a sample program and this pulls in all the mount definitions that would be causing problems. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-9-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/pidfd.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h index 88d6830ee004..3a96053e52e7 100644 --- a/tools/testing/selftests/pidfd/pidfd.h +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include From 6176522551a2d3c89788a0c47c3868bdb95e2431 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 13 Dec 2024 00:03:49 +0100 Subject: [PATCH 10/14] samples: add test-list-all-mounts Add a sample program illustrating how to list all mounts in all mount namespaces. Link: https://lore.kernel.org/r/20241213-work-mount-rbtree-lockless-v3-10-6e3cdaf9b280@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- samples/vfs/.gitignore | 1 + samples/vfs/Makefile | 2 +- samples/vfs/test-list-all-mounts.c | 235 +++++++++++++++++++++++++++++ 3 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 samples/vfs/test-list-all-mounts.c diff --git a/samples/vfs/.gitignore b/samples/vfs/.gitignore index 79212d91285b..8694dd17b318 100644 --- a/samples/vfs/.gitignore +++ b/samples/vfs/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only /test-fsmount +/test-list-all-mounts /test-statx diff --git a/samples/vfs/Makefile b/samples/vfs/Makefile index 6377a678134a..301be72a52a0 100644 --- a/samples/vfs/Makefile +++ b/samples/vfs/Makefile @@ -1,4 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -userprogs-always-y += test-fsmount test-statx +userprogs-always-y += test-fsmount test-statx test-list-all-mounts userccflags += -I usr/include diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c new file mode 100644 index 000000000000..f372d5aea471 --- /dev/null +++ b/samples/vfs/test-list-all-mounts.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Christian Brauner + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include "../../tools/testing/selftests/pidfd/pidfd.h" + +#define die_errno(format, ...) \ + do { \ + fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \ + __LINE__, __func__, ##__VA_ARGS__); \ + exit(EXIT_FAILURE); \ + } while (0) + +/* Get the id for a mount namespace */ +#define NS_GET_MNTNS_ID _IO(0xb7, 0x5) +/* Get next mount namespace. */ + +struct mnt_ns_info { + __u32 size; + __u32 nr_mounts; + __u64 mnt_ns_id; +}; + +#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */ + +/* Get information about namespace. */ +#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info) +/* Get next namespace. */ +#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info) +/* Get previous namespace. */ +#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info) + +#define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3) + +#ifndef __NR_listmount +#define __NR_listmount 458 +#endif + +#ifndef __NR_statmount +#define __NR_statmount 457 +#endif + +/* @mask bits for statmount(2) */ +#define STATMOUNT_SB_BASIC 0x00000001U /* Want/got sb_... */ +#define STATMOUNT_MNT_BASIC 0x00000002U /* Want/got mnt_... */ +#define STATMOUNT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */ +#define STATMOUNT_MNT_ROOT 0x00000008U /* Want/got mnt_root */ +#define STATMOUNT_MNT_POINT 0x00000010U /* Want/got mnt_point */ +#define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ +#define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ +#define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ + +#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */ + +struct statmount { + __u32 size; + __u32 mnt_opts; + __u64 mask; + __u32 sb_dev_major; + __u32 sb_dev_minor; + __u64 sb_magic; + __u32 sb_flags; + __u32 fs_type; + __u64 mnt_id; + __u64 mnt_parent_id; + __u32 mnt_id_old; + __u32 mnt_parent_id_old; + __u64 mnt_attr; + __u64 mnt_propagation; + __u64 mnt_peer_group; + __u64 mnt_master; + __u64 propagate_from; + __u32 mnt_root; + __u32 mnt_point; + __u64 mnt_ns_id; + __u64 __spare2[49]; + char str[]; +}; + +struct mnt_id_req { + __u32 size; + __u32 spare; + __u64 mnt_id; + __u64 param; + __u64 mnt_ns_id; +}; + +#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */ + +#define LSMT_ROOT 0xffffffffffffffff /* root mount */ + +static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask, + struct statmount *stmnt, size_t bufsize, + unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER1, + .mnt_id = mnt_id, + .param = mask, + .mnt_ns_id = mnt_ns_id, + }; + + return syscall(__NR_statmount, &req, stmnt, bufsize, flags); +} + +static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id, + __u64 mask, unsigned int flags) +{ + size_t bufsize = 1 << 15; + struct statmount *stmnt = NULL, *tmp = NULL; + int ret; + + for (;;) { + tmp = realloc(stmnt, bufsize); + if (!tmp) + goto out; + + stmnt = tmp; + ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags); + if (!ret) + return stmnt; + + if (errno != EOVERFLOW) + goto out; + + bufsize <<= 1; + if (bufsize >= UINT_MAX / 2) + goto out; + } + +out: + free(stmnt); + return NULL; +} + +static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id, + __u64 list[], size_t num, unsigned int flags) +{ + struct mnt_id_req req = { + .size = MNT_ID_REQ_SIZE_VER1, + .mnt_id = mnt_id, + .param = last_mnt_id, + .mnt_ns_id = mnt_ns_id, + }; + + return syscall(__NR_listmount, &req, list, num, flags); +} + +int main(int argc, char *argv[]) +{ +#define LISTMNT_BUFFER 10 + __u64 list[LISTMNT_BUFFER], last_mnt_id = 0; + int ret, pidfd, fd_mntns; + struct mnt_ns_info info = {}; + + pidfd = sys_pidfd_open(getpid(), 0); + if (pidfd < 0) + die_errno("pidfd_open failed"); + + fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0); + if (fd_mntns < 0) + die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed"); + + ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info); + if (ret < 0) + die_errno("ioctl(NS_GET_MNTNS_ID) failed"); + + printf("Listing %u mounts for mount namespace %llu\n", + info.nr_mounts, info.mnt_ns_id); + for (;;) { + ssize_t nr_mounts; +next: + nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id, + info.mnt_ns_id, list, LISTMNT_BUFFER, + 0); + if (nr_mounts <= 0) { + int fd_mntns_next; + + printf("Finished listing %u mounts for mount namespace %llu\n\n", + info.nr_mounts, info.mnt_ns_id); + fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info); + if (fd_mntns_next < 0) { + if (errno == ENOENT) { + printf("Finished listing all mount namespaces\n"); + exit(0); + } + die_errno("ioctl(NS_MNT_GET_NEXT) failed"); + } + close(fd_mntns); + fd_mntns = fd_mntns_next; + last_mnt_id = 0; + printf("Listing %u mounts for mount namespace %llu\n", + info.nr_mounts, info.mnt_ns_id); + goto next; + } + + for (size_t cur = 0; cur < nr_mounts; cur++) { + struct statmount *stmnt; + + last_mnt_id = list[cur]; + + stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id, + STATMOUNT_SB_BASIC | + STATMOUNT_MNT_BASIC | + STATMOUNT_MNT_ROOT | + STATMOUNT_MNT_POINT | + STATMOUNT_MNT_NS_ID | + STATMOUNT_MNT_OPTS | + STATMOUNT_FS_TYPE, 0); + if (!stmnt) { + printf("Failed to statmount(%llu) in mount namespace(%llu)\n", + last_mnt_id, info.mnt_ns_id); + continue; + } + + printf("mnt_id:\t\t%llu\nmnt_parent_id:\t%llu\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n", + stmnt->mnt_id, + stmnt->mnt_parent_id, + stmnt->str + stmnt->fs_type, + stmnt->str + stmnt->mnt_root, + stmnt->str + stmnt->mnt_point, + stmnt->str + stmnt->mnt_opts); + free(stmnt); + } + } + + exit(0); +} From 5a4e727d1ac1210c387cc32a126ef63cbb589702 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 15 Dec 2024 21:17:05 +0100 Subject: [PATCH 11/14] fs: kill MNT_ONRB Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist instead of keeping it with mnt->mnt_list. This allows us to use RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB. Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org Signed-off-by: Christian Brauner --- fs/mount.h | 15 +++++++++------ fs/namespace.c | 14 ++++++-------- include/linux/mount.h | 3 +-- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index 8cda387f47c5..e9f48e563c0f 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -42,6 +42,7 @@ struct mount { struct dentry *mnt_mountpoint; struct vfsmount mnt; union { + struct rb_node mnt_node; /* node in the ns->mounts rbtree */ struct rcu_head mnt_rcu; struct llist_node mnt_llist; }; @@ -55,10 +56,7 @@ struct mount { struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ - union { - struct rb_node mnt_node; /* Under ns->mounts */ - struct list_head mnt_list; - }; + struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ @@ -149,11 +147,16 @@ static inline bool is_anon_ns(struct mnt_namespace *ns) return ns->seq == 0; } +static inline bool mnt_ns_attached(const struct mount *mnt) +{ + return !RB_EMPTY_NODE(&mnt->mnt_node); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { - WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB)); - mnt->mnt.mnt_flags &= ~MNT_ONRB; + WARN_ON(!mnt_ns_attached(mnt)); rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); + RB_CLEAR_NODE(&mnt->mnt_node); list_add_tail(&mnt->mnt_list, dt_list); } diff --git a/fs/namespace.c b/fs/namespace.c index d0e6551b36e2..e37927b76814 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -375,6 +375,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -1155,7 +1156,7 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; - WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; @@ -1166,7 +1167,6 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) } rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); - mnt->mnt.mnt_flags |= MNT_ONRB; } /* @@ -1336,7 +1336,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, } mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); + mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1794,7 +1794,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - if (p->mnt.mnt_flags & MNT_ONRB) + if (mnt_ns_attached(p)) move_from_ns(p, &tmp_list); else list_move(&p->mnt_list, &tmp_list); @@ -1943,16 +1943,14 @@ static int do_umount(struct mount *mnt, int flags) event++; if (flags & MNT_DETACH) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) + if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } diff --git a/include/linux/mount.h b/include/linux/mount.h index c34c18b4e8f3..04213d8ef837 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,7 @@ struct path; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 @@ -64,7 +64,6 @@ struct path; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 -#define MNT_ONRB 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ From b782c5efe79d670aaeb3576a6a673b983ffd23b5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 15 Dec 2024 21:17:06 +0100 Subject: [PATCH 12/14] fs: cache first and last mount Speed up listmount() by caching the first and last node making retrieval of the first and last mount of each mount namespace O(1). Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-2-fd55922c4af8@kernel.org Signed-off-by: Christian Brauner --- fs/mount.h | 13 +++++++++++-- fs/namespace.c | 17 +++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index e9f48e563c0f..ffb613cdfeee 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -8,7 +8,11 @@ struct mnt_namespace { struct ns_common ns; struct mount * root; - struct rb_root mounts; /* Protected by namespace_sem */ + struct { + struct rb_root mounts; /* Protected by namespace_sem */ + struct rb_node *mnt_last_node; /* last (rightmost) mount in the rbtree */ + struct rb_node *mnt_first_node; /* first (leftmost) mount in the rbtree */ + }; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ @@ -154,8 +158,13 @@ static inline bool mnt_ns_attached(const struct mount *mnt) static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { + struct mnt_namespace *ns = mnt->mnt_ns; WARN_ON(!mnt_ns_attached(mnt)); - rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts); + if (ns->mnt_last_node == &mnt->mnt_node) + ns->mnt_last_node = rb_prev(&mnt->mnt_node); + if (ns->mnt_first_node == &mnt->mnt_node) + ns->mnt_first_node = rb_next(&mnt->mnt_node); + rb_erase(&mnt->mnt_node, &ns->mounts); RB_CLEAR_NODE(&mnt->mnt_node); list_add_tail(&mnt->mnt_list, dt_list); } diff --git a/fs/namespace.c b/fs/namespace.c index e37927b76814..4dd3137b37e2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1155,16 +1155,25 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) { struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; + bool mnt_first_node = true, mnt_last_node = true; WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; - if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { link = &parent->rb_left; - else + mnt_last_node = false; + } else { link = &parent->rb_right; + mnt_first_node = false; + } } + + if (mnt_last_node) + ns->mnt_last_node = &mnt->mnt_node; + if (mnt_first_node) + ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); } @@ -5559,9 +5568,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, if (!last_mnt_id) { if (reverse) - first = node_to_mount(rb_last(&ns->mounts)); + first = node_to_mount(ns->mnt_last_node); else - first = node_to_mount(rb_first(&ns->mounts)); + first = node_to_mount(ns->mnt_first_node); } else { if (reverse) first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); From f2ca2268a8bc97c5627e20b2611e99332d594a47 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 15 Dec 2024 21:17:07 +0100 Subject: [PATCH 13/14] selftests: add listmount() iteration tests Add a forward and backward iteration test for listmount(). Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-3-fd55922c4af8@kernel.org Signed-off-by: Christian Brauner --- .../selftests/filesystems/statmount/Makefile | 2 +- .../filesystems/statmount/listmount_test.c | 66 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/filesystems/statmount/listmount_test.c diff --git a/tools/testing/selftests/filesystems/statmount/Makefile b/tools/testing/selftests/filesystems/statmount/Makefile index 3af3136e35a4..14ee91a41650 100644 --- a/tools/testing/selftests/filesystems/statmount/Makefile +++ b/tools/testing/selftests/filesystems/statmount/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-or-later CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES) -TEST_GEN_PROGS := statmount_test statmount_test_ns +TEST_GEN_PROGS := statmount_test statmount_test_ns listmount_test include ../../lib.mk diff --git a/tools/testing/selftests/filesystems/statmount/listmount_test.c b/tools/testing/selftests/filesystems/statmount/listmount_test.c new file mode 100644 index 000000000000..15f0834f7557 --- /dev/null +++ b/tools/testing/selftests/filesystems/statmount/listmount_test.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +// Copyright (c) 2024 Christian Brauner + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include "statmount.h" +#include "../../kselftest_harness.h" + +#ifndef LISTMOUNT_REVERSE +#define LISTMOUNT_REVERSE (1 << 0) /* List later mounts first */ +#endif + +#define LISTMNT_BUFFER 10 + +/* Check that all mount ids are in increasing order. */ +TEST(listmount_forward) +{ + uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0; + + for (;;) { + ssize_t nr_mounts; + + nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id, + list, LISTMNT_BUFFER, 0); + ASSERT_GE(nr_mounts, 0); + if (nr_mounts == 0) + break; + + for (size_t cur = 0; cur < nr_mounts; cur++) { + if (cur < nr_mounts - 1) + ASSERT_LT(list[cur], list[cur + 1]); + last_mnt_id = list[cur]; + } + } +} + +/* Check that all mount ids are in decreasing order. */ +TEST(listmount_backward) +{ + uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0; + + for (;;) { + ssize_t nr_mounts; + + nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id, + list, LISTMNT_BUFFER, LISTMOUNT_REVERSE); + ASSERT_GE(nr_mounts, 0); + if (nr_mounts == 0) + break; + + for (size_t cur = 0; cur < nr_mounts; cur++) { + if (cur < nr_mounts - 1) + ASSERT_GT(list[cur], list[cur + 1]); + last_mnt_id = list[cur]; + } + } +} + +TEST_HARNESS_MAIN From 578eb3b6a9ad15f4a8269047b4bb36f758c0a236 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 17 Dec 2024 13:21:55 +0100 Subject: [PATCH 14/14] fs: use xarray for old mount id While the ida does use the xarray internally we can use it explicitly which allows us to increment the unique mount id under the xa lock. This allows us to remove the atomic as we're now allocating both ids in one go. Link: https://lore.kernel.org/r/20241217-erhielten-regung-44bb1604ca8f@brauner Signed-off-by: Christian Brauner --- fs/namespace.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 4dd3137b37e2..a80786e821c3 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -65,12 +65,12 @@ static int __init set_mphash_entries(char *str) __setup("mphash_entries=", set_mphash_entries); static u64 event; -static DEFINE_IDA(mnt_id_ida); +static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ #define MNT_UNIQUE_ID_OFFSET (1ULL << 31) -static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET); +static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET; static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; @@ -267,18 +267,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) static int mnt_alloc_id(struct mount *mnt) { - int res = ida_alloc(&mnt_id_ida, GFP_KERNEL); + int res; - if (res < 0) - return res; - mnt->mnt_id = res; - mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); - return 0; + xa_lock(&mnt_id_xa); + res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); + if (!res) + mnt->mnt_id_unique = ++mnt_id_ctr; + xa_unlock(&mnt_id_xa); + return res; } static void mnt_free_id(struct mount *mnt) { - ida_free(&mnt_id_ida, mnt->mnt_id); + xa_erase(&mnt_id_xa, mnt->mnt_id); } /*