Merge branch 'vfs-6.14.mount' into vfs.all

2024-12-28 00:32:00 +00:00 · 2024-12-17 21:42:07 +01:00 · 2024-12-17 21:42:07 +01:00 · ef5bbd2a28
commit ef5bbd2a28
parent cb511fa98e 578eb3b6a9
17 changed files with 657 additions and 112 deletions
--- a/fs/mount.h
+++ b/fs/mount.h
@ -8,15 +8,23 @@
 struct mnt_namespace {
 	struct ns_common	ns;
 	struct mount *	root;
-	struct rb_root		mounts; /* Protected by namespace_sem */
+	struct {
+		struct rb_root	mounts;		 /* Protected by namespace_sem */
+		struct rb_node	*mnt_last_node;	 /* last (rightmost) mount in the rbtree */
+		struct rb_node	*mnt_first_node; /* first (leftmost) mount in the rbtree */
+	};
 	struct user_namespace	*user_ns;
 	struct ucounts		*ucounts;
 	u64			seq;	/* Sequence number to prevent loops */
-	wait_queue_head_t poll;
+	union {
+		wait_queue_head_t	poll;
+		struct rcu_head		mnt_ns_rcu;
+	};
 	u64 event;
 	unsigned int		nr_mounts; /* # of mounts in the namespace */
 	unsigned int		pending_mounts;
 	struct rb_node		mnt_ns_tree_node; /* node in the mnt_ns_tree */
+	struct list_head	mnt_ns_list; /* entry in the sequential list of mounts namespace */
 	refcount_t		passive; /* number references not pinning @mounts */
 } __randomize_layout;

@ -38,6 +46,7 @@ struct mount {
 	struct dentry *mnt_mountpoint;
 	struct vfsmount mnt;
 	union {
+		struct rb_node mnt_node; /* node in the ns->mounts rbtree */
 		struct rcu_head mnt_rcu;
 		struct llist_node mnt_llist;
 	};
@ -51,10 +60,7 @@ struct mount {
 	struct list_head mnt_child;	/* and going through their mnt_child */
 	struct list_head mnt_instance;	/* mount instance on sb->s_mounts */
 	const char *mnt_devname;	/* Name of device e.g. /dev/dsk/hda1 */
-	union {
-		struct rb_node mnt_node;	/* Under ns->mounts */
-		struct list_head mnt_list;
-	};
+	struct list_head mnt_list;
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct list_head mnt_share;	/* circular list of shared mounts */
 	struct list_head mnt_slave_list;/* list of slave mounts */
@ -145,24 +151,28 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
 	return ns->seq == 0;
 }

+static inline bool mnt_ns_attached(const struct mount *mnt)
+{
+	return !RB_EMPTY_NODE(&mnt->mnt_node);
+}
+
 static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
 {
-	WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
-	mnt->mnt.mnt_flags &= ~MNT_ONRB;
-	rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+	struct mnt_namespace *ns = mnt->mnt_ns;
+	WARN_ON(!mnt_ns_attached(mnt));
+	if (ns->mnt_last_node == &mnt->mnt_node)
+		ns->mnt_last_node = rb_prev(&mnt->mnt_node);
+	if (ns->mnt_first_node == &mnt->mnt_node)
+		ns->mnt_first_node = rb_next(&mnt->mnt_node);
+	rb_erase(&mnt->mnt_node, &ns->mounts);
+	RB_CLEAR_NODE(&mnt->mnt_node);
 	list_add_tail(&mnt->mnt_list, dt_list);
 }

 bool has_locked_children(struct mount *mnt, struct dentry *dentry);
-struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
-static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
-{
-	return __lookup_next_mnt_ns(mntns, false);
-}
-static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
-{
-	return __lookup_next_mnt_ns(mntns, true);
-}
+struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
+					    bool previous);
+
 static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
 {
 	return container_of(ns, struct mnt_namespace, ns);
--- a/fs/namespace.c
+++ b/fs/namespace.c
@ -32,7 +32,6 @@
 #include <linux/fs_context.h>
 #include <linux/shmem_fs.h>
 #include <linux/mnt_idmapping.h>
-#include <linux/nospec.h>

 #include "pnode.h"
 #include "internal.h"
@ -66,12 +65,12 @@ static int __init set_mphash_entries(char *str)
 __setup("mphash_entries=", set_mphash_entries);

 static u64 event;
-static DEFINE_IDA(mnt_id_ida);
+static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
 static DEFINE_IDA(mnt_group_ida);

 /* Don't allow confusion with old 32bit mount ID */
 #define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
+static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;

 static struct hlist_head *mount_hashtable __ro_after_init;
 static struct hlist_head *mountpoint_hashtable __ro_after_init;
@ -79,8 +78,10 @@ static struct kmem_cache *mnt_cache __ro_after_init;
 static DECLARE_RWSEM(namespace_sem);
 static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
-static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static DEFINE_SEQLOCK(mnt_ns_tree_lock);
+
 static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
+static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */

 struct mount_kattr {
 	unsigned int attr_set;
@ -106,17 +107,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
 */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

-static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
-{
-	u64 seq_b = ns->seq;
-
-	if (seq < seq_b)
-		return -1;
-	if (seq > seq_b)
-		return 1;
-	return 0;
-}
-
 static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 {
 	if (!node)
@ -124,24 +114,53 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
 	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
 }

-static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
 {
 	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
 	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
 	u64 seq_a = ns_a->seq;
+	u64 seq_b = ns_b->seq;

-	return mnt_ns_cmp(seq_a, ns_b) < 0;
+	if (seq_a < seq_b)
+		return -1;
+	if (seq_a > seq_b)
+		return 1;
+	return 0;
+}
+
+static inline void mnt_ns_tree_write_lock(void)
+{
+	write_seqlock(&mnt_ns_tree_lock);
+}
+
+static inline void mnt_ns_tree_write_unlock(void)
+{
+	write_sequnlock(&mnt_ns_tree_lock);
 }

 static void mnt_ns_tree_add(struct mnt_namespace *ns)
 {
-	guard(write_lock)(&mnt_ns_tree_lock);
-	rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+	struct rb_node *node, *prev;
+
+	mnt_ns_tree_write_lock();
+	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+	/*
+	 * If there's no previous entry simply add it after the
+	 * head and if there is add it after the previous entry.
+	 */
+	prev = rb_prev(&ns->mnt_ns_tree_node);
+	if (!prev)
+		list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
+	else
+		list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
+	mnt_ns_tree_write_unlock();
+
+	WARN_ON_ONCE(node);
 }

 static void mnt_ns_release(struct mnt_namespace *ns)
 {
-	lockdep_assert_not_held(&mnt_ns_tree_lock);
+	lockdep_assert_not_held(&mnt_ns_tree_lock.lock);

 	/* keep alive for {list,stat}mount() */
 	if (refcount_dec_and_test(&ns->passive)) {
@ -151,41 +170,34 @@ static void mnt_ns_release(struct mnt_namespace *ns)
 }
 DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))

+static void mnt_ns_release_rcu(struct rcu_head *rcu)
+{
+	mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+}
+
 static void mnt_ns_tree_remove(struct mnt_namespace *ns)
 {
 	/* remove from global mount namespace list */
 	if (!is_anon_ns(ns)) {
-		guard(write_lock)(&mnt_ns_tree_lock);
+		mnt_ns_tree_write_lock();
 		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+		list_bidir_del_rcu(&ns->mnt_ns_list);
+		mnt_ns_tree_write_unlock();
 	}

-	mnt_ns_release(ns);
+	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
 }

-/*
- * Returns the mount namespace which either has the specified id, or has the
- * next smallest id afer the specified one.
- */
-static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+static int mnt_ns_find(const void *key, const struct rb_node *node)
 {
-	struct rb_node *node = mnt_ns_tree.rb_node;
-	struct mnt_namespace *ret = NULL;
+	const u64 mnt_ns_id = *(u64 *)key;
+	const struct mnt_namespace *ns = node_to_mnt_ns(node);

-	lockdep_assert_held(&mnt_ns_tree_lock);
-
-	while (node) {
-		struct mnt_namespace *n = node_to_mnt_ns(node);
-
-		if (mnt_ns_id <= n->seq) {
-			ret = node_to_mnt_ns(node);
-			if (mnt_ns_id == n->seq)
-				break;
-			node = node->rb_left;
-		} else {
-			node = node->rb_right;
-		}
-	}
-	return ret;
+	if (mnt_ns_id < ns->seq)
+		return -1;
+	if (mnt_ns_id > ns->seq)
+		return 1;
+	return 0;
 }

 /*
@ -195,18 +207,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
 * namespace the @namespace_sem must first be acquired. If the namespace has
 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
 * see that the mount rbtree of the namespace is empty.
+ *
+ * Note the lookup is lockless protected by a sequence counter. We only
+ * need to guard against false negatives as false positives aren't
+ * possible. So if we didn't find a mount namespace and the sequence
+ * counter has changed we need to retry. If the sequence counter is
+ * still the same we know the search actually failed.
 */
 static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
 {
-       struct mnt_namespace *ns;
+	struct mnt_namespace *ns;
+	struct rb_node *node;
+	unsigned int seq;

-       guard(read_lock)(&mnt_ns_tree_lock);
-       ns = mnt_ns_find_id_at(mnt_ns_id);
-       if (!ns || ns->seq != mnt_ns_id)
-               return NULL;
+	guard(rcu)();
+	do {
+		seq = read_seqbegin(&mnt_ns_tree_lock);
+		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
+		if (node)
+			break;
+	} while (read_seqretry(&mnt_ns_tree_lock, seq));

-       refcount_inc(&ns->passive);
-       return ns;
+	if (!node)
+		return NULL;
+
+	/*
+	 * The last reference count is put with RCU delay so we can
+	 * unconditonally acquire a reference here.
+	 */
+	ns = node_to_mnt_ns(node);
+	refcount_inc(&ns->passive);
+	return ns;
 }

 static inline void lock_mount_hash(void)
@ -236,18 +267,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)

 static int mnt_alloc_id(struct mount *mnt)
 {
-	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
+	int res;

-	if (res < 0)
-		return res;
-	mnt->mnt_id = res;
-	mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
-	return 0;
+	xa_lock(&mnt_id_xa);
+	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
+	if (!res)
+		mnt->mnt_id_unique = ++mnt_id_ctr;
+	xa_unlock(&mnt_id_xa);
+	return res;
 }

 static void mnt_free_id(struct mount *mnt)
 {
-	ida_free(&mnt_id_ida, mnt->mnt_id);
+	xa_erase(&mnt_id_xa, mnt->mnt_id);
 }

 /*
@ -344,6 +376,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
 		INIT_LIST_HEAD(&mnt->mnt_umounting);
 		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+		RB_CLEAR_NODE(&mnt->mnt_node);
 		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
 	}
 	return mnt;
@ -1123,19 +1156,27 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
 {
 	struct rb_node **link = &ns->mounts.rb_node;
 	struct rb_node *parent = NULL;
+	bool mnt_first_node = true, mnt_last_node = true;

-	WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+	WARN_ON(mnt_ns_attached(mnt));
 	mnt->mnt_ns = ns;
 	while (*link) {
 		parent = *link;
-		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
 			link = &parent->rb_left;
-		else
+			mnt_last_node = false;
+		} else {
 			link = &parent->rb_right;
+			mnt_first_node = false;
+		}
 	}
+
+	if (mnt_last_node)
+		ns->mnt_last_node = &mnt->mnt_node;
+	if (mnt_first_node)
+		ns->mnt_first_node = &mnt->mnt_node;
 	rb_link_node(&mnt->mnt_node, parent, link);
 	rb_insert_color(&mnt->mnt_node, &ns->mounts);
-	mnt->mnt.mnt_flags |= MNT_ONRB;
 }

 /*
@ -1305,7 +1346,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 	}

 	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
-	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
+	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);

 	atomic_inc(&sb->s_active);
 	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@ -1763,7 +1804,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 	/* Gather the mounts to umount */
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		p->mnt.mnt_flags |= MNT_UMOUNT;
-		if (p->mnt.mnt_flags & MNT_ONRB)
+		if (mnt_ns_attached(p))
 			move_from_ns(p, &tmp_list);
 		else
 			list_move(&p->mnt_list, &tmp_list);
@ -1912,16 +1953,14 @@ static int do_umount(struct mount *mnt, int flags)

 	event++;
 	if (flags & MNT_DETACH) {
-		if (mnt->mnt.mnt_flags & MNT_ONRB ||
-		    !list_empty(&mnt->mnt_list))
+		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
 			umount_tree(mnt, UMOUNT_PROPAGATE);
 		retval = 0;
 	} else {
 		shrink_submounts(mnt);
 		retval = -EBUSY;
 		if (!propagate_mount_busy(mnt, 2)) {
-			if (mnt->mnt.mnt_flags & MNT_ONRB ||
-			    !list_empty(&mnt->mnt_list))
+			if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
 				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
 			retval = 0;
 		}
@ -2071,30 +2110,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
 	return &mnt->ns;
 }

-struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
+struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
 {
-	guard(read_lock)(&mnt_ns_tree_lock);
+	guard(rcu)();
+
 	for (;;) {
-		struct rb_node *node;
+		struct list_head *list;

 		if (previous)
-			node = rb_prev(&mntns->mnt_ns_tree_node);
+			list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
 		else
-			node = rb_next(&mntns->mnt_ns_tree_node);
-		if (!node)
+			list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
+		if (list_is_head(list, &mnt_ns_list))
 			return ERR_PTR(-ENOENT);

-		mntns = node_to_mnt_ns(node);
-		node = &mntns->mnt_ns_tree_node;
+		mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);

+		/*
+		 * The last passive reference count is put with RCU
+		 * delay so accessing the mount namespace is not just
+		 * safe but all relevant members are still valid.
+		 */
 		if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
 			continue;

 		/*
-		 * Holding mnt_ns_tree_lock prevents the mount namespace from
-		 * being freed but it may well be on it's deathbed. We want an
-		 * active reference, not just a passive one here as we're
-		 * persisting the mount namespace.
+		 * We need an active reference count as we're persisting
+		 * the mount namespace and it might already be on its
+		 * deathbed.
 		 */
 		if (!refcount_inc_not_zero(&mntns->ns.count))
 			continue;
@ -3911,6 +3954,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
 	refcount_set(&new_ns->ns.count, 1);
 	refcount_set(&new_ns->passive, 1);
 	new_ns->mounts = RB_ROOT;
+	INIT_LIST_HEAD(&new_ns->mnt_ns_list);
 	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
 	init_waitqueue_head(&new_ns->poll);
 	new_ns->user_ns = get_user_ns(user_ns);
@ -3990,7 +4034,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 		while (p->mnt.mnt_root != q->mnt.mnt_root)
 			p = next_mnt(skip_mnt_tree(p), old);
 	}
-	mnt_ns_tree_add(new_ns);
 	namespace_unlock();

 	if (rootmnt)
@ -3998,6 +4041,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	if (pwdmnt)
 		mntput(pwdmnt);

+	mnt_ns_tree_add(new_ns);
 	return new_ns;
 }

@ -5535,9 +5579,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,

 	if (!last_mnt_id) {
 		if (reverse)
-			first = node_to_mount(rb_last(&ns->mounts));
+			first = node_to_mount(ns->mnt_last_node);
 		else
-			first = node_to_mount(rb_first(&ns->mounts));
+			first = node_to_mount(ns->mnt_first_node);
 	} else {
 		if (reverse)
 			first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@ -274,10 +274,7 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		if (usize < MNT_NS_INFO_SIZE_VER0)
 			return -EINVAL;

-		if (previous)
-			mnt_ns = lookup_prev_mnt_ns(to_mnt_ns(ns));
-		else
-			mnt_ns = lookup_next_mnt_ns(to_mnt_ns(ns));
+		mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
 		if (IS_ERR(mnt_ns))
 			return PTR_ERR(mnt_ns);

--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@ -50,7 +50,7 @@ struct path;
 #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME )

 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
-			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | MNT_ONRB)
+			    MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED)

 #define MNT_INTERNAL	0x4000

@ -64,7 +64,6 @@ struct path;
 #define MNT_SYNC_UMOUNT		0x2000000
 #define MNT_MARKED		0x4000000
 #define MNT_UMOUNT		0x8000000
-#define MNT_ONRB		0x10000000

 struct vfsmount {
 	struct dentry *mnt_root;	/* root of the mounted tree */
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@ -30,6 +30,17 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
 * way, we must not access it directly
 */
 #define list_next_rcu(list)	(*((struct list_head __rcu **)(&(list)->next)))
+/*
+ * Return the ->prev pointer of a list_head in an rcu safe way. Don't
+ * access it directly.
+ *
+ * Any list traversed with list_bidir_prev_rcu() must never use
+ * list_del_rcu().  Doing so will poison the ->prev pointer that
+ * list_bidir_prev_rcu() relies on, which will result in segfaults.
+ * To prevent these segfaults, use list_bidir_del_rcu() instead
+ * of list_del_rcu().
+ */
+#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))

 /**
 * list_tail_rcu - returns the prev pointer of the head of the list
@ -158,6 +169,39 @@ static inline void list_del_rcu(struct list_head *entry)
 	entry->prev = LIST_POISON2;
 }

+/**
+ * list_bidir_del_rcu - deletes entry from list without re-initialization
+ * @entry: the element to delete from the list.
+ *
+ * In contrast to list_del_rcu() doesn't poison the prev pointer thus
+ * allowing backwards traversal via list_bidir_prev_rcu().
+ *
+ * Note: list_empty() on entry does not return true after this because
+ * the entry is in a special undefined state that permits RCU-based
+ * lockfree reverse traversal. In particular this means that we can not
+ * poison the forward and backwards pointers that may still be used for
+ * walking the list.
+ *
+ * The caller must take whatever precautions are necessary (such as
+ * holding appropriate locks) to avoid racing with another list-mutation
+ * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
+ * this same list. However, it is perfectly legal to run concurrently
+ * with the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ *
+ * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
+ * the same list.
+ *
+ * Note that the caller is not permitted to immediately free
+ * the newly deleted entry.  Instead, either synchronize_rcu()
+ * or call_rcu() must be used to defer freeing until an RCU
+ * grace period has elapsed.
+ */
+static inline void list_bidir_del_rcu(struct list_head *entry)
+{
+	__list_del_entry(entry);
+}
+
 /**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
--- a/samples/vfs/.gitignore
+++ b/samples/vfs/.gitignore
@ -1,4 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /test-fsmount
+/test-list-all-mounts
 /test-statx
 /mountinfo
--- a/samples/vfs/Makefile
+++ b/samples/vfs/Makefile
@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-userprogs-always-y += test-fsmount test-statx mountinfo
+userprogs-always-y += test-fsmount test-statx mountinfo test-list-all-mounts

 userccflags += -I usr/include
--- a/samples/vfs/test-list-all-mounts.c
+++ b/samples/vfs/test-list-all-mounts.c
@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include "../../tools/testing/selftests/pidfd/pidfd.h"
+
+#define die_errno(format, ...)                                             \
+	do {                                                               \
+		fprintf(stderr, "%m | %s: %d: %s: " format "\n", __FILE__, \
+			__LINE__, __func__, ##__VA_ARGS__);                \
+		exit(EXIT_FAILURE);                                        \
+	} while (0)
+
+/* Get the id for a mount namespace */
+#define NS_GET_MNTNS_ID _IO(0xb7, 0x5)
+/* Get next mount namespace. */
+
+struct mnt_ns_info {
+	__u32 size;
+	__u32 nr_mounts;
+	__u64 mnt_ns_id;
+};
+
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+
+/* Get information about namespace. */
+#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
+/* Get next namespace. */
+#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
+/* Get previous namespace. */
+#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
+
+#define PIDFD_GET_MNT_NAMESPACE _IO(0xFF, 3)
+
+#ifndef __NR_listmount
+#define __NR_listmount 458
+#endif
+
+#ifndef __NR_statmount
+#define __NR_statmount 457
+#endif
+
+/* @mask bits for statmount(2) */
+#define STATMOUNT_SB_BASIC		0x00000001U /* Want/got sb_... */
+#define STATMOUNT_MNT_BASIC		0x00000002U /* Want/got mnt_... */
+#define STATMOUNT_PROPAGATE_FROM	0x00000004U /* Want/got propagate_from */
+#define STATMOUNT_MNT_ROOT		0x00000008U /* Want/got mnt_root  */
+#define STATMOUNT_MNT_POINT		0x00000010U /* Want/got mnt_point */
+#define STATMOUNT_FS_TYPE		0x00000020U /* Want/got fs_type */
+#define STATMOUNT_MNT_NS_ID		0x00000040U /* Want/got mnt_ns_id */
+#define STATMOUNT_MNT_OPTS		0x00000080U /* Want/got mnt_opts */
+
+#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */
+
+struct statmount {
+	__u32 size;
+	__u32 mnt_opts;
+	__u64 mask;
+	__u32 sb_dev_major;
+	__u32 sb_dev_minor;
+	__u64 sb_magic;
+	__u32 sb_flags;
+	__u32 fs_type;
+	__u64 mnt_id;
+	__u64 mnt_parent_id;
+	__u32 mnt_id_old;
+	__u32 mnt_parent_id_old;
+	__u64 mnt_attr;
+	__u64 mnt_propagation;
+	__u64 mnt_peer_group;
+	__u64 mnt_master;
+	__u64 propagate_from;
+	__u32 mnt_root;
+	__u32 mnt_point;
+	__u64 mnt_ns_id;
+	__u64 __spare2[49];
+	char str[];
+};
+
+struct mnt_id_req {
+	__u32 size;
+	__u32 spare;
+	__u64 mnt_id;
+	__u64 param;
+	__u64 mnt_ns_id;
+};
+
+#define MNT_ID_REQ_SIZE_VER1 32 /* sizeof second published struct */
+
+#define LSMT_ROOT 0xffffffffffffffff /* root mount */
+
+static int __statmount(__u64 mnt_id, __u64 mnt_ns_id, __u64 mask,
+		       struct statmount *stmnt, size_t bufsize,
+		       unsigned int flags)
+{
+	struct mnt_id_req req = {
+		.size		= MNT_ID_REQ_SIZE_VER1,
+		.mnt_id		= mnt_id,
+		.param		= mask,
+		.mnt_ns_id	= mnt_ns_id,
+	};
+
+	return syscall(__NR_statmount, &req, stmnt, bufsize, flags);
+}
+
+static struct statmount *sys_statmount(__u64 mnt_id, __u64 mnt_ns_id,
+				       __u64 mask, unsigned int flags)
+{
+	size_t bufsize = 1 << 15;
+	struct statmount *stmnt = NULL, *tmp = NULL;
+	int ret;
+
+	for (;;) {
+		tmp = realloc(stmnt, bufsize);
+		if (!tmp)
+			goto out;
+
+		stmnt = tmp;
+		ret = __statmount(mnt_id, mnt_ns_id, mask, stmnt, bufsize, flags);
+		if (!ret)
+			return stmnt;
+
+		if (errno != EOVERFLOW)
+			goto out;
+
+		bufsize <<= 1;
+		if (bufsize >= UINT_MAX / 2)
+			goto out;
+	}
+
+out:
+	free(stmnt);
+	return NULL;
+}
+
+static ssize_t sys_listmount(__u64 mnt_id, __u64 last_mnt_id, __u64 mnt_ns_id,
+			     __u64 list[], size_t num, unsigned int flags)
+{
+	struct mnt_id_req req = {
+		.size		= MNT_ID_REQ_SIZE_VER1,
+		.mnt_id		= mnt_id,
+		.param		= last_mnt_id,
+		.mnt_ns_id	= mnt_ns_id,
+	};
+
+	return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+int main(int argc, char *argv[])
+{
+#define LISTMNT_BUFFER 10
+	__u64 list[LISTMNT_BUFFER], last_mnt_id = 0;
+	int ret, pidfd, fd_mntns;
+	struct mnt_ns_info info = {};
+
+	pidfd = sys_pidfd_open(getpid(), 0);
+	if (pidfd < 0)
+		die_errno("pidfd_open failed");
+
+	fd_mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, 0);
+	if (fd_mntns < 0)
+		die_errno("ioctl(PIDFD_GET_MNT_NAMESPACE) failed");
+
+	ret = ioctl(fd_mntns, NS_MNT_GET_INFO, &info);
+	if (ret < 0)
+		die_errno("ioctl(NS_GET_MNTNS_ID) failed");
+
+	printf("Listing %u mounts for mount namespace %llu\n",
+	       info.nr_mounts, info.mnt_ns_id);
+	for (;;) {
+		ssize_t nr_mounts;
+next:
+		nr_mounts = sys_listmount(LSMT_ROOT, last_mnt_id,
+					  info.mnt_ns_id, list, LISTMNT_BUFFER,
+					  0);
+		if (nr_mounts <= 0) {
+			int fd_mntns_next;
+
+			printf("Finished listing %u mounts for mount namespace %llu\n\n",
+			       info.nr_mounts, info.mnt_ns_id);
+			fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info);
+			if (fd_mntns_next < 0) {
+				if (errno == ENOENT) {
+					printf("Finished listing all mount namespaces\n");
+					exit(0);
+				}
+				die_errno("ioctl(NS_MNT_GET_NEXT) failed");
+			}
+			close(fd_mntns);
+			fd_mntns = fd_mntns_next;
+			last_mnt_id = 0;
+			printf("Listing %u mounts for mount namespace %llu\n",
+			       info.nr_mounts, info.mnt_ns_id);
+			goto next;
+		}
+
+		for (size_t cur = 0; cur < nr_mounts; cur++) {
+			struct statmount *stmnt;
+
+			last_mnt_id = list[cur];
+
+			stmnt = sys_statmount(last_mnt_id, info.mnt_ns_id,
+					      STATMOUNT_SB_BASIC |
+					      STATMOUNT_MNT_BASIC |
+					      STATMOUNT_MNT_ROOT |
+					      STATMOUNT_MNT_POINT |
+					      STATMOUNT_MNT_NS_ID |
+					      STATMOUNT_MNT_OPTS |
+					      STATMOUNT_FS_TYPE, 0);
+			if (!stmnt) {
+				printf("Failed to statmount(%llu) in mount namespace(%llu)\n",
+				       last_mnt_id, info.mnt_ns_id);
+				continue;
+			}
+
+			printf("mnt_id:\t\t%llu\nmnt_parent_id:\t%llu\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n",
+			       stmnt->mnt_id,
+			       stmnt->mnt_parent_id,
+			       stmnt->str + stmnt->fs_type,
+			       stmnt->str + stmnt->mnt_root,
+			       stmnt->str + stmnt->mnt_point,
+			       stmnt->str + stmnt->mnt_opts);
+			free(stmnt);
+		}
+	}
+
+	exit(0);
+}
--- a/tools/testing/selftests/filesystems/nsfs/.gitignore
+++ b/tools/testing/selftests/filesystems/nsfs/.gitignore
@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 owner
 pidns
+iterate_mntns
--- a/tools/testing/selftests/filesystems/nsfs/Makefile
+++ b/tools/testing/selftests/filesystems/nsfs/Makefile
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
-TEST_GEN_PROGS := owner pidns
+TEST_GEN_PROGS := owner pidns iterate_mntns

 CFLAGS := -Wall -Werror

-include ../lib.mk
+include ../../lib.mk
--- a/tools/testing/selftests/filesystems/nsfs/config
+++ b/tools/testing/selftests/filesystems/nsfs/config
--- a/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
+++ b/tools/testing/selftests/filesystems/nsfs/iterate_mntns.c
@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "../../kselftest_harness.h"
+
+#define MNT_NS_COUNT 11
+#define MNT_NS_LAST_INDEX 10
+
+struct mnt_ns_info {
+	__u32 size;
+	__u32 nr_mounts;
+	__u64 mnt_ns_id;
+};
+
+#define MNT_NS_INFO_SIZE_VER0 16 /* size of first published struct */
+
+/* Get information about namespace. */
+#define NS_MNT_GET_INFO _IOR(0xb7, 10, struct mnt_ns_info)
+/* Get next namespace. */
+#define NS_MNT_GET_NEXT _IOR(0xb7, 11, struct mnt_ns_info)
+/* Get previous namespace. */
+#define NS_MNT_GET_PREV _IOR(0xb7, 12, struct mnt_ns_info)
+
+FIXTURE(iterate_mount_namespaces) {
+	int fd_mnt_ns[MNT_NS_COUNT];
+	__u64 mnt_ns_id[MNT_NS_COUNT];
+};
+
+FIXTURE_SETUP(iterate_mount_namespaces)
+{
+	for (int i = 0; i < MNT_NS_COUNT; i++)
+		self->fd_mnt_ns[i] = -EBADF;
+
+	/*
+	 * Creating a new user namespace let's us guarantee that we only see
+	 * mount namespaces that we did actually create.
+	 */
+	ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
+
+	for (int i = 0; i < MNT_NS_COUNT; i++) {
+		struct mnt_ns_info info = {};
+
+		ASSERT_EQ(unshare(CLONE_NEWNS), 0);
+		self->fd_mnt_ns[i] = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
+		ASSERT_GE(self->fd_mnt_ns[i], 0);
+		ASSERT_EQ(ioctl(self->fd_mnt_ns[i], NS_MNT_GET_INFO, &info), 0);
+		self->mnt_ns_id[i] = info.mnt_ns_id;
+	}
+}
+
+FIXTURE_TEARDOWN(iterate_mount_namespaces)
+{
+	for (int i = 0; i < MNT_NS_COUNT; i++) {
+		if (self->fd_mnt_ns[i] < 0)
+			continue;
+		ASSERT_EQ(close(self->fd_mnt_ns[i]), 0);
+	}
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_forward)
+{
+	int fd_mnt_ns_cur, count = 0;
+
+	fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[0], F_DUPFD_CLOEXEC);
+	ASSERT_GE(fd_mnt_ns_cur, 0);
+
+	for (;; count++) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_next;
+
+		fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+		if (fd_mnt_ns_next < 0 && errno == ENOENT)
+			break;
+		ASSERT_GE(fd_mnt_ns_next, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_next;
+	}
+	ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_all_backwards)
+{
+	int fd_mnt_ns_cur, count = 0;
+
+	fd_mnt_ns_cur = fcntl(self->fd_mnt_ns[MNT_NS_LAST_INDEX], F_DUPFD_CLOEXEC);
+	ASSERT_GE(fd_mnt_ns_cur, 0);
+
+	for (;; count++) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_prev;
+
+		fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+		if (fd_mnt_ns_prev < 0 && errno == ENOENT)
+			break;
+		ASSERT_GE(fd_mnt_ns_prev, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_prev;
+	}
+	ASSERT_EQ(count, MNT_NS_LAST_INDEX);
+}
+
+TEST_F(iterate_mount_namespaces, iterate_forward)
+{
+	int fd_mnt_ns_cur;
+
+	ASSERT_EQ(setns(self->fd_mnt_ns[0], CLONE_NEWNS), 0);
+
+	fd_mnt_ns_cur = self->fd_mnt_ns[0];
+	for (int i = 1; i < MNT_NS_COUNT; i++) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_next;
+
+		fd_mnt_ns_next = ioctl(fd_mnt_ns_cur, NS_MNT_GET_NEXT, &info);
+		ASSERT_GE(fd_mnt_ns_next, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_next;
+		ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+	}
+}
+
+TEST_F(iterate_mount_namespaces, iterate_backward)
+{
+	int fd_mnt_ns_cur;
+
+	ASSERT_EQ(setns(self->fd_mnt_ns[MNT_NS_LAST_INDEX], CLONE_NEWNS), 0);
+
+	fd_mnt_ns_cur = self->fd_mnt_ns[MNT_NS_LAST_INDEX];
+	for (int i = MNT_NS_LAST_INDEX - 1; i >= 0; i--) {
+		struct mnt_ns_info info = {};
+		int fd_mnt_ns_prev;
+
+		fd_mnt_ns_prev = ioctl(fd_mnt_ns_cur, NS_MNT_GET_PREV, &info);
+		ASSERT_GE(fd_mnt_ns_prev, 0);
+		ASSERT_EQ(close(fd_mnt_ns_cur), 0);
+		fd_mnt_ns_cur = fd_mnt_ns_prev;
+		ASSERT_EQ(info.mnt_ns_id, self->mnt_ns_id[i]);
+	}
+}
+
+TEST_HARNESS_MAIN
--- a/tools/testing/selftests/filesystems/nsfs/owner.c
+++ b/tools/testing/selftests/filesystems/nsfs/owner.c
--- a/tools/testing/selftests/filesystems/nsfs/pidns.c
+++ b/tools/testing/selftests/filesystems/nsfs/pidns.c
--- a/tools/testing/selftests/filesystems/statmount/Makefile
+++ b/tools/testing/selftests/filesystems/statmount/Makefile
@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-or-later

 CFLAGS += -Wall -O2 -g $(KHDR_INCLUDES)
-TEST_GEN_PROGS := statmount_test statmount_test_ns
+TEST_GEN_PROGS := statmount_test statmount_test_ns listmount_test

 include ../../lib.mk
--- a/tools/testing/selftests/filesystems/statmount/listmount_test.c
+++ b/tools/testing/selftests/filesystems/statmount/listmount_test.c
@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+#include "statmount.h"
+#include "../../kselftest_harness.h"
+
+#ifndef LISTMOUNT_REVERSE
+#define LISTMOUNT_REVERSE    (1 << 0) /* List later mounts first */
+#endif
+
+#define LISTMNT_BUFFER 10
+
+/* Check that all mount ids are in increasing order. */
+TEST(listmount_forward)
+{
+	uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+	for (;;) {
+		ssize_t nr_mounts;
+
+		nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+				      list, LISTMNT_BUFFER, 0);
+		ASSERT_GE(nr_mounts, 0);
+		if (nr_mounts == 0)
+			break;
+
+		for (size_t cur = 0; cur < nr_mounts; cur++) {
+			if (cur < nr_mounts - 1)
+				ASSERT_LT(list[cur], list[cur + 1]);
+			last_mnt_id = list[cur];
+		}
+	}
+}
+
+/* Check that all mount ids are in decreasing order. */
+TEST(listmount_backward)
+{
+	uint64_t list[LISTMNT_BUFFER], last_mnt_id = 0;
+
+	for (;;) {
+		ssize_t nr_mounts;
+
+		nr_mounts = listmount(LSMT_ROOT, 0, last_mnt_id,
+				      list, LISTMNT_BUFFER, LISTMOUNT_REVERSE);
+		ASSERT_GE(nr_mounts, 0);
+		if (nr_mounts == 0)
+			break;
+
+		for (size_t cur = 0; cur < nr_mounts; cur++) {
+			if (cur < nr_mounts - 1)
+				ASSERT_GT(list[cur], list[cur + 1]);
+			last_mnt_id = list[cur];
+		}
+	}
+}
+
+TEST_HARNESS_MAIN
--- a/tools/testing/selftests/pidfd/pidfd.h
+++ b/tools/testing/selftests/pidfd/pidfd.h
@ -12,7 +12,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <syscall.h>
-#include <sys/mount.h>
 #include <sys/types.h>
 #include <sys/wait.h>