linux-next/fs/mount.h
Christian Brauner 344bac8f0d
fs: kill MNT_ONRB
Move mnt->mnt_node into the union with mnt->mnt_rcu and mnt->mnt_llist
instead of keeping it with mnt->mnt_list. This allows us to use
RB_CLEAR_NODE(&mnt->mnt_node) in umount_tree() as well as
list_empty(&mnt->mnt_node). That in turn allows us to remove MNT_ONRB.

This also fixes the bug reported in [1] where seemingly MNT_ONRB wasn't
set in @mnt->mnt_flags even though the mount was present in the mount
rbtree of the mount namespace.

The root cause is the following race. When a btrfs subvolume is mounted
a temporary mount is created:

btrfs_get_tree_subvol()
{
        mnt = fc_mount()
        // Register the newly allocated mount with sb->mounts:
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
}

and registered on sb->s_mounts. Later it is added to an anonymous mount
namespace via mount_subvol():

-> mount_subvol()
   -> mount_subtree()
      -> alloc_mnt_ns()
         mnt_add_to_ns()
         vfs_path_lookup()
         put_mnt_ns()

The mnt_add_to_ns() call raises MNT_ONRB in @mnt->mnt_flags. If someone
concurrently does a ro remount:

reconfigure_super()
-> sb_prepare_remount_readonly()
   {
           list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
   }

all mounts registered in sb->s_mounts are visited and first
MNT_WRITE_HOLD is raised, then MNT_READONLY is raised, and finally
MNT_WRITE_HOLD is removed again.

The flag modification for MNT_WRITE_HOLD/MNT_READONLY and MNT_ONRB race
so MNT_ONRB might be lost.

Fixes: 2eea9ce4310d ("mounts: keep list of mounts in an rbtree")
Cc: <stable@kernel.org> # v6.8+
Link: https://lore.kernel.org/r/20241215-vfs-6-14-mount-work-v1-1-fd55922c4af8@kernel.org
Link: https://lore.kernel.org/r/ec6784ed-8722-4695-980a-4400d4e7bd1a@gmx.com [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
2025-01-09 16:58:50 +01:00

173 lines
4.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
struct rb_root mounts; /* Protected by namespace_sem */
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
refcount_t passive; /* number references not pinning @mounts */
} __randomize_layout;
struct mnt_pcp {
int mnt_count;
int mnt_writers;
};
struct mountpoint {
struct hlist_node m_hash;
struct dentry *m_dentry;
struct hlist_head m_list;
int m_count;
};
struct mount {
struct hlist_node mnt_hash;
struct mount *mnt_parent;
struct dentry *mnt_mountpoint;
struct vfsmount mnt;
union {
struct rb_node mnt_node; /* node in the ns->mounts rbtree */
struct rcu_head mnt_rcu;
struct llist_node mnt_llist;
};
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
#else
int mnt_count;
int mnt_writers;
#endif
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */
struct list_head mnt_slave; /* slave list entry */
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
union {
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
struct hlist_node mnt_umount;
};
struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
int mnt_id; /* mount identifier, reused */
u64 mnt_id_unique; /* mount ID unique until reboot */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct hlist_head mnt_stuck_children;
} __randomize_layout;
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
static inline struct mount *real_mount(struct vfsmount *mnt)
{
return container_of(mnt, struct mount, mnt);
}
static inline int mnt_has_parent(struct mount *mnt)
{
return mnt != mnt->mnt_parent;
}
static inline int is_mounted(struct vfsmount *mnt)
{
/* neither detached nor internal? */
return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}
extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
extern int __legitimize_mnt(struct vfsmount *, unsigned);
static inline bool __path_is_mountpoint(const struct path *path)
{
struct mount *m = __lookup_mnt(path->mnt, path->dentry);
return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}
extern void __detach_mounts(struct dentry *dentry);
static inline void detach_mounts(struct dentry *dentry)
{
if (!d_mountpoint(dentry))
return;
__detach_mounts(dentry);
}
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
refcount_inc(&ns->ns.count);
}
extern seqlock_t mount_lock;
struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
};
extern const struct seq_operations mounts_op;
extern bool __is_local_mountpoint(struct dentry *dentry);
static inline bool is_local_mountpoint(struct dentry *dentry)
{
if (!d_mountpoint(dentry))
return false;
return __is_local_mountpoint(dentry);
}
static inline bool is_anon_ns(struct mnt_namespace *ns)
{
return ns->seq == 0;
}
static inline bool mnt_ns_attached(const struct mount *mnt)
{
return !RB_EMPTY_NODE(&mnt->mnt_node);
}
static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
WARN_ON(!mnt_ns_attached(mnt));
rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
RB_CLEAR_NODE(&mnt->mnt_node);
list_add_tail(&mnt->mnt_list, dt_list);
}
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mnt_ns, bool previous);
static inline struct mnt_namespace *lookup_next_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, false);
}
static inline struct mnt_namespace *lookup_prev_mnt_ns(struct mnt_namespace *mntns)
{
return __lookup_next_mnt_ns(mntns, true);
}
static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}