Merge branch 'vfs-6.14.misc' into vfs.all

2024-12-28 00:32:00 +00:00 · 2024-12-17 21:41:49 +01:00 · 2024-12-17 21:41:49 +01:00 · 4554288d75
commit 4554288d75
parent 6a6d921e15 846d0723d2
22 changed files with 480 additions and 184 deletions
--- a/Documentation/filesystems/fiemap.rst
+++ b/Documentation/filesystems/fiemap.rst
@ -12,21 +12,10 @@ returns a list of extents.
 Request Basics
 --------------

-A fiemap request is encoded within struct fiemap::
-
-  struct fiemap {
-	__u64	fm_start;	 /* logical offset (inclusive) at
-				  * which to start mapping (in) */
-	__u64	fm_length;	 /* logical length of mapping which
-				  * userspace cares about (in) */
-	__u32	fm_flags;	 /* FIEMAP_FLAG_* flags for request (in/out) */
-	__u32	fm_mapped_extents; /* number of extents that were
-				    * mapped (out) */
-	__u32	fm_extent_count; /* size of fm_extents array (in) */
-	__u32	fm_reserved;
-	struct fiemap_extent fm_extents[0]; /* array of mapped extents (out) */
-  };
+A fiemap request is encoded within struct fiemap:

+.. kernel-doc:: include/uapi/linux/fiemap.h
+   :identifiers: fiemap

 fm_start, and fm_length specify the logical range within the file
 which the process would like mappings for. Extents returned mirror
@ -60,6 +49,8 @@ FIEMAP_FLAG_XATTR
  If this flag is set, the extents returned will describe the inodes
  extended attribute lookup tree, instead of its data tree.

+FIEMAP_FLAG_CACHE
+  This flag requests caching of the extents.

 Extent Mapping
 --------------
@ -77,18 +68,10 @@ complete the requested range and will not have the FIEMAP_EXTENT_LAST
 flag set (see the next section on extent flags).

 Each extent is described by a single fiemap_extent structure as
-returned in fm_extents::
+returned in fm_extents:

-    struct fiemap_extent {
-	    __u64	fe_logical;  /* logical offset in bytes for the start of
-				* the extent */
-	    __u64	fe_physical; /* physical offset in bytes for the start
-				* of the extent */
-	    __u64	fe_length;   /* length in bytes for the extent */
-	    __u64	fe_reserved64[2];
-	    __u32	fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
-	    __u32	fe_reserved[3];
-    };
+.. kernel-doc:: include/uapi/linux/fiemap.h
+    :identifiers: fiemap_extent

 All offsets and lengths are in bytes and mirror those on disk.  It is valid
 for an extents logical offset to start before the request or its logical
@ -175,6 +158,8 @@ FIEMAP_EXTENT_MERGED
  userspace would be highly inefficient, the kernel will try to merge most
  adjacent blocks into 'extents'.

+FIEMAP_EXTENT_SHARED
+  This flag is set to request that space be shared with other files.

 VFS -> File System Implementation
 ---------------------------------
@ -191,14 +176,10 @@ each discovered extent::
                     u64 len);

 ->fiemap is passed struct fiemap_extent_info which describes the
-fiemap request::
+fiemap request:

-  struct fiemap_extent_info {
-	unsigned int fi_flags;		/* Flags as passed from user */
-	unsigned int fi_extents_mapped;	/* Number of mapped extents */
-	unsigned int fi_extents_max;	/* Size of fiemap_extent array */
-	struct fiemap_extent *fi_extents_start;	/* Start of fiemap_extent array */
-  };
+.. kernel-doc:: include/linux/fiemap.h
+    :identifiers: fiemap_extent_info

 It is intended that the file system should not need to access any of this
 structure directly. Filesystem handlers should be tolerant to signals and return
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@ -5006,10 +5006,11 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 		if (IS_ENCRYPTED(inode)) {
 			inode->i_op = &ext4_encrypted_symlink_inode_operations;
 		} else if (ext4_inode_is_fast_symlink(inode)) {
-			inode->i_link = (char *)ei->i_data;
 			inode->i_op = &ext4_fast_symlink_inode_operations;
 			nd_terminate_link(ei->i_data, inode->i_size,
 				sizeof(ei->i_data) - 1);
+			inode_set_cached_link(inode, (char *)ei->i_data,
+					      inode->i_size);
 		} else {
 			inode->i_op = &ext4_symlink_inode_operations;
 		}
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@ -3418,7 +3418,6 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
 			inode->i_op = &ext4_symlink_inode_operations;
 		} else {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
-			inode->i_link = (char *)&EXT4_I(inode)->i_data;
 		}
 	}

@ -3434,6 +3433,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		       disk_link.len);
 		inode->i_size = disk_link.len - 1;
 		EXT4_I(inode)->i_disksize = inode->i_size;
+		if (!IS_ENCRYPTED(inode))
+			inode_set_cached_link(inode, (char *)&EXT4_I(inode)->i_data,
+					      inode->i_size);
 	}
 	err = ext4_add_nondir(handle, dentry, &inode);
 	if (handle)
--- a/fs/file.c
+++ b/fs/file.c
@ -279,10 +279,6 @@ static int expand_files(struct files_struct *files, unsigned int nr)
 	if (nr < fdt->max_fds)
 		return 0;

-	/* Can we expand? */
-	if (nr >= sysctl_nr_open)
-		return -EMFILE;
-
 	if (unlikely(files->resize_in_progress)) {
 		spin_unlock(&files->file_lock);
 		wait_event(files->resize_wait, !files->resize_in_progress);
@ -290,6 +286,10 @@ static int expand_files(struct files_struct *files, unsigned int nr)
 		goto repeat;
 	}

+	/* Can we expand? */
+	if (unlikely(nr >= sysctl_nr_open))
+		return -EMFILE;
+
 	/* All good, so we try */
 	files->resize_in_progress = true;
 	error = expand_fdtable(files, nr);
@ -1231,17 +1231,9 @@ __releases(&files->file_lock)

 	/*
 	 * We need to detect attempts to do dup2() over allocated but still
-	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
-	 * extra work in their equivalent of fget() - they insert struct
-	 * file immediately after grabbing descriptor, mark it larval if
-	 * more work (e.g. actual opening) is needed and make sure that
-	 * fget() treats larval files as absent.  Potentially interesting,
-	 * but while extra work in fget() is trivial, locking implications
-	 * and amount of surgery on open()-related paths in VFS are not.
-	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
-	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
-	 * scope of POSIX or SUS, since neither considers shared descriptor
-	 * tables and this condition does not arise without those.
+	 * not finished descriptor.
+	 *
+	 * POSIX is silent on the issue, we return -EBUSY.
 	 */
 	fdt = files_fdtable(files);
 	fd = array_index_nospec(fd, fdt->max_fds);
--- a/fs/file_table.c
+++ b/fs/file_table.c
@ -128,7 +128,7 @@ static struct ctl_table fs_stat_sysctls[] = {
 		.data		= &sysctl_nr_open,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_douintvec_minmax,
 		.extra1		= &sysctl_nr_open_min,
 		.extra2		= &sysctl_nr_open_max,
 	},
@ -478,6 +478,8 @@ static void ____fput(struct callback_head *work)
 	__fput(container_of(work, struct file, f_task_work));
 }

+static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
+
 /*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
@ -491,11 +493,10 @@ static void ____fput(struct callback_head *work)
 void flush_delayed_fput(void)
 {
 	delayed_fput(NULL);
+	flush_delayed_work(&delayed_fput_work);
 }
 EXPORT_SYMBOL_GPL(flush_delayed_fput);

-static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
-
 void fput(struct file *file)
 {
 	if (file_ref_put(&file->f_ref)) {
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@ -493,7 +493,7 @@ static void put_fc_log(struct fs_context *fc)
 	if (log) {
 		if (refcount_dec_and_test(&log->usage)) {
 			fc->log.log = NULL;
-			for (i = 0; i <= 7; i++)
+			for (i = 0; i < ARRAY_SIZE(log->buffer) ; i++)
 				if (log->need_free & (1 << i))
 					kfree(log->buffer[i]);
 			kfree(log);
--- a/fs/libfs.c
+++ b/fs/libfs.c
@ -245,9 +245,17 @@ const struct inode_operations simple_dir_inode_operations = {
 };
 EXPORT_SYMBOL(simple_dir_inode_operations);

-/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+/* simple_offset_add() allocation range */
 enum {
-	DIR_OFFSET_MIN	= 2,
+	DIR_OFFSET_MIN		= 3,
+	DIR_OFFSET_MAX		= LONG_MAX - 1,
+};
+
+/* simple_offset_add() never assigns these to a dentry */
+enum {
+	DIR_OFFSET_FIRST	= 2,		/* Find first real entry */
+	DIR_OFFSET_EOD		= LONG_MAX,	/* Marks EOD */
+
 };

 static void offset_set(struct dentry *dentry, long offset)
@ -291,8 +299,11 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
 		return -EBUSY;

 	ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
-				 LONG_MAX, &octx->next_offset, GFP_KERNEL);
-	if (ret < 0)
+				 DIR_OFFSET_MAX, &octx->next_offset,
+				 GFP_KERNEL);
+	if (unlikely(ret == -EBUSY))
+		return -ENOSPC;
+	if (unlikely(ret < 0))
 		return ret;

 	offset_set(dentry, offset);
@ -329,38 +340,6 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
 	offset_set(dentry, 0);
 }

-/**
- * simple_offset_empty - Check if a dentry can be unlinked
- * @dentry: dentry to be tested
- *
- * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
- */
-int simple_offset_empty(struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-	struct offset_ctx *octx;
-	struct dentry *child;
-	unsigned long index;
-	int ret = 1;
-
-	if (!inode || !S_ISDIR(inode->i_mode))
-		return ret;
-
-	index = DIR_OFFSET_MIN;
-	octx = inode->i_op->get_offset_ctx(inode);
-	mt_for_each(&octx->mt, child, index, LONG_MAX) {
-		spin_lock(&child->d_lock);
-		if (simple_positive(child)) {
-			spin_unlock(&child->d_lock);
-			ret = 0;
-			break;
-		}
-		spin_unlock(&child->d_lock);
-	}
-
-	return ret;
-}
-
 /**
 * simple_offset_rename - handle directory offsets for rename
 * @old_dir: parent directory of source entry
@ -454,14 +433,6 @@ void simple_offset_destroy(struct offset_ctx *octx)
 	mtree_destroy(&octx->mt);
 }

-static int offset_dir_open(struct inode *inode, struct file *file)
-{
-	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
-
-	file->private_data = (void *)ctx->next_offset;
-	return 0;
-}
-
 /**
 * offset_dir_llseek - Advance the read position of a directory descriptor
 * @file: an open directory whose position is to be updated
@ -475,9 +446,6 @@ static int offset_dir_open(struct inode *inode, struct file *file)
 */
 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 {
-	struct inode *inode = file->f_inode;
-	struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode);
-
 	switch (whence) {
 	case SEEK_CUR:
 		offset += file->f_pos;
@ -490,25 +458,46 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 		return -EINVAL;
 	}

-	/* In this case, ->private_data is protected by f_pos_lock */
-	if (!offset)
-		file->private_data = (void *)ctx->next_offset;
 	return vfs_setpos(file, offset, LONG_MAX);
 }

-static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
+/* Cf. find_next_child() */
+static struct dentry *find_next_sibling_locked(struct dentry *parent,
+					       struct dentry *dentry)
 {
-	MA_STATE(mas, &octx->mt, offset, offset);
+	struct dentry *found = NULL;
+
+	hlist_for_each_entry_from(dentry, d_sib) {
+		if (!simple_positive(dentry))
+			continue;
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+		if (simple_positive(dentry))
+			found = dget_dlock(dentry);
+		spin_unlock(&dentry->d_lock);
+		if (likely(found))
+			break;
+	}
+	return found;
+}
+
+static noinline_for_stack struct dentry *
+offset_dir_lookup(struct file *file, loff_t offset)
+{
+	struct dentry *parent = file->f_path.dentry;
 	struct dentry *child, *found = NULL;
+	struct inode *inode = d_inode(parent);
+	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
+
+	MA_STATE(mas, &octx->mt, offset, offset);

 	rcu_read_lock();
-	child = mas_find(&mas, LONG_MAX);
+	child = mas_find(&mas, DIR_OFFSET_MAX);
 	if (!child)
 		goto out;
-	spin_lock(&child->d_lock);
-	if (simple_positive(child))
-		found = dget_dlock(child);
-	spin_unlock(&child->d_lock);
+
+	spin_lock(&parent->d_lock);
+	found = find_next_sibling_locked(parent, child);
+	spin_unlock(&parent->d_lock);
 out:
 	rcu_read_unlock();
 	return found;
@ -517,35 +506,46 @@ static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
-	long offset = dentry2offset(dentry);

-	return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
-			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
+	return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
+			inode->i_ino, fs_umode_to_dtype(inode->i_mode));
 }

-static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index)
+static void offset_iterate_dir(struct file *file, struct dir_context *ctx)
 {
-	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
+	struct dentry *dir = file->f_path.dentry;
 	struct dentry *dentry;

+	if (ctx->pos == DIR_OFFSET_FIRST) {
+		spin_lock(&dir->d_lock);
+		dentry = find_next_sibling_locked(dir, d_first_child(dir));
+		spin_unlock(&dir->d_lock);
+	} else
+		dentry = offset_dir_lookup(file, ctx->pos);
+	if (!dentry)
+		goto out_eod;
+
 	while (true) {
-		dentry = offset_find_next(octx, ctx->pos);
-		if (!dentry)
-			return;
+		struct dentry *next;

-		if (dentry2offset(dentry) >= last_index) {
-			dput(dentry);
-			return;
-		}
+		ctx->pos = dentry2offset(dentry);
+		if (!offset_dir_emit(ctx, dentry))
+			break;

-		if (!offset_dir_emit(ctx, dentry)) {
-			dput(dentry);
-			return;
-		}
-
-		ctx->pos = dentry2offset(dentry) + 1;
+		spin_lock(&dir->d_lock);
+		next = find_next_sibling_locked(dir, d_next_sibling(dentry));
+		spin_unlock(&dir->d_lock);
 		dput(dentry);
+
+		if (!next)
+			goto out_eod;
+		dentry = next;
 	}
+	dput(dentry);
+	return;
+
+out_eod:
+	ctx->pos = DIR_OFFSET_EOD;
 }

 /**
@ -565,6 +565,8 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
 *
 * On return, @ctx->pos contains an offset that will read the next entry
 * in this directory when offset_readdir() is called again with @ctx.
+ * Caller places this value in the d_off field of the last entry in the
+ * user's buffer.
 *
 * Return values:
 *   %0 - Complete
@ -572,19 +574,17 @@ static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, lon
 static int offset_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct dentry *dir = file->f_path.dentry;
-	long last_index = (long)file->private_data;

 	lockdep_assert_held(&d_inode(dir)->i_rwsem);

 	if (!dir_emit_dots(file, ctx))
 		return 0;
-
-	offset_iterate_dir(d_inode(dir), ctx, last_index);
+	if (ctx->pos != DIR_OFFSET_EOD)
+		offset_iterate_dir(file, ctx);
 	return 0;
 }

 const struct file_operations simple_offset_dir_operations = {
-	.open		= offset_dir_open,
 	.llseek		= offset_dir_llseek,
 	.iterate_shared	= offset_readdir,
 	.read		= generic_read_dir,
--- a/fs/namei.c
+++ b/fs/namei.c
@ -5272,19 +5272,16 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
 				getname(newname), 0);
 }

-int readlink_copy(char __user *buffer, int buflen, const char *link)
+int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
 {
-	int len = PTR_ERR(link);
-	if (IS_ERR(link))
-		goto out;
+	int copylen;

-	len = strlen(link);
-	if (len > (unsigned) buflen)
-		len = buflen;
-	if (copy_to_user(buffer, link, len))
-		len = -EFAULT;
-out:
-	return len;
+	copylen = linklen;
+	if (unlikely(copylen > (unsigned) buflen))
+		copylen = buflen;
+	if (copy_to_user(buffer, link, copylen))
+		copylen = -EFAULT;
+	return copylen;
 }

 /**
@ -5304,6 +5301,9 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	const char *link;
 	int res;

+	if (inode->i_opflags & IOP_CACHED_LINK)
+		return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);
+
 	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
 		if (unlikely(inode->i_op->readlink))
 			return inode->i_op->readlink(dentry, buffer, buflen);
@ -5322,7 +5322,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
-	res = readlink_copy(buffer, buflen, link);
+	res = readlink_copy(buffer, buflen, link, strlen(link));
 	do_delayed_call(&done);
 	return res;
 }
@ -5391,10 +5391,14 @@ EXPORT_SYMBOL(page_put_link);

 int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
+	const char *link;
+	int res;
+
 	DEFINE_DELAYED_CALL(done);
-	int res = readlink_copy(buffer, buflen,
-				page_get_link(dentry, d_inode(dentry),
-					      &done));
+	link = page_get_link(dentry, d_inode(dentry), &done);
+	res = PTR_ERR(link);
+	if (!IS_ERR(link))
+		res = readlink_copy(buffer, buflen, link, strlen(link));
 	do_delayed_call(&done);
 	return res;
 }
--- a/fs/namespace.c
+++ b/fs/namespace.c
@ -5044,6 +5044,10 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
 	if (sb->s_op->show_options) {
 		size_t start = seq->count;

+		err = security_sb_show_options(seq, sb);
+		if (err)
+			return err;
+
 		err = sb->s_op->show_options(seq, mnt->mnt_root);
 		if (err)
 			return err;
--- a/fs/pnode.c
+++ b/fs/pnode.c
@ -611,10 +611,10 @@ int propagate_umount(struct list_head *list)
 				continue;
 			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
 				/*
-				 * We have come accross an partially unmounted
-				 * mount in list that has not been visited yet.
-				 * Remember it has been visited and continue
-				 * about our merry way.
+				 * We have come across a partially unmounted
+				 * mount in a list that has not been visited
+				 * yet. Remember it has been visited and
+				 * continue about our merry way.
 				 */
 				list_add_tail(&child->mnt_umounting, &visited);
 				continue;
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@ -83,7 +83,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 	if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) {
 		res = ns_get_name(name, sizeof(name), task, ns_ops);
 		if (res >= 0)
-			res = readlink_copy(buffer, buflen, name);
+			res = readlink_copy(buffer, buflen, name, strlen(name));
 	}
 	put_task_struct(task);
 	return res;
--- a/include/linux/fiemap.h
+++ b/include/linux/fiemap.h
@ -5,12 +5,18 @@
 #include <uapi/linux/fiemap.h>
 #include <linux/fs.h>

+/**
+ * struct fiemap_extent_info - fiemap request to a filesystem
+ * @fi_flags:		Flags as passed from user
+ * @fi_extents_mapped:	Number of mapped extents
+ * @fi_extents_max:	Size of fiemap_extent array
+ * @fi_extents_start:	Start of fiemap_extent array
+ */
 struct fiemap_extent_info {
-	unsigned int fi_flags;		/* Flags as passed from user */
-	unsigned int fi_extents_mapped;	/* Number of mapped extents */
-	unsigned int fi_extents_max;	/* Size of fiemap_extent array */
-	struct fiemap_extent __user *fi_extents_start; /* Start of
-							fiemap_extent array */
+	unsigned int fi_flags;
+	unsigned int fi_extents_mapped;
+	unsigned int fi_extents_max;
+	struct fiemap_extent __user *fi_extents_start;
 };

 int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@ -626,6 +626,7 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_XATTR	0x0008
 #define IOP_DEFAULT_READLINK	0x0010
 #define IOP_MGTIME	0x0020
+#define IOP_CACHED_LINK	0x0040

 /*
 * Keep mostly read-only and often accessed (especially for
@ -723,7 +724,10 @@ struct inode {
 	};
 	struct file_lock_context	*i_flctx;
 	struct address_space	i_data;
-	struct list_head	i_devices;
+	union {
+		struct list_head	i_devices;
+		int			i_linklen;
+	};
 	union {
 		struct pipe_inode_info	*i_pipe;
 		struct cdev		*i_cdev;
@ -749,6 +753,13 @@ struct inode {
 	void			*i_private; /* fs or device private pointer */
 } __randomize_layout;

+static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
+{
+	inode->i_link = link;
+	inode->i_linklen = linklen;
+	inode->i_opflags |= IOP_CACHED_LINK;
+}
+
 /*
 * Get bit address from inode->i_state to use with wait_var_event()
 * infrastructre.
@ -3351,7 +3362,7 @@ extern const struct file_operations generic_ro_fops;

 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

-extern int readlink_copy(char __user *, int, const char *);
+extern int readlink_copy(char __user *, int, const char *, int);
 extern int page_readlink(struct dentry *, char __user *, int);
 extern const char *page_get_link(struct dentry *, struct inode *,
 				 struct delayed_call *);
@ -3468,7 +3479,6 @@ struct offset_ctx {
 void simple_offset_init(struct offset_ctx *octx);
 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
-int simple_offset_empty(struct dentry *dentry);
 int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
 			 struct inode *new_dir, struct dentry *new_dentry);
 int simple_offset_rename_exchange(struct inode *old_dir,
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@ -76,7 +76,7 @@ struct vfsmount {
 static inline struct mnt_idmap *mnt_idmap(const struct vfsmount *mnt)
 {
 	/* Pairs with smp_store_release() in do_idmap_mount(). */
-	return smp_load_acquire(&mnt->mnt_idmap);
+	return READ_ONCE(mnt->mnt_idmap);
 }

 extern int mnt_want_write(struct vfsmount *mnt);
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@ -272,7 +272,7 @@ SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
 ({									\
 	unsigned __seq;							\
 									\
-	while ((__seq = seqprop_sequence(s)) & 1)			\
+	while (unlikely((__seq = seqprop_sequence(s)) & 1))		\
 		cpu_relax();						\
 									\
 	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);			\
--- a/include/uapi/linux/fiemap.h
+++ b/include/uapi/linux/fiemap.h
@ -14,37 +14,56 @@

 #include <linux/types.h>

+/**
+ * struct fiemap_extent - description of one fiemap extent
+ * @fe_logical: byte offset of the extent in the file
+ * @fe_physical: byte offset of extent on disk
+ * @fe_length: length in bytes for this extent
+ * @fe_flags: FIEMAP_EXTENT_* flags for this extent
+ */
 struct fiemap_extent {
-	__u64 fe_logical;  /* logical offset in bytes for the start of
-			    * the extent from the beginning of the file */
-	__u64 fe_physical; /* physical offset in bytes for the start
-			    * of the extent from the beginning of the disk */
-	__u64 fe_length;   /* length in bytes for this extent */
+	__u64 fe_logical;
+	__u64 fe_physical;
+	__u64 fe_length;
+	/* private: */
 	__u64 fe_reserved64[2];
-	__u32 fe_flags;    /* FIEMAP_EXTENT_* flags for this extent */
+	/* public: */
+	__u32 fe_flags;
+	/* private: */
 	__u32 fe_reserved[3];
 };

+/**
+ * struct fiemap - file extent mappings
+ * @fm_start: byte offset (inclusive) at which to start mapping (in)
+ * @fm_length: logical length of mapping which userspace wants (in)
+ * @fm_flags: FIEMAP_FLAG_* flags for request (in/out)
+ * @fm_mapped_extents: number of extents that were mapped (out)
+ * @fm_extent_count: size of fm_extents array (in)
+ * @fm_extents: array of mapped extents (out)
+ */
 struct fiemap {
-	__u64 fm_start;		/* logical offset (inclusive) at
-				 * which to start mapping (in) */
-	__u64 fm_length;	/* logical length of mapping which
-				 * userspace wants (in) */
-	__u32 fm_flags;		/* FIEMAP_FLAG_* flags for request (in/out) */
-	__u32 fm_mapped_extents;/* number of extents that were mapped (out) */
-	__u32 fm_extent_count;  /* size of fm_extents array (in) */
+	__u64 fm_start;
+	__u64 fm_length;
+	__u32 fm_flags;
+	__u32 fm_mapped_extents;
+	__u32 fm_extent_count;
+	/* private: */
 	__u32 fm_reserved;
-	struct fiemap_extent fm_extents[]; /* array of mapped extents (out) */
+	/* public: */
+	struct fiemap_extent fm_extents[];
 };

 #define FIEMAP_MAX_OFFSET	(~0ULL)

+/* flags used in fm_flags: */
 #define FIEMAP_FLAG_SYNC	0x00000001 /* sync file data before map */
 #define FIEMAP_FLAG_XATTR	0x00000002 /* map extended attribute tree */
 #define FIEMAP_FLAG_CACHE	0x00000004 /* request caching of the extents */

 #define FIEMAP_FLAGS_COMPAT	(FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)

+/* flags used in fe_flags: */
 #define FIEMAP_EXTENT_LAST		0x00000001 /* Last extent in file. */
 #define FIEMAP_EXTENT_UNKNOWN		0x00000002 /* Data location unknown. */
 #define FIEMAP_EXTENT_DELALLOC		0x00000004 /* Location still pending.
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
 	bit /= WATCH_QUEUE_NOTE_SIZE;

 	page = buf->page;
-	bit += page->index;
+	bit += page->private;

 	set_bit(bit, wqueue->notes_bitmap);
 	generic_pipe_buf_release(pipe, buf);
@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
 		pages[i] = alloc_page(GFP_KERNEL);
 		if (!pages[i])
 			goto error_p;
-		pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
+		pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE;
 	}

 	bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
--- a/mm/shmem.c
+++ b/mm/shmem.c
@ -3818,7 +3818,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)

 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
 {
-	if (!simple_offset_empty(dentry))
+	if (!simple_empty(dentry))
 		return -ENOTEMPTY;

 	drop_nlink(d_inode(dentry));
@ -3875,7 +3875,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
 		return simple_offset_rename_exchange(old_dir, old_dentry,
 						     new_dir, new_dentry);

-	if (!simple_offset_empty(new_dentry))
+	if (!simple_empty(new_dentry))
 		return -ENOTEMPTY;

 	if (flags & RENAME_WHITEOUT) {
@ -3914,6 +3914,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	int len;
 	struct inode *inode;
 	struct folio *folio;
+	char *link;

 	len = strlen(symname) + 1;
 	if (len > PAGE_SIZE)
@ -3935,12 +3936,13 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,

 	inode->i_size = len-1;
 	if (len <= SHORT_SYMLINK_LEN) {
-		inode->i_link = kmemdup(symname, len, GFP_KERNEL);
-		if (!inode->i_link) {
+		link = kmemdup(symname, len, GFP_KERNEL);
+		if (!link) {
 			error = -ENOMEM;
 			goto out_remove_offset;
 		}
 		inode->i_op = &shmem_short_symlink_operations;
+		inode_set_cached_link(inode, link, len - 1);
 	} else {
 		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &shmem_aops;
--- a/samples/vfs/.gitignore
+++ b/samples/vfs/.gitignore
@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 /test-fsmount
 /test-statx
+/mountinfo
--- a/samples/vfs/Makefile
+++ b/samples/vfs/Makefile
@ -1,4 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-userprogs-always-y += test-fsmount test-statx
+userprogs-always-y += test-fsmount test-statx mountinfo

 userccflags += -I usr/include
--- a/samples/vfs/mountinfo.c
+++ b/samples/vfs/mountinfo.c
@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Use pidfds, nsfds, listmount() and statmount() mimic the
+ * contents of /proc/self/mountinfo.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <linux/pidfd.h>
+#include <linux/mount.h>
+#include <linux/nsfs.h>
+#include <unistd.h>
+#include <alloca.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <errno.h>
+
+/* max mounts per listmount call */
+#define MAXMOUNTS		1024
+
+/* size of struct statmount (including trailing string buffer) */
+#define STATMOUNT_BUFSIZE	4096
+
+static bool ext_format;
+
+/*
+ * There are no bindings in glibc for listmount() and statmount() (yet),
+ * make our own here.
+ */
+static int statmount(uint64_t mnt_id, uint64_t mnt_ns_id, uint64_t mask,
+			    struct statmount *buf, size_t bufsize,
+			    unsigned int flags)
+{
+	struct mnt_id_req req = {
+		.size = MNT_ID_REQ_SIZE_VER0,
+		.mnt_id = mnt_id,
+		.param = mask,
+	};
+
+	if (mnt_ns_id) {
+		req.size = MNT_ID_REQ_SIZE_VER1;
+		req.mnt_ns_id = mnt_ns_id;
+	}
+
+	return syscall(__NR_statmount, &req, buf, bufsize, flags);
+}
+
+static ssize_t listmount(uint64_t mnt_id, uint64_t mnt_ns_id,
+			 uint64_t last_mnt_id, uint64_t list[], size_t num,
+			 unsigned int flags)
+{
+	struct mnt_id_req req = {
+		.size = MNT_ID_REQ_SIZE_VER0,
+		.mnt_id = mnt_id,
+		.param = last_mnt_id,
+	};
+
+	if (mnt_ns_id) {
+		req.size = MNT_ID_REQ_SIZE_VER1;
+		req.mnt_ns_id = mnt_ns_id;
+	}
+
+	return syscall(__NR_listmount, &req, list, num, flags);
+}
+
+static void show_mnt_attrs(uint64_t flags)
+{
+	printf("%s", flags & MOUNT_ATTR_RDONLY ? "ro" : "rw");
+
+	if (flags & MOUNT_ATTR_NOSUID)
+		printf(",nosuid");
+	if (flags & MOUNT_ATTR_NODEV)
+		printf(",nodev");
+	if (flags & MOUNT_ATTR_NOEXEC)
+		printf(",noexec");
+
+	switch (flags & MOUNT_ATTR__ATIME) {
+	case MOUNT_ATTR_RELATIME:
+		printf(",relatime");
+		break;
+	case MOUNT_ATTR_NOATIME:
+		printf(",noatime");
+		break;
+	case MOUNT_ATTR_STRICTATIME:
+		/* print nothing */
+		break;
+	}
+
+	if (flags & MOUNT_ATTR_NODIRATIME)
+		printf(",nodiratime");
+	if (flags & MOUNT_ATTR_NOSYMFOLLOW)
+		printf(",nosymfollow");
+	if (flags & MOUNT_ATTR_IDMAP)
+		printf(",idmapped");
+}
+
+static void show_propagation(struct statmount *sm)
+{
+	if (sm->mnt_propagation & MS_SHARED)
+		printf(" shared:%llu", sm->mnt_peer_group);
+	if (sm->mnt_propagation & MS_SLAVE) {
+		printf(" master:%llu", sm->mnt_master);
+		if (sm->propagate_from && sm->propagate_from != sm->mnt_master)
+			printf(" propagate_from:%llu", sm->propagate_from);
+	}
+	if (sm->mnt_propagation & MS_UNBINDABLE)
+		printf(" unbindable");
+}
+
+static void show_sb_flags(uint64_t flags)
+{
+	printf("%s", flags & MS_RDONLY ? "ro" : "rw");
+	if (flags & MS_SYNCHRONOUS)
+		printf(",sync");
+	if (flags & MS_DIRSYNC)
+		printf(",dirsync");
+	if (flags & MS_MANDLOCK)
+		printf(",mand");
+	if (flags & MS_LAZYTIME)
+		printf(",lazytime");
+}
+
+static int dump_mountinfo(uint64_t mnt_id, uint64_t mnt_ns_id)
+{
+	int ret;
+	struct statmount *buf = alloca(STATMOUNT_BUFSIZE);
+	const uint64_t mask = STATMOUNT_SB_BASIC | STATMOUNT_MNT_BASIC |
+				STATMOUNT_PROPAGATE_FROM | STATMOUNT_FS_TYPE |
+				STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT |
+				STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE |
+				STATMOUNT_SB_SOURCE;
+
+	ret = statmount(mnt_id, mnt_ns_id, mask, buf, STATMOUNT_BUFSIZE, 0);
+	if (ret < 0) {
+		perror("statmount");
+		return 1;
+	}
+
+	if (ext_format)
+		printf("0x%lx 0x%lx 0x%llx ", mnt_ns_id, mnt_id, buf->mnt_parent_id);
+
+	printf("%u %u %u:%u %s %s ", buf->mnt_id_old, buf->mnt_parent_id_old,
+				   buf->sb_dev_major, buf->sb_dev_minor,
+				   &buf->str[buf->mnt_root],
+				   &buf->str[buf->mnt_point]);
+	show_mnt_attrs(buf->mnt_attr);
+	show_propagation(buf);
+
+	printf(" - %s", &buf->str[buf->fs_type]);
+	if (buf->mask & STATMOUNT_FS_SUBTYPE)
+		printf(".%s", &buf->str[buf->fs_subtype]);
+	if (buf->mask & STATMOUNT_SB_SOURCE)
+		printf(" %s ", &buf->str[buf->sb_source]);
+	else
+		printf(" :none ");
+
+	show_sb_flags(buf->sb_flags);
+	if (buf->mask & STATMOUNT_MNT_OPTS)
+		printf(",%s", &buf->str[buf->mnt_opts]);
+	printf("\n");
+	return 0;
+}
+
+static int dump_mounts(uint64_t mnt_ns_id)
+{
+	uint64_t mntid[MAXMOUNTS];
+	uint64_t last_mnt_id = 0;
+	ssize_t count;
+	int i;
+
+	/*
+	 * Get a list of all mntids in mnt_ns_id. If it returns MAXMOUNTS
+	 * mounts, then go again until we get everything.
+	 */
+	do {
+		count = listmount(LSMT_ROOT, mnt_ns_id, last_mnt_id, mntid, MAXMOUNTS, 0);
+		if (count < 0 || count > MAXMOUNTS) {
+			errno = count < 0 ? errno : count;
+			perror("listmount");
+			return 1;
+		}
+
+		/* Walk the returned mntids and print info about each */
+		for (i = 0; i < count; ++i) {
+			int ret = dump_mountinfo(mntid[i], mnt_ns_id);
+
+			if (ret != 0)
+				return ret;
+		}
+		/* Set up last_mnt_id to pick up where we left off */
+		last_mnt_id = mntid[count - 1];
+	} while (count == MAXMOUNTS);
+	return 0;
+}
+
+static void usage(const char * const prog)
+{
+	printf("Usage:\n");
+	printf("%s [-e] [-p pid] [-r] [-h]\n", prog);
+	printf("    -e: extended format\n");
+	printf("    -h: print usage message\n");
+	printf("    -p: get mount namespace from given pid\n");
+	printf("    -r: recursively print all mounts in all child namespaces\n");
+}
+
+int main(int argc, char * const *argv)
+{
+	struct mnt_ns_info mni = { .size = MNT_NS_INFO_SIZE_VER0 };
+	int pidfd, mntns, ret, opt;
+	pid_t pid = getpid();
+	bool recursive = false;
+
+	while ((opt = getopt(argc, argv, "ehp:r")) != -1) {
+		switch (opt) {
+		case 'e':
+			ext_format = true;
+			break;
+		case 'h':
+			usage(argv[0]);
+			return 0;
+		case 'p':
+			pid = atoi(optarg);
+			break;
+		case 'r':
+			recursive = true;
+			break;
+		}
+	}
+
+	/* Get a pidfd for pid */
+	pidfd = syscall(SYS_pidfd_open, pid, 0);
+	if (pidfd < 0) {
+		perror("pidfd_open");
+		return 1;
+	}
+
+	/* Get the mnt namespace for pidfd */
+	mntns = ioctl(pidfd, PIDFD_GET_MNT_NAMESPACE, NULL);
+	if (mntns < 0) {
+		perror("PIDFD_GET_MNT_NAMESPACE");
+		return 1;
+	}
+	close(pidfd);
+
+	/* get info about mntns. In particular, the mnt_ns_id */
+	ret = ioctl(mntns, NS_MNT_GET_INFO, &mni);
+	if (ret < 0) {
+		perror("NS_MNT_GET_INFO");
+		return 1;
+	}
+
+	do {
+		int ret;
+
+		ret = dump_mounts(mni.mnt_ns_id);
+		if (ret)
+			return ret;
+
+		if (!recursive)
+			break;
+
+		/* get the next mntns (and overwrite the old mount ns info) */
+		ret = ioctl(mntns, NS_MNT_GET_NEXT, &mni);
+		close(mntns);
+		mntns = ret;
+	} while (mntns >= 0);
+
+	return 0;
+}
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@ -2612,7 +2612,7 @@ static int policy_readlink(struct dentry *dentry, char __user *buffer,
 	res = snprintf(name, sizeof(name), "%s:[%lu]", AAFS_NAME,
 		       d_inode(dentry)->i_ino);
 	if (res > 0 && res < sizeof(name))
-		res = readlink_copy(buffer, buflen, name);
+		res = readlink_copy(buffer, buflen, name, strlen(name));
 	else
 		res = -ENOENT;