From 1934b212615dc617ac84fc306333ab2b9fc3b04f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Aug 2024 18:00:01 +0200
Subject: [PATCH 01/26] file: reclaim 24 bytes from f_owner

We do embedd struct fown_struct into struct file letting it take up 32
bytes in total. We could tweak struct fown_struct to be more compact but
really it shouldn't even be embedded in struct file in the first place.

Instead, actual users of struct fown_struct should allocate the struct
on demand. This frees up 24 bytes in struct file.

That will have some potentially user-visible changes for the ownership
fcntl()s. Some of them can now fail due to allocation failures.
Practically, that probably will almost never happen as the allocations
are small and they only happen once per file.

The fown_struct is used during kill_fasync() which is used by e.g.,
pipes to generate a SIGIO signal. Sending of such signals is conditional
on userspace having set an owner for the file using one of the F_OWNER
fcntl()s. Such users will be unaffected if struct fown_struct is
allocated during the fcntl() call.

There are a few subsystems that call __f_setown() expecting
file->f_owner to be allocated:

(1) tun devices
    file->f_op->fasync::tun_chr_fasync()
    -> __f_setown()

    There are no callers of tun_chr_fasync().

(2) tty devices

    file->f_op->fasync::tty_fasync()
    -> __tty_fasync()
       -> __f_setown()

    tty_fasync() has no additional callers but __tty_fasync() has. Note
    that __tty_fasync() only calls __f_setown() if the @on argument is
    true. It's called from:

    file->f_op->release::tty_release()
    -> tty_release()
       -> __tty_fasync()
          -> __f_setown()

    tty_release() calls __tty_fasync() with @on false
    => __f_setown() is never called from tty_release().
       => All callers of tty_release() are safe as well.

    file->f_op->release::tty_open()
    -> tty_release()
       -> __tty_fasync()
          -> __f_setown()

    __tty_hangup() calls __tty_fasync() with @on false
    => __f_setown() is never called from tty_release().
       => All callers of __tty_hangup() are safe as well.

From the callchains it's obvious that (1) and (2) end up getting called
via file->f_op->fasync(). That can happen either through the F_SETFL
fcntl() with the FASYNC flag raised or via the FIOASYNC ioctl(). If
FASYNC is requested and the file isn't already FASYNC then
file->f_op->fasync() is called with @on true which ends up causing both
(1) and (2) to call __f_setown().

(1) and (2) are the only subsystems that call __f_setown() from the
file->f_op->fasync() handler. So both (1) and (2) have been updated to
allocate a struct fown_struct prior to calling fasync_helper() to
register with the fasync infrastructure. That's safe as they both call
fasync_helper() which also does allocations if @on is true.

The other interesting case are file leases:

(3) file leases
    lease_manager_ops->lm_setup::lease_setup()
    -> __f_setown()

    Which in turn is called from:

    generic_add_lease()
    -> lease_manager_ops->lm_setup::lease_setup()
       -> __f_setown()

So here again we can simply make generic_add_lease() allocate struct
fown_struct prior to the lease_manager_ops->lm_setup::lease_setup()
which happens under a spinlock.

With that the two remaining subsystems that call __f_setown() are:

(4) dnotify
(5) sockets

Both have their own custom ioctls to set struct fown_struct and both
have been converted to allocate a struct fown_struct on demand from
their respective ioctls.

Interactions with O_PATH are fine as well e.g., when opening a /dev/tty
as O_PATH then no file->f_op->open() happens thus no file->f_owner is
allocated. That's fine as no file operation will be set for those and
the device has never been opened. fcntl()s called on such things will
just allocate a ->f_owner on demand. Although I have zero idea why'd you
care about f_owner on an O_PATH fd.

Link: https://lore.kernel.org/r/20240813-work-f_owner-v2-1-4e9343a79f9f@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/net/tun.c           |   6 ++
 drivers/tty/tty_io.c        |   6 ++
 fs/fcntl.c                  | 166 ++++++++++++++++++++++++++++--------
 fs/file_table.c             |   3 +-
 fs/internal.h               |   1 +
 fs/locks.c                  |   6 +-
 fs/notify/dnotify/dnotify.c |   6 +-
 include/linux/fs.h          |  11 ++-
 net/core/sock.c             |   2 +-
 security/selinux/hooks.c    |   2 +-
 security/smack/smack_lsm.c  |   2 +-
 11 files changed, 168 insertions(+), 43 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 1d06c560c5e6..6fe5e8f7017c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3451,6 +3451,12 @@ static int tun_chr_fasync(int fd, struct file *file, int on)
 	struct tun_file *tfile = file->private_data;
 	int ret;
 
+	if (on) {
+		ret = file_f_owner_allocate(file);
+		if (ret)
+			goto out;
+	}
+
 	if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
 		goto out;
 
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 407b0d87b7c1..7ae0c8934f42 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2225,6 +2225,12 @@ static int __tty_fasync(int fd, struct file *filp, int on)
 	if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync"))
 		goto out;
 
+	if (on) {
+		retval = file_f_owner_allocate(filp);
+		if (retval)
+			goto out;
+	}
+
 	retval = fasync_helper(fd, filp, on, &tty->fasync);
 	if (retval <= 0)
 		goto out;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 300e5d9ad913..0587a0e221a6 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -33,6 +33,8 @@
 #include <asm/siginfo.h>
 #include <linux/uaccess.h>
 
+#include "internal.h"
+
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
 static int setfl(int fd, struct file * filp, unsigned int arg)
@@ -87,22 +89,64 @@ static int setfl(int fd, struct file * filp, unsigned int arg)
 	return error;
 }
 
+/*
+ * Allocate an file->f_owner struct if it doesn't exist, handling racing
+ * allocations correctly.
+ */
+int file_f_owner_allocate(struct file *file)
+{
+	struct fown_struct *f_owner;
+
+	f_owner = file_f_owner(file);
+	if (f_owner)
+		return 0;
+
+	f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
+	if (!f_owner)
+		return -ENOMEM;
+
+	rwlock_init(&f_owner->lock);
+	f_owner->file = file;
+	/* If someone else raced us, drop our allocation. */
+	if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
+		kfree(f_owner);
+	return 0;
+}
+EXPORT_SYMBOL(file_f_owner_allocate);
+
+void file_f_owner_release(struct file *file)
+{
+	struct fown_struct *f_owner;
+
+	f_owner = file_f_owner(file);
+	if (f_owner) {
+		put_pid(f_owner->pid);
+		kfree(f_owner);
+	}
+}
+
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                      int force)
 {
-	write_lock_irq(&filp->f_owner.lock);
-	if (force || !filp->f_owner.pid) {
-		put_pid(filp->f_owner.pid);
-		filp->f_owner.pid = get_pid(pid);
-		filp->f_owner.pid_type = type;
+	struct fown_struct *f_owner;
+
+	f_owner = file_f_owner(filp);
+	if (WARN_ON_ONCE(!f_owner))
+		return;
+
+	write_lock_irq(&f_owner->lock);
+	if (force || !f_owner->pid) {
+		put_pid(f_owner->pid);
+		f_owner->pid = get_pid(pid);
+		f_owner->pid_type = type;
 
 		if (pid) {
 			const struct cred *cred = current_cred();
-			filp->f_owner.uid = cred->uid;
-			filp->f_owner.euid = cred->euid;
+			f_owner->uid = cred->uid;
+			f_owner->euid = cred->euid;
 		}
 	}
-	write_unlock_irq(&filp->f_owner.lock);
+	write_unlock_irq(&f_owner->lock);
 }
 
 void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
@@ -119,6 +163,8 @@ int f_setown(struct file *filp, int who, int force)
 	struct pid *pid = NULL;
 	int ret = 0;
 
+	might_sleep();
+
 	type = PIDTYPE_TGID;
 	if (who < 0) {
 		/* avoid overflow below */
@@ -129,6 +175,10 @@ int f_setown(struct file *filp, int who, int force)
 		who = -who;
 	}
 
+	ret = file_f_owner_allocate(filp);
+	if (ret)
+		return ret;
+
 	rcu_read_lock();
 	if (who) {
 		pid = find_vpid(who);
@@ -152,16 +202,21 @@ void f_delown(struct file *filp)
 pid_t f_getown(struct file *filp)
 {
 	pid_t pid = 0;
+	struct fown_struct *f_owner;
 
-	read_lock_irq(&filp->f_owner.lock);
+	f_owner = file_f_owner(filp);
+	if (!f_owner)
+		return pid;
+
+	read_lock_irq(&f_owner->lock);
 	rcu_read_lock();
-	if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
-		pid = pid_vnr(filp->f_owner.pid);
-		if (filp->f_owner.pid_type == PIDTYPE_PGID)
+	if (pid_task(f_owner->pid, f_owner->pid_type)) {
+		pid = pid_vnr(f_owner->pid);
+		if (f_owner->pid_type == PIDTYPE_PGID)
 			pid = -pid;
 	}
 	rcu_read_unlock();
-	read_unlock_irq(&filp->f_owner.lock);
+	read_unlock_irq(&f_owner->lock);
 	return pid;
 }
 
@@ -194,6 +249,10 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
 		return -EINVAL;
 	}
 
+	ret = file_f_owner_allocate(filp);
+	if (ret)
+		return ret;
+
 	rcu_read_lock();
 	pid = find_vpid(owner.pid);
 	if (owner.pid && !pid)
@@ -210,13 +269,20 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
 	struct f_owner_ex __user *owner_p = (void __user *)arg;
 	struct f_owner_ex owner = {};
 	int ret = 0;
+	struct fown_struct *f_owner;
+	enum pid_type pid_type = PIDTYPE_PID;
 
-	read_lock_irq(&filp->f_owner.lock);
-	rcu_read_lock();
-	if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
-		owner.pid = pid_vnr(filp->f_owner.pid);
-	rcu_read_unlock();
-	switch (filp->f_owner.pid_type) {
+	f_owner = file_f_owner(filp);
+	if (f_owner) {
+		read_lock_irq(&f_owner->lock);
+		rcu_read_lock();
+		if (pid_task(f_owner->pid, f_owner->pid_type))
+			owner.pid = pid_vnr(f_owner->pid);
+		rcu_read_unlock();
+		pid_type = f_owner->pid_type;
+	}
+
+	switch (pid_type) {
 	case PIDTYPE_PID:
 		owner.type = F_OWNER_TID;
 		break;
@@ -234,7 +300,8 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
 		ret = -EINVAL;
 		break;
 	}
-	read_unlock_irq(&filp->f_owner.lock);
+	if (f_owner)
+		read_unlock_irq(&f_owner->lock);
 
 	if (!ret) {
 		ret = copy_to_user(owner_p, &owner, sizeof(owner));
@@ -248,14 +315,18 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
 static int f_getowner_uids(struct file *filp, unsigned long arg)
 {
 	struct user_namespace *user_ns = current_user_ns();
+	struct fown_struct *f_owner;
 	uid_t __user *dst = (void __user *)arg;
-	uid_t src[2];
+	uid_t src[2] = {0, 0};
 	int err;
 
-	read_lock_irq(&filp->f_owner.lock);
-	src[0] = from_kuid(user_ns, filp->f_owner.uid);
-	src[1] = from_kuid(user_ns, filp->f_owner.euid);
-	read_unlock_irq(&filp->f_owner.lock);
+	f_owner = file_f_owner(filp);
+	if (f_owner) {
+		read_lock_irq(&f_owner->lock);
+		src[0] = from_kuid(user_ns, f_owner->uid);
+		src[1] = from_kuid(user_ns, f_owner->euid);
+		read_unlock_irq(&f_owner->lock);
+	}
 
 	err  = put_user(src[0], &dst[0]);
 	err |= put_user(src[1], &dst[1]);
@@ -343,6 +414,30 @@ static long f_dupfd_query(int fd, struct file *filp)
 	return f.file == filp;
 }
 
+static int f_owner_sig(struct file *filp, int signum, bool setsig)
+{
+	int ret = 0;
+	struct fown_struct *f_owner;
+
+	might_sleep();
+
+	if (setsig) {
+		if (!valid_signal(signum))
+			return -EINVAL;
+
+		ret = file_f_owner_allocate(filp);
+		if (ret)
+			return ret;
+	}
+
+	f_owner = file_f_owner(filp);
+	if (setsig)
+		f_owner->signum = signum;
+	else if (f_owner)
+		ret = f_owner->signum;
+	return ret;
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		struct file *filp)
 {
@@ -421,15 +516,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		err = f_getowner_uids(filp, arg);
 		break;
 	case F_GETSIG:
-		err = filp->f_owner.signum;
+		err = f_owner_sig(filp, 0, false);
 		break;
 	case F_SETSIG:
-		/* arg == 0 restores default behaviour. */
-		if (!valid_signal(argi)) {
-			break;
-		}
-		err = 0;
-		filp->f_owner.signum = argi;
+		err = f_owner_sig(filp, argi, true);
 		break;
 	case F_GETLEASE:
 		err = fcntl_getlease(filp);
@@ -844,14 +934,19 @@ static void send_sigurg_to_task(struct task_struct *p,
 		do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
 }
 
-int send_sigurg(struct fown_struct *fown)
+int send_sigurg(struct file *file)
 {
+	struct fown_struct *fown;
 	struct task_struct *p;
 	enum pid_type type;
 	struct pid *pid;
 	unsigned long flags;
 	int ret = 0;
 	
+	fown = file_f_owner(file);
+	if (!fown)
+		return 0;
+
 	read_lock_irqsave(&fown->lock, flags);
 
 	type = fown->pid_type;
@@ -1027,13 +1122,16 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
 		}
 		read_lock_irqsave(&fa->fa_lock, flags);
 		if (fa->fa_file) {
-			fown = &fa->fa_file->f_owner;
+			fown = file_f_owner(fa->fa_file);
+			if (!fown)
+				goto next;
 			/* Don't send SIGURG to processes which have not set a
 			   queued signum: SIGURG has its own default signalling
 			   mechanism. */
 			if (!(sig == SIGURG && fown->signum == 0))
 				send_sigio(fown, fa->fa_fd, band);
 		}
+next:
 		read_unlock_irqrestore(&fa->fa_lock, flags);
 		fa = rcu_dereference(fa->fa_next);
 	}
diff --git a/fs/file_table.c b/fs/file_table.c
index ca7843dde56d..fdf98709dde2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -155,7 +155,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 		return error;
 	}
 
-	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
 	f->f_flags = flags;
@@ -425,7 +424,7 @@ static void __fput(struct file *file)
 		cdev_put(inode->i_cdev);
 	}
 	fops_put(file->f_op);
-	put_pid(file->f_owner.pid);
+	file_f_owner_release(file);
 	put_file_access(file);
 	dput(dentry);
 	if (unlikely(mode & FMODE_NEED_UNMOUNT))
diff --git a/fs/internal.h b/fs/internal.h
index cdd73209eecb..8c1b7acbbe8f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -337,3 +337,4 @@ static inline bool path_mounted(const struct path *path)
 {
 	return path->mnt->mnt_root == path->dentry;
 }
+void file_f_owner_release(struct file *file);
diff --git a/fs/locks.c b/fs/locks.c
index e45cad40f8b6..b51b1c395ce6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1451,7 +1451,7 @@ int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
 		struct file *filp = fl->c.flc_file;
 
 		f_delown(filp);
-		filp->f_owner.signum = 0;
+		file_f_owner(filp)->signum = 0;
 		fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
 		if (fl->fl_fasync != NULL) {
 			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
@@ -1783,6 +1783,10 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr
 	lease = *flp;
 	trace_generic_add_lease(inode, lease);
 
+	error = file_f_owner_allocate(filp);
+	if (error)
+		return error;
+
 	/* Note that arg is never F_UNLCK here */
 	ctx = locks_get_lock_context(inode, arg);
 	if (!ctx)
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index f3669403fabf..46440fbb8662 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -110,7 +110,7 @@ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
 			prev = &dn->dn_next;
 			continue;
 		}
-		fown = &dn->dn_filp->f_owner;
+		fown = file_f_owner(dn->dn_filp);
 		send_sigio(fown, dn->dn_fd, POLL_MSG);
 		if (dn->dn_mask & FS_DN_MULTISHOT)
 			prev = &dn->dn_next;
@@ -316,6 +316,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 		goto out_err;
 	}
 
+	error = file_f_owner_allocate(filp);
+	if (error)
+		goto out_err;
+
 	/* set up the new_fsn_mark and new_dn_mark */
 	new_fsn_mark = &new_dn_mark->fsn_mark;
 	fsnotify_init_mark(new_fsn_mark, dnotify_group);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb0426f349fc..7af239ca87e2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -947,6 +947,7 @@ static inline unsigned imajor(const struct inode *inode)
 }
 
 struct fown_struct {
+	struct file *file;	/* backpointer for security modules */
 	rwlock_t lock;          /* protects pid, uid, euid fields */
 	struct pid *pid;	/* pid or -pgrp where SIGIO should be sent */
 	enum pid_type pid_type;	/* Kind of process group SIGIO should be sent to */
@@ -1011,7 +1012,7 @@ struct file {
 	struct mutex		f_pos_lock;
 	loff_t			f_pos;
 	unsigned int		f_flags;
-	struct fown_struct	f_owner;
+	struct fown_struct	*f_owner;
 	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
 	struct path		f_path;
@@ -1076,6 +1077,12 @@ struct file_lease;
 #define OFFT_OFFSET_MAX	type_max(off_t)
 #endif
 
+int file_f_owner_allocate(struct file *file);
+static inline struct fown_struct *file_f_owner(const struct file *file)
+{
+	return READ_ONCE(file->f_owner);
+}
+
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
 static inline struct inode *file_inode(const struct file *f)
@@ -1124,7 +1131,7 @@ extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force
 extern int f_setown(struct file *filp, int who, int force);
 extern void f_delown(struct file *filp);
 extern pid_t f_getown(struct file *filp);
-extern int send_sigurg(struct fown_struct *fown);
+extern int send_sigurg(struct file *file);
 
 /*
  * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
diff --git a/net/core/sock.c b/net/core/sock.c
index 9abc4fe25953..bbe4c58470c3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3429,7 +3429,7 @@ static void sock_def_destruct(struct sock *sk)
 void sk_send_sigurg(struct sock *sk)
 {
 	if (sk->sk_socket && sk->sk_socket->file)
-		if (send_sigurg(&sk->sk_socket->file->f_owner))
+		if (send_sigurg(sk->sk_socket->file))
 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
 }
 EXPORT_SYMBOL(sk_send_sigurg);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index bfa61e005aac..577e1fcbf6df 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3950,7 +3950,7 @@ static int selinux_file_send_sigiotask(struct task_struct *tsk,
 	struct file_security_struct *fsec;
 
 	/* struct fown_struct is never outside the context of a struct file */
-	file = container_of(fown, struct file, f_owner);
+	file = fown->file;
 
 	fsec = selinux_file(file);
 
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 4164699cd4f6..cb33920ab67c 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1950,7 +1950,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 	/*
 	 * struct fown_struct is never outside the context of a struct file
 	 */
-	file = container_of(fown, struct file, f_owner);
+	file = fown->file;
 
 	/* we don't log here as rc can be overriden */
 	blob = smack_file(file);

From a55d1cbd1720679cfe9837bce250e397ec513989 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 22 Aug 2024 16:14:46 +0200
Subject: [PATCH 02/26] fs: switch f_iocb_flags and f_ra

Now that we shrank struct file by 24 bytes we still have a 4 byte hole.
If we move struct file_ra_state into the union and f_iocb_flags out of
the union we close that whole and bring down struct file to 192 bytes.
Which means struct file is 3 cachelines and we managed to shrink it by
40 bytes this cycle.

I've tried to audit all codepaths that use f_ra and none of them seem to
rely on it in file->f_op->release() and never have since commit
1da177e4c3f4 ("Linux-2.6.12-rc2").

Link: https://lore.kernel.org/r/20240823-luftdicht-berappen-d69a2166a0db@brauner
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7af239ca87e2..095a956aeb29 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -999,9 +999,9 @@ struct file {
 		struct callback_head 	f_task_work;
 		/* fput() must use workqueue (most kernel threads). */
 		struct llist_node	f_llist;
-		unsigned int 		f_iocb_flags;
+		/* Invalid after last fput(). */
+		struct file_ra_state	f_ra;
 	};
-
 	/*
 	 * Protects f_ep, f_flags.
 	 * Must not be taken from IRQ context.
@@ -1012,9 +1012,9 @@ struct file {
 	struct mutex		f_pos_lock;
 	loff_t			f_pos;
 	unsigned int		f_flags;
+	unsigned int 		f_iocb_flags;
 	struct fown_struct	*f_owner;
 	const struct cred	*f_cred;
-	struct file_ra_state	f_ra;
 	struct path		f_path;
 	struct inode		*f_inode;	/* cached value */
 	const struct file_operations	*f_op;

From c0390d541128e8820af8177a572d9d87ff68a3bb Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 23 Aug 2024 21:06:58 +0200
Subject: [PATCH 03/26] fs: pack struct file

Now that we shrunk struct file to 192 bytes aka 3 cachelines reorder
struct file to not leave any holes or have members cross cachelines.

Add a short comment to each of the fields and mark the cachelines.
It's possible that we may have to tweak this based on profiling in the
future. So far I had Jens test this comparing io_uring with non-fixed
and fixed files and it improved performance. The layout is a combination
of Jens' and my changes.

Link: https: //lore.kernel.org/r/20240824-peinigen-hocken-7384b977c643@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 91 ++++++++++++++++++++++++++--------------------
 1 file changed, 51 insertions(+), 40 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 095a956aeb29..af8bbd4eeb3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -987,52 +987,63 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
 		index <  ra->start + ra->size);
 }
 
-/*
- * f_{lock,count,pos_lock} members can be highly contended and share
- * the same cacheline. f_{lock,mode} are very frequently used together
- * and so share the same cacheline as well. The read-mostly
- * f_{path,inode,op} are kept on a separate cacheline.
+/**
+ * struct file - Represents a file
+ * @f_count: reference count
+ * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
+ * @f_mode: FMODE_* flags often used in hotpaths
+ * @f_op: file operations
+ * @f_mapping: Contents of a cacheable, mappable object.
+ * @private_data: filesystem or driver specific data
+ * @f_inode: cached inode
+ * @f_flags: file flags
+ * @f_iocb_flags: iocb flags
+ * @f_cred: stashed credentials of creator/opener
+ * @f_path: path of the file
+ * @f_pos_lock: lock protecting file position
+ * @f_pos: file position
+ * @f_version: file version
+ * @f_security: LSM security context of this file
+ * @f_owner: file owner
+ * @f_wb_err: writeback error
+ * @f_sb_err: per sb writeback errors
+ * @f_ep: link of all epoll hooks for this file
+ * @f_task_work: task work entry point
+ * @f_llist: work queue entrypoint
+ * @f_ra: file's readahead state
  */
 struct file {
+	atomic_long_t			f_count;
+	spinlock_t			f_lock;
+	fmode_t				f_mode;
+	const struct file_operations	*f_op;
+	struct address_space		*f_mapping;
+	void				*private_data;
+	struct inode			*f_inode;
+	unsigned int			f_flags;
+	unsigned int			f_iocb_flags;
+	const struct cred		*f_cred;
+	/* --- cacheline 1 boundary (64 bytes) --- */
+	struct path			f_path;
+	struct mutex			f_pos_lock;
+	loff_t				f_pos;
+	u64				f_version;
+	/* --- cacheline 2 boundary (128 bytes) --- */
+#ifdef CONFIG_SECURITY
+	void				*f_security;
+#endif
+	struct fown_struct		*f_owner;
+	errseq_t			f_wb_err;
+	errseq_t			f_sb_err;
+#ifdef CONFIG_EPOLL
+	struct hlist_head		*f_ep;
+#endif
 	union {
-		/* fput() uses task work when closing and freeing file (default). */
-		struct callback_head 	f_task_work;
-		/* fput() must use workqueue (most kernel threads). */
+		struct callback_head	f_task_work;
 		struct llist_node	f_llist;
-		/* Invalid after last fput(). */
 		struct file_ra_state	f_ra;
 	};
-	/*
-	 * Protects f_ep, f_flags.
-	 * Must not be taken from IRQ context.
-	 */
-	spinlock_t		f_lock;
-	fmode_t			f_mode;
-	atomic_long_t		f_count;
-	struct mutex		f_pos_lock;
-	loff_t			f_pos;
-	unsigned int		f_flags;
-	unsigned int 		f_iocb_flags;
-	struct fown_struct	*f_owner;
-	const struct cred	*f_cred;
-	struct path		f_path;
-	struct inode		*f_inode;	/* cached value */
-	const struct file_operations	*f_op;
-
-	u64			f_version;
-#ifdef CONFIG_SECURITY
-	void			*f_security;
-#endif
-	/* needed for tty driver, and maybe others */
-	void			*private_data;
-
-#ifdef CONFIG_EPOLL
-	/* Used by fs/eventpoll.c to link all the hooks to this file */
-	struct hlist_head	*f_ep;
-#endif /* #ifdef CONFIG_EPOLL */
-	struct address_space	*f_mapping;
-	errseq_t		f_wb_err;
-	errseq_t		f_sb_err; /* for syncfs */
+	/* --- cacheline 3 boundary (192 bytes) --- */
 } __randomize_layout
   __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */
 

From e446f18e98e89fb7de2b320620ce983929bb2486 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 28 Aug 2024 12:56:23 +0200
Subject: [PATCH 04/26] mm: remove unused argument from create_cache()

That root_cache argument is unused so remove it.

Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-1-5460bc1f09f6@kernel.org
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 mm/slab_common.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 40b582a014b8..c8dd7e08c5f6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -204,8 +204,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
 static struct kmem_cache *create_cache(const char *name,
 		unsigned int object_size, unsigned int align,
 		slab_flags_t flags, unsigned int useroffset,
-		unsigned int usersize, void (*ctor)(void *),
-		struct kmem_cache *root_cache)
+		unsigned int usersize, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 	int err;
@@ -334,7 +333,7 @@ kmem_cache_create_usercopy(const char *name,
 
 	s = create_cache(cache_name, size,
 			 calculate_alignment(flags, align, size),
-			 flags, useroffset, usersize, ctor, NULL);
+			 flags, useroffset, usersize, ctor);
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
 		kfree_const(cache_name);

From d345bd2e9834e2da505977e154a1c179c793b7b2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 28 Aug 2024 12:56:24 +0200
Subject: [PATCH 05/26] mm: add kmem_cache_create_rcu()

When a kmem cache is created with SLAB_TYPESAFE_BY_RCU the free pointer
must be located outside of the object because we don't know what part of
the memory can safely be overwritten as it may be needed to prevent
object recycling.

That has the consequence that SLAB_TYPESAFE_BY_RCU may end up adding a
new cacheline. This is the case for e.g., struct file. After having it
shrunk down by 40 bytes and having it fit in three cachelines we still
have SLAB_TYPESAFE_BY_RCU adding a fourth cacheline because it needs to
accommodate the free pointer.

Add a new kmem_cache_create_rcu() function that allows the caller to
specify an offset where the free pointer is supposed to be placed.

Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-2-5460bc1f09f6@kernel.org
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/slab.h |   9 +++
 mm/slab.h            |   2 +
 mm/slab_common.c     | 136 ++++++++++++++++++++++++++++++-------------
 mm/slub.c            |  20 ++++---
 4 files changed, 121 insertions(+), 46 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index eb2bf4629157..5b2da2cf31a8 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -212,6 +212,12 @@ enum _slab_flag_bits {
 #define SLAB_NO_OBJ_EXT		__SLAB_FLAG_UNUSED
 #endif
 
+/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
@@ -242,6 +248,9 @@ struct kmem_cache *kmem_cache_create_usercopy(const char *name,
 			slab_flags_t flags,
 			unsigned int useroffset, unsigned int usersize,
 			void (*ctor)(void *));
+struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
+					 unsigned int freeptr_offset,
+					 slab_flags_t flags);
 void kmem_cache_destroy(struct kmem_cache *s);
 int kmem_cache_shrink(struct kmem_cache *s);
 
diff --git a/mm/slab.h b/mm/slab.h
index dcdb56b8e7f5..a6051385186e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -261,6 +261,8 @@ struct kmem_cache {
 	unsigned int object_size;	/* Object size without metadata */
 	struct reciprocal_value reciprocal_size;
 	unsigned int offset;		/* Free pointer offset */
+	/* Specific free pointer requested (if not UINT_MAX) */
+	unsigned int rcu_freeptr_offset;
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	/* Number of per cpu partial objects to keep around */
 	unsigned int cpu_partial;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c8dd7e08c5f6..887f6b9855dd 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -202,9 +202,10 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
 }
 
 static struct kmem_cache *create_cache(const char *name,
-		unsigned int object_size, unsigned int align,
-		slab_flags_t flags, unsigned int useroffset,
-		unsigned int usersize, void (*ctor)(void *))
+		unsigned int object_size, unsigned int freeptr_offset,
+		unsigned int align, slab_flags_t flags,
+		unsigned int useroffset, unsigned int usersize,
+		void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 	int err;
@@ -212,6 +213,13 @@ static struct kmem_cache *create_cache(const char *name,
 	if (WARN_ON(useroffset + usersize > object_size))
 		useroffset = usersize = 0;
 
+	/* If a custom freelist pointer is requested make sure it's sane. */
+	err = -EINVAL;
+	if (freeptr_offset != UINT_MAX &&
+	    (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) ||
+	     !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t))))
+		goto out;
+
 	err = -ENOMEM;
 	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
 	if (!s)
@@ -219,13 +227,13 @@ static struct kmem_cache *create_cache(const char *name,
 
 	s->name = name;
 	s->size = s->object_size = object_size;
+	s->rcu_freeptr_offset = freeptr_offset;
 	s->align = align;
 	s->ctor = ctor;
 #ifdef CONFIG_HARDENED_USERCOPY
 	s->useroffset = useroffset;
 	s->usersize = usersize;
 #endif
-
 	err = __kmem_cache_create(s, flags);
 	if (err)
 		goto out_free_cache;
@@ -240,38 +248,10 @@ static struct kmem_cache *create_cache(const char *name,
 	return ERR_PTR(err);
 }
 
-/**
- * kmem_cache_create_usercopy - Create a cache with a region suitable
- * for copying to userspace
- * @name: A string which is used in /proc/slabinfo to identify this cache.
- * @size: The size of objects to be created in this cache.
- * @align: The required alignment for the objects.
- * @flags: SLAB flags
- * @useroffset: Usercopy region offset
- * @usersize: Usercopy region size
- * @ctor: A constructor for the objects.
- *
- * Cannot be called within a interrupt, but can be interrupted.
- * The @ctor is run when new pages are allocated by the cache.
- *
- * The flags are
- *
- * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
- * to catch references to uninitialised memory.
- *
- * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
- * for buffer overruns.
- *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline.  This can be beneficial if you're counting cycles as closely
- * as davem.
- *
- * Return: a pointer to the cache on success, NULL on failure.
- */
-struct kmem_cache *
-kmem_cache_create_usercopy(const char *name,
-		  unsigned int size, unsigned int align,
-		  slab_flags_t flags,
+static struct kmem_cache *
+do_kmem_cache_create_usercopy(const char *name,
+		  unsigned int size, unsigned int freeptr_offset,
+		  unsigned int align, slab_flags_t flags,
 		  unsigned int useroffset, unsigned int usersize,
 		  void (*ctor)(void *))
 {
@@ -331,7 +311,7 @@ kmem_cache_create_usercopy(const char *name,
 		goto out_unlock;
 	}
 
-	s = create_cache(cache_name, size,
+	s = create_cache(cache_name, size, freeptr_offset,
 			 calculate_alignment(flags, align, size),
 			 flags, useroffset, usersize, ctor);
 	if (IS_ERR(s)) {
@@ -355,6 +335,45 @@ kmem_cache_create_usercopy(const char *name,
 	}
 	return s;
 }
+
+/**
+ * kmem_cache_create_usercopy - Create a cache with a region suitable
+ * for copying to userspace
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @freeptr_offset: Custom offset for the free pointer in RCU caches
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @useroffset: Usercopy region offset
+ * @usersize: Usercopy region size
+ * @ctor: A constructor for the objects.
+ *
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline.  This can be beneficial if you're counting cycles as closely
+ * as davem.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
+ */
+struct kmem_cache *
+kmem_cache_create_usercopy(const char *name, unsigned int size,
+			   unsigned int align, slab_flags_t flags,
+			   unsigned int useroffset, unsigned int usersize,
+			   void (*ctor)(void *))
+{
+	return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags,
+					     useroffset, usersize, ctor);
+}
 EXPORT_SYMBOL(kmem_cache_create_usercopy);
 
 /**
@@ -386,11 +405,50 @@ struct kmem_cache *
 kmem_cache_create(const char *name, unsigned int size, unsigned int align,
 		slab_flags_t flags, void (*ctor)(void *))
 {
-	return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
-					  ctor);
+	return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags,
+					     0, 0, ctor);
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
+/**
+ * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @freeptr_offset: The offset into the memory to the free pointer
+ * @flags: SLAB flags
+ *
+ * Cannot be called within an interrupt, but can be interrupted.
+ *
+ * See kmem_cache_create() for an explanation of possible @flags.
+ *
+ * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside
+ * of the object. This might cause the object to grow in size. Callers
+ * that have a reason to avoid this can specify a custom free pointer
+ * offset in their struct where the free pointer will be placed.
+ *
+ * Note that placing the free pointer inside the object requires the
+ * caller to ensure that no fields are invalidated that are required to
+ * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for
+ * details.).
+ *
+ * Using zero as a value for @freeptr_offset is valid. To request no
+ * offset UINT_MAX must be specified.
+ *
+ * Note that @ctor isn't supported with custom free pointers as a @ctor
+ * requires an external free pointer.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
+ */
+struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size,
+					 unsigned int freeptr_offset,
+					 slab_flags_t flags)
+{
+	return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0,
+					     flags | SLAB_TYPESAFE_BY_RCU, 0, 0,
+					     NULL);
+}
+EXPORT_SYMBOL(kmem_cache_create_rcu);
+
 static struct kmem_cache *kmem_buckets_cache __ro_after_init;
 
 /**
diff --git a/mm/slub.c b/mm/slub.c
index c9d8a2497fd6..9aa5da1e8e27 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -465,12 +465,6 @@ static struct workqueue_struct *flushwq;
  * 			Core slab cache functions
  *******************************************************************/
 
-/*
- * freeptr_t represents a SLUB freelist pointer, which might be encoded
- * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
- */
-typedef struct { unsigned long v; } freeptr_t;
-
 /*
  * Returns freelist pointer (ptr). With hardening, this is obfuscated
  * with an XOR of the address where the pointer is held and a per-cache
@@ -3921,6 +3915,9 @@ static void *__slab_alloc_node(struct kmem_cache *s,
 /*
  * If the object has been wiped upon free, make sure it's fully initialized by
  * zeroing out freelist pointer.
+ *
+ * Note that we also wipe custom freelist pointers specified via
+ * s->rcu_freeptr_offset.
  */
 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
 						   void *obj)
@@ -5144,6 +5141,12 @@ static void set_cpu_partial(struct kmem_cache *s)
 #endif
 }
 
+/* Was a valid freeptr offset requested? */
+static inline bool has_freeptr_offset(const struct kmem_cache *s)
+{
+	return s->rcu_freeptr_offset != UINT_MAX;
+}
+
 /*
  * calculate_sizes() determines the order and the distribution of data within
  * a slab object.
@@ -5189,7 +5192,8 @@ static int calculate_sizes(struct kmem_cache *s)
 	 */
 	s->inuse = size;
 
-	if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor ||
+	if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) ||
+	    (flags & SLAB_POISON) || s->ctor ||
 	    ((flags & SLAB_RED_ZONE) &&
 	     (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
 		/*
@@ -5210,6 +5214,8 @@ static int calculate_sizes(struct kmem_cache *s)
 		 */
 		s->offset = size;
 		size += sizeof(void *);
+	} else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) {
+		s->offset = s->rcu_freeptr_offset;
 	} else {
 		/*
 		 * Store freelist pointer near middle of object to keep

From ea566e18b4deea6e998088de4f1a76d1f39c8d3f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 28 Aug 2024 12:56:25 +0200
Subject: [PATCH 06/26] fs: use kmem_cache_create_rcu()

Switch to the new kmem_cache_create_rcu() helper which allows us to use
a custom free pointer offset avoiding the need to have an external free
pointer which would grow struct file behind our backs.

Link: https://lore.kernel.org/r/20240828-work-kmem_cache-rcu-v3-3-5460bc1f09f6@kernel.org
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c    | 6 +++---
 include/linux/fs.h | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index fdf98709dde2..3ef558f27a1c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -511,9 +511,9 @@ EXPORT_SYMBOL(__fput_sync);
 
 void __init files_init(void)
 {
-	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
-				SLAB_PANIC | SLAB_ACCOUNT, NULL);
+	filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file),
+				offsetof(struct file, f_freeptr),
+				SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index af8bbd4eeb3a..58c91a52cad1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1011,6 +1011,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_task_work: task work entry point
  * @f_llist: work queue entrypoint
  * @f_ra: file's readahead state
+ * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
  */
 struct file {
 	atomic_long_t			f_count;
@@ -1042,6 +1043,7 @@ struct file {
 		struct callback_head	f_task_work;
 		struct llist_node	f_llist;
 		struct file_ra_state	f_ra;
+		freeptr_t		f_freeptr;
 	};
 	/* --- cacheline 3 boundary (192 bytes) --- */
 } __randomize_layout

From 0f389adb4b80eef29920db2e2c99392aa5d06811 Mon Sep 17 00:00:00 2001
From: R Sundar <prosunofficial@gmail.com>
Date: Mon, 2 Sep 2024 07:35:55 +0530
Subject: [PATCH 07/26] mm: Removed @freeptr_offset to prevent doc warning

Removed @freeptr_offset to fix below doc warning.
./mm/slab_common.c:385: warning: Excess function parameter 'freeptr_offset' description in 'kmem_cache_create_usercopy'

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202408292249.5oUpnCbS-lkp@intel.com/
Signed-off-by: R Sundar <prosunofficial@gmail.com>
Link: https://lore.kernel.org/r/20240902020555.11506-1-prosunofficial@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 mm/slab_common.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 887f6b9855dd..95db3702f8d6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -341,7 +341,6 @@ do_kmem_cache_create_usercopy(const char *name,
  * for copying to userspace
  * @name: A string which is used in /proc/slabinfo to identify this cache.
  * @size: The size of objects to be created in this cache.
- * @freeptr_offset: Custom offset for the free pointer in RCU caches
  * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @useroffset: Usercopy region offset

From db1faacb3a27863647845eb5bd17521e77ce430d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:43 +0200
Subject: [PATCH 08/26] adi: remove unused f_version

It's not used for adi so don't bother with it at all.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-2-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/char/adi.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/adi.c b/drivers/char/adi.c
index 751d7cc0da1b..c091a0282ad0 100644
--- a/drivers/char/adi.c
+++ b/drivers/char/adi.c
@@ -196,7 +196,6 @@ static loff_t adi_llseek(struct file *file, loff_t offset, int whence)
 
 	if (offset != file->f_pos) {
 		file->f_pos = offset;
-		file->f_version = 0;
 		ret = offset;
 	}
 

From 387b499b789ce60c195db510b53f54ea728e10e3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:44 +0200
Subject: [PATCH 09/26] ceph: remove unused f_version

It's not used for ceph so don't bother with it at all.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-3-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ceph/dir.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 18c72b305858..ddec8c9244ee 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -707,7 +707,6 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
-			file->f_version = 0;
 			dfi->file_info.flags &= ~CEPH_F_ATEND;
 		}
 		retval = offset;

From 1c5a54af4717ff54b640986768b901aa6c294297 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:45 +0200
Subject: [PATCH 10/26] s390: remove unused f_version

It's not used so don't bother with it at all.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-4-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/s390/char/hmcdrv_dev.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/s390/char/hmcdrv_dev.c b/drivers/s390/char/hmcdrv_dev.c
index 8d50c894711f..e069dd685899 100644
--- a/drivers/s390/char/hmcdrv_dev.c
+++ b/drivers/s390/char/hmcdrv_dev.c
@@ -186,9 +186,6 @@ static loff_t hmcdrv_dev_seek(struct file *fp, loff_t pos, int whence)
 	if (pos < 0)
 		return -EINVAL;
 
-	if (fp->f_pos != pos)
-		++fp->f_version;
-
 	fp->f_pos = pos;
 	return pos;
 }

From d095a5be7533aa3e72f7358ddbcd595d67d56ff1 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:46 +0200
Subject: [PATCH 11/26] fs: add vfs_setpos_cookie()

Add a new helper and make vfs_setpos() call it. We will use it in
follow-up patches.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-5-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/read_write.c | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 90e283b31ca1..66ff52860496 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -39,6 +39,34 @@ static inline bool unsigned_offsets(struct file *file)
 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
 
+/**
+ * vfs_setpos_cookie - update the file offset for lseek and reset cookie
+ * @file:	file structure in question
+ * @offset:	file offset to seek to
+ * @maxsize:	maximum file size
+ * @cookie:	cookie to reset
+ *
+ * Update the file offset to the value specified by @offset if the given
+ * offset is valid and it is not equal to the current file offset and
+ * reset the specified cookie to indicate that a seek happened.
+ *
+ * Return the specified offset on success and -EINVAL on invalid offset.
+ */
+static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
+				loff_t maxsize, u64 *cookie)
+{
+	if (offset < 0 && !unsigned_offsets(file))
+		return -EINVAL;
+	if (offset > maxsize)
+		return -EINVAL;
+
+	if (offset != file->f_pos) {
+		file->f_pos = offset;
+		*cookie = 0;
+	}
+	return offset;
+}
+
 /**
  * vfs_setpos - update the file offset for lseek
  * @file:	file structure in question
@@ -53,16 +81,7 @@ static inline bool unsigned_offsets(struct file *file)
  */
 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
 {
-	if (offset < 0 && !unsigned_offsets(file))
-		return -EINVAL;
-	if (offset > maxsize)
-		return -EINVAL;
-
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
-	return offset;
+	return vfs_setpos_cookie(file, offset, maxsize, &file->f_version);
 }
 EXPORT_SYMBOL(vfs_setpos);
 

From b8c7451928ab2698653966a54ab5fff2fce484ff Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:47 +0200
Subject: [PATCH 12/26] fs: add must_set_pos()

Add a new must_set_pos() helper. We will use it in follow-up patches.
Temporarily mark it as unused. This is only done to keep the diff small
and reviewable.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-6-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/read_write.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/fs/read_write.c b/fs/read_write.c
index 66ff52860496..5434467f5c05 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -85,6 +85,60 @@ loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
 }
 EXPORT_SYMBOL(vfs_setpos);
 
+/**
+ * must_set_pos - check whether f_pos has to be updated
+ * @file: file to seek on
+ * @offset: offset to use
+ * @whence: type of seek operation
+ * @eof: end of file
+ *
+ * Check whether f_pos needs to be updated and update @offset according
+ * to @whence.
+ *
+ * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
+ * updated, and negative error code on failure.
+ */
+static __maybe_unused int must_set_pos(struct file *file, loff_t *offset,
+				       int whence, loff_t eof)
+{
+	switch (whence) {
+	case SEEK_END:
+		*offset += eof;
+		break;
+	case SEEK_CUR:
+		/*
+		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
+		 * position-querying operation.  Avoid rewriting the "same"
+		 * f_pos value back to the file because a concurrent read(),
+		 * write() or lseek() might have altered it
+		 */
+		if (*offset == 0) {
+			*offset = file->f_pos;
+			return 0;
+		}
+		break;
+	case SEEK_DATA:
+		/*
+		 * In the generic case the entire file is data, so as long as
+		 * offset isn't at the end of the file then the offset is data.
+		 */
+		if ((unsigned long long)*offset >= eof)
+			return -ENXIO;
+		break;
+	case SEEK_HOLE:
+		/*
+		 * There is a virtual hole at the end of the file, so as long as
+		 * offset isn't i_size or larger, return i_size.
+		 */
+		if ((unsigned long long)*offset >= eof)
+			return -ENXIO;
+		*offset = eof;
+		break;
+	}
+
+	return 1;
+}
+
 /**
  * generic_file_llseek_size - generic llseek implementation for regular files
  * @file:	file structure to seek on

From ed904935c3992ec4178a38f61eca8ce7303ae1a0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:48 +0200
Subject: [PATCH 13/26] fs: use must_set_pos()

Make generic_file_llseek_size() use must_set_pos(). We'll use
must_set_pos() in another helper in a minutes. Remove __maybe_unused
from must_set_pos() now that it's used.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-7-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/read_write.c | 52 +++++++++++++------------------------------------
 1 file changed, 14 insertions(+), 38 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 5434467f5c05..b07e48cd81d1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -98,8 +98,7 @@ EXPORT_SYMBOL(vfs_setpos);
  * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
  * updated, and negative error code on failure.
  */
-static __maybe_unused int must_set_pos(struct file *file, loff_t *offset,
-				       int whence, loff_t eof)
+static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
 {
 	switch (whence) {
 	case SEEK_END:
@@ -159,45 +158,22 @@ loff_t
 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 		loff_t maxsize, loff_t eof)
 {
-	switch (whence) {
-	case SEEK_END:
-		offset += eof;
-		break;
-	case SEEK_CUR:
-		/*
-		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
-		 * position-querying operation.  Avoid rewriting the "same"
-		 * f_pos value back to the file because a concurrent read(),
-		 * write() or lseek() might have altered it
-		 */
-		if (offset == 0)
-			return file->f_pos;
-		/*
-		 * f_lock protects against read/modify/write race with other
-		 * SEEK_CURs. Note that parallel writes and reads behave
-		 * like SEEK_SET.
-		 */
-		spin_lock(&file->f_lock);
-		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
-		spin_unlock(&file->f_lock);
+	int ret;
+
+	ret = must_set_pos(file, &offset, whence, eof);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
 		return offset;
-	case SEEK_DATA:
+
+	if (whence == SEEK_CUR) {
 		/*
-		 * In the generic case the entire file is data, so as long as
-		 * offset isn't at the end of the file then the offset is data.
+		 * f_lock protects against read/modify/write race with
+		 * other SEEK_CURs. Note that parallel writes and reads
+		 * behave like SEEK_SET.
 		 */
-		if ((unsigned long long)offset >= eof)
-			return -ENXIO;
-		break;
-	case SEEK_HOLE:
-		/*
-		 * There is a virtual hole at the end of the file, so as long as
-		 * offset isn't i_size or larger, return i_size.
-		 */
-		if ((unsigned long long)offset >= eof)
-			return -ENXIO;
-		offset = eof;
-		break;
+		guard(spinlock)(&file->f_lock);
+		return vfs_setpos(file, file->f_pos + offset, maxsize);
 	}
 
 	return vfs_setpos(file, offset, maxsize);

From d688d65a847ff910146eff51e70f4b7049895eb5 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:49 +0200
Subject: [PATCH 14/26] fs: add generic_llseek_cookie()

This is similar to generic_file_llseek() but allows the caller to
specify a cookie that will be updated to indicate that a seek happened.
Caller's requiring that information in their readdir implementations can
use that.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-8-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/read_write.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 47 insertions(+)

diff --git a/fs/read_write.c b/fs/read_write.c
index b07e48cd81d1..fb519e55c8e8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -180,6 +180,51 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 }
 EXPORT_SYMBOL(generic_file_llseek_size);
 
+/**
+ * generic_llseek_cookie - versioned llseek implementation
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ * @cookie:	cookie to update
+ *
+ * See generic_file_llseek for a general description and locking assumptions.
+ *
+ * In contrast to generic_file_llseek, this function also resets a
+ * specified cookie to indicate a seek took place.
+ */
+loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+			     u64 *cookie)
+{
+	struct inode *inode = file->f_mapping->host;
+	loff_t maxsize = inode->i_sb->s_maxbytes;
+	loff_t eof = i_size_read(inode);
+	int ret;
+
+	if (WARN_ON_ONCE(!cookie))
+		return -EINVAL;
+
+	/*
+	 * Require that this is only used for directories that guarantee
+	 * synchronization between readdir and seek so that an update to
+	 * @cookie is correctly synchronized with concurrent readdir.
+	 */
+	if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
+		return -EINVAL;
+
+	ret = must_set_pos(file, &offset, whence, eof);
+	if (ret < 0)
+		return ret;
+	if (ret == 0)
+		return offset;
+
+	/* No need to hold f_lock because we know that f_pos_lock is held. */
+	if (whence == SEEK_CUR)
+		return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);
+
+	return vfs_setpos_cookie(file, offset, maxsize, cookie);
+}
+EXPORT_SYMBOL(generic_llseek_cookie);
+
 /**
  * generic_file_llseek - generic llseek implementation for regular files
  * @file:	file structure to seek on
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58c91a52cad1..3e6b3c1afb31 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3202,6 +3202,8 @@ extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
 		int whence, loff_t maxsize, loff_t eof);
+loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
+			     u64 *cookie);
 extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
 		int whence, loff_t size);
 extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);

From bad74142a04bf9b161237f52667473a111181b83 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:50 +0200
Subject: [PATCH 15/26] affs: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-9-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/affs/dir.c | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index b2bf7016e1b3..bd40d5f08810 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -17,13 +17,44 @@
 #include <linux/iversion.h>
 #include "affs.h"
 
+struct affs_dir_data {
+	unsigned long ino;
+	u64 cookie;
+};
+
 static int affs_readdir(struct file *, struct dir_context *);
 
+static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct affs_dir_data *data = file->private_data;
+
+	return generic_llseek_cookie(file, offset, whence, &data->cookie);
+}
+
+static int affs_dir_open(struct inode *inode, struct file *file)
+{
+	struct affs_dir_data	*data;
+
+	data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	file->private_data = data;
+	return 0;
+}
+
+static int affs_dir_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
 const struct file_operations affs_dir_operations = {
+	.open		= affs_dir_open,
 	.read		= generic_read_dir,
-	.llseek		= generic_file_llseek,
+	.llseek		= affs_dir_llseek,
 	.iterate_shared	= affs_readdir,
 	.fsync		= affs_file_fsync,
+	.release	= affs_dir_release,
 };
 
 /*
@@ -45,6 +76,7 @@ static int
 affs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode		*inode = file_inode(file);
+	struct affs_dir_data	*data = file->private_data;
 	struct super_block	*sb = inode->i_sb;
 	struct buffer_head	*dir_bh = NULL;
 	struct buffer_head	*fh_bh = NULL;
@@ -59,7 +91,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
 	pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
 
 	if (ctx->pos < 2) {
-		file->private_data = (void *)0;
+		data->ino = 0;
 		if (!dir_emit_dots(file, ctx))
 			return 0;
 	}
@@ -80,8 +112,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
 	/* If the directory hasn't changed since the last call to readdir(),
 	 * we can jump directly to where we left off.
 	 */
-	ino = (u32)(long)file->private_data;
-	if (ino && inode_eq_iversion(inode, file->f_version)) {
+	ino = data->ino;
+	if (ino && inode_eq_iversion(inode, data->cookie)) {
 		pr_debug("readdir() left off=%d\n", ino);
 		goto inside;
 	}
@@ -131,8 +163,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
 		} while (ino);
 	}
 done:
-	file->f_version = inode_query_iversion(inode);
-	file->private_data = (void *)(long)ino;
+	data->cookie = inode_query_iversion(inode);
+	data->ino = ino;
 	affs_brelse(fh_bh);
 
 out_brelse_dir:

From 794576e07585bfb31f810e25c58ee0bcfc5e19d9 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:51 +0200
Subject: [PATCH 16/26] ext2: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-10-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ext2/dir.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 087457061c6e..6622c582f550 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -263,7 +263,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
 	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
-	bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+	bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data);
 	bool has_filetype;
 
 	if (pos > inode->i_size - EXT2_DIR_REC_LEN(1))
@@ -290,7 +290,7 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
 				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
 				ctx->pos = (n<<PAGE_SHIFT) + offset;
 			}
-			file->f_version = inode_query_iversion(inode);
+			*(u64 *)file->private_data = inode_query_iversion(inode);
 			need_revalidate = false;
 		}
 		de = (ext2_dirent *)(kaddr+offset);
@@ -703,8 +703,30 @@ int ext2_empty_dir(struct inode *inode)
 	return 0;
 }
 
+static int ext2_dir_open(struct inode *inode, struct file *file)
+{
+	file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int ext2_dir_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static loff_t ext2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_llseek_cookie(file, offset, whence,
+				     (u64 *)file->private_data);
+}
+
 const struct file_operations ext2_dir_operations = {
-	.llseek		= generic_file_llseek,
+	.open		= ext2_dir_open,
+	.release	= ext2_dir_release,
+	.llseek		= ext2_dir_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= ext2_readdir,
 	.unlocked_ioctl = ext2_ioctl,

From 4f05ee2f82b470c20f7ff260bb0d866425b09d05 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:52 +0200
Subject: [PATCH 17/26] ext4: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-11-6d3e4816aa7b@kernel.org
Acked-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ext4/dir.c    | 48 +++++++++++++++++++++++++++---------------------
 fs/ext4/ext4.h   |  2 ++
 fs/ext4/inline.c |  7 ++++---
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ff4514e4626b..13196afe55ce 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -133,6 +133,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *bh = NULL;
 	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
+	struct dir_private_info *info = file->private_data;
 
 	err = fscrypt_prepare_readdir(inode);
 	if (err)
@@ -229,7 +230,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 		 * readdir(2), then we might be pointing to an invalid
 		 * dirent right now.  Scan from the start of the block
 		 * to make sure. */
-		if (!inode_eq_iversion(inode, file->f_version)) {
+		if (!inode_eq_iversion(inode, info->cookie)) {
 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
 				de = (struct ext4_dir_entry_2 *)
 					(bh->b_data + i);
@@ -249,7 +250,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 			offset = i;
 			ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
 				| offset;
-			file->f_version = inode_query_iversion(inode);
+			info->cookie = inode_query_iversion(inode);
 		}
 
 		while (ctx->pos < inode->i_size
@@ -384,6 +385,7 @@ static inline loff_t ext4_get_htree_eof(struct file *filp)
 static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
+	struct dir_private_info *info = file->private_data;
 	int dx_dir = is_dx_dir(inode);
 	loff_t ret, htree_max = ext4_get_htree_eof(file);
 
@@ -392,7 +394,7 @@ static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence)
 						    htree_max, htree_max);
 	else
 		ret = ext4_llseek(file, offset, whence);
-	file->f_version = inode_peek_iversion(inode) - 1;
+	info->cookie = inode_peek_iversion(inode) - 1;
 	return ret;
 }
 
@@ -429,18 +431,15 @@ static void free_rb_tree_fname(struct rb_root *root)
 	*root = RB_ROOT;
 }
 
-
-static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
-							   loff_t pos)
+static void ext4_htree_init_dir_info(struct file *filp, loff_t pos)
 {
-	struct dir_private_info *p;
+	struct dir_private_info *p = filp->private_data;
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
-	if (!p)
-		return NULL;
-	p->curr_hash = pos2maj_hash(filp, pos);
-	p->curr_minor_hash = pos2min_hash(filp, pos);
-	return p;
+	if (is_dx_dir(file_inode(filp)) && !p->initialized) {
+		p->curr_hash = pos2maj_hash(filp, pos);
+		p->curr_minor_hash = pos2min_hash(filp, pos);
+		p->initialized = true;
+	}
 }
 
 void ext4_htree_free_dir_info(struct dir_private_info *p)
@@ -552,12 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
 	struct fname *fname;
 	int ret = 0;
 
-	if (!info) {
-		info = ext4_htree_create_dir_info(file, ctx->pos);
-		if (!info)
-			return -ENOMEM;
-		file->private_data = info;
-	}
+	ext4_htree_init_dir_info(file, ctx->pos);
 
 	if (ctx->pos == ext4_get_htree_eof(file))
 		return 0;	/* EOF */
@@ -590,10 +584,10 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
 		 * cached entries.
 		 */
 		if ((!info->curr_node) ||
-		    !inode_eq_iversion(inode, file->f_version)) {
+		    !inode_eq_iversion(inode, info->cookie)) {
 			info->curr_node = NULL;
 			free_rb_tree_fname(&info->root);
-			file->f_version = inode_query_iversion(inode);
+			info->cookie = inode_query_iversion(inode);
 			ret = ext4_htree_fill_tree(file, info->curr_hash,
 						   info->curr_minor_hash,
 						   &info->next_hash);
@@ -664,7 +658,19 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
 	return 0;
 }
 
+static int ext4_dir_open(struct inode *inode, struct file *file)
+{
+	struct dir_private_info *info;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+	file->private_data = info;
+	return 0;
+}
+
 const struct file_operations ext4_dir_operations = {
+	.open		= ext4_dir_open,
 	.llseek		= ext4_dir_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= ext4_readdir,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 08acd152261e..d62a4b9b26ce 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2553,6 +2553,8 @@ struct dir_private_info {
 	__u32		curr_hash;
 	__u32		curr_minor_hash;
 	__u32		next_hash;
+	u64		cookie;
+	bool		initialized;
 };
 
 /* calculate the first block number of the group */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index e7a09a99837b..4282e12dc405 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1460,6 +1460,7 @@ int ext4_read_inline_dir(struct file *file,
 	struct ext4_iloc iloc;
 	void *dir_buf = NULL;
 	int dotdot_offset, dotdot_size, extra_offset, extra_size;
+	struct dir_private_info *info = file->private_data;
 
 	ret = ext4_get_inode_loc(inode, &iloc);
 	if (ret)
@@ -1503,12 +1504,12 @@ int ext4_read_inline_dir(struct file *file,
 	extra_size = extra_offset + inline_size;
 
 	/*
-	 * If the version has changed since the last call to
+	 * If the cookie has changed since the last call to
 	 * readdir(2), then we might be pointing to an invalid
 	 * dirent right now.  Scan from the start of the inline
 	 * dir to make sure.
 	 */
-	if (!inode_eq_iversion(inode, file->f_version)) {
+	if (!inode_eq_iversion(inode, info->cookie)) {
 		for (i = 0; i < extra_size && i < offset;) {
 			/*
 			 * "." is with offset 0 and
@@ -1540,7 +1541,7 @@ int ext4_read_inline_dir(struct file *file,
 		}
 		offset = i;
 		ctx->pos = offset;
-		file->f_version = inode_query_iversion(inode);
+		info->cookie = inode_query_iversion(inode);
 	}
 
 	while (ctx->pos < extra_size) {

From 7a7ce8b3ba66754f5d275a71630b4ee8b507d266 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:53 +0200
Subject: [PATCH 18/26] input: remove f_version abuse

f_version is removed from struct file. Make input stop abusing f_version
for stashing information for poll. Move the input state counter into
input_seq_state and allocate it via seq_private_open() and free via
seq_release_private().

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-12-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/input/input.c | 47 ++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 54c57b267b25..19ea1888da9f 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -1079,33 +1079,31 @@ static inline void input_wakeup_procfs_readers(void)
 	wake_up(&input_devices_poll_wait);
 }
 
+struct input_seq_state {
+	unsigned short pos;
+	bool mutex_acquired;
+	int input_devices_state;
+};
+
 static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait)
 {
+	struct seq_file *seq = file->private_data;
+	struct input_seq_state *state = seq->private;
+
 	poll_wait(file, &input_devices_poll_wait, wait);
-	if (file->f_version != input_devices_state) {
-		file->f_version = input_devices_state;
+	if (state->input_devices_state != input_devices_state) {
+		state->input_devices_state = input_devices_state;
 		return EPOLLIN | EPOLLRDNORM;
 	}
 
 	return 0;
 }
 
-union input_seq_state {
-	struct {
-		unsigned short pos;
-		bool mutex_acquired;
-	};
-	void *p;
-};
-
 static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	union input_seq_state *state = (union input_seq_state *)&seq->private;
+	struct input_seq_state *state = seq->private;
 	int error;
 
-	/* We need to fit into seq->private pointer */
-	BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private));
-
 	error = mutex_lock_interruptible(&input_mutex);
 	if (error) {
 		state->mutex_acquired = false;
@@ -1124,7 +1122,7 @@ static void *input_devices_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 static void input_seq_stop(struct seq_file *seq, void *v)
 {
-	union input_seq_state *state = (union input_seq_state *)&seq->private;
+	struct input_seq_state *state = seq->private;
 
 	if (state->mutex_acquired)
 		mutex_unlock(&input_mutex);
@@ -1210,7 +1208,8 @@ static const struct seq_operations input_devices_seq_ops = {
 
 static int input_proc_devices_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &input_devices_seq_ops);
+	return seq_open_private(file, &input_devices_seq_ops,
+				sizeof(struct input_seq_state));
 }
 
 static const struct proc_ops input_devices_proc_ops = {
@@ -1218,17 +1217,14 @@ static const struct proc_ops input_devices_proc_ops = {
 	.proc_poll	= input_proc_devices_poll,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
+	.proc_release	= seq_release_private,
 };
 
 static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	union input_seq_state *state = (union input_seq_state *)&seq->private;
+	struct input_seq_state *state = seq->private;
 	int error;
 
-	/* We need to fit into seq->private pointer */
-	BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private));
-
 	error = mutex_lock_interruptible(&input_mutex);
 	if (error) {
 		state->mutex_acquired = false;
@@ -1243,7 +1239,7 @@ static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-	union input_seq_state *state = (union input_seq_state *)&seq->private;
+	struct input_seq_state *state = seq->private;
 
 	state->pos = *pos + 1;
 	return seq_list_next(v, &input_handler_list, pos);
@@ -1252,7 +1248,7 @@ static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 static int input_handlers_seq_show(struct seq_file *seq, void *v)
 {
 	struct input_handler *handler = container_of(v, struct input_handler, node);
-	union input_seq_state *state = (union input_seq_state *)&seq->private;
+	struct input_seq_state *state = seq->private;
 
 	seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name);
 	if (handler->filter)
@@ -1273,14 +1269,15 @@ static const struct seq_operations input_handlers_seq_ops = {
 
 static int input_proc_handlers_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &input_handlers_seq_ops);
+	return seq_open_private(file, &input_handlers_seq_ops,
+				sizeof(struct input_seq_state));
 }
 
 static const struct proc_ops input_handlers_proc_ops = {
 	.proc_open	= input_proc_handlers_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
-	.proc_release	= seq_release,
+	.proc_release	= seq_release_private,
 };
 
 static int __init input_proc_init(void)

From ceaa5e80db7c73321c89fda39c3c8c78817aecd2 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:54 +0200
Subject: [PATCH 19/26] ocfs2: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-13-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ocfs2/dir.c  |  3 ++-
 fs/ocfs2/file.c | 11 +++++++++--
 fs/ocfs2/file.h |  1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index f0beb173dbba..ccef3f42b333 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1932,6 +1932,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
 {
 	int error = 0;
 	struct inode *inode = file_inode(file);
+	struct ocfs2_file_private *fp = file->private_data;
 	int lock_level = 0;
 
 	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1952,7 +1953,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx)
 		goto bail_nolock;
 	}
 
-	error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false);
+	error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false);
 
 	ocfs2_inode_unlock(inode, lock_level);
 	if (error)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ccc57038a977..115ab2172820 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2750,6 +2750,13 @@ static loff_t ocfs2_remap_file_range(struct file *file_in, loff_t pos_in,
 	return remapped > 0 ? remapped : ret;
 }
 
+static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct ocfs2_file_private *fp = file->private_data;
+
+	return generic_llseek_cookie(file, offset, whence, &fp->cookie);
+}
+
 const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
@@ -2797,7 +2804,7 @@ const struct file_operations ocfs2_fops = {
 
 WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
 const struct file_operations ocfs2_dops = {
-	.llseek		= generic_file_llseek,
+	.llseek		= ocfs2_dir_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= shared_ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
@@ -2843,7 +2850,7 @@ const struct file_operations ocfs2_fops_no_plocks = {
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {
-	.llseek		= generic_file_llseek,
+	.llseek		= ocfs2_dir_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= shared_ocfs2_readdir,
 	.fsync		= ocfs2_sync_file,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 8e53e4ac1120..41e65e45a9f3 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -20,6 +20,7 @@ struct ocfs2_alloc_context;
 enum ocfs2_alloc_restarted;
 
 struct ocfs2_file_private {
+	u64			cookie;
 	struct file		*fp_file;
 	struct mutex		fp_mutex;
 	struct ocfs2_lock_res	fp_flock;

From b4dba2efa8106002076b070fdff24ed6bf1ea87b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:55 +0200
Subject: [PATCH 20/26] proc: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-14-6d3e4816aa7b@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/proc/base.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 72a1acd03675..988f6bbfc4bd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3870,12 +3870,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 	if (!dir_emit_dots(file, ctx))
 		return 0;
 
-	/* f_version caches the tgid value that the last readdir call couldn't
-	 * return. lseek aka telldir automagically resets f_version to 0.
+	/* We cache the tgid value that the last readdir call couldn't
+	 * return and lseek resets it to 0.
 	 */
 	ns = proc_pid_ns(inode->i_sb);
-	tid = (int)file->f_version;
-	file->f_version = 0;
+	tid = (int)(intptr_t)file->private_data;
+	file->private_data = NULL;
 	for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
 	     task;
 	     task = next_tid(task), ctx->pos++) {
@@ -3890,7 +3890,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
 				proc_task_instantiate, task, NULL)) {
 			/* returning this tgid failed, save it as the first
 			 * pid for the next readir call */
-			file->f_version = (u64)tid;
+			file->private_data = (void *)(intptr_t)tid;
 			put_task_struct(task);
 			break;
 		}
@@ -3915,6 +3915,24 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
 	return 0;
 }
 
+/*
+ * proc_task_readdir() set @file->private_data to a positive integer
+ * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ * here to catch any unexpected change in behavior either in
+ * proc_task_readdir() or generic_llseek_cookie().
+ */
+static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	u64 cookie = (u64)(intptr_t)file->private_data;
+	loff_t off;
+
+	off = generic_llseek_cookie(file, offset, whence, &cookie);
+	WARN_ON_ONCE(cookie > INT_MAX);
+	file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+	return off;
+}
+
 static const struct inode_operations proc_task_inode_operations = {
 	.lookup		= proc_task_lookup,
 	.getattr	= proc_task_getattr,
@@ -3925,7 +3943,7 @@ static const struct inode_operations proc_task_inode_operations = {
 static const struct file_operations proc_task_operations = {
 	.read		= generic_read_dir,
 	.iterate_shared	= proc_task_readdir,
-	.llseek		= generic_file_llseek,
+	.llseek		= proc_dir_llseek,
 };
 
 void __init set_proc_pid_nlink(void)

From 3dd4624ffcd2e6e5d2cee5a6c234774ce27e1f04 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:56 +0200
Subject: [PATCH 21/26] udf: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-15-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/udf/dir.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f94f45fe2c91..5023dfe191e8 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -60,7 +60,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
 	 * identifying beginning of dir entry (names are under user control),
 	 * we need to scan the directory from the beginning.
 	 */
-	if (!inode_eq_iversion(dir, file->f_version)) {
+	if (!inode_eq_iversion(dir, *(u64 *)file->private_data)) {
 		emit_pos = nf_pos;
 		nf_pos = 0;
 	} else {
@@ -122,15 +122,37 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
 	udf_fiiter_release(&iter);
 out:
 	if (pos_valid)
-		file->f_version = inode_query_iversion(dir);
+		*(u64 *)file->private_data = inode_query_iversion(dir);
 	kfree(fname);
 
 	return ret;
 }
 
+static int udf_dir_open(struct inode *inode, struct file *file)
+{
+	file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int udf_dir_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static loff_t udf_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_llseek_cookie(file, offset, whence,
+				     (u64 *)file->private_data);
+}
+
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
-	.llseek			= generic_file_llseek,
+	.open			= udf_dir_open,
+	.release		= udf_dir_release,
+	.llseek			= udf_dir_llseek,
 	.read			= generic_read_dir,
 	.iterate_shared		= udf_readdir,
 	.unlocked_ioctl		= udf_ioctl,

From 0bea8287df6c86cacaf34eef167f23dcc5dbcede Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:57 +0200
Subject: [PATCH 22/26] ufs: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-16-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ufs/dir.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 61f25d3cf3f7..335f0ae529b4 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -435,7 +435,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
 	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
-	bool need_revalidate = !inode_eq_iversion(inode, file->f_version);
+	bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data);
 	unsigned flags = UFS_SB(sb)->s_flags;
 
 	UFSD("BEGIN\n");
@@ -462,7 +462,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
 				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
 				ctx->pos = (n<<PAGE_SHIFT) + offset;
 			}
-			file->f_version = inode_query_iversion(inode);
+			*(u64 *)file->private_data = inode_query_iversion(inode);
 			need_revalidate = false;
 		}
 		de = (struct ufs_dir_entry *)(kaddr+offset);
@@ -646,9 +646,31 @@ int ufs_empty_dir(struct inode * inode)
 	return 0;
 }
 
+static int ufs_dir_open(struct inode *inode, struct file *file)
+{
+	file->private_data = kzalloc(sizeof(u64), GFP_KERNEL);
+	if (!file->private_data)
+		return -ENOMEM;
+	return 0;
+}
+
+static int ufs_dir_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static loff_t ufs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	return generic_llseek_cookie(file, offset, whence,
+				     (u64 *)file->private_data);
+}
+
 const struct file_operations ufs_dir_operations = {
+	.open		= ufs_dir_open,
+	.release	= ufs_dir_release,
 	.read		= generic_read_dir,
 	.iterate_shared	= ufs_readdir,
 	.fsync		= generic_file_fsync,
-	.llseek		= generic_file_llseek,
+	.llseek		= ufs_dir_llseek,
 };

From 1146e5a69efca76501378f748388fd7742ad09cf Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:58 +0200
Subject: [PATCH 23/26] ubifs: store cookie in private data

Store the cookie to detect concurrent seeks on directories in
file->private_data.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-17-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ubifs/dir.c | 64 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c77ea57fe696..fda82f3e16e8 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -555,6 +555,11 @@ static unsigned int vfs_dent_type(uint8_t type)
 	return 0;
 }
 
+struct ubifs_dir_data {
+	struct ubifs_dent_node *dent;
+	u64 cookie;
+};
+
 /*
  * The classical Unix view for directory is that it is a linear array of
  * (name, inode number) entries. Linux/VFS assumes this model as well.
@@ -582,6 +587,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 	struct inode *dir = file_inode(file);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	bool encrypted = IS_ENCRYPTED(dir);
+	struct ubifs_dir_data *data = file->private_data;
 
 	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
 
@@ -604,27 +610,27 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 		fstr_real_len = fstr.len;
 	}
 
-	if (file->f_version == 0) {
+	if (data->cookie == 0) {
 		/*
-		 * The file was seek'ed, which means that @file->private_data
+		 * The file was seek'ed, which means that @data->dent
 		 * is now invalid. This may also be just the first
 		 * 'ubifs_readdir()' invocation, in which case
-		 * @file->private_data is NULL, and the below code is
+		 * @data->dent is NULL, and the below code is
 		 * basically a no-op.
 		 */
-		kfree(file->private_data);
-		file->private_data = NULL;
+		kfree(data->dent);
+		data->dent = NULL;
 	}
 
 	/*
-	 * 'generic_file_llseek()' unconditionally sets @file->f_version to
-	 * zero, and we use this for detecting whether the file was seek'ed.
+	 * 'ubifs_dir_llseek()' sets @data->cookie to zero, and we use this
+	 * for detecting whether the file was seek'ed.
 	 */
-	file->f_version = 1;
+	data->cookie = 1;
 
 	/* File positions 0 and 1 correspond to "." and ".." */
 	if (ctx->pos < 2) {
-		ubifs_assert(c, !file->private_data);
+		ubifs_assert(c, !data->dent);
 		if (!dir_emit_dots(file, ctx)) {
 			if (encrypted)
 				fscrypt_fname_free_buffer(&fstr);
@@ -641,10 +647,10 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 		}
 
 		ctx->pos = key_hash_flash(c, &dent->key);
-		file->private_data = dent;
+		data->dent = dent;
 	}
 
-	dent = file->private_data;
+	dent = data->dent;
 	if (!dent) {
 		/*
 		 * The directory was seek'ed to and is now readdir'ed.
@@ -658,7 +664,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 			goto out;
 		}
 		ctx->pos = key_hash_flash(c, &dent->key);
-		file->private_data = dent;
+		data->dent = dent;
 	}
 
 	while (1) {
@@ -701,15 +707,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 			goto out;
 		}
 
-		kfree(file->private_data);
+		kfree(data->dent);
 		ctx->pos = key_hash_flash(c, &dent->key);
-		file->private_data = dent;
+		data->dent = dent;
 		cond_resched();
 	}
 
 out:
-	kfree(file->private_data);
-	file->private_data = NULL;
+	kfree(data->dent);
+	data->dent = NULL;
 
 	if (encrypted)
 		fscrypt_fname_free_buffer(&fstr);
@@ -733,7 +739,10 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 /* Free saved readdir() state when the directory is closed */
 static int ubifs_dir_release(struct inode *dir, struct file *file)
 {
-	kfree(file->private_data);
+	struct ubifs_dir_data *data = file->private_data;
+
+	kfree(data->dent);
+	kfree(data);
 	file->private_data = NULL;
 	return 0;
 }
@@ -1712,6 +1721,24 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path,
 	return 0;
 }
 
+static int ubifs_dir_open(struct inode *inode, struct file *file)
+{
+	struct ubifs_dir_data *data;
+
+	data = kzalloc(sizeof(struct ubifs_dir_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	file->private_data = data;
+	return 0;
+}
+
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct ubifs_dir_data *data = file->private_data;
+
+	return generic_llseek_cookie(file, offset, whence, &data->cookie);
+}
+
 const struct inode_operations ubifs_dir_inode_operations = {
 	.lookup      = ubifs_lookup,
 	.create      = ubifs_create,
@@ -1732,7 +1759,8 @@ const struct inode_operations ubifs_dir_inode_operations = {
 };
 
 const struct file_operations ubifs_dir_operations = {
-	.llseek         = generic_file_llseek,
+	.open		= ubifs_dir_open,
+	.llseek         = ubifs_dir_llseek,
 	.release        = ubifs_dir_release,
 	.read           = generic_read_dir,
 	.iterate_shared = ubifs_readdir,

From 5e9b50dea970ae6d3e1309d4254157099734a2af Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:04:59 +0200
Subject: [PATCH 24/26] fs: add f_pipe

Only regular files with FMODE_ATOMIC_POS and directories need
f_pos_lock. Place a new f_pipe member in a union with f_pos_lock
that they can use and make them stop abusing f_version in follow-up
patches.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-18-6d3e4816aa7b@kernel.org
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c    | 7 +++++++
 include/linux/fs.h | 8 +++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 3ef558f27a1c..7ce4d5dac080 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -156,6 +156,13 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	}
 
 	spin_lock_init(&f->f_lock);
+	/*
+	 * Note that f_pos_lock is only used for files raising
+	 * FMODE_ATOMIC_POS and directories. Other files such as pipes
+	 * don't need it and since f_pos_lock is in a union may reuse
+	 * the space for other purposes. They are expected to initialize
+	 * the respective member when opening the file.
+	 */
 	mutex_init(&f->f_pos_lock);
 	f->f_flags = flags;
 	f->f_mode = OPEN_FMODE(flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e6b3c1afb31..ca4925008244 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1001,6 +1001,7 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_cred: stashed credentials of creator/opener
  * @f_path: path of the file
  * @f_pos_lock: lock protecting file position
+ * @f_pipe: specific to pipes
  * @f_pos: file position
  * @f_version: file version
  * @f_security: LSM security context of this file
@@ -1026,7 +1027,12 @@ struct file {
 	const struct cred		*f_cred;
 	/* --- cacheline 1 boundary (64 bytes) --- */
 	struct path			f_path;
-	struct mutex			f_pos_lock;
+	union {
+		/* regular files (with FMODE_ATOMIC_POS) and directories */
+		struct mutex		f_pos_lock;
+		/* pipes */
+		u64			f_pipe;
+	};
 	loff_t				f_pos;
 	u64				f_version;
 	/* --- cacheline 2 boundary (128 bytes) --- */

From 5a957bbac3ab9808a8df711a269e4d18f84e9e4a Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:05:00 +0200
Subject: [PATCH 25/26] pipe: use f_pipe

Pipes use f_version to defer poll notifications until a write has been
observed. Since multiple file's refer to the same struct pipe_inode_info
in their ->private_data moving it into their isn't feasible since we
would need to introduce an additional pointer indirection.

However, since pipes don't require f_pos_lock we placed a new f_pipe
member into a union with f_pos_lock that pipes can use. This is similar
to what we already do for struct inode where we have additional fields
per file type. This will allow us to fully remove f_version in the next
step.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-19-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 7dff2aa50a6d..d3735f1d6f00 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -686,7 +686,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 	if (filp->f_mode & FMODE_READ) {
 		if (!pipe_empty(head, tail))
 			mask |= EPOLLIN | EPOLLRDNORM;
-		if (!pipe->writers && filp->f_version != pipe->w_counter)
+		if (!pipe->writers && filp->f_pipe != pipe->w_counter)
 			mask |= EPOLLHUP;
 	}
 
@@ -945,6 +945,7 @@ int create_pipe_files(struct file **res, int flags)
 	}
 
 	f->private_data = inode->i_pipe;
+	f->f_pipe = 0;
 
 	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
 				  &pipefifo_fops);
@@ -954,6 +955,7 @@ int create_pipe_files(struct file **res, int flags)
 		return PTR_ERR(res[0]);
 	}
 	res[0]->private_data = inode->i_pipe;
+	res[0]->f_pipe = 0;
 	res[1] = f;
 	stream_open(inode, res[0]);
 	stream_open(inode, res[1]);
@@ -1108,7 +1110,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
 	int ret;
 
-	filp->f_version = 0;
+	filp->f_pipe = 0;
 
 	spin_lock(&inode->i_lock);
 	if (inode->i_pipe) {
@@ -1155,7 +1157,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 			if ((filp->f_flags & O_NONBLOCK)) {
 				/* suppress EPOLLHUP until we have
 				 * seen a writer */
-				filp->f_version = pipe->w_counter;
+				filp->f_pipe = pipe->w_counter;
 			} else {
 				if (wait_for_partner(pipe, &pipe->w_counter))
 					goto err_rd;

From 11068e0b64cbb540b96e577fcca0926242ecaf58 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 30 Aug 2024 15:05:01 +0200
Subject: [PATCH 26/26] fs: remove f_version

Now that detecting concurrent seeks is done by the filesystems that
require it we can remove f_version and free up 8 bytes for future
extensions.

Link: https://lore.kernel.org/r/20240830-vfs-file-f_version-v1-20-6d3e4816aa7b@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/read_write.c    | 9 ++++-----
 include/linux/fs.h | 4 +---
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index fb519e55c8e8..b19cce8e55b9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -62,7 +62,8 @@ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
 
 	if (offset != file->f_pos) {
 		file->f_pos = offset;
-		*cookie = 0;
+		if (cookie)
+			*cookie = 0;
 	}
 	return offset;
 }
@@ -81,7 +82,7 @@ static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
  */
 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
 {
-	return vfs_setpos_cookie(file, offset, maxsize, &file->f_version);
+	return vfs_setpos_cookie(file, offset, maxsize, NULL);
 }
 EXPORT_SYMBOL(vfs_setpos);
 
@@ -364,10 +365,8 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
 	}
 	retval = -EINVAL;
 	if (offset >= 0 || unsigned_offsets(file)) {
-		if (offset != file->f_pos) {
+		if (offset != file->f_pos)
 			file->f_pos = offset;
-			file->f_version = 0;
-		}
 		retval = offset;
 	}
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ca4925008244..7e11ce172140 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1003,7 +1003,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
  * @f_pos_lock: lock protecting file position
  * @f_pipe: specific to pipes
  * @f_pos: file position
- * @f_version: file version
  * @f_security: LSM security context of this file
  * @f_owner: file owner
  * @f_wb_err: writeback error
@@ -1034,11 +1033,10 @@ struct file {
 		u64			f_pipe;
 	};
 	loff_t				f_pos;
-	u64				f_version;
-	/* --- cacheline 2 boundary (128 bytes) --- */
 #ifdef CONFIG_SECURITY
 	void				*f_security;
 #endif
+	/* --- cacheline 2 boundary (128 bytes) --- */
 	struct fown_struct		*f_owner;
 	errseq_t			f_wb_err;
 	errseq_t			f_sb_err;