From 7d286849a8de41c1aee2957e224c5802d3488c8d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 15 Jun 2024 17:55:00 -0700
Subject: [PATCH 1/5] vfs: link_path_walk: simplify name hash flow

This is one of those hot functions in path walking, and it's doing
things in just the wrong order that causes slightly unnecessary extra
work.

Move the name pointer update and the setting of 'nd->last' up a bit, so
that the (unlikely) filesystem-specific hashing can run on them in
place, instead of having to set up a copy on the stack and copy things
back and forth.

Because even when the hashing is not run, it causes the stack frame of
the function to be bigger to hold the unnecessary temporary copy.

This also means that we never then reference the full "hashlen" field
after calculating it, and can clarify the code with just using the
length part.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 37fb0a8aa09a..f3c91d8ae140 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2266,7 +2266,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	for(;;) {
 		struct mnt_idmap *idmap;
 		const char *link;
-		u64 hash_len;
+		unsigned int len;
 		int type;
 
 		idmap = mnt_idmap(nd->path.mnt);
@@ -2274,37 +2274,34 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (err)
 			return err;
 
-		hash_len = hash_name(nd->path.dentry, name);
+		nd->last.name = name;
+		nd->last.hash_len = hash_name(nd->path.dentry, name);
+		len = hashlen_len(nd->last.hash_len);
+		name += len;
 
 		type = LAST_NORM;
-		if (name[0] == '.') switch (hashlen_len(hash_len)) {
-			case 2:
-				if (name[1] == '.') {
+		/* We know len is at least 1, so compare against 2 */
+		if (len <= 2 && name[-1] == '.') {
+			if (len == 2) {
+				if (name[-2] == '.') {
 					type = LAST_DOTDOT;
 					nd->state |= ND_JUMPED;
 				}
-				break;
-			case 1:
+			} else {
 				type = LAST_DOT;
+			}
 		}
+		nd->last_type = type;
 		if (likely(type == LAST_NORM)) {
 			struct dentry *parent = nd->path.dentry;
 			nd->state &= ~ND_JUMPED;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-				struct qstr this = { { .hash_len = hash_len }, .name = name };
-				err = parent->d_op->d_hash(parent, &this);
+				err = parent->d_op->d_hash(parent, &nd->last);
 				if (err < 0)
 					return err;
-				hash_len = this.hash_len;
-				name = this.name;
 			}
 		}
 
-		nd->last.hash_len = hash_len;
-		nd->last.name = name;
-		nd->last_type = type;
-
-		name += hashlen_len(hash_len);
 		if (!*name)
 			goto OK;
 		/*

From 631e1a710c0489e083d4f1276f6de787a5cf08fb Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 16 Jun 2024 10:38:35 -0700
Subject: [PATCH 2/5] vfs: link_path_walk: clarify and improve name hashing
 interface

Now that we clearly only care about the length of the name we just
parsed, we can simplify and clarify the interface to "name_hash()", and
move the actual nd->last field setting in there.

That makes everything simpler, and this way don't mix the hash and the
length together only to then immediately unmix them again.

We still eventually want the combined mixed "hashlen" for when we look
things up in the dentry cache, but inside link_path_walk() it's simpler
and clearer to just deal with the path component length.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index f3c91d8ae140..40929d3cdf76 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2165,9 +2165,9 @@ EXPORT_SYMBOL(hashlen_string);
  * Calculate the length and hash of the path component, and
  * return the "hash_len" as the result.
  */
-static inline u64 hash_name(const void *salt, const char *name)
+static inline unsigned long hash_name(struct nameidata *nd, const char *name)
 {
-	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+	unsigned long a = 0, b, x = 0, y = (unsigned long)nd->path.dentry;
 	unsigned long adata, bdata, mask, len;
 	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 
@@ -2186,8 +2186,11 @@ static inline u64 hash_name(const void *salt, const char *name)
 	bdata = prep_zero_mask(b, bdata, &constants);
 	mask = create_zero_mask(adata | bdata);
 	x ^= a & zero_bytemask(mask);
+	len += find_zero(mask);
 
-	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
+	nd->last.hash = fold_hash(x, y);
+	nd->last.len = len;
+	return len;
 }
 
 #else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
@@ -2222,9 +2225,9 @@ EXPORT_SYMBOL(hashlen_string);
  * We know there's a real path component here of at least
  * one character.
  */
-static inline u64 hash_name(const void *salt, const char *name)
+static inline unsigned long hash_name(struct nameidata *nd, const char *name)
 {
-	unsigned long hash = init_name_hash(salt);
+	unsigned long hash = init_name_hash(nd->path.dentry);
 	unsigned long len = 0, c;
 
 	c = (unsigned char)*name;
@@ -2233,7 +2236,9 @@ static inline u64 hash_name(const void *salt, const char *name)
 		hash = partial_name_hash(c, hash);
 		c = (unsigned char)name[len];
 	} while (c && c != '/');
-	return hashlen_create(end_name_hash(hash), len);
+	nd->last.hash = end_name_hash(hash);
+	nd->last.len = len;
+	return len;
 }
 
 #endif
@@ -2275,8 +2280,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			return err;
 
 		nd->last.name = name;
-		nd->last.hash_len = hash_name(nd->path.dentry, name);
-		len = hashlen_len(nd->last.hash_len);
+		len = hash_name(nd, name);
 		name += len;
 
 		type = LAST_NORM;

From ba848a77c90800cb686a5c8cf725e9bdfdcccfc2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 16 Jun 2024 13:06:56 -0700
Subject: [PATCH 3/5] vfs: link_path_walk: do '.' and '..' detection while
 hashing

Instead of loading the name again to detect '.' and '..', just use the
fact that we already had the masked last word available when as we
created the name hash.  Which is exactly what we'd then test for.

Dealing with big-endian word ordering needs a bit of care, particularly
since we have the byte-at-a-time loop as a fallback that doesn't do BE
word loads.  But not a big deal.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 68 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 40929d3cdf76..928cbc856d5b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2163,9 +2163,11 @@ EXPORT_SYMBOL(hashlen_string);
 
 /*
  * Calculate the length and hash of the path component, and
- * return the "hash_len" as the result.
+ * return the length as the result.
  */
-static inline unsigned long hash_name(struct nameidata *nd, const char *name)
+static inline unsigned long hash_name(struct nameidata *nd,
+				      const char *name,
+				      unsigned long *lastword)
 {
 	unsigned long a = 0, b, x = 0, y = (unsigned long)nd->path.dentry;
 	unsigned long adata, bdata, mask, len;
@@ -2185,7 +2187,9 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name)
 	adata = prep_zero_mask(a, adata, &constants);
 	bdata = prep_zero_mask(b, bdata, &constants);
 	mask = create_zero_mask(adata | bdata);
-	x ^= a & zero_bytemask(mask);
+	a &= zero_bytemask(mask);
+	*lastword = a;
+	x ^= a;
 	len += find_zero(mask);
 
 	nd->last.hash = fold_hash(x, y);
@@ -2193,6 +2197,15 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name)
 	return len;
 }
 
+/*
+ * Note that the 'last' word is always zero-masked, but
+ * was loaded as a possibly big-endian word.
+ */
+#ifdef __BIG_ENDIAN
+  #define LAST_WORD_IS_DOT	(0x2eul << (BITS_PER_LONG-8))
+  #define LAST_WORD_IS_DOTDOT	(0x2e2eul << (BITS_PER_LONG-16))
+#endif
+
 #else	/* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
 
 /* Return the hash of a string of known length */
@@ -2225,17 +2238,19 @@ EXPORT_SYMBOL(hashlen_string);
  * We know there's a real path component here of at least
  * one character.
  */
-static inline unsigned long hash_name(struct nameidata *nd, const char *name)
+static inline unsigned long hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
 {
 	unsigned long hash = init_name_hash(nd->path.dentry);
-	unsigned long len = 0, c;
+	unsigned long len = 0, c, last = 0;
 
 	c = (unsigned char)*name;
 	do {
+		last = (last << 8) + c;
 		len++;
 		hash = partial_name_hash(c, hash);
 		c = (unsigned char)name[len];
 	} while (c && c != '/');
+	*lastword = last;
 	nd->last.hash = end_name_hash(hash);
 	nd->last.len = len;
 	return len;
@@ -2243,6 +2258,11 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name)
 
 #endif
 
+#ifndef LAST_WORD_IS_DOT
+  #define LAST_WORD_IS_DOT	0x2e
+  #define LAST_WORD_IS_DOTDOT	0x2e2e
+#endif
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -2271,8 +2291,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	for(;;) {
 		struct mnt_idmap *idmap;
 		const char *link;
+		unsigned long lastword;
 		unsigned int len;
-		int type;
 
 		idmap = mnt_idmap(nd->path.mnt);
 		err = may_lookup(idmap, nd);
@@ -2280,25 +2300,29 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			return err;
 
 		nd->last.name = name;
-		len = hash_name(nd, name);
+		len = hash_name(nd, name, &lastword);
 		name += len;
 
-		type = LAST_NORM;
-		/* We know len is at least 1, so compare against 2 */
-		if (len <= 2 && name[-1] == '.') {
-			if (len == 2) {
-				if (name[-2] == '.') {
-					type = LAST_DOTDOT;
-					nd->state |= ND_JUMPED;
-				}
-			} else {
-				type = LAST_DOT;
-			}
-		}
-		nd->last_type = type;
-		if (likely(type == LAST_NORM)) {
-			struct dentry *parent = nd->path.dentry;
+		switch(lastword) {
+		case LAST_WORD_IS_DOTDOT:
+			if (len != 2)
+				goto normal;
+			nd->last_type = LAST_DOTDOT;
+			nd->state |= ND_JUMPED;
+			break;
+
+		case LAST_WORD_IS_DOT:
+			if (len != 1)
+				goto normal;
+			nd->last_type = LAST_DOT;
+			break;
+
+		default:
+		normal:
+			nd->last_type = LAST_NORM;
 			nd->state &= ~ND_JUMPED;
+
+			struct dentry *parent = nd->path.dentry;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
 				err = parent->d_op->d_hash(parent, &nd->last);
 				if (err < 0)

From 58b0afa038bb404ff82b358b68db5ffa8f53c404 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 16 Jun 2024 15:00:35 -0700
Subject: [PATCH 4/5] vfs: link_path_walk: improve may_lookup() code generation

Instead of having separate calls to 'inode_permission()' depending on
whether we're in RCU lookup or not, just share the first call.

Note that the initial "conditional" on LOOKUP_RCU really turns into just
a "convert the LOOKUP_RCU bit in the nameidata into the MAY_NOT_BLOCK
bit in the argument", which is just a trivial bitwise and and shift
operation.

So the initial conditional goes away entirely, and then the likely case
is that it will succeed independently of us being in RCU lookup or not,
and the possible "we may need to fall out of RCU and redo it all" fixups
that are needed afterwards all go in the unlikely path.

[ This also marks 'nd' restrict, because that means that the compiler
  can know that there is no other alias, and can cache the LOOKUP_RCU
  value over the call to inode_permission(). ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 928cbc856d5b..6dcb1be1aca9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1712,17 +1712,26 @@ static struct dentry *lookup_slow(const struct qstr *name,
 }
 
 static inline int may_lookup(struct mnt_idmap *idmap,
-			     struct nameidata *nd)
+			     struct nameidata *restrict nd)
 {
-	if (nd->flags & LOOKUP_RCU) {
-		int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
-		if (!err)		// success, keep going
-			return 0;
-		if (!try_to_unlazy(nd))
-			return -ECHILD;	// redo it all non-lazy
-		if (err != -ECHILD)	// hard error
-			return err;
-	}
+	int err, mask;
+
+	mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
+	err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
+	if (likely(!err))
+		return 0;
+
+	// If we failed, and we weren't in LOOKUP_RCU, it's final
+	if (!(nd->flags & LOOKUP_RCU))
+		return err;
+
+	// Drop out of RCU mode to make sure it wasn't transient
+	if (!try_to_unlazy(nd))
+		return -ECHILD;	// redo it all non-lazy
+
+	if (err != -ECHILD)	// hard error
+		return err;
+
 	return inode_permission(idmap, nd->inode, MAY_EXEC);
 }
 

From 13694f0dbc077f28fcb613e7e37018d87997abb9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 17 Jun 2024 09:05:52 -0700
Subject: [PATCH 5/5] vfs: link_path_walk: move more of the name hashing into
 hash_name()

This avoids having to return the length of the component entirely by
just doing all of the name processing in hash_name().  We can just
return the end of the path component, and a flag for the DOT and DOTDOT
cases.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 51 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 6dcb1be1aca9..4601c7ab7b77 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2174,21 +2174,37 @@ EXPORT_SYMBOL(hashlen_string);
  * Calculate the length and hash of the path component, and
  * return the length as the result.
  */
-static inline unsigned long hash_name(struct nameidata *nd,
-				      const char *name,
-				      unsigned long *lastword)
+static inline const char *hash_name(struct nameidata *nd,
+				    const char *name,
+				    unsigned long *lastword)
 {
-	unsigned long a = 0, b, x = 0, y = (unsigned long)nd->path.dentry;
+	unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
 	unsigned long adata, bdata, mask, len;
 	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 
-	len = 0;
-	goto inside;
+	/*
+	 * The first iteration is special, because it can result in
+	 * '.' and '..' and has no mixing other than the final fold.
+	 */
+	a = load_unaligned_zeropad(name);
+	b = a ^ REPEAT_BYTE('/');
+	if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
+		adata = prep_zero_mask(a, adata, &constants);
+		bdata = prep_zero_mask(b, bdata, &constants);
+		mask = create_zero_mask(adata | bdata);
+		a &= zero_bytemask(mask);
+		*lastword = a;
+		len = find_zero(mask);
+		nd->last.hash = fold_hash(a, y);
+		nd->last.len = len;
+		return name + len;
+	}
 
+	len = 0;
+	x = 0;
 	do {
 		HASH_MIX(x, y, a);
 		len += sizeof(unsigned long);
-inside:
 		a = load_unaligned_zeropad(name+len);
 		b = a ^ REPEAT_BYTE('/');
 	} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
@@ -2197,13 +2213,13 @@ static inline unsigned long hash_name(struct nameidata *nd,
 	bdata = prep_zero_mask(b, bdata, &constants);
 	mask = create_zero_mask(adata | bdata);
 	a &= zero_bytemask(mask);
-	*lastword = a;
 	x ^= a;
 	len += find_zero(mask);
+	*lastword = 0;		// Multi-word components cannot be DOT or DOTDOT
 
 	nd->last.hash = fold_hash(x, y);
 	nd->last.len = len;
-	return len;
+	return name + len;
 }
 
 /*
@@ -2247,7 +2263,7 @@ EXPORT_SYMBOL(hashlen_string);
  * We know there's a real path component here of at least
  * one character.
  */
-static inline unsigned long hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
+static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
 {
 	unsigned long hash = init_name_hash(nd->path.dentry);
 	unsigned long len = 0, c, last = 0;
@@ -2259,10 +2275,14 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name, un
 		hash = partial_name_hash(c, hash);
 		c = (unsigned char)name[len];
 	} while (c && c != '/');
+
+	// This is reliable for DOT or DOTDOT, since the component
+	// cannot contain NUL characters - top bits being zero means
+	// we cannot have had any other pathnames.
 	*lastword = last;
 	nd->last.hash = end_name_hash(hash);
 	nd->last.len = len;
-	return len;
+	return name + len;
 }
 
 #endif
@@ -2301,7 +2321,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		struct mnt_idmap *idmap;
 		const char *link;
 		unsigned long lastword;
-		unsigned int len;
 
 		idmap = mnt_idmap(nd->path.mnt);
 		err = may_lookup(idmap, nd);
@@ -2309,25 +2328,19 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			return err;
 
 		nd->last.name = name;
-		len = hash_name(nd, name, &lastword);
-		name += len;
+		name = hash_name(nd, name, &lastword);
 
 		switch(lastword) {
 		case LAST_WORD_IS_DOTDOT:
-			if (len != 2)
-				goto normal;
 			nd->last_type = LAST_DOTDOT;
 			nd->state |= ND_JUMPED;
 			break;
 
 		case LAST_WORD_IS_DOT:
-			if (len != 1)
-				goto normal;
 			nd->last_type = LAST_DOT;
 			break;
 
 		default:
-		normal:
 			nd->last_type = LAST_NORM;
 			nd->state &= ~ND_JUMPED;