From 7d286849a8de41c1aee2957e224c5802d3488c8d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Jun 2024 17:55:00 -0700 Subject: [PATCH 1/5] vfs: link_path_walk: simplify name hash flow This is one of those hot functions in path walking, and it's doing things in just the wrong order that causes slightly unnecessary extra work. Move the name pointer update and the setting of 'nd->last' up a bit, so that the (unlikely) filesystem-specific hashing can run on them in place, instead of having to set up a copy on the stack and copy things back and forth. Because even when the hashing is not run, it causes the stack frame of the function to be bigger to hold the unnecessary temporary copy. This also means that we never then reference the full "hashlen" field after calculating it, and can clarify the code with just using the length part. Signed-off-by: Linus Torvalds --- fs/namei.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 37fb0a8aa09a..f3c91d8ae140 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2266,7 +2266,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) for(;;) { struct mnt_idmap *idmap; const char *link; - u64 hash_len; + unsigned int len; int type; idmap = mnt_idmap(nd->path.mnt); @@ -2274,37 +2274,34 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) return err; - hash_len = hash_name(nd->path.dentry, name); + nd->last.name = name; + nd->last.hash_len = hash_name(nd->path.dentry, name); + len = hashlen_len(nd->last.hash_len); + name += len; type = LAST_NORM; - if (name[0] == '.') switch (hashlen_len(hash_len)) { - case 2: - if (name[1] == '.') { + /* We know len is at least 1, so compare against 2 */ + if (len <= 2 && name[-1] == '.') { + if (len == 2) { + if (name[-2] == '.') { type = LAST_DOTDOT; nd->state |= ND_JUMPED; } - break; - case 1: + } else { type = LAST_DOT; + } } + nd->last_type = type; if (likely(type == LAST_NORM)) { struct dentry *parent = nd->path.dentry; nd->state &= ~ND_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - struct qstr this = { { .hash_len = hash_len }, .name = name }; - err = parent->d_op->d_hash(parent, &this); + err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; - hash_len = this.hash_len; - name = this.name; } } - nd->last.hash_len = hash_len; - nd->last.name = name; - nd->last_type = type; - - name += hashlen_len(hash_len); if (!*name) goto OK; /* From 631e1a710c0489e083d4f1276f6de787a5cf08fb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Jun 2024 10:38:35 -0700 Subject: [PATCH 2/5] vfs: link_path_walk: clarify and improve name hashing interface Now that we clearly only care about the length of the name we just parsed, we can simplify and clarify the interface to "name_hash()", and move the actual nd->last field setting in there. That makes everything simpler, and this way don't mix the hash and the length together only to then immediately unmix them again. We still eventually want the combined mixed "hashlen" for when we look things up in the dentry cache, but inside link_path_walk() it's simpler and clearer to just deal with the path component length. Signed-off-by: Linus Torvalds --- fs/namei.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index f3c91d8ae140..40929d3cdf76 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2165,9 +2165,9 @@ EXPORT_SYMBOL(hashlen_string); * Calculate the length and hash of the path component, and * return the "hash_len" as the result. */ -static inline u64 hash_name(const void *salt, const char *name) +static inline unsigned long hash_name(struct nameidata *nd, const char *name) { - unsigned long a = 0, b, x = 0, y = (unsigned long)salt; + unsigned long a = 0, b, x = 0, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; @@ -2186,8 +2186,11 @@ static inline u64 hash_name(const void *salt, const char *name) bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); x ^= a & zero_bytemask(mask); + len += find_zero(mask); - return hashlen_create(fold_hash(x, y), len + find_zero(mask)); + nd->last.hash = fold_hash(x, y); + nd->last.len = len; + return len; } #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ @@ -2222,9 +2225,9 @@ EXPORT_SYMBOL(hashlen_string); * We know there's a real path component here of at least * one character. */ -static inline u64 hash_name(const void *salt, const char *name) +static inline unsigned long hash_name(struct nameidata *nd, const char *name) { - unsigned long hash = init_name_hash(salt); + unsigned long hash = init_name_hash(nd->path.dentry); unsigned long len = 0, c; c = (unsigned char)*name; @@ -2233,7 +2236,9 @@ static inline u64 hash_name(const void *salt, const char *name) hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - return hashlen_create(end_name_hash(hash), len); + nd->last.hash = end_name_hash(hash); + nd->last.len = len; + return len; } #endif @@ -2275,8 +2280,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) return err; nd->last.name = name; - nd->last.hash_len = hash_name(nd->path.dentry, name); - len = hashlen_len(nd->last.hash_len); + len = hash_name(nd, name); name += len; type = LAST_NORM; From ba848a77c90800cb686a5c8cf725e9bdfdcccfc2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Jun 2024 13:06:56 -0700 Subject: [PATCH 3/5] vfs: link_path_walk: do '.' and '..' detection while hashing Instead of loading the name again to detect '.' and '..', just use the fact that we already had the masked last word available when as we created the name hash. Which is exactly what we'd then test for. Dealing with big-endian word ordering needs a bit of care, particularly since we have the byte-at-a-time loop as a fallback that doesn't do BE word loads. But not a big deal. Signed-off-by: Linus Torvalds --- fs/namei.c | 68 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 40929d3cdf76..928cbc856d5b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2163,9 +2163,11 @@ EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and - * return the "hash_len" as the result. + * return the length as the result. */ -static inline unsigned long hash_name(struct nameidata *nd, const char *name) +static inline unsigned long hash_name(struct nameidata *nd, + const char *name, + unsigned long *lastword) { unsigned long a = 0, b, x = 0, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; @@ -2185,7 +2187,9 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name) adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); - x ^= a & zero_bytemask(mask); + a &= zero_bytemask(mask); + *lastword = a; + x ^= a; len += find_zero(mask); nd->last.hash = fold_hash(x, y); @@ -2193,6 +2197,15 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name) return len; } +/* + * Note that the 'last' word is always zero-masked, but + * was loaded as a possibly big-endian word. + */ +#ifdef __BIG_ENDIAN + #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) + #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) +#endif + #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ @@ -2225,17 +2238,19 @@ EXPORT_SYMBOL(hashlen_string); * We know there's a real path component here of at least * one character. */ -static inline unsigned long hash_name(struct nameidata *nd, const char *name) +static inline unsigned long hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long hash = init_name_hash(nd->path.dentry); - unsigned long len = 0, c; + unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { + last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); + *lastword = last; nd->last.hash = end_name_hash(hash); nd->last.len = len; return len; @@ -2243,6 +2258,11 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name) #endif +#ifndef LAST_WORD_IS_DOT + #define LAST_WORD_IS_DOT 0x2e + #define LAST_WORD_IS_DOTDOT 0x2e2e +#endif + /* * Name resolution. * This is the basic name resolution function, turning a pathname into @@ -2271,8 +2291,8 @@ static int link_path_walk(const char *name, struct nameidata *nd) for(;;) { struct mnt_idmap *idmap; const char *link; + unsigned long lastword; unsigned int len; - int type; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); @@ -2280,25 +2300,29 @@ static int link_path_walk(const char *name, struct nameidata *nd) return err; nd->last.name = name; - len = hash_name(nd, name); + len = hash_name(nd, name, &lastword); name += len; - type = LAST_NORM; - /* We know len is at least 1, so compare against 2 */ - if (len <= 2 && name[-1] == '.') { - if (len == 2) { - if (name[-2] == '.') { - type = LAST_DOTDOT; - nd->state |= ND_JUMPED; - } - } else { - type = LAST_DOT; - } - } - nd->last_type = type; - if (likely(type == LAST_NORM)) { - struct dentry *parent = nd->path.dentry; + switch(lastword) { + case LAST_WORD_IS_DOTDOT: + if (len != 2) + goto normal; + nd->last_type = LAST_DOTDOT; + nd->state |= ND_JUMPED; + break; + + case LAST_WORD_IS_DOT: + if (len != 1) + goto normal; + nd->last_type = LAST_DOT; + break; + + default: + normal: + nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; + + struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) From 58b0afa038bb404ff82b358b68db5ffa8f53c404 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Jun 2024 15:00:35 -0700 Subject: [PATCH 4/5] vfs: link_path_walk: improve may_lookup() code generation Instead of having separate calls to 'inode_permission()' depending on whether we're in RCU lookup or not, just share the first call. Note that the initial "conditional" on LOOKUP_RCU really turns into just a "convert the LOOKUP_RCU bit in the nameidata into the MAY_NOT_BLOCK bit in the argument", which is just a trivial bitwise and and shift operation. So the initial conditional goes away entirely, and then the likely case is that it will succeed independently of us being in RCU lookup or not, and the possible "we may need to fall out of RCU and redo it all" fixups that are needed afterwards all go in the unlikely path. [ This also marks 'nd' restrict, because that means that the compiler can know that there is no other alias, and can cache the LOOKUP_RCU value over the call to inode_permission(). ] Signed-off-by: Linus Torvalds --- fs/namei.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 928cbc856d5b..6dcb1be1aca9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1712,17 +1712,26 @@ static struct dentry *lookup_slow(const struct qstr *name, } static inline int may_lookup(struct mnt_idmap *idmap, - struct nameidata *nd) + struct nameidata *restrict nd) { - if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); - if (!err) // success, keep going - return 0; - if (!try_to_unlazy(nd)) - return -ECHILD; // redo it all non-lazy - if (err != -ECHILD) // hard error - return err; - } + int err, mask; + + mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; + err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); + if (likely(!err)) + return 0; + + // If we failed, and we weren't in LOOKUP_RCU, it's final + if (!(nd->flags & LOOKUP_RCU)) + return err; + + // Drop out of RCU mode to make sure it wasn't transient + if (!try_to_unlazy(nd)) + return -ECHILD; // redo it all non-lazy + + if (err != -ECHILD) // hard error + return err; + return inode_permission(idmap, nd->inode, MAY_EXEC); } From 13694f0dbc077f28fcb613e7e37018d87997abb9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Jun 2024 09:05:52 -0700 Subject: [PATCH 5/5] vfs: link_path_walk: move more of the name hashing into hash_name() This avoids having to return the length of the component entirely by just doing all of the name processing in hash_name(). We can just return the end of the path component, and a flag for the DOT and DOTDOT cases. Signed-off-by: Linus Torvalds --- fs/namei.c | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 6dcb1be1aca9..4601c7ab7b77 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2174,21 +2174,37 @@ EXPORT_SYMBOL(hashlen_string); * Calculate the length and hash of the path component, and * return the length as the result. */ -static inline unsigned long hash_name(struct nameidata *nd, - const char *name, - unsigned long *lastword) +static inline const char *hash_name(struct nameidata *nd, + const char *name, + unsigned long *lastword) { - unsigned long a = 0, b, x = 0, y = (unsigned long)nd->path.dentry; + unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - len = 0; - goto inside; + /* + * The first iteration is special, because it can result in + * '.' and '..' and has no mixing other than the final fold. + */ + a = load_unaligned_zeropad(name); + b = a ^ REPEAT_BYTE('/'); + if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { + adata = prep_zero_mask(a, adata, &constants); + bdata = prep_zero_mask(b, bdata, &constants); + mask = create_zero_mask(adata | bdata); + a &= zero_bytemask(mask); + *lastword = a; + len = find_zero(mask); + nd->last.hash = fold_hash(a, y); + nd->last.len = len; + return name + len; + } + len = 0; + x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); -inside: a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); @@ -2197,13 +2213,13 @@ static inline unsigned long hash_name(struct nameidata *nd, bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); - *lastword = a; x ^= a; len += find_zero(mask); + *lastword = 0; // Multi-word components cannot be DOT or DOTDOT nd->last.hash = fold_hash(x, y); nd->last.len = len; - return len; + return name + len; } /* @@ -2247,7 +2263,7 @@ EXPORT_SYMBOL(hashlen_string); * We know there's a real path component here of at least * one character. */ -static inline unsigned long hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) +static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long hash = init_name_hash(nd->path.dentry); unsigned long len = 0, c, last = 0; @@ -2259,10 +2275,14 @@ static inline unsigned long hash_name(struct nameidata *nd, const char *name, un hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); + + // This is reliable for DOT or DOTDOT, since the component + // cannot contain NUL characters - top bits being zero means + // we cannot have had any other pathnames. *lastword = last; nd->last.hash = end_name_hash(hash); nd->last.len = len; - return len; + return name + len; } #endif @@ -2301,7 +2321,6 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct mnt_idmap *idmap; const char *link; unsigned long lastword; - unsigned int len; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); @@ -2309,25 +2328,19 @@ static int link_path_walk(const char *name, struct nameidata *nd) return err; nd->last.name = name; - len = hash_name(nd, name, &lastword); - name += len; + name = hash_name(nd, name, &lastword); switch(lastword) { case LAST_WORD_IS_DOTDOT: - if (len != 2) - goto normal; nd->last_type = LAST_DOTDOT; nd->state |= ND_JUMPED; break; case LAST_WORD_IS_DOT: - if (len != 1) - goto normal; nd->last_type = LAST_DOT; break; default: - normal: nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED;