From 8cf9a01edc216b16b5839eb793ac544d2c97ce97 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 11 Sep 2024 15:42:57 -0400 Subject: [PATCH 01/35] fs: Introduce FOP_ASYNC_LOCK Some lock managers (NLM, kNFSD) fastidiously avoid blocking their kernel threads while servicing blocking locks. If a filesystem supports asynchronous lock requests those lock managers can use notifications to quickly inform clients they have acquired a file lock. Historically, only posix_lock_file() was capable of supporting asynchronous locks so the check for support was simply file_operations->lock(), but with recent changes in DLM, both GFS2 and OCFS2 also support asynchronous locks and have started signalling their support with EXPORT_OP_ASYNC_LOCK. We recently noticed that those changes dropped the checks for whether a filesystem simply defaults to posix_lock_file(), so async lock notifications have not been attempted for NLM and NFSv4.1+ for most filesystems. While trying to fix this it has become clear that testing both the export flag combined with testing ->lock() creates quite a layering mess. It seems appropriate to signal support with a fop_flag. Add FOP_ASYNC_LOCK so that filesystems with ->lock() can signal their capability to handle lock requests asynchronously. Add a helper for lock managers to properly test that support. Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/3330d5a324abe2ce9c1dafe89cacdc6db41945d1.1726083391.git.bcodding@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/filelock.h | 5 +++++ include/linux/fs.h | 2 ++ 2 files changed, 7 insertions(+) diff --git a/include/linux/filelock.h b/include/linux/filelock.h index daee999d05f3..58c1120a8253 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -180,6 +180,11 @@ static inline void locks_wake_up(struct file_lock *fl) wake_up(&fl->c.flc_wait); } +static inline bool locks_can_async_lock(const struct file_operations *fops) +{ + return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK; +} + /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); diff --git a/include/linux/fs.h b/include/linux/fs.h index 6ca11e241a24..78221ae589d9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2074,6 +2074,8 @@ struct file_operations { #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) +/* Supports asynchronous lock callbacks */ +#define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 5)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, From 2253ab99f2e978d94693d6f63c83aa5b5d4c7839 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 11 Sep 2024 15:42:58 -0400 Subject: [PATCH 02/35] gfs2/ocfs2: set FOP_ASYNC_LOCK Both GFS2 and OCFS2 use DLM locking, which will allow async lock requests. Signal this support by setting FOP_ASYNC_LOCK. Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/fc4163dbbf33c58e5a8b8ee8cb8c57e555f53ce5.1726083391.git.bcodding@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/gfs2/file.c | 2 ++ fs/ocfs2/file.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 08982937b5df..b9ed2602287d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = { .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, + .fop_flags = FOP_ASYNC_LOCK, }; const struct file_operations gfs2_dir_fops = { @@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .llseek = default_llseek, + .fop_flags = FOP_ASYNC_LOCK, }; #endif /* CONFIG_GFS2_FS_LOCKING_DLM */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ccc57038a977..a642f1adee6a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2793,6 +2793,7 @@ const struct file_operations ocfs2_fops = { .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, .remap_file_range = ocfs2_remap_file_range, + .fop_flags = FOP_ASYNC_LOCK, }; WRAP_DIR_ITER(ocfs2_readdir) // FIXME! @@ -2809,6 +2810,7 @@ const struct file_operations ocfs2_dops = { #endif .lock = ocfs2_lock, .flock = ocfs2_flock, + .fop_flags = FOP_ASYNC_LOCK, }; /* From 318580ad7f2828f6269e0b8819e943ddedda3375 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 29 Aug 2024 14:41:09 +0800 Subject: [PATCH 03/35] hugetlbfs: support tracepoint Add basic tracepoints for {alloc, evict, free}_inode, setattr and fallocate. These can help users to debug hugetlbfs more conveniently. Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240829064110.67884-2-lihongbo22@huawei.com Reviewed-by: Steven Rostedt (Google) Signed-off-by: Christian Brauner --- MAINTAINERS | 1 + include/trace/events/hugetlbfs.h | 156 +++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 include/trace/events/hugetlbfs.h diff --git a/MAINTAINERS b/MAINTAINERS index 0c94ec0ca478..5e628017cbaa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10367,6 +10367,7 @@ F: Documentation/mm/hugetlbfs_reserv.rst F: Documentation/mm/vmemmap_dedup.rst F: fs/hugetlbfs/ F: include/linux/hugetlb.h +F: include/trace/events/hugetlbfs.h F: mm/hugetlb.c F: mm/hugetlb_vmemmap.c F: mm/hugetlb_vmemmap.h diff --git a/include/trace/events/hugetlbfs.h b/include/trace/events/hugetlbfs.h new file mode 100644 index 000000000000..8331c904a9ba --- /dev/null +++ b/include/trace/events/hugetlbfs.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hugetlbfs + +#if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HUGETLBFS_H + +#include + +TRACE_EVENT(hugetlbfs_alloc_inode, + + TP_PROTO(struct inode *inode, struct inode *dir, int mode), + + TP_ARGS(inode, dir, mode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(ino_t, dir) + __field(__u16, mode) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->dir = dir->i_ino; + __entry->mode = mode; + ), + + TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + (unsigned long) __entry->dir, __entry->mode) +); + +DECLARE_EVENT_CLASS(hugetlbfs__inode, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(__u16, mode) + __field(loff_t, size) + __field(unsigned int, nlink) + __field(unsigned int, seals) + __field(blkcnt_t, blocks) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = inode->i_mode; + __entry->size = inode->i_size; + __entry->nlink = inode->i_nlink; + __entry->seals = HUGETLBFS_I(inode)->seals; + __entry->blocks = inode->i_blocks; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o size %lld nlink %u seals %u blocks %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, + __entry->mode, __entry->size, __entry->nlink, __entry->seals, + (unsigned long long)__entry->blocks) +); + +DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode, + + TP_PROTO(struct inode *inode), + + TP_ARGS(inode) +); + +TRACE_EVENT(hugetlbfs_setattr, + + TP_PROTO(struct inode *inode, struct dentry *dentry, + struct iattr *attr), + + TP_ARGS(inode, dentry, attr), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(unsigned int, d_len) + __string(d_name, dentry->d_name.name) + __field(unsigned int, ia_valid) + __field(unsigned int, ia_mode) + __field(loff_t, old_size) + __field(loff_t, ia_size) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->d_len = dentry->d_name.len; + __assign_str(d_name); + __entry->ia_valid = attr->ia_valid; + __entry->ia_mode = attr->ia_mode; + __entry->old_size = inode->i_size; + __entry->ia_size = attr->ia_size; + ), + + TP_printk("dev %d,%d ino %lu name %.*s valid %#x mode 0%o old_size %lld size %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, + __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode, + __entry->old_size, __entry->ia_size) +); + +TRACE_EVENT(hugetlbfs_fallocate, + + TP_PROTO(struct inode *inode, int mode, + loff_t offset, loff_t len, int ret), + + TP_ARGS(inode, mode, offset, len, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(int, mode) + __field(loff_t, offset) + __field(loff_t, len) + __field(loff_t, size) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->mode = mode; + __entry->offset = offset; + __entry->len = len; + __entry->size = inode->i_size; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d ino %lu mode 0%o offset %lld len %lld size %lld ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long)__entry->ino, __entry->mode, + (unsigned long long)__entry->offset, + (unsigned long long)__entry->len, + (unsigned long long)__entry->size, + __entry->ret) +); + +#endif /* _TRACE_HUGETLBFS_H */ + + /* This part must be outside protection */ +#include From 014ad7c42a69d41aa670df96e41e8796d8645d37 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Thu, 29 Aug 2024 14:41:10 +0800 Subject: [PATCH 04/35] hugetlbfs: use tracepoints in hugetlbfs functions. Here we use the hugetlbfs tracepoint to track the call stack. And the output in trace is as follows: ``` touch-5265 [005] ..... 43.246550: hugetlbfs_alloc_inode: dev 0,51 ino 24621 dir 21959 mode 0100644 touch-5265 [005] ..... 43.246638: hugetlbfs_setattr: dev 0,51 ino 24621 name testfile valid 0x20070 mode 0177777 old_size 0 size -51622648042749952 truncate-5266 [005] ..... 45.590890: hugetlbfs_setattr: dev 0,51 ino 24621 name testfile valid 0x2068 mode 00 old_size 0 size 2097152 rm-5273 [007] ..... 110.052783: hugetlbfs_evict_inode: dev 0,51 ino 24621 mode 0100644 size 2097152 nlink 0 seals 1 blocks 0 -0 [007] ..s1. 110.059441: hugetlbfs_free_inode: dev 0,51 ino 24621 mode 0100644 size 2097152 nlink 0 seals 1 blocks 0 ``` Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240829064110.67884-3-lihongbo22@huawei.com Signed-off-by: Christian Brauner --- fs/hugetlbfs/inode.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 9f6cff356796..1689c01a11a0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -39,6 +39,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + static const struct address_space_operations hugetlbfs_aops; static const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; @@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) { struct resv_map *resv_map; + trace_hugetlbfs_evict_inode(inode); remove_inode_hugepages(inode, 0, LLONG_MAX); /* @@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (mode & FALLOC_FL_PUNCH_HOLE) - return hugetlbfs_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = hugetlbfs_punch_hole(inode, offset, len); + goto out_nolock; + } /* * Default preallocate case. @@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, inode_set_ctime_current(inode); out: inode_unlock(inode); + +out_nolock: + trace_hugetlbfs_fallocate(inode, mode, offset, len, error); return error; } @@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, if (error) return error; + trace_hugetlbfs_setattr(inode, dentry, attr); + if (ia_valid & ATTR_SIZE) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; @@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; } lockdep_annotate_inode_mutex_key(inode); + trace_hugetlbfs_alloc_inode(inode, dir, mode); } else { if (resv_map) kref_put(&resv_map->refs, resv_map_release); @@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_free_inode(struct inode *inode) { + trace_hugetlbfs_free_inode(inode); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } From 7e64c5bc497cf17872b38003307f320e8f077880 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 11 Sep 2024 15:42:59 -0400 Subject: [PATCH 05/35] NLM/NFSD: Fix lock notifications for async-capable filesystems Instead of checking just the exportfs flag, use the new locks_can_async_lock() helper which allows NLM and NFSD to once again support lock notifications for all filesystems which use posix_lock_file(). Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/865c40da44af67939e8eb560d17a26c9c50f23e0.1726083391.git.bcodding@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/lockd/svclock.c | 7 +++---- fs/nfsd/nfs4state.c | 19 ++++--------------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 1f2149db10f2..2359347c9fbd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -30,7 +30,6 @@ #include #include #include -#include #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { - struct inode *inode = nlmsvc_file_inode(file); + struct inode *inode __maybe_unused = nlmsvc_file_inode(file); struct nlm_block *block = NULL; int error; int mode; @@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) { + if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) { async_block = wait; wait = 0; } @@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, * requests on the underlaying ->lock() implementation but * only one nlm_block to being granted by lm_grant(). */ - if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) && + if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) && !list_empty(&block->b_list)) { spin_unlock(&nlm_blocked_lock); ret = nlm_lck_blocked; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a366fb1c1b9b..a061987abee3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -7953,9 +7953,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: spin_lock(&fp->fi_lock); @@ -7966,9 +7963,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate) || - exportfs_lock_op_is_async(sb->s_export_op)) - flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); @@ -7988,15 +7982,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - /* - * Most filesystems with their own ->lock operations will block - * the nfsd thread waiting to acquire the lock. That leads to - * deadlocks (we don't want every nfsd thread tied up waiting - * for file locks), so don't attempt blocking lock notifications - * on those filesystems: - */ - if (!exportfs_lock_op_is_async(sb->s_export_op)) - flags &= ~FL_SLEEP; + if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) && + nfsd4_has_session(cstate) && + locks_can_async_lock(nf->nf_file->f_op)) + flags |= FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); if (!nbl) { From b875bd5b381e114115922944f7a01e31f8b07c2a Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 11 Sep 2024 15:43:00 -0400 Subject: [PATCH 06/35] exportfs: Remove EXPORT_OP_ASYNC_LOCK Now that GFS2 and OCFS2 are signalling async ->lock() support with FOP_ASYNC_LOCK and checks for support are converted, we can remove EXPORT_OP_ASYNC_LOCK. Signed-off-by: Benjamin Coddington Link: https://lore.kernel.org/r/0a114db814fec3086f937ae3d44a086f13b8de26.1726083391.git.bcodding@redhat.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- Documentation/filesystems/nfs/exporting.rst | 7 ------- fs/gfs2/export.c | 1 - fs/ocfs2/export.c | 1 - include/linux/exportfs.h | 13 ------------- 4 files changed, 22 deletions(-) diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst index f04ce1215a03..de64d2d002a2 100644 --- a/Documentation/filesystems/nfs/exporting.rst +++ b/Documentation/filesystems/nfs/exporting.rst @@ -238,10 +238,3 @@ following flags are defined: all of an inode's dirty data on last close. Exports that behave this way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip waiting for writeback when closing such files. - - EXPORT_OP_ASYNC_LOCK - Indicates a capable filesystem to do async lock - requests from lockd. Only set EXPORT_OP_ASYNC_LOCK if the filesystem has - it's own ->lock() functionality as core posix_lock_file() implementation - has no async lock request handling yet. For more information about how to - indicate an async lock request from a ->lock() file_operations struct, see - fs/locks.c and comment for the function vfs_lock_file(). diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index d418d8b5367f..3334c394ce9c 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = { .fh_to_parent = gfs2_fh_to_parent, .get_name = gfs2_get_name, .get_parent = gfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 96b684763b39..b95724b767e1 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = { .fh_to_dentry = ocfs2_fh_to_dentry, .fh_to_parent = ocfs2_fh_to_parent, .get_parent = ocfs2_get_parent, - .flags = EXPORT_OP_ASYNC_LOCK, }; diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 893a1d21dc1c..1ab165c2939f 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -250,19 +250,6 @@ struct export_operations { unsigned long flags; }; -/** - * exportfs_lock_op_is_async() - export op supports async lock operation - * @export_ops: the nfs export operations to check - * - * Returns true if the nfs export_operations structure has - * EXPORT_OP_ASYNC_LOCK in their flags set - */ -static inline bool -exportfs_lock_op_is_async(const struct export_operations *export_ops) -{ - return export_ops->flags & EXPORT_OP_ASYNC_LOCK; -} - extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid, int *max_len, struct inode *parent, int flags); From 9d926f10b7ff4300a5dc36ecc52d061911d027d8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Sep 2024 16:04:18 -0400 Subject: [PATCH 07/35] filemap: filemap_read() should check that the offset is positive or zero We do check that the read offset is less than the filesystem limit, however for good measure we should also check that it is positive or zero, and return EINVAL if that is not the case. Signed-off-by: Trond Myklebust Link: https://lore.kernel.org/r/482ee0b8a30b62324adb9f7c551a99926f037393.1726257832.git.trond.myklebust@hammerspace.com Signed-off-by: Christian Brauner --- mm/filemap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 36d22968be9a..82aa94d2b709 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2620,6 +2620,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, loff_t isize, end_offset; loff_t last_pos = ra->prev_pos; + if (unlikely(iocb->ki_pos < 0)) + return -EINVAL; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; if (unlikely(!iov_iter_count(iter))) From 05fba0a11557dfdc1b6895f4a3fb59165669e643 Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Wed, 25 Sep 2024 09:56:24 +0800 Subject: [PATCH 08/35] fs: support relative paths with FSCONFIG_SET_STRING The fs_lookup_param did not consider the relative path for block device. When we mount ext4 with journal_path option using relative path, param->dirfd was not set which will cause mounting error. This can be reproduced easily like this: mke2fs -F -O journal_dev $JOURNAL_DEV -b 4096 100M mkfs.ext4 -F -J device=$JOURNAL_DEV -b 4096 $FS_DEV cd /dev; mount -t ext4 -o journal_path=`basename $JOURNAL_DEV` $FS_DEV $MNT Fixes: 461c3af045d3 ("ext4: Change handle_mount_opt() to use fs_parameter") Suggested-by: Christian Brauner Signed-off-by: Hongbo Li Link: https://lore.kernel.org/r/20240925015624.3817878-1-lihongbo22@huawei.com Signed-off-by: Christian Brauner --- fs/fs_parser.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 24727ec34e5a..698464f3e26a 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc, f = getname_kernel(param->string); if (IS_ERR(f)) return PTR_ERR(f); + param->dirfd = AT_FDCWD; put_f = true; break; case fs_value_is_filename: From 6474353a5e3d0b2cf610153cea0c61f576a36d0a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 25 Sep 2024 11:05:16 +0200 Subject: [PATCH 09/35] epoll: annotate racy check Epoll relies on a racy fastpath check during __fput() in eventpoll_release() to avoid the hit of pointlessly acquiring a semaphore. Annotate that race by using WRITE_ONCE() and READ_ONCE(). Link: https://lore.kernel.org/r/66edfb3c.050a0220.3195df.001a.GAE@google.com Link: https://lore.kernel.org/r/20240925-fungieren-anbauen-79b334b00542@brauner Reviewed-by: Jan Kara Reported-by: syzbot+3b6b32dc50537a49bb4a@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- fs/eventpoll.c | 6 ++++-- include/linux/eventpoll.h | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1ae4542f0bd8..90fbab6b6f03 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { - file->f_ep = NULL; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); @@ -1603,7 +1604,8 @@ static int attach_epitem(struct file *file, struct epitem *epi) spin_unlock(&file->f_lock); goto allocate; } - file->f_ep = head; + /* See eventpoll_release() for details. */ + WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index 3337745d81bd..0c0d00fcd131 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h @@ -42,7 +42,7 @@ static inline void eventpoll_release(struct file *file) * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ - if (likely(!file->f_ep)) + if (likely(!READ_ONCE(file->f_ep))) return; /* From e6957c99dca5fd919756e6721e798cbadd23445a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 29 Sep 2024 20:28:31 +0800 Subject: [PATCH 10/35] vfs: Add a sysctl for automated deletion of dentry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 681ce8623567 ("vfs: Delete the associated dentry when deleting a file") introduced an unconditional deletion of the associated dentry when a file is removed. However, this led to performance regressions in specific benchmarks, such as ilebench.sum_operations/s [0], prompting a revert in commit 4a4be1ad3a6e ("Revert "vfs: Delete the associated dentry when deleting a file""). This patch seeks to reintroduce the concept conditionally, where the associated dentry is deleted only when the user explicitly opts for it during file removal. A new sysctl fs.automated_deletion_of_dentry is added for this purpose. Its default value is set to 0. There are practical use cases for this proactive dentry reclamation. Besides the Elasticsearch use case mentioned in commit 681ce8623567, additional examples have surfaced in our production environment. For instance, in video rendering services that continuously generate temporary files, upload them to persistent storage servers, and then delete them, a large number of negative dentries—serving no useful purpose—accumulate. Users in such cases would benefit from proactively reclaiming these negative dentries. Link: https://lore.kernel.org/linux-fsdevel/202405291318.4dfbb352-oliver.sang@intel.com [0] Link: https://lore.kernel.org/all/20240912-programm-umgibt-a1145fa73bb6@brauner/ Suggested-by: Christian Brauner Acked-by: Linus Torvalds Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20240929122831.92515-1-laoar.shao@gmail.com Cc: Linus Torvalds Cc: Al Viro Cc: Christian Brauner Cc: Jan Kara Cc: Mateusz Guzik Signed-off-by: Christian Brauner --- Documentation/admin-guide/sysctl/fs.rst | 5 +++++ fs/dcache.c | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/Documentation/admin-guide/sysctl/fs.rst b/Documentation/admin-guide/sysctl/fs.rst index 47499a1742bd..30c61474dec5 100644 --- a/Documentation/admin-guide/sysctl/fs.rst +++ b/Documentation/admin-guide/sysctl/fs.rst @@ -38,6 +38,11 @@ requests. ``aio-max-nr`` allows you to change the maximum value ``aio-max-nr`` does not result in the pre-allocation or re-sizing of any kernel data structures. +dentry-negative +---------------------------- + +Policy for negative dentries. Set to 1 to to always delete the dentry when a +file is removed, and 0 to disable it. By default, this behavior is disabled. dentry-state ------------ diff --git a/fs/dcache.c b/fs/dcache.c index 0f6b16ba30d0..0d7d1bfa23bd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -135,6 +135,7 @@ struct dentry_stat_t { static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); +static int dentry_negative_policy; #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* Statistics gathering. */ @@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, + { + .procname = "dentry-negative", + .data = &dentry_negative_policy, + .maxlen = sizeof(dentry_negative_policy), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __init init_fs_dcache_sysctls(void) @@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry) * Are we the only user? */ if (dentry->d_lockref.count == 1) { + if (dentry_negative_policy) + __d_drop(dentry); dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { From 1e756248be2aa03188a9700da5feb3d3f3c91eed Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 30 Sep 2024 13:20:54 +0200 Subject: [PATCH 11/35] fs: Reorganize kerneldoc parameter names Reorganize kerneldoc parameter names to match the parameter order in the function header. Problems identified using Coccinelle. Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20240930112121.95324-9-Julia.Lawall@inria.fr Signed-off-by: Christian Brauner --- fs/char_dev.c | 2 +- fs/dcache.c | 4 ++-- fs/seq_file.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/char_dev.c b/fs/char_dev.c index 57cc096c498a..c2ddb998f3c9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev) /** * cdev_device_del() - inverse of cdev_device_add - * @dev: the device structure * @cdev: the cdev structure + * @dev: the device structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. diff --git a/fs/dcache.c b/fs/dcache.c index 0d7d1bfa23bd..0099077a2982 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2049,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name - * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func + * @inode: the inode case-insensitive lookup has found * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the @@ -2103,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci); /** * d_same_name - compare dentry name with case-exact name - * @parent: parent dentry * @dentry: the negative dentry that was passed to the parent's lookup func + * @parent: parent dentry * @name: the case-exact name to be associated with the returned dentry * * Return: true if names are same, or false diff --git a/fs/seq_file.c b/fs/seq_file.c index e676c8b0cf5d..8bbb1ad46335 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek); /** * seq_release - free the structures associated with sequential file. - * @file: file in question * @inode: its inode + * @file: file in question * * Frees the structures associated with sequential file; can be used * as ->f_op->release() if you don't have private data to destroy. From 0cb9c994e71c15555cf8c3d12fda10fa8f5dcdea Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 7 Oct 2024 10:52:37 +0200 Subject: [PATCH 12/35] namespace: Use atomic64_inc_return() in alloc_mnt_ns() Use atomic64_inc_return(&ref) instead of atomic64_add_return(1, &ref) to use optimized implementation and ease register pressure around the primitive for targets that implement optimized variant. Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/r/20241007085303.48312-1-ubizjak@gmail.com Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 93c377816d75..9a3c251d033d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a } new_ns->ns.ops = &mntns_operations; if (!anon) - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + new_ns->seq = atomic64_inc_return(&mnt_ns_seq); refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; From c2986387430ae6a98e46094bbe669454656f873a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 4 Oct 2024 13:51:51 +0200 Subject: [PATCH 13/35] vfs: inode insertion kdoc corrections Some minor corrections to the inode_insert5 and iget5_locked kernel documentation. Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20241004115151.44834-1-agruenba@redhat.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 471ae4a31549..6b3ff38df7f7 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1239,16 +1239,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a variant of iget5_locked() for callers that don't want to fail on memory - * allocation of inode. + * and if present return it with an increased reference count. This is a + * variant of iget5_locked() that doesn't allocate an inode. * - * If the inode is not in cache, insert the pre-allocated inode to cache and + * If the inode is not present in the cache, insert the pre-allocated inode and * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), @@ -1312,16 +1311,16 @@ EXPORT_SYMBOL(inode_insert5); * @data: opaque data pointer to pass to @test and @set * * Search for the inode specified by @hashval and @data in the inode cache, - * and if present it is return it with an increased reference count. This is - * a generalized version of iget_locked() for file systems where the inode + * and if present return it with an increased reference count. This is a + * generalized version of iget_locked() for file systems where the inode * number is not sufficient for unique identification of an inode. * - * If the inode is not in cache, allocate a new inode and return it locked, - * hashed, and with the I_NEW flag set. The file system gets to fill it in - * before unlocking it via unlock_new_inode(). + * If the inode is not present in the cache, allocate and insert a new inode + * and return it locked, hashed, and with the I_NEW flag set. The file system + * gets to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note that both @test and @set are called with the inode_hash_lock held, so + * they can't sleep. */ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), From 2714b0d1f36999dbd99a3474a24e7301acbd74f1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 8 Oct 2024 13:30:49 +0200 Subject: [PATCH 14/35] fcntl: make F_DUPFD_QUERY associative Currently when passing a closed file descriptor to fcntl(fd, F_DUPFD_QUERY, fd_dup) the order matters: fd = open("/dev/null"); fd_dup = dup(fd); When we now close one of the file descriptors we get: (1) fcntl(fd, fd_dup) // -EBADF (2) fcntl(fd_dup, fd) // 0 aka not equal depending on which file descriptor is passed first. That's not a huge deal but it gives the api I slightly weird feel. Make it so that the order doesn't matter by requiring that both file descriptors are valid: (1') fcntl(fd, fd_dup) // -EBADF (2') fcntl(fd_dup, fd) // -EBADF Link: https://lore.kernel.org/r/20241008-duften-formel-251f967602d5@brauner Fixes: c62b758bae6a ("fcntl: add F_DUPFD_QUERY fcntl()") Reviewed-by: Jeff Layton Reviewed-By: Lennart Poettering Cc: stable@vger.kernel.org Reported-by: Lennart Poettering Signed-off-by: Christian Brauner --- fs/fcntl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fcntl.c b/fs/fcntl.c index 22dd9dcce7ec..3d89de31066a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -397,6 +397,9 @@ static long f_dupfd_query(int fd, struct file *filp) { CLASS(fd_raw, f)(fd); + if (fd_empty(f)) + return -EBADF; + /* * We can do the 'fdput()' immediately, as the only thing that * matters is the pointer value which isn't changed by the fdput. From 80d3ab22277e6dea72c0715a6698d12f2682ec38 Mon Sep 17 00:00:00 2001 From: Andrew Kreimer Date: Tue, 8 Oct 2024 15:16:02 +0300 Subject: [PATCH 15/35] fs/inode: Fix a typo Fix a typo in comments: wether v-> whether. Signed-off-by: Andrew Kreimer Link: https://lore.kernel.org/r/20241008121602.16778-1-algonell@gmail.com Signed-off-by: Christian Brauner --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 6b3ff38df7f7..d7a0c36e1ba6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2669,7 +2669,7 @@ EXPORT_SYMBOL(inode_set_ctime_current); * @inode: inode to check * @vfsgid: the new/current vfsgid of @inode * - * Check wether @vfsgid is in the caller's group list or if the caller is + * Check whether @vfsgid is in the caller's group list or if the caller is * privileged with CAP_FSETID over @inode. This can be used to determine * whether the setgid bit can be kept or must be dropped. * From a54fc4932438cfc1f41626d78404237fd5f82e5b Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Wed, 9 Oct 2024 23:17:27 +0800 Subject: [PATCH 16/35] mm/page-writeback.c: Update comment for BANDWIDTH_INTERVAL The name of the BANDWIDTH_INTERVAL macro is misleading, as it is not only used in the bandwidth update functions wb_update_bandwidth() and __wb_update_bandwidth(), but also in the dirty limit update function domain_update_dirty_limit(). Currently, we haven't found an ideal name, so update the comment only. Signed-off-by: Tang Yizhou Link: https://lore.kernel.org/r/20241009151728.300477-2-yizhou.tang@shopee.com Signed-off-by: Christian Brauner --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fcd4c1439cb9..c7c6b58a8461 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -54,7 +54,7 @@ #define DIRTY_POLL_THRESH (128 >> (PAGE_SHIFT - 10)) /* - * Estimate write bandwidth at 200ms intervals. + * Estimate write bandwidth or update dirty limit at 200ms intervals. */ #define BANDWIDTH_INTERVAL max(HZ/5, 1) From 98f3ac9ba0ec35ff276e6c64ac9f173efa27df78 Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Wed, 9 Oct 2024 23:17:28 +0800 Subject: [PATCH 17/35] mm/page-writeback.c: Fix comment of wb_domain_writeout_add() __bdi_writeout_inc() has undergone multiple renamings, but the comment within the function body have not been updated accordingly. Update it to reflect the latest wb_domain_writeout_add(). Signed-off-by: Tang Yizhou Link: https://lore.kernel.org/r/20241009151728.300477-3-yizhou.tang@shopee.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c7c6b58a8461..72a5d8836425 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -586,7 +586,7 @@ static void wb_domain_writeout_add(struct wb_domain *dom, /* First event after period switching was turned off? */ if (unlikely(!dom->period_time)) { /* - * We can race with other __bdi_writeout_inc calls here but + * We can race with other wb_domain_writeout_add calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. From 0dfcb72d33c767bbe63f4a6872108515594154d9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 10 Oct 2024 11:36:51 -0400 Subject: [PATCH 18/35] coredump: add cond_resched() to dump_user_range The loop between elf_core_dump() and dump_user_range() can run for so long that the system shows softlockup messages, with side effects like workqueues and RCU getting stuck on the core dumping CPU. Add a cond_resched() in dump_user_range() to avoid that softlockup. Signed-off-by: Rik van Riel Link: https://lore.kernel.org/r/20241010113651.50cb0366@imladris.surriel.com Signed-off-by: Christian Brauner --- fs/coredump.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/coredump.c b/fs/coredump.c index 45737b43dda5..d48edb37bc35 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, } else { dump_skip(cprm, PAGE_SIZE); } + cond_resched(); } dump_page_free(dump_page); return 1; From 900bbaae67e980945dec74d36f8afe0de7556d5a Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Fri, 26 Apr 2024 16:05:48 +0800 Subject: [PATCH 19/35] epoll: Add synchronous wakeup support for ep_poll_callback Now, the epoll only use wake_up() interface to wake up task. However, sometimes, there are epoll users which want to use the synchronous wakeup flag to hint the scheduler, such as Android binder driver. So add a wake_up_sync() define, and use the wake_up_sync() when the sync is true in ep_poll_callback(). Co-developed-by: Jing Xia Signed-off-by: Jing Xia Signed-off-by: Xuewen Yan Link: https://lore.kernel.org/r/20240426080548.8203-1-xuewen.yan@unisoc.com Tested-by: Brian Geffon Reviewed-by: Brian Geffon Reported-by: Benoit Lize Signed-off-by: Christian Brauner --- fs/eventpoll.c | 5 ++++- include/linux/wait.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 90fbab6b6f03..1a06e462b6ef 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1373,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up(&ep->wq); + if (sync) + wake_up_sync(&ep->wq); + else + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; diff --git a/include/linux/wait.h b/include/linux/wait.h index 8aa3372f21a0..2b322a9b88a2 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -221,6 +221,7 @@ void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) #define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) #define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) +#define wake_up_sync(x) __wake_up_sync(x, TASK_NORMAL) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) From 99bdadbde9c418f29b78b7241732268dbc0a05cc Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 15 Oct 2024 22:21:54 +0200 Subject: [PATCH 20/35] acl: Realign struct posix_acl to save 8 bytes Reduce posix_acl's struct size by 8 bytes by realigning its members. Cc: Christian Brauner Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241015202158.2376-1-thorsten.blum@linux.dev Signed-off-by: Christian Brauner --- include/linux/posix_acl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 0e65b3d634d9..2d6a4badd306 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -28,8 +28,8 @@ struct posix_acl_entry { struct posix_acl { refcount_t a_refcount; - struct rcu_head a_rcu; unsigned int a_count; + struct rcu_head a_rcu; struct posix_acl_entry a_entries[]; }; From 8c6e03ffedc5463d1aa1ba89f6ceb082518a3520 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 18 Oct 2024 14:14:21 +0200 Subject: [PATCH 21/35] acl: Annotate struct posix_acl with __counted_by() Add the __counted_by compiler attribute to the flexible array member a_entries to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Use struct_size() to calculate the number of bytes to allocate for new and cloned acls and remove the local size variables. Change the posix_acl_alloc() function parameter count from int to unsigned int to match posix_acl's a_count data type. Add identifier names to the function definition to silence two checkpatch warnings. Reviewed-by: Jan Kara Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241018121426.155247-2-thorsten.blum@linux.dev Cc: Nathan Chancellor Signed-off-by: Christian Brauner --- fs/posix_acl.c | 13 ++++++------- include/linux/posix_acl.h | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 6c66a37522d0..4050942ab52f 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init); * Allocate a new ACL with the specified number of entries. */ struct posix_acl * -posix_acl_alloc(int count, gfp_t flags) +posix_acl_alloc(unsigned int count, gfp_t flags) { - const size_t size = sizeof(struct posix_acl) + - count * sizeof(struct posix_acl_entry); - struct posix_acl *acl = kmalloc(size, flags); + struct posix_acl *acl; + + acl = kmalloc(struct_size(acl, a_entries, count), flags); if (acl) posix_acl_init(acl, count); return acl; @@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags) struct posix_acl *clone = NULL; if (acl) { - int size = sizeof(struct posix_acl) + acl->a_count * - sizeof(struct posix_acl_entry); - clone = kmemdup(acl, size, flags); + clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count), + flags); if (clone) refcount_set(&clone->a_refcount, 1); } diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 2d6a4badd306..e2d47eb1a7f3 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -30,7 +30,7 @@ struct posix_acl { refcount_t a_refcount; unsigned int a_count; struct rcu_head a_rcu; - struct posix_acl_entry a_entries[]; + struct posix_acl_entry a_entries[] __counted_by(a_count); }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ @@ -62,7 +62,7 @@ posix_acl_release(struct posix_acl *acl) /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); -extern struct posix_acl *posix_acl_alloc(int, gfp_t); +extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); From 30dac24e14b52e1787572d1d4e06eeabe8a63630 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 26 Sep 2024 16:01:21 +0200 Subject: [PATCH 22/35] fs/writeback: convert wbc_account_cgroup_owner to take a folio Most of the callers of wbc_account_cgroup_owner() are converting a folio to page before calling the function. wbc_account_cgroup_owner() is converting the page back to a folio to call mem_cgroup_css_from_folio(). Convert wbc_account_cgroup_owner() to take a folio instead of a page, and convert all callers to pass a folio directly except f2fs. Convert the page to folio for all the callers from f2fs as they were the only callers calling wbc_account_cgroup_owner() with a page. As f2fs is already in the process of converting to folios, these call sites might also soon be calling wbc_account_cgroup_owner() with a folio directly in the future. No functional changes. Only compile tested. Signed-off-by: Pankaj Raghav Link: https://lore.kernel.org/r/20240926140121.203821-1-kernel@pankajraghav.com Acked-by: David Sterba Acked-by: Tejun Heo Signed-off-by: Christian Brauner --- Documentation/admin-guide/cgroup-v2.rst | 2 +- fs/btrfs/extent_io.c | 7 +++---- fs/btrfs/inode.c | 2 +- fs/buffer.c | 4 ++-- fs/ext4/page-io.c | 2 +- fs/f2fs/data.c | 9 ++++++--- fs/fs-writeback.c | 8 +++----- fs/iomap/buffered-io.c | 2 +- fs/mpage.c | 2 +- include/linux/writeback.h | 4 ++-- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 69af2173555f..064012ea6f36 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2945,7 +2945,7 @@ following two functions. a queue (device) has been associated with the bio and before submission. - wbc_account_cgroup_owner(@wbc, @page, @bytes) + wbc_account_cgroup_owner(@wbc, @folio, @bytes) Should be called for each data segment being written out. While this function doesn't care exactly when it's called during the writeback session, it's the easiest and most diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 39c9677c47d5..4667d1e034e0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -785,7 +785,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, } if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page, + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; @@ -1707,7 +1707,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, ret = bio_add_folio(&bbio->bio, folio, eb->len, eb->start - folio_pos(folio)); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + wbc_account_cgroup_owner(wbc, folio, eb->len); folio_unlock(folio); } else { int num_folios = num_extent_folios(eb); @@ -1721,8 +1721,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, folio_start_writeback(folio); ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio_page(folio, 0), - eb->folio_size); + wbc_account_cgroup_owner(wbc, folio, eb->folio_size); wbc->nr_to_write -= folio_nr_pages(folio); folio_unlock(folio); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index edac499fd83d..eb64f04755c2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1729,7 +1729,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, * need full accuracy. Just account the whole thing * against the first page. */ - wbc_account_cgroup_owner(wbc, &locked_folio->page, + wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; diff --git a/fs/buffer.c b/fs/buffer.c index 1fc9a50def0b..32bd0f4c4223 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2803,7 +2803,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_write_hint = write_hint; - __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh)); bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; @@ -2813,7 +2813,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size); } submit_bio(bio); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ad5543866d21..b7b9261fec3b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -421,7 +421,7 @@ static void io_submit_add_bh(struct ext4_io_submit *io, io_submit_init_bio(io, bh); if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size); io->io_next_block++; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 94f7b084f601..e3ce763cce18 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, is_read_io(fio->op) ? __read_io_type(page) : WB_DATA_TYPE(fio->page, false)); @@ -911,7 +912,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1011,7 +1013,8 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio) } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), + PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..2391b09f4ced 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -890,17 +890,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress - * @page: page being written out + * @folio: folio being written out * @bytes: number of bytes being written out * - * @bytes from @page are about to written out during the writeback + * @bytes from @folio are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes) { - struct folio *folio; struct cgroup_subsys_state *css; int id; @@ -913,7 +912,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - folio = page_folio(page); css = mem_cgroup_css_from_folio(folio); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 11ea747228ae..3d1fae7d3a64 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1833,7 +1833,7 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; - wbc_account_cgroup_owner(wbc, &folio->page, len); + wbc_account_cgroup_owner(wbc, folio, len); return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index b5b5ddf9d513..82aecf372743 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -606,7 +606,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio)); + wbc_account_cgroup_owner(wbc, folio, folio_size(folio)); length = first_unmapped << blkbits; if (!bio_add_folio(bio, folio, length, 0)) { bio = mpage_bio_submit_write(bio); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d6db822e4bb3..641a057e0413 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -217,7 +217,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); -void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done); @@ -324,7 +324,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) } static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, - struct page *page, size_t bytes) + struct folio *folio, size_t bytes) { } From e017671f534dd3f568db9e47b0583e853d2da9b5 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Wed, 30 Oct 2024 03:55:10 +0000 Subject: [PATCH 23/35] initramfs: avoid filename buffer overrun The initramfs filename field is defined in Documentation/driver-api/early-userspace/buffer-format.rst as: 37 cpio_file := ALGN(4) + cpio_header + filename + "\0" + ALGN(4) + data ... 55 ============= ================== ========================= 56 Field name Field size Meaning 57 ============= ================== ========================= ... 70 c_namesize 8 bytes Length of filename, including final \0 When extracting an initramfs cpio archive, the kernel's do_name() path handler assumes a zero-terminated path at @collected, passing it directly to filp_open() / init_mkdir() / init_mknod(). If a specially crafted cpio entry carries a non-zero-terminated filename and is followed by uninitialized memory, then a file may be created with trailing characters that represent the uninitialized memory. The ability to create an initramfs entry would imply already having full control of the system, so the buffer overrun shouldn't be considered a security vulnerability. Append the output of the following bash script to an existing initramfs and observe any created /initramfs_test_fname_overrunAA* path. E.g. ./reproducer.sh | gzip >> /myinitramfs It's easiest to observe non-zero uninitialized memory when the output is gzipped, as it'll overflow the heap allocated @out_buf in __gunzip(), rather than the initrd_start+initrd_size block. ---- reproducer.sh ---- nilchar="A" # change to "\0" to properly zero terminate / pad magic="070701" ino=1 mode=$(( 0100777 )) uid=0 gid=0 nlink=1 mtime=1 filesize=0 devmajor=0 devminor=1 rdevmajor=0 rdevminor=0 csum=0 fname="initramfs_test_fname_overrun" namelen=$(( ${#fname} + 1 )) # plus one to account for terminator printf "%s%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x%s" \ $magic $ino $mode $uid $gid $nlink $mtime $filesize \ $devmajor $devminor $rdevmajor $rdevminor $namelen $csum $fname termpadlen=$(( 1 + ((4 - ((110 + $namelen) & 3)) % 4) )) printf "%.s${nilchar}" $(seq 1 $termpadlen) ---- reproducer.sh ---- Symlink filename fields handled in do_symlink() won't overrun past the data segment, due to the explicit zero-termination of the symlink target. Fix filename buffer overrun by aborting the initramfs FSM if any cpio entry doesn't carry a zero-terminator at the expected (name_len - 1) offset. Fixes: 1da177e4c3f41 ("Linux-2.6.12-rc2") Signed-off-by: David Disseldorp Link: https://lore.kernel.org/r/20241030035509.20194-2-ddiss@suse.de Signed-off-by: Christian Brauner --- init/initramfs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/init/initramfs.c b/init/initramfs.c index bc911e466d5b..b2f7583bb1f5 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -360,6 +360,15 @@ static int __init do_name(void) { state = SkipIt; next_state = Reset; + + /* name_len > 0 && name_len <= PATH_MAX checked in do_header */ + if (collected[name_len - 1] != '\0') { + pr_err("initramfs name without nulterm: %.*s\n", + (int)name_len, collected); + error("malformed archive"); + return 1; + } + if (strcmp(collected, "TRAILER!!!") == 0) { free_hash(); return 0; @@ -424,6 +433,12 @@ static int __init do_copy(void) static int __init do_symlink(void) { + if (collected[name_len - 1] != '\0') { + pr_err("initramfs symlink without nulterm: %.*s\n", + (int)name_len, collected); + error("malformed archive"); + return 1; + } collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); init_symlink(collected + N_ALIGN(name_len), collected); From cb80d9074f2a56c8226657b01f19656584fc3ab5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 1 Nov 2024 13:52:25 +0100 Subject: [PATCH 24/35] fs: optimize acl_permission_check() generic_permission() turned out to be costlier than expected. The reason is that acl_permission_check() performs owner checks that involve costly pointer dereferences. There's already code that skips expensive group checks if the group and other permission bits are the same wrt to the mask that is checked. This logic can be extended to also shortcut permission checking in acl_permission_check(). If the inode doesn't have POSIX ACLs than ownership doesn't matter. If there are no missing UGO permissions the permission check can be shortcut. Acked-by: Al Viro Link: https://lore.kernel.org/all/CAHk-=wgXEoAOFRkDg+grxs+p1U+QjWXLixRGmYEfd=vG+OBuFw@mail.gmail.com Signed-off-by: Linus Torvalds Signed-off-by: Christian Brauner --- fs/namei.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 4a4a22a08ac2..05a8d544fb35 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -326,6 +326,25 @@ static int check_acl(struct mnt_idmap *idmap, return -EAGAIN; } +/* + * Very quick optimistic "we know we have no ACL's" check. + * + * Note that this is purely for ACL_TYPE_ACCESS, and purely + * for the "we have cached that there are no ACLs" case. + * + * If this returns true, we know there are no ACLs. But if + * it returns false, we might still not have ACLs (it could + * be the is_uncached_acl() case). + */ +static inline bool no_acl_inode(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + return likely(!READ_ONCE(inode->i_acl)); +#else + return true; +#endif +} + /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from @@ -348,6 +367,28 @@ static int acl_permission_check(struct mnt_idmap *idmap, unsigned int mode = inode->i_mode; vfsuid_t vfsuid; + /* + * Common cheap case: everybody has the requested + * rights, and there are no ACLs to check. No need + * to do any owner/group checks in that case. + * + * - 'mask&7' is the requested permission bit set + * - multiplying by 0111 spreads them out to all of ugo + * - '& ~mode' looks for missing inode permission bits + * - the '!' is for "no missing permissions" + * + * After that, we just need to check that there are no + * ACL's on the inode - do the 'IS_POSIXACL()' check last + * because it will dereference the ->i_sb pointer and we + * want to avoid that if at all possible. + */ + if (!((mask & 7) * 0111 & ~mode)) { + if (no_acl_inode(inode)) + return 0; + if (!IS_POSIXACL(inode)) + return 0; + } + /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { From fdfa4c02e6dd6c67f5cef8d78c6204e1ff7e12ca Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 3 Nov 2024 13:17:09 +0100 Subject: [PATCH 25/35] freevxfs: Replace one-element array with flexible array member Replace the deprecated one-element array with a modern flexible array member in the struct vxfs_dirblk. Link: https://github.com/KSPP/linux/issues/79 Signed-off-by: Thorsten Blum Link: https://lore.kernel.org/r/20241103121707.102838-3-thorsten.blum@linux.dev Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- fs/freevxfs/vxfs_dir.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h index fbcd603365ad..8c67627f2a3d 100644 --- a/fs/freevxfs/vxfs_dir.h +++ b/fs/freevxfs/vxfs_dir.h @@ -25,7 +25,7 @@ struct vxfs_dirblk { __fs16 d_free; /* free space in dirblock */ __fs16 d_nhash; /* no of hash chains */ - __fs16 d_hash[1]; /* hash chain */ + __fs16 d_hash[]; /* hash chain */ }; /* From 1c82587cb57687de3f18ab4b98a8850c789bedcf Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Nov 2024 08:41:09 -0300 Subject: [PATCH 26/35] hfsplus: don't query the device logical block size multiple times Devices block sizes may change. One of these cases is a loop device by using ioctl LOOP_SET_BLOCK_SIZE. While this may cause other issues like IO being rejected, in the case of hfsplus, it will allocate a block by using that size and potentially write out-of-bounds when hfsplus_read_wrapper calls hfsplus_submit_bio and the latter function reads a different io_size. Using a new min_io_size initally set to sb_min_blocksize works for the purposes of the original fix, since it will be set to the max between HFSPLUS_SECTOR_SIZE and the first seen logical block size. We still use the max between HFSPLUS_SECTOR_SIZE and min_io_size in case the latter is not initialized. Tested by mounting an hfsplus filesystem with loop block sizes 512, 1024 and 4096. The produced KASAN report before the fix looks like this: [ 419.944641] ================================================================== [ 419.945655] BUG: KASAN: slab-use-after-free in hfsplus_read_wrapper+0x659/0xa0a [ 419.946703] Read of size 2 at addr ffff88800721fc00 by task repro/10678 [ 419.947612] [ 419.947846] CPU: 0 UID: 0 PID: 10678 Comm: repro Not tainted 6.12.0-rc5-00008-gdf56e0f2f3ca #84 [ 419.949007] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014 [ 419.950035] Call Trace: [ 419.950384] [ 419.950676] dump_stack_lvl+0x57/0x78 [ 419.951212] ? hfsplus_read_wrapper+0x659/0xa0a [ 419.951830] print_report+0x14c/0x49e [ 419.952361] ? __virt_addr_valid+0x267/0x278 [ 419.952979] ? kmem_cache_debug_flags+0xc/0x1d [ 419.953561] ? hfsplus_read_wrapper+0x659/0xa0a [ 419.954231] kasan_report+0x89/0xb0 [ 419.954748] ? hfsplus_read_wrapper+0x659/0xa0a [ 419.955367] hfsplus_read_wrapper+0x659/0xa0a [ 419.955948] ? __pfx_hfsplus_read_wrapper+0x10/0x10 [ 419.956618] ? do_raw_spin_unlock+0x59/0x1a9 [ 419.957214] ? _raw_spin_unlock+0x1a/0x2e [ 419.957772] hfsplus_fill_super+0x348/0x1590 [ 419.958355] ? hlock_class+0x4c/0x109 [ 419.958867] ? __pfx_hfsplus_fill_super+0x10/0x10 [ 419.959499] ? __pfx_string+0x10/0x10 [ 419.960006] ? lock_acquire+0x3e2/0x454 [ 419.960532] ? bdev_name.constprop.0+0xce/0x243 [ 419.961129] ? __pfx_bdev_name.constprop.0+0x10/0x10 [ 419.961799] ? pointer+0x3f0/0x62f [ 419.962277] ? __pfx_pointer+0x10/0x10 [ 419.962761] ? vsnprintf+0x6c4/0xfba [ 419.963178] ? __pfx_vsnprintf+0x10/0x10 [ 419.963621] ? setup_bdev_super+0x376/0x3b3 [ 419.964029] ? snprintf+0x9d/0xd2 [ 419.964344] ? __pfx_snprintf+0x10/0x10 [ 419.964675] ? lock_acquired+0x45c/0x5e9 [ 419.965016] ? set_blocksize+0x139/0x1c1 [ 419.965381] ? sb_set_blocksize+0x6d/0xae [ 419.965742] ? __pfx_hfsplus_fill_super+0x10/0x10 [ 419.966179] mount_bdev+0x12f/0x1bf [ 419.966512] ? __pfx_mount_bdev+0x10/0x10 [ 419.966886] ? vfs_parse_fs_string+0xce/0x111 [ 419.967293] ? __pfx_vfs_parse_fs_string+0x10/0x10 [ 419.967702] ? __pfx_hfsplus_mount+0x10/0x10 [ 419.968073] legacy_get_tree+0x104/0x178 [ 419.968414] vfs_get_tree+0x86/0x296 [ 419.968751] path_mount+0xba3/0xd0b [ 419.969157] ? __pfx_path_mount+0x10/0x10 [ 419.969594] ? kmem_cache_free+0x1e2/0x260 [ 419.970311] do_mount+0x99/0xe0 [ 419.970630] ? __pfx_do_mount+0x10/0x10 [ 419.971008] __do_sys_mount+0x199/0x1c9 [ 419.971397] do_syscall_64+0xd0/0x135 [ 419.971761] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 419.972233] RIP: 0033:0x7c3cb812972e [ 419.972564] Code: 48 8b 0d f5 46 0d 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d c2 46 0d 00 f7 d8 64 89 01 48 [ 419.974371] RSP: 002b:00007ffe30632548 EFLAGS: 00000286 ORIG_RAX: 00000000000000a5 [ 419.975048] RAX: ffffffffffffffda RBX: 00007ffe306328d8 RCX: 00007c3cb812972e [ 419.975701] RDX: 0000000020000000 RSI: 0000000020000c80 RDI: 00007ffe306325d0 [ 419.976363] RBP: 00007ffe30632720 R08: 00007ffe30632610 R09: 0000000000000000 [ 419.977034] R10: 0000000000200008 R11: 0000000000000286 R12: 0000000000000000 [ 419.977713] R13: 00007ffe306328e8 R14: 00005a0eb298bc68 R15: 00007c3cb8356000 [ 419.978375] [ 419.978589] Fixes: 6596528e391a ("hfsplus: ensure bio requests are not smaller than the hardware sectors") Signed-off-by: Thadeu Lima de Souza Cascardo Link: https://lore.kernel.org/r/20241107114109.839253-1-cascardo@igalia.com Signed-off-by: Christian Brauner --- fs/hfsplus/hfsplus_fs.h | 3 ++- fs/hfsplus/wrapper.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 59ce81dca73f..5389918bbf29 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -156,6 +156,7 @@ struct hfsplus_sb_info { /* Runtime variables */ u32 blockoffset; + u32 min_io_size; sector_t part_start; sector_t sect_count; int fs_shift; @@ -307,7 +308,7 @@ struct hfsplus_readdir_data { */ static inline unsigned short hfsplus_min_io_size(struct super_block *sb) { - return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size, HFSPLUS_SECTOR_SIZE); } diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index ce9346099c72..4b0fdc49d1fe 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb) if (!blocksize) goto out; + sbi->min_io_size = blocksize; + if (hfsplus_get_last_session(sb, &part_start, &part_size)) goto out; From c4d7d90747f4e8b528c8cd0a2d9ac01dc4a9339e Mon Sep 17 00:00:00 2001 From: Mohammed Anees Date: Tue, 12 Nov 2024 17:08:34 +0530 Subject: [PATCH 27/35] fs:aio: Remove TODO comment suggesting hash or array usage in io_cancel() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment suggests a hash or array approach to store the active requests. Currently it iterates through all the active requests and when found deletes the requested request, in the linked list. However io_cancel() isn’t a frequently used operation, and optimizing it wouldn’t bring a substantial benefit to real users and the increased complexity of maintaining a hashtable for this would be significant and will slow down other operation. Therefore remove this TODO to avoid people spending time improving this. Signed-off-by: Mohammed Anees Link: https://lore.kernel.org/r/20241112113906.15825-1-pvmohammedanees2003@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/aio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index e8920178b50f..72e3970f4225 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return -EINVAL; spin_lock_irq(&ctx->ctx_lock); - /* TODO: use a hash or array, this sucks. */ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { if (kiocb->ki_res.obj == obj) { ret = kiocb->ki_cancel(&kiocb->rw); From 75ead69a717332efa70303fba85e1876793c74a9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Nov 2024 10:09:55 -0500 Subject: [PATCH 28/35] fs: don't let statmount return empty strings When one of the statmount_string() handlers doesn't emit anything to seq, the kernel currently sets the corresponding flag and emits an empty string. Given that statmount() returns a mask of accessible fields, just leave the bit unset in this case, and skip any NULL termination. If nothing was emitted to the seq, then the EOVERFLOW and EAGAIN cases aren't applicable and the function can just return immediately. Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241111-statmount-v4-1-2eaf35d07a80@kernel.org Acked-by: Miklos Szeredi Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/namespace.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 9a3c251d033d..23187a414754 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5044,22 +5044,23 @@ static int statmount_string(struct kstatmount *s, u64 flag) size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; + u32 start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = seq->count; + sm->fs_type = start; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = seq->count; + sm->mnt_root = start; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = seq->count; + sm->mnt_point = start; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = seq->count; + sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; default: @@ -5067,6 +5068,12 @@ static int statmount_string(struct kstatmount *s, u64 flag) return -EINVAL; } + /* + * If nothing was emitted, return to avoid setting the flag + * and terminating the buffer. + */ + if (seq->count == start) + return ret; if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) return -EOVERFLOW; if (kbufsize >= s->bufsize) From ed9d95f691c29748f21bc019de9566b698fdfab7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Nov 2024 10:09:56 -0500 Subject: [PATCH 29/35] fs: add the ability for statmount() to report the fs_subtype /proc/self/mountinfo prints out the sb->s_subtype after the type. This is particularly useful for disambiguating FUSE mounts (at least when the userland driver bothers to set it). Add STATMOUNT_FS_SUBTYPE and claim one of the __spare2 fields to point to the offset into the str[] array. Reviewed-by: Jan Kara Reviewed-by: Ian Kent Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241111-statmount-v4-2-2eaf35d07a80@kernel.org Acked-by: Miklos Szeredi Signed-off-by: Christian Brauner --- fs/namespace.c | 19 +++++++++++++++++-- include/uapi/linux/mount.h | 5 ++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 23187a414754..dbd89fffd919 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5004,6 +5004,14 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) return 0; } +static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + + if (sb->s_subtype) + seq_puts(seq, sb->s_subtype); +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5040,7 +5048,7 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) static int statmount_string(struct kstatmount *s, u64 flag) { - int ret; + int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; @@ -5063,6 +5071,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_FS_SUBTYPE: + sm->fs_subtype = start; + statmount_fs_subtype(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5208,6 +5220,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) + err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5229,7 +5244,7 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 225bc366ffcb..2e939dddf9cb 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -173,7 +173,9 @@ struct statmount { __u32 mnt_root; /* [str] Root of mount relative to root of fs */ __u32 mnt_point; /* [str] Mountpoint relative to current root */ __u64 mnt_ns_id; /* ID of the mount namespace */ - __u64 __spare2[49]; + __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ + __u32 __spare1[1]; + __u64 __spare2[48]; char str[]; /* Variable size part containing strings */ }; @@ -207,6 +209,7 @@ struct mnt_id_req { #define STATMOUNT_FS_TYPE 0x00000020U /* Want/got fs_type */ #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ +#define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ /* * Special @mnt_id values that can be passed to listmount From 4d7485cff59951c83aa2b6891b24d68b76d86f6f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 06:43:54 +0100 Subject: [PATCH 30/35] writeback: add a __releases annoation to wbc_attach_and_unlock_inode This shuts up a sparse lock context tracking warning. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241112054403.1470586-2-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d8bec3c1bb1f..3fb115ae44b1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -733,6 +733,7 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) */ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) + __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); From 8182a8b39aa227f5b99b8d4d18f296b82ce4b94c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Nov 2024 06:43:55 +0100 Subject: [PATCH 31/35] writeback: wbc_attach_fdatawrite_inode out of line This allows exporting this high-level interface only while keeping wbc_attach_and_unlock_inode private in fs-writeback.c and unexporting __inode_attach_wb. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241112054403.1470586-3-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 31 +++++++++++++++++++++++++++---- include/linux/writeback.h | 28 ++-------------------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3fb115ae44b1..77db1f10023e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } -EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list @@ -731,8 +730,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) +static void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) __releases(&inode->i_lock) { if (!inode_cgwb_enabled(inode)) { @@ -763,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } -EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); + +/** + * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite + * @wbc: writeback_control of interest + * @inode: target inode + * + * This function is to be used by __filemap_fdatawrite_range(), which is an + * alternative entry point into writeback code, and first ensures @inode is + * associated with a bdi_writeback and attaches it to @wbc. + */ +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode) +{ + spin_lock(&inode->i_lock); + inode_attach_wb(inode, NULL); + wbc_attach_and_unlock_inode(wbc, inode); +} +EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -1228,6 +1244,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, } } +static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) + __releases(&inode->i_lock) +{ + spin_unlock(&inode->i_lock); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d6db822e4bb3..aee3e1b4c50f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -213,9 +213,6 @@ static inline void wait_on_inode(struct inode *inode) #include void __inode_attach_wb(struct inode *inode, struct folio *folio); -void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); @@ -254,22 +251,8 @@ static inline void inode_detach_wb(struct inode *inode) } } -/** - * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite - * @wbc: writeback_control of interest - * @inode: target inode - * - * This function is to be used by __filemap_fdatawrite_range(), which is an - * alternative entry point into writeback code, and first ensures @inode is - * associated with a bdi_writeback and attaches it to @wbc. - */ -static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, - struct inode *inode) -{ - spin_lock(&inode->i_lock); - inode_attach_wb(inode, NULL); - wbc_attach_and_unlock_inode(wbc, inode); -} +void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, + struct inode *inode); /** * wbc_init_bio - writeback specific initializtion of bio @@ -303,13 +286,6 @@ static inline void inode_detach_wb(struct inode *inode) { } -static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, - struct inode *inode) - __releases(&inode->i_lock) -{ - spin_unlock(&inode->i_lock); -} - static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { From 44010543fc8bedad172aa5b6c43480e5d2124497 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Nov 2024 10:09:57 -0500 Subject: [PATCH 32/35] fs: add the ability for statmount() to report the sb_source /proc/self/mountinfo displays the source for the mount, but statmount() doesn't yet have a way to return it. Add a new STATMOUNT_SB_SOURCE flag, claim the 32-bit __spare1 field to hold the offset into the str[] array. Reviewed-by: Jan Kara Signed-off-by: Jeff Layton Link: https://lore.kernel.org/r/20241111-statmount-v4-3-2eaf35d07a80@kernel.org Acked-by: Miklos Szeredi Signed-off-by: Christian Brauner --- fs/namespace.c | 36 +++++++++++++++++++++++++++++++++++- include/uapi/linux/mount.h | 3 ++- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index dbd89fffd919..d32b5afa99dc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5012,6 +5012,32 @@ static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) seq_puts(seq, sb->s_subtype); } +static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) +{ + struct super_block *sb = s->mnt->mnt_sb; + struct mount *r = real_mount(s->mnt); + + if (sb->s_op->show_devname) { + size_t start = seq->count; + int ret; + + ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); + if (ret) + return ret; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + /* Unescape the result */ + seq->buf[seq->count] = '\0'; + seq->count = start; + seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); + } else if (r->mnt_devname) { + seq_puts(seq, r->mnt_devname); + } + return 0; +} + static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) { s->sm.mask |= STATMOUNT_MNT_NS_ID; @@ -5075,6 +5101,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->fs_subtype = start; statmount_fs_subtype(s, seq); break; + case STATMOUNT_SB_SOURCE: + sm->sb_source = start; + ret = statmount_sb_source(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5223,6 +5253,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); + if (!err && s->mask & STATMOUNT_SB_SOURCE) + err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); @@ -5244,7 +5277,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) } #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ - STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | STATMOUNT_FS_SUBTYPE) + STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 2e939dddf9cb..2b49e9131d77 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -174,7 +174,7 @@ struct statmount { __u32 mnt_point; /* [str] Mountpoint relative to current root */ __u64 mnt_ns_id; /* ID of the mount namespace */ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ - __u32 __spare1[1]; + __u32 sb_source; /* [str] Source string of the mount */ __u64 __spare2[48]; char str[]; /* Variable size part containing strings */ }; @@ -210,6 +210,7 @@ struct mnt_id_req { #define STATMOUNT_MNT_NS_ID 0x00000040U /* Want/got mnt_ns_id */ #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ #define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ +#define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ /* * Special @mnt_id values that can be passed to listmount From 2f4d4503e9e5ab765a7948f98bc5deef7850f607 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 12 Nov 2024 11:10:04 +0100 Subject: [PATCH 33/35] statmount: add flag to retrieve unescaped options Filesystem options can be retrieved with STATMOUNT_MNT_OPTS, which returns a string of comma separated options, where some characters are escaped using the \OOO notation. Add a new flag, STATMOUNT_OPT_ARRAY, which instead returns the raw option values separated with '\0' charaters. Since escaped charaters are rare, this inteface is preferable for non-libmount users which likley don't want to deal with option de-escaping. Example code: if (st->mask & STATMOUNT_OPT_ARRAY) { const char *opt = st->str + st->opt_array; for (unsigned int i = 0; i < st->opt_num; i++) { printf("opt_array[%i]: <%s>\n", i, opt); opt += strlen(opt) + 1; } } Example ouput: (1) mnt_opts: (2) opt_array[0]: opt_array[1]: opt_array[2]: opt_array[3]: opt_array[4]: opt_array[5]: Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20241112101006.30715-1-mszeredi@redhat.com Acked-by: Jeff Layton [brauner: tweak variable naming and parsing add example output] Signed-off-by: Christian Brauner --- fs/namespace.c | 47 +++++++++++++++++++++++++++++++++++++- include/uapi/linux/mount.h | 7 ++++-- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d32b5afa99dc..4f39c4aba85d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5072,6 +5072,43 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) return 0; } +static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start, *buf_end, *opt_start, *opt_end; + u32 count = 0; + int err; + + if (!sb->s_op->show_options) + return 0; + + buf_start = seq->buf + start; + err = sb->s_op->show_options(seq, mnt->mnt_root); + if (err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + buf_end = seq->buf + seq->count; + *buf_end = '\0'; + for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) { + opt_end = strchrnul(opt_start, ','); + *opt_end = '\0'; + buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1; + if (WARN_ON_ONCE(++count == 0)) + return -EOVERFLOW; + } + seq->count = buf_start - 1 - seq->buf; + s->sm.opt_num = count; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; @@ -5097,6 +5134,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->mnt_opts = start; ret = statmount_mnt_opts(s, seq); break; + case STATMOUNT_OPT_ARRAY: + sm->opt_array = start; + ret = statmount_opt_array(s, seq); + break; case STATMOUNT_FS_SUBTYPE: sm->fs_subtype = start; statmount_fs_subtype(s, seq); @@ -5250,6 +5291,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_MNT_OPTS) err = statmount_string(s, STATMOUNT_MNT_OPTS); + if (!err && s->mask & STATMOUNT_OPT_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_ARRAY); + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); @@ -5278,7 +5322,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ - STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE) + STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 2b49e9131d77..c0fda4604187 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -154,7 +154,7 @@ struct mount_attr { */ struct statmount { __u32 size; /* Total size, including strings */ - __u32 mnt_opts; /* [str] Mount options of the mount */ + __u32 mnt_opts; /* [str] Options (comma separated, escaped) */ __u64 mask; /* What results were written */ __u32 sb_dev_major; /* Device ID */ __u32 sb_dev_minor; @@ -175,7 +175,9 @@ struct statmount { __u64 mnt_ns_id; /* ID of the mount namespace */ __u32 fs_subtype; /* [str] Subtype of fs_type (if any) */ __u32 sb_source; /* [str] Source string of the mount */ - __u64 __spare2[48]; + __u32 opt_num; /* Number of fs options */ + __u32 opt_array; /* [str] Array of nul terminated fs options */ + __u64 __spare2[47]; char str[]; /* Variable size part containing strings */ }; @@ -211,6 +213,7 @@ struct mnt_id_req { #define STATMOUNT_MNT_OPTS 0x00000080U /* Want/got mnt_opts */ #define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ #define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ +#define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ /* * Special @mnt_id values that can be passed to listmount From 45c9faf50665812a14fc9b40ab9d6cb893792ffd Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 13 Nov 2024 16:51:03 +0100 Subject: [PATCH 34/35] vfs: make evict() use smp_mb__after_spinlock instead of smp_mb It literally directly follows a spin_lock() call. This whacks an explicit barrier on x86-64. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20241113155103.4194099-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index d7a0c36e1ba6..84ab7b209b38 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -746,7 +746,7 @@ static void evict(struct inode *inode) * ___wait_var_event() either sees the bit cleared or * waitqueue_active() check in wake_up_var() sees the waiter. */ - smp_mb(); + smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); spin_unlock(&inode->i_lock); From aefff51e1c2986e16f2780ca8e4c97b784800ab5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 14 Nov 2024 16:31:27 +0100 Subject: [PATCH 35/35] statmount: retrieve security mount options Add the ability to retrieve security mount options. Keep them separate from filesystem specific mount options so it's easy to tell them apart. Also allow to retrieve them separate from other mount options as most of the time users won't be interested in security specific mount options. Link: https://lore.kernel.org/r/20241114-radtour-ofenrohr-ff34b567b40a@brauner Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- fs/namespace.c | 74 ++++++++++++++++++++++++++++++-------- include/uapi/linux/mount.h | 5 ++- 2 files changed, 64 insertions(+), 15 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 4f39c4aba85d..a9065a9ab971 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5072,13 +5072,30 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start) +{ + char *buf_end, *opt_start, *opt_end; + int count = 0; + + buf_end = seq->buf + seq->count; + *buf_end = '\0'; + for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) { + opt_end = strchrnul(opt_start, ','); + *opt_end = '\0'; + buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1; + if (WARN_ON_ONCE(++count == INT_MAX)) + return -EOVERFLOW; + } + seq->count = buf_start - 1 - seq->buf; + return count; +} + static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; size_t start = seq->count; - char *buf_start, *buf_end, *opt_start, *opt_end; - u32 count = 0; + char *buf_start; int err; if (!sb->s_op->show_options) @@ -5095,17 +5112,39 @@ static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) if (seq->count == start) return 0; - buf_end = seq->buf + seq->count; - *buf_end = '\0'; - for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) { - opt_end = strchrnul(opt_start, ','); - *opt_end = '\0'; - buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1; - if (WARN_ON_ONCE(++count == 0)) - return -EOVERFLOW; - } - seq->count = buf_start - 1 - seq->buf; - s->sm.opt_num = count; + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_num = err; + return 0; +} + +static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) +{ + struct vfsmount *mnt = s->mnt; + struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; + char *buf_start; + int err; + + buf_start = seq->buf + start; + + err = security_sb_show_options(seq, sb); + if (!err) + return err; + + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; + + if (seq->count == start) + return 0; + + err = statmount_opt_unescape(seq, buf_start); + if (err < 0) + return err; + + s->sm.opt_sec_num = err; return 0; } @@ -5138,6 +5177,10 @@ static int statmount_string(struct kstatmount *s, u64 flag) sm->opt_array = start; ret = statmount_opt_array(s, seq); break; + case STATMOUNT_OPT_SEC_ARRAY: + sm->opt_sec_array = start; + ret = statmount_opt_sec_array(s, seq); + break; case STATMOUNT_FS_SUBTYPE: sm->fs_subtype = start; statmount_fs_subtype(s, seq); @@ -5294,6 +5337,9 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_OPT_ARRAY) err = statmount_string(s, STATMOUNT_OPT_ARRAY); + if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) + err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); + if (!err && s->mask & STATMOUNT_FS_SUBTYPE) err = statmount_string(s, STATMOUNT_FS_SUBTYPE); @@ -5323,7 +5369,7 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ - STATMOUNT_OPT_ARRAY) + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index c0fda4604187..c07008816aca 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -177,7 +177,9 @@ struct statmount { __u32 sb_source; /* [str] Source string of the mount */ __u32 opt_num; /* Number of fs options */ __u32 opt_array; /* [str] Array of nul terminated fs options */ - __u64 __spare2[47]; + __u32 opt_sec_num; /* Number of security options */ + __u32 opt_sec_array; /* [str] Array of nul terminated security options */ + __u64 __spare2[46]; char str[]; /* Variable size part containing strings */ }; @@ -214,6 +216,7 @@ struct mnt_id_req { #define STATMOUNT_FS_SUBTYPE 0x00000100U /* Want/got fs_subtype */ #define STATMOUNT_SB_SOURCE 0x00000200U /* Want/got sb_source */ #define STATMOUNT_OPT_ARRAY 0x00000400U /* Want/got opt_... */ +#define STATMOUNT_OPT_SEC_ARRAY 0x00000800U /* Want/got opt_sec... */ /* * Special @mnt_id values that can be passed to listmount