From c7e263ab45b95170fe7d4b818bab484268b7bba8 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 25 Jul 2023 16:45:04 +0200 Subject: [PATCH 01/20] shmem: make shmem_inode_acct_block() return error Make shmem_inode_acct_block() return proper error code instead of bool. This will be useful later when we introduce quota support. There should be no functional change. Signed-off-by: Lukas Czerner Signed-off-by: Carlos Maiolino Reviewed-by: Jan Kara Message-Id: <20230725144510.253763-2-cem@kernel.org> Signed-off-by: Christian Brauner --- mm/shmem.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f5af4b943e42..be71cdcaeb14 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -199,13 +199,14 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static inline bool shmem_inode_acct_block(struct inode *inode, long pages) +static inline int shmem_inode_acct_block(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + int err = -ENOSPC; if (shmem_acct_block(info->flags, pages)) - return false; + return err; if (sbinfo->max_blocks) { if (percpu_counter_compare(&sbinfo->used_blocks, @@ -214,11 +215,11 @@ static inline bool shmem_inode_acct_block(struct inode *inode, long pages) percpu_counter_add(&sbinfo->used_blocks, pages); } - return true; + return 0; unacct: shmem_unacct_blocks(info->flags, pages); - return false; + return err; } static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) @@ -370,7 +371,7 @@ bool shmem_charge(struct inode *inode, long pages) struct shmem_inode_info *info = SHMEM_I(inode); unsigned long flags; - if (!shmem_inode_acct_block(inode, pages)) + if (shmem_inode_acct_block(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ @@ -1588,13 +1589,14 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; int nr; - int err = -ENOSPC; + int err; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1; - if (!shmem_inode_acct_block(inode, nr)) + err = shmem_inode_acct_block(inode, nr); + if (err) goto failed; if (huge) @@ -2445,7 +2447,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, int ret; pgoff_t max_off; - if (!shmem_inode_acct_block(inode, 1)) { + if (shmem_inode_acct_block(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to From 71480663b751de1bdfafaa2f668ceabaea78b68b Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 25 Jul 2023 16:45:05 +0200 Subject: [PATCH 02/20] shmem: make shmem_get_inode() return ERR_PTR instead of NULL Make shmem_get_inode() return ERR_PTR instead of NULL on error. This will be useful later when we introduce quota support. There should be no functional change. Signed-off-by: Lukas Czerner Signed-off-by: Carlos Maiolino Reviewed-by: Jan Kara Message-Id: <20230725144510.253763-3-cem@kernel.org> Signed-off-by: Christian Brauner --- mm/shmem.c | 211 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 119 insertions(+), 92 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index be71cdcaeb14..e89c35c65586 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2365,67 +2365,74 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); ino_t ino; + int err; + + err = shmem_reserve_inode(sb, &ino); + if (err) + return ERR_PTR(err); - if (shmem_reserve_inode(sb, &ino)) - return NULL; inode = new_inode(sb); - if (inode) { - inode->i_ino = ino; - inode_init_owner(idmap, inode, dir, mode); - inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = get_random_u32(); - info = SHMEM_I(inode); - memset(info, 0, (char *)inode - (char *)info); - spin_lock_init(&info->lock); - atomic_set(&info->stop_eviction, 0); - info->seals = F_SEAL_SEAL; - info->flags = flags & VM_NORESERVE; - info->i_crtime = inode->i_mtime; - info->fsflags = (dir == NULL) ? 0 : - SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; - if (info->fsflags) - shmem_set_inode_flags(inode, info->fsflags); - INIT_LIST_HEAD(&info->shrinklist); - INIT_LIST_HEAD(&info->swaplist); - if (sbinfo->noswap) - mapping_set_unevictable(inode->i_mapping); - simple_xattrs_init(&info->xattrs); - cache_no_acl(inode); - mapping_set_large_folios(inode->i_mapping); - switch (mode & S_IFMT) { - default: - inode->i_op = &shmem_special_inode_operations; - init_special_inode(inode, mode, dev); - break; - case S_IFREG: - inode->i_mapping->a_ops = &shmem_aops; - inode->i_op = &shmem_inode_operations; - inode->i_fop = &shmem_file_operations; - mpol_shared_policy_init(&info->policy, - shmem_get_sbmpol(sbinfo)); - break; - case S_IFDIR: - inc_nlink(inode); - /* Some things misbehave if size == 0 on a directory */ - inode->i_size = 2 * BOGO_DIRENT_SIZE; - inode->i_op = &shmem_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - break; - case S_IFLNK: - /* - * Must not load anything in the rbtree, - * mpol_free_shared_policy will not be called. - */ - mpol_shared_policy_init(&info->policy, NULL); - break; - } - - lockdep_annotate_inode_mutex_key(inode); - } else + if (!inode) { shmem_free_inode(sb); + return ERR_PTR(-ENOSPC); + } + + inode->i_ino = ino; + inode_init_owner(idmap, inode, dir, mode); + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_generation = get_random_u32(); + info = SHMEM_I(inode); + memset(info, 0, (char *)inode - (char *)info); + spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); + info->seals = F_SEAL_SEAL; + info->flags = flags & VM_NORESERVE; + info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags); + INIT_LIST_HEAD(&info->shrinklist); + INIT_LIST_HEAD(&info->swaplist); + INIT_LIST_HEAD(&info->swaplist); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); + simple_xattrs_init(&info->xattrs); + cache_no_acl(inode); + mapping_set_large_folios(inode->i_mapping); + + switch (mode & S_IFMT) { + default: + inode->i_op = &shmem_special_inode_operations; + init_special_inode(inode, mode, dev); + break; + case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; + inode->i_op = &shmem_inode_operations; + inode->i_fop = &shmem_file_operations; + mpol_shared_policy_init(&info->policy, + shmem_get_sbmpol(sbinfo)); + break; + case S_IFDIR: + inc_nlink(inode); + /* Some things misbehave if size == 0 on a directory */ + inode->i_size = 2 * BOGO_DIRENT_SIZE; + inode->i_op = &shmem_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + break; + case S_IFLNK: + /* + * Must not load anything in the rbtree, + * mpol_free_shared_policy will not be called. + */ + mpol_shared_policy_init(&info->policy, NULL); + break; + } + + lockdep_annotate_inode_mutex_key(inode); return inode; } @@ -3074,27 +3081,30 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; - int error = -ENOSPC; + int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE); - if (inode) { - error = simple_acl_create(dir, inode); - if (error) - goto out_iput; - error = security_inode_init_security(inode, dir, - &dentry->d_name, - shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) - goto out_iput; - error = 0; - dir->i_size += BOGO_DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = current_time(dir); - inode_inc_iversion(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - } + if (IS_ERR(inode)) + return PTR_ERR(inode); + + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + error = security_inode_init_security(inode, dir, + &dentry->d_name, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + + error = 0; + dir->i_size += BOGO_DIRENT_SIZE; + dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ return error; + out_iput: iput(inode); return error; @@ -3105,20 +3115,26 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct inode *inode; - int error = -ENOSPC; + int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (inode) { - error = security_inode_init_security(inode, dir, - NULL, - shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) - goto out_iput; - error = simple_acl_create(dir, inode); - if (error) - goto out_iput; - d_tmpfile(file, inode); + + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto err_out; } + + error = security_inode_init_security(inode, dir, + NULL, + shmem_initxattrs, NULL); + if (error && error != -EOPNOTSUPP) + goto out_iput; + error = simple_acl_create(dir, inode); + if (error) + goto out_iput; + d_tmpfile(file, inode); + +err_out: return finish_open_simple(file, error); out_iput: iput(inode); @@ -3293,8 +3309,9 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); - if (!inode) - return -ENOSPC; + + if (IS_ERR(inode)) + return PTR_ERR(inode); error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); @@ -3932,12 +3949,13 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; + int error = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) - return -ENOMEM; + return error; sb->s_fs_info = sbinfo; @@ -4000,8 +4018,10 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); - if (!inode) + if (IS_ERR(inode)) { + error = PTR_ERR(inode); goto failed; + } inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; sb->s_root = d_make_root(inode); @@ -4011,7 +4031,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) failed: shmem_put_super(sb); - return -ENOMEM; + return error; } static int shmem_get_tree(struct fs_context *fc) @@ -4380,10 +4400,16 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops #define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(idmap, sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); + return inode ? inode : ERR_PTR(-ENOSPC); +} + #endif /* CONFIG_SHMEM */ /* common code */ @@ -4408,9 +4434,10 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); - if (unlikely(!inode)) { + + if (IS_ERR(inode)) { shmem_unacct_size(flags, size); - return ERR_PTR(-ENOSPC); + return ERR_CAST(inode); } inode->i_flags |= i_flags; inode->i_size = size; From 86be6b8bd834bece87e6adc5313cb41e1ba7fe81 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 25 Jul 2023 16:45:06 +0200 Subject: [PATCH 03/20] quota: Check presence of quota operation structures instead of ->quota_read and ->quota_write callbacks Currently we check whether superblock has ->quota_read and ->quota_write operations to check whether filesystem supports quotas. However for example for shmfs we will not read or write dquots so check whether quota operations are set in the superblock instead. Signed-off-by: Jan Kara Reviewed-by: Carlos Maiolino Message-Id: <20230725144510.253763-4-cem@kernel.org> Signed-off-by: Christian Brauner --- fs/quota/dquot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index e3e4f4047657..4d826c369da2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2367,7 +2367,7 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, if (!fmt) return -ESRCH; - if (!sb->s_op->quota_write || !sb->s_op->quota_read || + if (!sb->dq_op || !sb->s_qcop || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; From eafc474e202978ac735c551d5ee1eb8c02e2be54 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 25 Jul 2023 16:45:07 +0200 Subject: [PATCH 04/20] shmem: prepare shmem quota infrastructure Add new shmem quota format, its quota_format_ops together with dquot_operations Signed-off-by: Lukas Czerner Signed-off-by: Carlos Maiolino Reviewed-by: Jan Kara Message-Id: <20230725144510.253763-5-cem@kernel.org> Signed-off-by: Christian Brauner --- fs/Kconfig | 12 ++ include/linux/shmem_fs.h | 12 ++ include/uapi/linux/quota.h | 1 + mm/Makefile | 2 +- mm/shmem_quota.c | 318 +++++++++++++++++++++++++++++++++++++ 5 files changed, 344 insertions(+), 1 deletion(-) create mode 100644 mm/shmem_quota.c diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7953..8218a71933f9 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -233,6 +233,18 @@ config TMPFS_INODE64 If unsure, say N. +config TMPFS_QUOTA + bool "Tmpfs quota support" + depends on TMPFS + select QUOTA + help + Quota support allows to set per user and group limits for tmpfs + usage. Say Y to enable quota support. Once enabled you can control + user and group quota enforcement with quota, usrquota and grpquota + mount options. + + If unsure, say N. + config ARCH_SUPPORTS_HUGETLBFS def_bool n diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 9029abd29b1c..7abfaf70b58a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -13,6 +13,10 @@ /* inode in-kernel data */ +#ifdef CONFIG_TMPFS_QUOTA +#define SHMEM_MAXQUOTAS 2 +#endif + struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ @@ -172,4 +176,12 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, #endif /* CONFIG_SHMEM */ #endif /* CONFIG_USERFAULTFD */ +/* + * Used space is stored as unsigned 64-bit value in bytes but + * quota core supports only signed 64-bit values so use that + * as a limit + */ +#define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */ +#define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL + #endif diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h index f17c9636a859..52090105b828 100644 --- a/include/uapi/linux/quota.h +++ b/include/uapi/linux/quota.h @@ -77,6 +77,7 @@ #define QFMT_VFS_V0 2 #define QFMT_OCFS2 3 #define QFMT_VFS_V1 4 +#define QFMT_SHMEM 5 /* Size of block in which space limits are passed through the quota * interface */ diff --git a/mm/Makefile b/mm/Makefile index 678530a07326..d4ee20988dd1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -51,7 +51,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o show_mem.o\ + compaction.o show_mem.o shmem_quota.o\ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c new file mode 100644 index 000000000000..e92b8ece9880 --- /dev/null +++ b/mm/shmem_quota.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * In memory quota format relies on quota infrastructure to store dquot + * information for us. While conventional quota formats for file systems + * with persistent storage can load quota information into dquot from the + * storage on-demand and hence quota dquot shrinker can free any dquot + * that is not currently being used, it must be avoided here. Otherwise we + * can lose valuable information, user provided limits, because there is + * no persistent storage to load the information from afterwards. + * + * One information that in-memory quota format needs to keep track of is + * a sorted list of ids for each quota type. This is done by utilizing + * an rb tree which root is stored in mem_dqinfo->dqi_priv for each quota + * type. + * + * This format can be used to support quota on file system without persistent + * storage such as tmpfs. + * + * Author: Lukas Czerner + * Carlos Maiolino + * + * Copyright (C) 2023 Red Hat, Inc. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_TMPFS_QUOTA + +/* + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define SHMEM_MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#define SHMEM_MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ + +struct quota_id { + struct rb_node node; + qid_t id; + qsize_t bhardlimit; + qsize_t bsoftlimit; + qsize_t ihardlimit; + qsize_t isoftlimit; +}; + +static int shmem_check_quota_file(struct super_block *sb, int type) +{ + /* There is no real quota file, nothing to do */ + return 1; +} + +/* + * There is no real quota file. Just allocate rb_root for quota ids and + * set limits + */ +static int shmem_read_file_info(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct mem_dqinfo *info = &dqopt->info[type]; + + info->dqi_priv = kzalloc(sizeof(struct rb_root), GFP_NOFS); + if (!info->dqi_priv) + return -ENOMEM; + + info->dqi_max_spc_limit = SHMEM_QUOTA_MAX_SPC_LIMIT; + info->dqi_max_ino_limit = SHMEM_QUOTA_MAX_INO_LIMIT; + + info->dqi_bgrace = SHMEM_MAX_DQ_TIME; + info->dqi_igrace = SHMEM_MAX_IQ_TIME; + info->dqi_flags = 0; + + return 0; +} + +static int shmem_write_file_info(struct super_block *sb, int type) +{ + /* There is no real quota file, nothing to do */ + return 0; +} + +/* + * Free all the quota_id entries in the rb tree and rb_root. + */ +static int shmem_free_file_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = &sb_dqopt(sb)->info[type]; + struct rb_root *root = info->dqi_priv; + struct quota_id *entry; + struct rb_node *node; + + info->dqi_priv = NULL; + node = rb_first(root); + while (node) { + entry = rb_entry(node, struct quota_id, node); + node = rb_next(&entry->node); + + rb_erase(&entry->node, root); + kfree(entry); + } + + kfree(root); + return 0; +} + +static int shmem_get_next_id(struct super_block *sb, struct kqid *qid) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, qid->type); + struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + qid_t id = from_kqid(&init_user_ns, *qid); + struct quota_info *dqopt = sb_dqopt(sb); + struct quota_id *entry = NULL; + int ret = 0; + + if (!sb_has_quota_active(sb, qid->type)) + return -ESRCH; + + down_read(&dqopt->dqio_sem); + while (node) { + entry = rb_entry(node, struct quota_id, node); + + if (id < entry->id) + node = node->rb_left; + else if (id > entry->id) + node = node->rb_right; + else + goto got_next_id; + } + + if (!entry) { + ret = -ENOENT; + goto out_unlock; + } + + if (id > entry->id) { + node = rb_next(&entry->node); + if (!node) { + ret = -ENOENT; + goto out_unlock; + } + entry = rb_entry(node, struct quota_id, node); + } + +got_next_id: + *qid = make_kqid(&init_user_ns, qid->type, entry->id); +out_unlock: + up_read(&dqopt->dqio_sem); + return ret; +} + +/* + * Load dquot with limits from existing entry, or create the new entry if + * it does not exist. + */ +static int shmem_acquire_dquot(struct dquot *dquot) +{ + struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); + struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node; + struct rb_node *parent = NULL, *new_node = NULL; + struct quota_id *new_entry, *entry; + qid_t id = from_kqid(&init_user_ns, dquot->dq_id); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret = 0; + + mutex_lock(&dquot->dq_lock); + + down_write(&dqopt->dqio_sem); + while (*n) { + parent = *n; + entry = rb_entry(parent, struct quota_id, node); + + if (id < entry->id) + n = &(*n)->rb_left; + else if (id > entry->id) + n = &(*n)->rb_right; + else + goto found; + } + + /* We don't have entry for this id yet, create it */ + new_entry = kzalloc(sizeof(struct quota_id), GFP_NOFS); + if (!new_entry) { + ret = -ENOMEM; + goto out_unlock; + } + + new_entry->id = id; + new_node = &new_entry->node; + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, (struct rb_root *)info->dqi_priv); + entry = new_entry; + +found: + /* Load the stored limits from the tree */ + spin_lock(&dquot->dq_dqb_lock); + dquot->dq_dqb.dqb_bhardlimit = entry->bhardlimit; + dquot->dq_dqb.dqb_bsoftlimit = entry->bsoftlimit; + dquot->dq_dqb.dqb_ihardlimit = entry->ihardlimit; + dquot->dq_dqb.dqb_isoftlimit = entry->isoftlimit; + + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && + !dquot->dq_dqb.dqb_isoftlimit) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dquot->dq_dqb_lock); + + /* Make sure flags update is visible after dquot has been filled */ + smp_mb__before_atomic(); + set_bit(DQ_ACTIVE_B, &dquot->dq_flags); +out_unlock: + up_write(&dqopt->dqio_sem); + mutex_unlock(&dquot->dq_lock); + return ret; +} + +/* + * Store limits from dquot in the tree unless it's fake. If it is fake + * remove the id from the tree since there is no useful information in + * there. + */ +static int shmem_release_dquot(struct dquot *dquot) +{ + struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); + struct rb_node *node = ((struct rb_root *)info->dqi_priv)->rb_node; + qid_t id = from_kqid(&init_user_ns, dquot->dq_id); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + struct quota_id *entry = NULL; + + mutex_lock(&dquot->dq_lock); + /* Check whether we are not racing with some other dqget() */ + if (dquot_is_busy(dquot)) + goto out_dqlock; + + down_write(&dqopt->dqio_sem); + while (node) { + entry = rb_entry(node, struct quota_id, node); + + if (id < entry->id) + node = node->rb_left; + else if (id > entry->id) + node = node->rb_right; + else + goto found; + } + + /* We should always find the entry in the rb tree */ + WARN_ONCE(1, "quota id %u from dquot %p, not in rb tree!\n", id, dquot); + up_write(&dqopt->dqio_sem); + mutex_unlock(&dquot->dq_lock); + return -ENOENT; + +found: + if (test_bit(DQ_FAKE_B, &dquot->dq_flags)) { + /* Remove entry from the tree */ + rb_erase(&entry->node, info->dqi_priv); + kfree(entry); + } else { + /* Store the limits in the tree */ + spin_lock(&dquot->dq_dqb_lock); + entry->bhardlimit = dquot->dq_dqb.dqb_bhardlimit; + entry->bsoftlimit = dquot->dq_dqb.dqb_bsoftlimit; + entry->ihardlimit = dquot->dq_dqb.dqb_ihardlimit; + entry->isoftlimit = dquot->dq_dqb.dqb_isoftlimit; + spin_unlock(&dquot->dq_dqb_lock); + } + + clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); + up_write(&dqopt->dqio_sem); + +out_dqlock: + mutex_unlock(&dquot->dq_lock); + return 0; +} + +static int shmem_mark_dquot_dirty(struct dquot *dquot) +{ + return 0; +} + +static int shmem_dquot_write_info(struct super_block *sb, int type) +{ + return 0; +} + +static const struct quota_format_ops shmem_format_ops = { + .check_quota_file = shmem_check_quota_file, + .read_file_info = shmem_read_file_info, + .write_file_info = shmem_write_file_info, + .free_file_info = shmem_free_file_info, +}; + +struct quota_format_type shmem_quota_format = { + .qf_fmt_id = QFMT_SHMEM, + .qf_ops = &shmem_format_ops, + .qf_owner = THIS_MODULE +}; + +const struct dquot_operations shmem_quota_operations = { + .acquire_dquot = shmem_acquire_dquot, + .release_dquot = shmem_release_dquot, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, + .write_info = shmem_dquot_write_info, + .mark_dirty = shmem_mark_dquot_dirty, + .get_next_id = shmem_get_next_id, +}; +#endif /* CONFIG_TMPFS_QUOTA */ From e09764cff44b5d31c2ca5477444565e3080637d2 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 25 Jul 2023 16:45:08 +0200 Subject: [PATCH 05/20] shmem: quota support Now the basic infra-structure is in place, enable quota support for tmpfs. This offers user and group quotas to tmpfs (project quotas will be added later). Also, as other filesystems, the tmpfs quota is not supported within user namespaces yet, so idmapping is not translated. Signed-off-by: Lukas Czerner Signed-off-by: Carlos Maiolino Reviewed-by: Jan Kara Message-Id: <20230725144510.253763-6-cem@kernel.org> Signed-off-by: Christian Brauner --- Documentation/filesystems/tmpfs.rst | 15 +++ include/linux/shmem_fs.h | 8 ++ mm/shmem.c | 186 ++++++++++++++++++++++++++-- 3 files changed, 201 insertions(+), 8 deletions(-) diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst index 2cd8fa332feb..06d6195eaa2c 100644 --- a/Documentation/filesystems/tmpfs.rst +++ b/Documentation/filesystems/tmpfs.rst @@ -123,6 +123,21 @@ sysfs file /sys/kernel/mm/transparent_hugepage/shmem_enabled: which can be used to deny huge pages on all tmpfs mounts in an emergency, or to force huge pages on all tmpfs mounts for testing. +tmpfs also supports quota with the following mount options + +======== ============================================================= +quota User and group quota accounting and enforcement is enabled on + the mount. Tmpfs is using hidden system quota files that are + initialized on mount. +usrquota User quota accounting and enforcement is enabled on the + mount. +grpquota Group quota accounting and enforcement is enabled on the + mount. +======== ============================================================= + +Note that tmpfs quotas do not support user namespaces so no uid/gid +translation is done if quotas are enabled inside user namespaces. + tmpfs has a mount option to set the NUMA memory allocation policy for all files in that instance (if CONFIG_NUMA is enabled) - which can be adjusted on the fly via 'mount -o remount ...' diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 7abfaf70b58a..1a568a0f542f 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -31,6 +31,9 @@ struct shmem_inode_info { atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ +#ifdef CONFIG_TMPFS_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif struct inode vfs_inode; }; @@ -184,4 +187,9 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, #define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */ #define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL +#ifdef CONFIG_TMPFS_QUOTA +extern const struct dquot_operations shmem_quota_operations; +extern struct quota_format_type shmem_quota_format; +#endif /* CONFIG_TMPFS_QUOTA */ + #endif diff --git a/mm/shmem.c b/mm/shmem.c index e89c35c65586..ec76ce9aea5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -78,6 +78,7 @@ static struct vfsmount *shm_mnt; #include #include #include +#include #include @@ -116,11 +117,13 @@ struct shmem_options { int huge; int seen; bool noswap; + unsigned short quota_types; #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 #define SHMEM_SEEN_INUMS 8 #define SHMEM_SEEN_NOSWAP 16 +#define SHMEM_SEEN_QUOTA 32 }; #ifdef CONFIG_TMPFS @@ -212,7 +215,16 @@ static inline int shmem_inode_acct_block(struct inode *inode, long pages) if (percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks - pages) > 0) goto unacct; + + err = dquot_alloc_block_nodirty(inode, pages); + if (err) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, pages); + } else { + err = dquot_alloc_block_nodirty(inode, pages); + if (err) + goto unacct; } return 0; @@ -227,6 +239,8 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + dquot_free_block_nodirty(inode, pages); + if (sbinfo->max_blocks) percpu_counter_sub(&sbinfo->used_blocks, pages); shmem_unacct_blocks(info->flags, pages); @@ -255,6 +269,47 @@ bool vma_is_shmem(struct vm_area_struct *vma) static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); +#ifdef CONFIG_TMPFS_QUOTA + +static int shmem_enable_quotas(struct super_block *sb, + unsigned short quota_types) +{ + int type, err = 0; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; + for (type = 0; type < SHMEM_MAXQUOTAS; type++) { + if (!(quota_types & (1 << type))) + continue; + err = dquot_load_quota_sb(sb, type, QFMT_SHMEM, + DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (err) + goto out_err; + } + return 0; + +out_err: + pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n", + type, err); + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + return err; +} + +static void shmem_disable_quotas(struct super_block *sb) +{ + int type; + + for (type = 0; type < SHMEM_MAXQUOTAS; type++) + dquot_quota_off(sb, type); +} + +static struct dquot **shmem_get_dquots(struct inode *inode) +{ + return SHMEM_I(inode)->i_dquot; +} +#endif /* CONFIG_TMPFS_QUOTA */ + /* * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and * produces a novel ino for the newly allocated inode. @@ -361,7 +416,6 @@ static void shmem_recalc_inode(struct inode *inode) freed = info->alloced - info->swapped - inode->i_mapping->nrpages; if (freed > 0) { info->alloced -= freed; - inode->i_blocks -= freed * BLOCKS_PER_PAGE; shmem_inode_unacct_blocks(inode, freed); } } @@ -379,7 +433,6 @@ bool shmem_charge(struct inode *inode, long pages) spin_lock_irqsave(&info->lock, flags); info->alloced += pages; - inode->i_blocks += pages * BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); @@ -395,7 +448,6 @@ void shmem_uncharge(struct inode *inode, long pages) spin_lock_irqsave(&info->lock, flags); info->alloced -= pages; - inode->i_blocks -= pages * BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); @@ -1141,6 +1193,21 @@ static int shmem_setattr(struct mnt_idmap *idmap, } } + if (is_quota_modification(idmap, inode, attr)) { + error = dquot_initialize(inode); + if (error) + return error; + } + + /* Transfer quota accounting */ + if (i_uid_needs_update(idmap, attr, inode) || + i_gid_needs_update(idmap, attr, inode)) { + error = dquot_transfer(idmap, inode, attr); + + if (error) + return error; + } + setattr_copy(idmap, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(idmap, dentry, inode->i_mode); @@ -1187,6 +1254,10 @@ static void shmem_evict_inode(struct inode *inode) WARN_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); clear_inode(inode); +#ifdef CONFIG_TMPFS_QUOTA + dquot_free_inode(inode); + dquot_drop(inode); +#endif } static int shmem_find_swap_entries(struct address_space *mapping, @@ -1986,7 +2057,6 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, spin_lock_irq(&info->lock); info->alloced += folio_nr_pages(folio); - inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); alloced = true; @@ -2357,9 +2427,10 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) #define shmem_initxattrs NULL #endif -static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, - struct inode *dir, umode_t mode, dev_t dev, - unsigned long flags) +static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, + struct inode *dir, umode_t mode, + dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -2436,6 +2507,43 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block return inode; } +#ifdef CONFIG_TMPFS_QUOTA +static struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + int err; + struct inode *inode; + + inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags); + if (IS_ERR(inode)) + return inode; + + err = dquot_initialize(inode); + if (err) + goto errout; + + err = dquot_alloc_inode(inode); + if (err) { + dquot_drop(inode); + goto errout; + } + return inode; + +errout: + inode->i_flags |= S_NOQUOTA; + iput(inode); + return ERR_PTR(err); +} +#else +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) +{ + return __shmem_get_inode(idmap, sb, dir, mode, dev, flags); +} +#endif /* CONFIG_TMPFS_QUOTA */ + #ifdef CONFIG_USERFAULTFD int shmem_mfill_atomic_pte(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -2538,7 +2646,6 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, spin_lock_irq(&info->lock); info->alloced++; - inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -3519,6 +3626,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) static const struct inode_operations shmem_short_symlink_operations = { .getattr = shmem_getattr, + .setattr = shmem_setattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3527,6 +3635,7 @@ static const struct inode_operations shmem_short_symlink_operations = { static const struct inode_operations shmem_symlink_inode_operations = { .getattr = shmem_getattr, + .setattr = shmem_setattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3626,6 +3735,9 @@ enum shmem_param { Opt_inode32, Opt_inode64, Opt_noswap, + Opt_quota, + Opt_usrquota, + Opt_grpquota, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -3648,6 +3760,11 @@ const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_flag ("inode32", Opt_inode32), fsparam_flag ("inode64", Opt_inode64), fsparam_flag ("noswap", Opt_noswap), +#ifdef CONFIG_TMPFS_QUOTA + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag ("grpquota", Opt_grpquota), +#endif {} }; @@ -3739,6 +3856,24 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->noswap = true; ctx->seen |= SHMEM_SEEN_NOSWAP; break; + case Opt_quota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP); + break; + case Opt_usrquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_USR; + break; + case Opt_grpquota: + if (fc->user_ns != &init_user_ns) + return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported"); + ctx->seen |= SHMEM_SEEN_QUOTA; + ctx->quota_types |= QTYPE_MASK_GRP; + break; } return 0; @@ -3846,6 +3981,12 @@ static int shmem_reconfigure(struct fs_context *fc) goto out; } + if (ctx->seen & SHMEM_SEEN_QUOTA && + !sb_any_quota_loaded(fc->root->d_sb)) { + err = "Cannot enable quota on remount"; + goto out; + } + if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) @@ -3937,6 +4078,9 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); +#ifdef CONFIG_TMPFS_QUOTA + shmem_disable_quotas(sb); +#endif free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); @@ -4016,6 +4160,17 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) #endif uuid_gen(&sb->s_uuid); +#ifdef CONFIG_TMPFS_QUOTA + if (ctx->seen & SHMEM_SEEN_QUOTA) { + sb->dq_op = &shmem_quota_operations; + sb->s_qcop = &dquot_quotactl_sysfile_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + + if (shmem_enable_quotas(sb, ctx->quota_types)) + goto failed; + } +#endif /* CONFIG_TMPFS_QUOTA */ + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { @@ -4191,6 +4346,9 @@ static const struct super_operations shmem_ops = { #ifdef CONFIG_TMPFS .statfs = shmem_statfs, .show_options = shmem_show_options, +#endif +#ifdef CONFIG_TMPFS_QUOTA + .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, @@ -4257,6 +4415,14 @@ void __init shmem_init(void) shmem_init_inodecache(); +#ifdef CONFIG_TMPFS_QUOTA + error = register_quota_format(&shmem_quota_format); + if (error < 0) { + pr_err("Could not register quota format\n"); + goto out3; + } +#endif + error = register_filesystem(&shmem_fs_type); if (error) { pr_err("Could not register tmpfs\n"); @@ -4281,6 +4447,10 @@ void __init shmem_init(void) out1: unregister_filesystem(&shmem_fs_type); out2: +#ifdef CONFIG_TMPFS_QUOTA + unregister_quota_format(&shmem_quota_format); +out3: +#endif shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); } From de4c0e7ca8b526a82ff7e5ee5533787bb6d01724 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Tue, 25 Jul 2023 16:45:09 +0200 Subject: [PATCH 06/20] shmem: Add default quota limit mount options Allow system administrator to set default global quota limits at tmpfs mount time. Signed-off-by: Lukas Czerner Signed-off-by: Carlos Maiolino Reviewed-by: Jan Kara Message-Id: <20230725144510.253763-7-cem@kernel.org> Signed-off-by: Christian Brauner --- Documentation/filesystems/tmpfs.rst | 34 +++++++++++----- include/linux/shmem_fs.h | 8 ++++ mm/shmem.c | 61 +++++++++++++++++++++++++++++ mm/shmem_quota.c | 34 +++++++++++++++- 4 files changed, 127 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst index 06d6195eaa2c..67422ee10e03 100644 --- a/Documentation/filesystems/tmpfs.rst +++ b/Documentation/filesystems/tmpfs.rst @@ -125,15 +125,31 @@ force huge pages on all tmpfs mounts for testing. tmpfs also supports quota with the following mount options -======== ============================================================= -quota User and group quota accounting and enforcement is enabled on - the mount. Tmpfs is using hidden system quota files that are - initialized on mount. -usrquota User quota accounting and enforcement is enabled on the - mount. -grpquota Group quota accounting and enforcement is enabled on the - mount. -======== ============================================================= +======================== ================================================= +quota User and group quota accounting and enforcement + is enabled on the mount. Tmpfs is using hidden + system quota files that are initialized on mount. +usrquota User quota accounting and enforcement is enabled + on the mount. +grpquota Group quota accounting and enforcement is enabled + on the mount. +usrquota_block_hardlimit Set global user quota block hard limit. +usrquota_inode_hardlimit Set global user quota inode hard limit. +grpquota_block_hardlimit Set global group quota block hard limit. +grpquota_inode_hardlimit Set global group quota inode hard limit. +======================== ================================================= + +None of the quota related mount options can be set or changed on remount. + +Quota limit parameters accept a suffix k, m or g for kilo, mega and giga +and can't be changed on remount. Default global quota limits are taking +effect for any and all user/group/project except root the first time the +quota entry for user/group/project id is being accessed - typically the +first time an inode with a particular id ownership is being created after +the mount. In other words, instead of the limits being initialized to zero, +they are initialized with the particular value provided with these mount +options. The limits can be changed for any user/group id at any time as they +normally can be. Note that tmpfs quotas do not support user namespaces so no uid/gid translation is done if quotas are enabled inside user namespaces. diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1a568a0f542f..c0058f3bba70 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -42,6 +42,13 @@ struct shmem_inode_info { (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL) #define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL) +struct shmem_quota_limits { + qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */ + qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */ + qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */ + qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */ +}; + struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ @@ -60,6 +67,7 @@ struct shmem_sb_info { spinlock_t shrinklist_lock; /* Protects shrinklist */ struct list_head shrinklist; /* List of shinkable inodes */ unsigned long shrinklist_len; /* Length of shrinklist */ + struct shmem_quota_limits qlimits; /* Default quota limits */ }; static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) diff --git a/mm/shmem.c b/mm/shmem.c index ec76ce9aea5f..5f2c9e2961c2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -118,6 +118,7 @@ struct shmem_options { int seen; bool noswap; unsigned short quota_types; + struct shmem_quota_limits qlimits; #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 @@ -3738,6 +3739,10 @@ enum shmem_param { Opt_quota, Opt_usrquota, Opt_grpquota, + Opt_usrquota_block_hardlimit, + Opt_usrquota_inode_hardlimit, + Opt_grpquota_block_hardlimit, + Opt_grpquota_inode_hardlimit, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -3764,6 +3769,10 @@ const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_flag ("quota", Opt_quota), fsparam_flag ("usrquota", Opt_usrquota), fsparam_flag ("grpquota", Opt_grpquota), + fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit), + fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit), + fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit), + fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit), #endif {} }; @@ -3874,6 +3883,42 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->seen |= SHMEM_SEEN_QUOTA; ctx->quota_types |= QTYPE_MASK_GRP; break; + case Opt_usrquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "User quota block hardlimit too large."); + ctx->qlimits.usrquota_bhardlimit = size; + break; + case Opt_grpquota_block_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_SPC_LIMIT) + return invalfc(fc, + "Group quota block hardlimit too large."); + ctx->qlimits.grpquota_bhardlimit = size; + break; + case Opt_usrquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "User quota inode hardlimit too large."); + ctx->qlimits.usrquota_ihardlimit = size; + break; + case Opt_grpquota_inode_hardlimit: + size = memparse(param->string, &rest); + if (*rest || !size) + goto bad_value; + if (size > SHMEM_QUOTA_MAX_INO_LIMIT) + return invalfc(fc, + "Group quota inode hardlimit too large."); + ctx->qlimits.grpquota_ihardlimit = size; + break; } return 0; @@ -3987,6 +4032,18 @@ static int shmem_reconfigure(struct fs_context *fc) goto out; } +#ifdef CONFIG_TMPFS_QUOTA +#define CHANGED_LIMIT(name) \ + (ctx->qlimits.name## hardlimit && \ + (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit)) + + if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) || + CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) { + err = "Cannot change global quota limit on remount"; + goto out; + } +#endif /* CONFIG_TMPFS_QUOTA */ + if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; if (ctx->seen & SHMEM_SEEN_INUMS) @@ -4166,6 +4223,10 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_qcop = &dquot_quotactl_sysfile_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + /* Copy the default limits from ctx into sbinfo */ + memcpy(&sbinfo->qlimits, &ctx->qlimits, + sizeof(struct shmem_quota_limits)); + if (shmem_enable_quotas(sb, ctx->quota_types)) goto failed; } diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c index e92b8ece9880..062d1c1097ae 100644 --- a/mm/shmem_quota.c +++ b/mm/shmem_quota.c @@ -166,6 +166,7 @@ static int shmem_acquire_dquot(struct dquot *dquot) { struct mem_dqinfo *info = sb_dqinfo(dquot->dq_sb, dquot->dq_id.type); struct rb_node **n = &((struct rb_root *)info->dqi_priv)->rb_node; + struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; struct rb_node *parent = NULL, *new_node = NULL; struct quota_id *new_entry, *entry; qid_t id = from_kqid(&init_user_ns, dquot->dq_id); @@ -195,6 +196,14 @@ static int shmem_acquire_dquot(struct dquot *dquot) } new_entry->id = id; + if (dquot->dq_id.type == USRQUOTA) { + new_entry->bhardlimit = sbinfo->qlimits.usrquota_bhardlimit; + new_entry->ihardlimit = sbinfo->qlimits.usrquota_ihardlimit; + } else if (dquot->dq_id.type == GRPQUOTA) { + new_entry->bhardlimit = sbinfo->qlimits.grpquota_bhardlimit; + new_entry->ihardlimit = sbinfo->qlimits.grpquota_ihardlimit; + } + new_node = &new_entry->node; rb_link_node(new_node, parent, n); rb_insert_color(new_node, (struct rb_root *)info->dqi_priv); @@ -224,6 +233,29 @@ static int shmem_acquire_dquot(struct dquot *dquot) return ret; } +static bool shmem_is_empty_dquot(struct dquot *dquot) +{ + struct shmem_sb_info *sbinfo = dquot->dq_sb->s_fs_info; + qsize_t bhardlimit; + qsize_t ihardlimit; + + if (dquot->dq_id.type == USRQUOTA) { + bhardlimit = sbinfo->qlimits.usrquota_bhardlimit; + ihardlimit = sbinfo->qlimits.usrquota_ihardlimit; + } else if (dquot->dq_id.type == GRPQUOTA) { + bhardlimit = sbinfo->qlimits.grpquota_bhardlimit; + ihardlimit = sbinfo->qlimits.grpquota_ihardlimit; + } + + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || + (dquot->dq_dqb.dqb_curspace == 0 && + dquot->dq_dqb.dqb_curinodes == 0 && + dquot->dq_dqb.dqb_bhardlimit == bhardlimit && + dquot->dq_dqb.dqb_ihardlimit == ihardlimit)) + return true; + + return false; +} /* * Store limits from dquot in the tree unless it's fake. If it is fake * remove the id from the tree since there is no useful information in @@ -261,7 +293,7 @@ static int shmem_release_dquot(struct dquot *dquot) return -ENOENT; found: - if (test_bit(DQ_FAKE_B, &dquot->dq_flags)) { + if (shmem_is_empty_dquot(dquot)) { /* Remove entry from the tree */ rb_erase(&entry->node, info->dqi_priv); kfree(entry); From 509f006932de7556d48eaa7afcd02dcf1ca9a3e9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 25 Jul 2023 16:45:10 +0200 Subject: [PATCH 07/20] shmem: fix quota lock nesting in huge hole handling i_pages lock nests inside i_lock, but shmem_charge() and shmem_uncharge() were being called from THP splitting or collapsing while i_pages lock was held, and now go on to call dquot_alloc_block_nodirty() which takes i_lock to update i_blocks. We may well want to take i_lock out of this path later, in the non-quota case even if it's left in the quota case (or perhaps use i_lock instead of shmem's info->lock throughout); but don't get into that at this time. Move the shmem_charge() and shmem_uncharge() calls out from under i_pages lock, accounting the full batch of holes in a single call. Still pass the pages argument to shmem_uncharge(), but it happens now to be unused: shmem_recalc_inode() is designed to account for clean pages freed behind shmem's back, so it gets the accounting right by itself; then the later call to shmem_inode_unacct_blocks() led to imbalance (that WARN_ON(inode->i_blocks) in shmem_evict_inode()). Reported-by: syzbot+38ca19393fb3344f57e6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/0000000000008e62f40600bfe080@google.com/ Reported-by: syzbot+440ff8cca06ee7a1d4db@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/00000000000076a7840600bfb6e8@google.com/ Signed-off-by: Hugh Dickins Tested-by: Carlos Maiolino Reviewed-by: Carlos Maiolino Message-Id: <20230725144510.253763-8-cem@kernel.org> Signed-off-by: Christian Brauner --- mm/huge_memory.c | 6 ++++-- mm/khugepaged.c | 13 +++++++------ mm/shmem.c | 19 +++++++++---------- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eb3678360b97..d301c323c69a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2521,7 +2521,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); - int i; + int i, nr_dropped = 0; /* complete memcg works before add pages to LRU */ split_page_memcg(head, nr); @@ -2546,7 +2546,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct folio *tail = page_folio(head + i); if (shmem_mapping(head->mapping)) - shmem_uncharge(head->mapping->host, 1); + nr_dropped++; else if (folio_test_clear_dirty(tail)) folio_account_cleaned(tail, inode_to_wb(folio->mapping->host)); @@ -2583,6 +2583,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); + if (nr_dropped) + shmem_uncharge(head->mapping->host, nr_dropped); remap_page(folio, nr); if (PageSwapCache(head)) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 78c8d5d8b628..47d1d32c734f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1955,10 +1955,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto xa_locked; } } - if (!shmem_charge(mapping->host, 1)) { - result = SCAN_FAIL; - goto xa_locked; - } nr_none++; continue; } @@ -2145,8 +2141,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, */ try_to_unmap_flush(); - if (result != SCAN_SUCCEED) + if (result == SCAN_SUCCEED && nr_none && + !shmem_charge(mapping->host, nr_none)) + result = SCAN_FAIL; + if (result != SCAN_SUCCEED) { + nr_none = 0; goto rollback; + } /* * The old pages are locked, so they won't change anymore. @@ -2283,8 +2284,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (nr_none) { xas_lock_irq(&xas); mapping->nrpages -= nr_none; - shmem_uncharge(mapping->host, nr_none); xas_unlock_irq(&xas); + shmem_uncharge(mapping->host, nr_none); } list_for_each_entry_safe(page, tmp, &pagelist, lru) { diff --git a/mm/shmem.c b/mm/shmem.c index 5f2c9e2961c2..72326ea74954 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -424,18 +424,20 @@ static void shmem_recalc_inode(struct inode *inode) bool shmem_charge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long flags; + struct address_space *mapping = inode->i_mapping; if (shmem_inode_acct_block(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ - inode->i_mapping->nrpages += pages; + xa_lock_irq(&mapping->i_pages); + mapping->nrpages += pages; + xa_unlock_irq(&mapping->i_pages); - spin_lock_irqsave(&info->lock, flags); + spin_lock_irq(&info->lock); info->alloced += pages; shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); + spin_unlock_irq(&info->lock); return true; } @@ -443,16 +445,13 @@ bool shmem_charge(struct inode *inode, long pages) void shmem_uncharge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long flags; /* nrpages adjustment done by __filemap_remove_folio() or caller */ - spin_lock_irqsave(&info->lock, flags); - info->alloced -= pages; + spin_lock_irq(&info->lock); shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); - - shmem_inode_unacct_blocks(inode, pages); + /* which has called shmem_inode_unacct_blocks() if necessary */ + spin_unlock_irq(&info->lock); } /* From 6faddda69f623d38bb097640689901a4b5ff881a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 30 Jun 2023 13:48:49 -0400 Subject: [PATCH 08/20] libfs: Add directory operations for stable offsets Create a vector of directory operations in fs/libfs.c that handles directory seeks and readdir via stable offsets instead of the current cursor-based mechanism. For the moment these are unused. Signed-off-by: Chuck Lever Message-Id: <168814732984.530310.11190772066786107220.stgit@manet.1015granger.net> Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 8 +- Documentation/filesystems/vfs.rst | 6 +- fs/libfs.c | 248 ++++++++++++++++++++++++++ include/linux/fs.h | 18 ++ 4 files changed, 276 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index ed148919e11a..443091406e35 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -85,13 +85,14 @@ prototypes:: struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); + struct offset_ctx *(*get_offset_ctx)(struct inode *inode); locking rules: all may block -============== ============================================= +============== ================================================== ops i_rwsem(inode) -============== ============================================= +============== ================================================== lookup: shared create: exclusive link: exclusive (both) @@ -115,7 +116,8 @@ atomic_open: shared (exclusive if O_CREAT is set in open flags) tmpfile: no fileattr_get: no or exclusive fileattr_set: exclusive -============== ============================================= +get_offset_ctx no +============== ================================================== Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index cb2a97e49872..898d0b43109e 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -515,6 +515,7 @@ As of kernel 2.6.22, the following members are defined: int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); + struct offset_ctx *(*get_offset_ctx)(struct inode *inode); }; Again, all methods are called without any locks being held, unless @@ -675,7 +676,10 @@ otherwise noted. called on ioctl(FS_IOC_SETFLAGS) and ioctl(FS_IOC_FSSETXATTR) to change miscellaneous file flags and attributes. Callers hold i_rwsem exclusive. If unset, then fall back to f_op->ioctl(). - +``get_offset_ctx`` + called to get the offset context for a directory inode. A + filesystem must define this operation to use + simple_offset_dir_operations. The Address Space Object ======================== diff --git a/fs/libfs.c b/fs/libfs.c index 5b851315eeed..a7e56baf8bbd 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -239,6 +239,254 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +static void offset_set(struct dentry *dentry, u32 offset) +{ + dentry->d_fsdata = (void *)((uintptr_t)(offset)); +} + +static u32 dentry2offset(struct dentry *dentry) +{ + return (u32)((uintptr_t)(dentry->d_fsdata)); +} + +/** + * simple_offset_init - initialize an offset_ctx + * @octx: directory offset map to be initialized + * + */ +void simple_offset_init(struct offset_ctx *octx) +{ + xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); + + /* 0 is '.', 1 is '..', so always start with offset 2 */ + octx->next_offset = 2; +} + +/** + * simple_offset_add - Add an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: new dentry being added + * + * Returns zero on success. @so_ctx and the dentry offset are updated. + * Otherwise, a negative errno value is returned. + */ +int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) +{ + static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); + u32 offset; + int ret; + + if (dentry2offset(dentry) != 0) + return -EBUSY; + + ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, + &octx->next_offset, GFP_KERNEL); + if (ret < 0) + return ret; + + offset_set(dentry, offset); + return 0; +} + +/** + * simple_offset_remove - Remove an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: dentry being removed + * + */ +void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) +{ + u32 offset; + + offset = dentry2offset(dentry); + if (offset == 0) + return; + + xa_erase(&octx->xa, offset); + offset_set(dentry, 0); +} + +/** + * simple_offset_rename_exchange - exchange rename with directory offsets + * @old_dir: parent of dentry being moved + * @old_dentry: dentry being moved + * @new_dir: destination parent + * @new_dentry: destination dentry + * + * Returns zero on success. Otherwise a negative errno is returned and the + * rename is rolled back. + */ +int simple_offset_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); + struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); + u32 old_index = dentry2offset(old_dentry); + u32 new_index = dentry2offset(new_dentry); + int ret; + + simple_offset_remove(old_ctx, old_dentry); + simple_offset_remove(new_ctx, new_dentry); + + ret = simple_offset_add(new_ctx, old_dentry); + if (ret) + goto out_restore; + + ret = simple_offset_add(old_ctx, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + goto out_restore; + } + + ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + simple_offset_remove(old_ctx, new_dentry); + goto out_restore; + } + return 0; + +out_restore: + offset_set(old_dentry, old_index); + xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + offset_set(new_dentry, new_index); + xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + return ret; +} + +/** + * simple_offset_destroy - Release offset map + * @octx: directory offset ctx that is about to be destroyed + * + * During fs teardown (eg. umount), a directory's offset map might still + * contain entries. xa_destroy() cleans out anything that remains. + */ +void simple_offset_destroy(struct offset_ctx *octx) +{ + xa_destroy(&octx->xa); +} + +/** + * offset_dir_llseek - Advance the read position of a directory descriptor + * @file: an open directory whose position is to be updated + * @offset: a byte offset + * @whence: enumerator describing the starting position for this update + * + * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. + * + * Returns the updated read position if successful; otherwise a + * negative errno is returned and the read position remains unchanged. + */ +static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_CUR: + offset += file->f_pos; + fallthrough; + case SEEK_SET: + if (offset >= 0) + break; + fallthrough; + default: + return -EINVAL; + } + + return vfs_setpos(file, offset, U32_MAX); +} + +static struct dentry *offset_find_next(struct xa_state *xas) +{ + struct dentry *child, *found = NULL; + + rcu_read_lock(); + child = xas_next_entry(xas, U32_MAX); + if (!child) + goto out; + spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(child)) + found = dget_dlock(child); + spin_unlock(&child->d_lock); +out: + rcu_read_unlock(); + return found; +} + +static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) +{ + u32 offset = dentry2offset(dentry); + struct inode *inode = d_inode(dentry); + + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, + inode->i_ino, fs_umode_to_dtype(inode->i_mode)); +} + +static void offset_iterate_dir(struct dentry *dir, struct dir_context *ctx) +{ + struct inode *inode = d_inode(dir); + struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); + XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct dentry *dentry; + + while (true) { + spin_lock(&dir->d_lock); + dentry = offset_find_next(&xas); + spin_unlock(&dir->d_lock); + if (!dentry) + break; + + if (!offset_dir_emit(ctx, dentry)) { + dput(dentry); + break; + } + + dput(dentry); + ctx->pos = xas.xa_index + 1; + } +} + +/** + * offset_readdir - Emit entries starting at offset @ctx->pos + * @file: an open directory to iterate over + * @ctx: directory iteration context + * + * Caller must hold @file's i_rwsem to prevent insertion or removal of + * entries during this call. + * + * On entry, @ctx->pos contains an offset that represents the first entry + * to be read from the directory. + * + * The operation continues until there are no more entries to read, or + * until the ctx->actor indicates there is no more space in the caller's + * output buffer. + * + * On return, @ctx->pos contains an offset that will read the next entry + * in this directory when shmem_readdir() is called again with @ctx. + * + * Return values: + * %0 - Complete + */ +static int offset_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dir = file->f_path.dentry; + + lockdep_assert_held(&d_inode(dir)->i_rwsem); + + if (!dir_emit_dots(file, ctx)) + return 0; + + offset_iterate_dir(dir, ctx); + return 0; +} + +const struct file_operations simple_offset_dir_operations = { + .llseek = offset_dir_llseek, + .iterate_shared = offset_readdir, + .read = generic_read_dir, + .fsync = noop_fsync, +}; + static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6..59a4129ce14c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1770,6 +1770,7 @@ struct dir_context { struct iov_iter; struct io_uring_cmd; +struct offset_ctx; struct file_operations { struct module *owner; @@ -1857,6 +1858,7 @@ struct inode_operations { int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); + struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, @@ -2971,6 +2973,22 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); +struct offset_ctx { + struct xarray xa; + u32 next_offset; +}; + +void simple_offset_init(struct offset_ctx *octx); +int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); +void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); +int simple_offset_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry); +void simple_offset_destroy(struct offset_ctx *octx); + +extern const struct file_operations simple_offset_dir_operations; + extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); From 23a31d87645c652734f89f477f69ddac9aa402cb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 30 Jun 2023 13:48:56 -0400 Subject: [PATCH 09/20] shmem: Refactor shmem_symlink() De-duplicate the error handling paths. No change in behavior is expected. Suggested-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: Chuck Lever Message-Id: <168814733654.530310.9958360833543413152.stgit@manet.1015granger.net> Signed-off-by: Christian Brauner --- mm/shmem.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 72326ea74954..f5006ea3b8e1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3422,26 +3422,22 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error && error != -EOPNOTSUPP) { - iput(inode); - return error; - } + if (error && error != -EOPNOTSUPP) + goto out_iput; inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { inode->i_link = kmemdup(symname, len, GFP_KERNEL); if (!inode->i_link) { - iput(inode); - return -ENOMEM; + error = -ENOMEM; + goto out_iput; } inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); - if (error) { - iput(inode); - return error; - } + if (error) + goto out_iput; inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); @@ -3456,6 +3452,9 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); dget(dentry); return 0; +out_iput: + iput(inode); + return error; } static void shmem_put_link(void *arg) From a2e459555c5f9da3e619b7e47a63f98574dc75f1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 30 Jun 2023 13:49:03 -0400 Subject: [PATCH 10/20] shmem: stable directory offsets The current cursor-based directory offset mechanism doesn't work when a tmpfs filesystem is exported via NFS. This is because NFS clients do not open directories. Each server-side READDIR operation has to open the directory, read it, then close it. The cursor state for that directory, being associated strictly with the opened struct file, is thus discarded after each NFS READDIR operation. Directory offsets are cached not only by NFS clients, but also by user space libraries on those clients. Essentially there is no way to invalidate those caches when directory offsets have changed on an NFS server after the offset-to-dentry mapping changes. Thus the whole application stack depends on unchanging directory offsets. The solution we've come up with is to make the directory offset for each file in a tmpfs filesystem stable for the life of the directory entry it represents. shmem_readdir() and shmem_dir_llseek() now use an xarray to map each directory offset (an loff_t integer) to the memory address of a struct dentry. Signed-off-by: Chuck Lever Message-Id: <168814734331.530310.3911190551060453102.stgit@manet.1015granger.net> Signed-off-by: Christian Brauner --- include/linux/shmem_fs.h | 1 + mm/shmem.c | 47 ++++++++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index c0058f3bba70..9b2d2faff1d0 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -34,6 +34,7 @@ struct shmem_inode_info { #ifdef CONFIG_TMPFS_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif + struct offset_ctx dir_offsets; /* stable entry offsets */ struct inode vfs_inode; }; diff --git a/mm/shmem.c b/mm/shmem.c index f5006ea3b8e1..81973914b294 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2427,6 +2427,11 @@ static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) #define shmem_initxattrs NULL #endif +static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode) +{ + return &SHMEM_I(inode)->dir_offsets; +} + static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, umode_t mode, @@ -2492,7 +2497,8 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + inode->i_fop = &simple_offset_dir_operations; + simple_offset_init(shmem_get_offset_ctx(inode)); break; case S_IFLNK: /* @@ -3204,7 +3210,10 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, if (error && error != -EOPNOTSUPP) goto out_iput; - error = 0; + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; + dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); inode_inc_iversion(dir); @@ -3287,6 +3296,13 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr goto out; } + ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (ret) { + if (inode->i_nlink) + shmem_free_inode(inode->i_sb); + goto out; + } + dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); @@ -3305,6 +3321,8 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) shmem_free_inode(inode->i_sb); + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); + dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); inode_inc_iversion(dir); @@ -3363,24 +3381,29 @@ static int shmem_rename2(struct mnt_idmap *idmap, { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); + int error; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + return simple_offset_rename_exchange(old_dir, old_dentry, + new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { - int error; - error = shmem_whiteout(idmap, old_dir, old_dentry); if (error) return error; } + simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry); + error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry); + if (error) + return error; + if (d_really_is_positive(new_dentry)) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { @@ -3425,19 +3448,23 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, if (error && error != -EOPNOTSUPP) goto out_iput; + error = simple_offset_add(shmem_get_offset_ctx(dir), dentry); + if (error) + goto out_iput; + inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { inode->i_link = kmemdup(symname, len, GFP_KERNEL); if (!inode->i_link) { error = -ENOMEM; - goto out_iput; + goto out_remove_offset; } inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) - goto out_iput; + goto out_remove_offset; inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; memcpy(folio_address(folio), symname, len); @@ -3452,6 +3479,9 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); dget(dentry); return 0; + +out_remove_offset: + simple_offset_remove(shmem_get_offset_ctx(dir), dentry); out_iput: iput(inode); return error; @@ -4295,6 +4325,8 @@ static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); + if (S_ISDIR(inode->i_mode)) + simple_offset_destroy(shmem_get_offset_ctx(inode)); } static void shmem_init_inode(void *foo) @@ -4375,6 +4407,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .mknod = shmem_mknod, .rename = shmem_rename2, .tmpfile = shmem_tmpfile, + .get_offset_ctx = shmem_get_offset_ctx, #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, From bbaef7973dd017954419718ab2df526ae1f66e93 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 24 Jul 2023 10:43:57 -0400 Subject: [PATCH 11/20] libfs: Add a lock class for the offset map's xa_lock Tie the dynamically-allocated xarray locks into a single class so contention on the directory offset xarrays can be observed. Signed-off-by: Chuck Lever Message-Id: <169020933088.160441.9405180953116076087.stgit@manet.1015granger.net> Signed-off-by: Christian Brauner --- fs/libfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/libfs.c b/fs/libfs.c index a7e56baf8bbd..3495f071093e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -249,6 +249,8 @@ static u32 dentry2offset(struct dentry *dentry) return (u32)((uintptr_t)(dentry->d_fsdata)); } +static struct lock_class_key simple_offset_xa_lock; + /** * simple_offset_init - initialize an offset_ctx * @octx: directory offset map to be initialized @@ -257,6 +259,7 @@ static u32 dentry2offset(struct dentry *dentry) void simple_offset_init(struct offset_ctx *octx) { xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); + lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); /* 0 is '.', 1 is '..', so always start with offset 2 */ octx->next_offset = 2; From 2be4f05af71bb2a9958c5680c19e5a489636ff42 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Jul 2023 14:31:04 -0400 Subject: [PATCH 12/20] libfs: Remove parent dentry locking in offset_iterate_dir() Since offset_iterate_dir() does not walk the parent's d_subdir list nor does it manipulate the parent's d_child, there doesn't seem to be a reason to hold the parent's d_lock. The offset_ctx's xarray can be sufficiently protected with just the RCU read lock. Flame graph data captured during the git regression run shows a 20% reduction in CPU cycles consumed in offset_find_next(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202307171640.e299f8d5-oliver.sang@intel.com Signed-off-by: Chuck Lever Message-Id: <169030957098.157536.9938425508695693348.stgit@manet.1015granger.net> Signed-off-by: Christian Brauner --- fs/libfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 3495f071093e..c78a48654a94 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -407,7 +407,7 @@ static struct dentry *offset_find_next(struct xa_state *xas) child = xas_next_entry(xas, U32_MAX); if (!child) goto out; - spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); + spin_lock(&child->d_lock); if (simple_positive(child)) found = dget_dlock(child); spin_unlock(&child->d_lock); @@ -425,17 +425,14 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } -static void offset_iterate_dir(struct dentry *dir, struct dir_context *ctx) +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { - struct inode *inode = d_inode(dir); struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); XA_STATE(xas, &so_ctx->xa, ctx->pos); struct dentry *dentry; while (true) { - spin_lock(&dir->d_lock); dentry = offset_find_next(&xas); - spin_unlock(&dir->d_lock); if (!dentry) break; @@ -465,7 +462,7 @@ static void offset_iterate_dir(struct dentry *dir, struct dir_context *ctx) * output buffer. * * On return, @ctx->pos contains an offset that will read the next entry - * in this directory when shmem_readdir() is called again with @ctx. + * in this directory when offset_readdir() is called again with @ctx. * * Return values: * %0 - Complete @@ -479,7 +476,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - offset_iterate_dir(dir, ctx); + offset_iterate_dir(d_inode(dir), ctx); return 0; } From 3c1b7528d8969a8e89c77cd5eb867503152547b1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 3 Aug 2023 22:46:11 -0700 Subject: [PATCH 13/20] shmem: move spinlock into shmem_recalc_inode() to fix quota support Commit "shmem: fix quota lock nesting in huge hole handling" was not so good: Smatch caught shmem_recalc_inode()'s shmem_inode_unacct_blocks() descending into quota_send_warning(): where blocking GFP_NOFS is used, yet shmem_recalc_inode() is called holding the shmem inode's info->lock. Yes, both __dquot_alloc_space() and __dquot_free_space() are commented "This operation can block, but only after everything is updated" - when calling flush_warnings() at the end - both its print_warning() and its quota_send_warning() may block. Rework shmem_recalc_inode() to take the shmem inode's info->lock inside, and drop it before calling shmem_inode_unacct_blocks(). And why were the spin_locks disabling interrupts? That was just a relic from when shmem_charge() and shmem_uncharge() were called while holding i_pages xa_lock: stop disabling interrupts for info->lock now. To help stop me from making the same mistake again, add a might_sleep() into shmem_inode_acct_block() and shmem_inode_unacct_blocks(); and those functions have grown, so let the compiler decide whether to inline them. Reported-by: Dan Carpenter Closes: https://lore.kernel.org/linux-fsdevel/ffd7ca34-7f2a-44ee-b05d-b54d920ce076@moroto.mountain/ Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Message-Id: <29f48045-2cb5-7db-ecf1-72462f1bef5@google.com> Signed-off-by: Christian Brauner --- mm/shmem.c | 107 ++++++++++++++++++++++------------------------------- 1 file changed, 44 insertions(+), 63 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 81973914b294..b88dc7916e94 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -203,7 +203,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static inline int shmem_inode_acct_block(struct inode *inode, long pages) +static int shmem_inode_acct_block(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -212,6 +212,7 @@ static inline int shmem_inode_acct_block(struct inode *inode, long pages) if (shmem_acct_block(info->flags, pages)) return err; + might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { if (percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks - pages) > 0) @@ -235,11 +236,12 @@ static inline int shmem_inode_acct_block(struct inode *inode, long pages) return err; } -static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) +static void shmem_inode_unacct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + might_sleep(); /* when quotas */ dquot_free_block_nodirty(inode, pages); if (sbinfo->max_blocks) @@ -400,30 +402,45 @@ static void shmem_free_inode(struct super_block *sb) /** * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc + * @alloced: the change in number of pages allocated to inode + * @swapped: the change in number of pages swapped from inode * * We have to calculate the free blocks since the mm can drop * undirtied hole pages behind our back. * * But normally info->alloced == inode->i_mapping->nrpages + info->swapped * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped) - * - * It has to be called with the spinlock held. */ -static void shmem_recalc_inode(struct inode *inode) +static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped) { struct shmem_inode_info *info = SHMEM_I(inode); long freed; - freed = info->alloced - info->swapped - inode->i_mapping->nrpages; - if (freed > 0) { + spin_lock(&info->lock); + info->alloced += alloced; + info->swapped += swapped; + freed = info->alloced - info->swapped - + READ_ONCE(inode->i_mapping->nrpages); + /* + * Special case: whereas normally shmem_recalc_inode() is called + * after i_mapping->nrpages has already been adjusted (up or down), + * shmem_writepage() has to raise swapped before nrpages is lowered - + * to stop a racing shmem_recalc_inode() from thinking that a page has + * been freed. Compensate here, to avoid the need for a followup call. + */ + if (swapped > 0) + freed += swapped; + if (freed > 0) info->alloced -= freed; + spin_unlock(&info->lock); + + /* The quota case may block */ + if (freed > 0) shmem_inode_unacct_blocks(inode, freed); - } } bool shmem_charge(struct inode *inode, long pages) { - struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; if (shmem_inode_acct_block(inode, pages)) @@ -434,24 +451,16 @@ bool shmem_charge(struct inode *inode, long pages) mapping->nrpages += pages; xa_unlock_irq(&mapping->i_pages); - spin_lock_irq(&info->lock); - info->alloced += pages; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - + shmem_recalc_inode(inode, pages, 0); return true; } void shmem_uncharge(struct inode *inode, long pages) { - struct shmem_inode_info *info = SHMEM_I(inode); - + /* pages argument is currently unused: keep it to help debugging */ /* nrpages adjustment done by __filemap_remove_folio() or caller */ - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - /* which has called shmem_inode_unacct_blocks() if necessary */ - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, 0); } /* @@ -1091,10 +1100,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_batch_release(&fbatch); } - spin_lock_irq(&info->lock); - info->swapped -= nr_swaps_freed; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, -nr_swaps_freed); } void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) @@ -1112,11 +1118,9 @@ static int shmem_getattr(struct mnt_idmap *idmap, struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - if (info->alloced - info->swapped != inode->i_mapping->nrpages) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - } + if (info->alloced - info->swapped != inode->i_mapping->nrpages) + shmem_recalc_inode(inode, 0, 0); + if (info->fsflags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (info->fsflags & FS_IMMUTABLE_FL) @@ -1501,11 +1505,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - info->swapped++; - spin_unlock_irq(&info->lock); - + shmem_recalc_inode(inode, 0, 1); swap_shmem_alloc(swap); shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); @@ -1776,7 +1776,6 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, struct folio *folio, swp_entry_t swap) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); swp_entry_t swapin_error; void *old; @@ -1789,16 +1788,12 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, folio_wait_writeback(folio); delete_from_swap_cache(folio); - spin_lock_irq(&info->lock); /* - * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't - * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in - * shmem_evict_inode. + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks + * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) + * in shmem_evict_inode(). */ - info->alloced--; - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, -1, -1); swap_free(swap); } @@ -1885,10 +1880,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (error) goto failed; - spin_lock_irq(&info->lock); - info->swapped--; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, -1); if (sgp == SGP_WRITE) folio_mark_accessed(folio); @@ -2053,12 +2045,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, charge_mm); if (error) goto unacct; - folio_add_lru(folio); - spin_lock_irq(&info->lock); - info->alloced += folio_nr_pages(folio); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + folio_add_lru(folio); + shmem_recalc_inode(inode, folio_nr_pages(folio), 0); alloced = true; if (folio_test_pmd_mappable(folio) && @@ -2107,9 +2096,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, if (alloced) { folio_clear_dirty(folio); filemap_remove_folio(folio); - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, 0); } error = -EINVAL; goto unlock; @@ -2135,9 +2122,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, folio_put(folio); } if (error == -ENOSPC && !once++) { - spin_lock_irq(&info->lock); - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); + shmem_recalc_inode(inode, 0, 0); goto repeat; } if (error == -EEXIST) @@ -2650,11 +2635,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (ret) goto out_delete_from_cache; - spin_lock_irq(&info->lock); - info->alloced++; - shmem_recalc_inode(inode); - spin_unlock_irq(&info->lock); - + shmem_recalc_inode(inode, 1, 0); folio_unlock(folio); return 0; out_delete_from_cache: From 0200679fc7953177941e41c2a4241d0b6c2c5de8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 1 Aug 2023 18:17:04 +0200 Subject: [PATCH 14/20] tmpfs: verify {g,u}id mount options correctly A while ago we received the following report: "The other outstanding issue I noticed comes from the fact that fsconfig syscalls may occur in a different userns than that which called fsopen. That means that resolving the uid/gid via current_user_ns() can save a kuid that isn't mapped in the associated namespace when the filesystem is finally mounted. This means that it is possible for an unprivileged user to create files owned by any group in a tmpfs mount (since we can set the SUID bit on the tmpfs directory), or a tmpfs that is owned by any user, including the root group/user." The contract for {g,u}id mount options and {g,u}id values in general set from userspace has always been that they are translated according to the caller's idmapping. In so far, tmpfs has been doing the correct thing. But since tmpfs is mountable in unprivileged contexts it is also necessary to verify that the resulting {k,g}uid is representable in the namespace of the superblock to avoid such bugs as above. The new mount api's cross-namespace delegation abilities are already widely used. After having talked to a bunch of userspace this is the most faithful solution with minimal regression risks. I know of one users - systemd - that makes use of the new mount api in this way and they don't set unresolable {g,u}ids. So the regression risk is minimal. Link: https://lore.kernel.org/lkml/CALxfFW4BXhEwxR0Q5LSkg-8Vb4r2MONKCcUCVioehXQKr35eHg@mail.gmail.com Fixes: f32356261d44 ("vfs: Convert ramfs, shmem, tmpfs, devtmpfs, rootfs to use the new mount API") Reviewed-by: "Seth Forshee (DigitalOcean)" Reported-by: Seth Jenkins Message-Id: <20230801-vfs-fs_context-uidgid-v1-1-daf46a050bbf@kernel.org> Signed-off-by: Christian Brauner --- mm/shmem.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b88dc7916e94..678a7be46b89 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3793,6 +3793,8 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) unsigned long long size; char *rest; int opt; + kuid_t kuid; + kgid_t kgid; opt = fs_parse(fc, shmem_fs_parameters, param, &result); if (opt < 0) @@ -3828,14 +3830,32 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) ctx->mode = result.uint_32 & 07777; break; case Opt_uid: - ctx->uid = make_kuid(current_user_ns(), result.uint_32); - if (!uid_valid(ctx->uid)) + kuid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(kuid)) goto bad_value; + + /* + * The requested uid must be representable in the + * filesystem's idmapping. + */ + if (!kuid_has_mapping(fc->user_ns, kuid)) + goto bad_value; + + ctx->uid = kuid; break; case Opt_gid: - ctx->gid = make_kgid(current_user_ns(), result.uint_32); - if (!gid_valid(ctx->gid)) + kgid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(kgid)) goto bad_value; + + /* + * The requested gid must be representable in the + * filesystem's idmapping. + */ + if (!kgid_has_mapping(fc->user_ns, kgid)) + goto bad_value; + + ctx->gid = kgid; break; case Opt_huge: ctx->huge = result.uint_32; From 5de75970c9fd7220e394b76e6d20fbafa1369b5a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 8 Aug 2023 21:30:59 -0700 Subject: [PATCH 15/20] xattr: simple_xattr_set() return old_xattr to be freed tmpfs wants to support limited user extended attributes, but kernfs (or cgroupfs, the only kernfs with KERNFS_ROOT_SUPPORT_USER_XATTR) already supports user extended attributes through simple xattrs: but limited by a policy (128KiB per inode) too liberal to be used on tmpfs. To allow a different limiting policy for tmpfs, without affecting the policy for kernfs, change simple_xattr_set() to return the replaced or removed xattr (if any), leaving the caller to update their accounting then free the xattr (by simple_xattr_free(), renamed from the static free_simple_xattr()). Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Reviewed-by: Carlos Maiolino Message-Id: <158c6585-2aa7-d4aa-90ff-f7c3f8fe407c@google.com> Signed-off-by: Christian Brauner --- fs/kernfs/inode.c | 46 +++++++++++++++++++++++--------------- fs/xattr.c | 51 +++++++++++++++++++------------------------ include/linux/xattr.h | 7 +++--- mm/shmem.c | 10 +++++---- 4 files changed, 61 insertions(+), 53 deletions(-) diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index b22b74d1a115..fec5d5f78f07 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -306,11 +306,17 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name, int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { + struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; - return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); + old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, @@ -342,7 +348,7 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; + struct simple_xattr *old_xattr; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { @@ -355,13 +361,18 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, goto dec_size_out; } - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); - - if (!ret && removed_size >= 0) - size = removed_size; - else if (!ret) + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) return 0; + + if (IS_ERR(old_xattr)) { + ret = PTR_ERR(old_xattr); + goto dec_size_out; + } + + ret = 0; + size = old_xattr->size; + simple_xattr_free(old_xattr); dec_size_out: atomic_sub(size, sz); dec_count_out: @@ -376,18 +387,19 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; - int ret; + struct simple_xattr *old_xattr; - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) + return 0; - if (removed_size >= 0) { - atomic_sub(removed_size, sz); - atomic_dec(nr); - } + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); - return ret; + atomic_sub(old_xattr->size, sz); + atomic_dec(nr); + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, diff --git a/fs/xattr.c b/fs/xattr.c index e7bbb7f57557..ba37a8f5cfd1 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1040,12 +1040,12 @@ const char *xattr_full_name(const struct xattr_handler *handler, EXPORT_SYMBOL(xattr_full_name); /** - * free_simple_xattr - free an xattr object + * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ -static inline void free_simple_xattr(struct simple_xattr *xattr) +void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); @@ -1164,7 +1164,6 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr - * @removed_size: the size of the removed xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE @@ -1181,29 +1180,27 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * - * Return: On success zero and on error a negative error code is returned. + * Return: On success, the removed or replaced xattr is returned, to be freed + * by the caller; or NULL if none. On failure a negative error code is returned. */ -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags, - ssize_t *removed_size) +struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, + const char *name, const void *value, + size_t size, int flags) { - struct simple_xattr *xattr = NULL, *new_xattr = NULL; + struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; - if (removed_size) - *removed_size = -1; - /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); new_xattr->name = kstrdup(name, GFP_KERNEL); if (!new_xattr->name) { - free_simple_xattr(new_xattr); - return -ENOMEM; + simple_xattr_free(new_xattr); + return ERR_PTR(-ENOMEM); } } @@ -1217,12 +1214,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, else if (ret > 0) rbp = &(*rbp)->rb_right; else - xattr = rb_entry(*rbp, struct simple_xattr, rb_node); - if (xattr) + old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); + if (old_xattr) break; } - if (xattr) { + if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; @@ -1230,12 +1227,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, } if (new_xattr) - rb_replace_node(&xattr->rb_node, &new_xattr->rb_node, - &xattrs->rb_root); + rb_replace_node(&old_xattr->rb_node, + &new_xattr->rb_node, &xattrs->rb_root); else - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (!err && removed_size) - *removed_size = xattr->size; + rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { @@ -1260,12 +1255,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, out_unlock: write_unlock(&xattrs->lock); - if (err) - free_simple_xattr(new_xattr); - else - free_simple_xattr(xattr); - return err; - + if (!err) + return old_xattr; + simple_xattr_free(new_xattr); + return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) @@ -1386,7 +1379,7 @@ void simple_xattrs_free(struct simple_xattrs *xattrs) rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); - free_simple_xattr(xattr); + simple_xattr_free(xattr); rbp = rbp_next; } } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index d591ef59aa98..e37fe667ae04 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -116,11 +116,12 @@ struct simple_xattr { void simple_xattrs_init(struct simple_xattrs *xattrs); void simple_xattrs_free(struct simple_xattrs *xattrs); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); +void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size); -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags, - ssize_t *removed_size); +struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, + const char *name, const void *value, + size_t size, int flags); ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size); void simple_xattr_add(struct simple_xattrs *xattrs, diff --git a/mm/shmem.c b/mm/shmem.c index 678a7be46b89..b22d0bf1f5e3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3598,15 +3598,17 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); - int err; + struct simple_xattr *old_xattr; name = xattr_full_name(handler, name); - err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); - if (!err) { + old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); + if (!IS_ERR(old_xattr)) { + simple_xattr_free(old_xattr); + old_xattr = NULL; inode->i_ctime = current_time(inode); inode_inc_iversion(inode); } - return err; + return PTR_ERR(old_xattr); } static const struct xattr_handler shmem_security_xattr_handler = { From e07c469e979c104464300aaa3b7923f929055cd0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 8 Aug 2023 21:32:21 -0700 Subject: [PATCH 16/20] tmpfs: track free_ispace instead of free_inodes In preparation for assigning some inode space to extended attributes, keep track of free_ispace instead of number of free_inodes: as if one tmpfs inode (and accompanying dentry) occupies very approximately 1KiB. Unsigned long is large enough for free_ispace, on 64-bit and on 32-bit: but take care to enforce the maximum. And fix the nr_blocks maximum on 32-bit: S64_MAX would be too big for it there, so say LONG_MAX instead. Delete the incorrect limited<->unlimited blocks/inodes comment above shmem_reconfigure(): leave it to the error messages below to describe. Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Reviewed-by: Carlos Maiolino Message-Id: <4fe1739-d9e7-8dfd-5bce-12e7339711da@google.com> Signed-off-by: Christian Brauner --- include/linux/shmem_fs.h | 2 +- mm/shmem.c | 33 +++++++++++++++++---------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 9b2d2faff1d0..6b0c626620f5 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -54,7 +54,7 @@ struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ - unsigned long free_inodes; /* How many are left for allocation */ + unsigned long free_ispace; /* How much ispace left for allocation */ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ diff --git a/mm/shmem.c b/mm/shmem.c index b22d0bf1f5e3..8c8f0b1b700c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -90,6 +90,9 @@ static struct vfsmount *shm_mnt; /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 +/* Pretend that one inode + its dentry occupy this much memory */ +#define BOGO_INODE_SIZE 1024 + /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ #define SHORT_SYMLINK_LEN 128 @@ -137,7 +140,8 @@ static unsigned long shmem_default_max_inodes(void) { unsigned long nr_pages = totalram_pages(); - return min(nr_pages - totalhigh_pages(), nr_pages / 2); + return min3(nr_pages - totalhigh_pages(), nr_pages / 2, + ULONG_MAX / BOGO_INODE_SIZE); } #endif @@ -331,11 +335,11 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) if (!(sb->s_flags & SB_KERNMOUNT)) { raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { - if (!sbinfo->free_inodes) { + if (sbinfo->free_ispace < BOGO_INODE_SIZE) { raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } - sbinfo->free_inodes--; + sbinfo->free_ispace -= BOGO_INODE_SIZE; } if (inop) { ino = sbinfo->next_ino++; @@ -394,7 +398,7 @@ static void shmem_free_inode(struct super_block *sb) struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; + sbinfo->free_ispace += BOGO_INODE_SIZE; raw_spin_unlock(&sbinfo->stat_lock); } } @@ -3158,7 +3162,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; - buf->f_ffree = sbinfo->free_inodes; + buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE; } /* else leave those fields 0 like simple_statfs */ @@ -3818,13 +3822,13 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest || ctx->blocks > S64_MAX) + if (*rest || ctx->blocks > LONG_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; case Opt_nr_inodes: ctx->inodes = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE) goto bad_value; ctx->seen |= SHMEM_SEEN_INODES; break; @@ -4005,21 +4009,17 @@ static int shmem_parse_options(struct fs_context *fc, void *data) /* * Reconfigure a shmem filesystem. - * - * Note that we disallow change from limited->unlimited blocks/inodes while any - * are in use; but we must separately disallow unlimited->limited, because in - * that case we have no record of how much is already in use. */ static int shmem_reconfigure(struct fs_context *fc) { struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); - unsigned long inodes; + unsigned long used_isp; struct mempolicy *mpol = NULL; const char *err; raw_spin_lock(&sbinfo->stat_lock); - inodes = sbinfo->max_inodes - sbinfo->free_inodes; + used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -4037,7 +4037,7 @@ static int shmem_reconfigure(struct fs_context *fc) err = "Cannot retroactively limit inodes"; goto out; } - if (ctx->inodes < inodes) { + if (ctx->inodes * BOGO_INODE_SIZE < used_isp) { err = "Too few inodes for current use"; goto out; } @@ -4083,7 +4083,7 @@ static int shmem_reconfigure(struct fs_context *fc) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { sbinfo->max_inodes = ctx->inodes; - sbinfo->free_inodes = ctx->inodes - inodes; + sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp; } /* @@ -4214,7 +4214,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; #endif sbinfo->max_blocks = ctx->blocks; - sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE; if (sb->s_flags & SB_KERNMOUNT) { sbinfo->ino_batch = alloc_percpu(ino_t); if (!sbinfo->ino_batch) From 2daf18a7884dc03d5164ab9c7dc3f2ea70638469 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 8 Aug 2023 21:33:56 -0700 Subject: [PATCH 17/20] tmpfs,xattr: enable limited user extended attributes Enable "user." extended attributes on tmpfs, limiting them by tracking the space they occupy, and deducting that space from the limited ispace (unless tmpfs mounted with nr_inodes=0 to leave that ispace unlimited). tmpfs inodes and simple xattrs are both unswappable, and have to be in lowmem on a 32-bit highmem kernel: so the ispace limit is appropriate for xattrs, without any need for a further mount option. Add simple_xattr_space() to give approximate but deterministic estimate of the space taken up by each xattr: with simple_xattrs_free() outputting the space freed if required (but kernfs and even some tmpfs usages do not require that, so don't waste time on strlen'ing if not needed). Security and trusted xattrs were already supported: for consistency and simplicity, account them from the same pool; though there's a small risk that a tmpfs with enough space before would now be considered too small. When extended attributes are used, "df -i" does show more IUsed and less IFree than can be explained by the inodes: document that (manpage later). xfstests tests/generic which were not run on tmpfs before but now pass: 020 037 062 070 077 097 103 117 337 377 454 486 523 533 611 618 728 with no new failures. Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Reviewed-by: Carlos Maiolino Message-Id: <2e63b26e-df46-5baa-c7d6-f9a8dd3282c5@google.com> Signed-off-by: Christian Brauner --- Documentation/filesystems/tmpfs.rst | 7 ++- fs/Kconfig | 4 +- fs/kernfs/dir.c | 2 +- fs/xattr.c | 28 ++++++++++- include/linux/xattr.h | 3 +- mm/shmem.c | 78 +++++++++++++++++++++++++---- 6 files changed, 106 insertions(+), 16 deletions(-) diff --git a/Documentation/filesystems/tmpfs.rst b/Documentation/filesystems/tmpfs.rst index 67422ee10e03..56a26c843dbe 100644 --- a/Documentation/filesystems/tmpfs.rst +++ b/Documentation/filesystems/tmpfs.rst @@ -21,8 +21,8 @@ explained further below, some of which can be reconfigured dynamically on the fly using a remount ('mount -o remount ...') of the filesystem. A tmpfs filesystem can be resized but it cannot be resized to a size below its current usage. tmpfs also supports POSIX ACLs, and extended attributes for the -trusted.* and security.* namespaces. ramfs does not use swap and you cannot -modify any parameter for a ramfs filesystem. The size limit of a ramfs +trusted.*, security.* and user.* namespaces. ramfs does not use swap and you +cannot modify any parameter for a ramfs filesystem. The size limit of a ramfs filesystem is how much memory you have available, and so care must be taken if used so to not run out of memory. @@ -97,6 +97,9 @@ mount with such options, since it allows any user with write access to use up all the memory on the machine; but enhances the scalability of that instance in a system with many CPUs making intensive use of it. +If nr_inodes is not 0, that limited space for inodes is also used up by +extended attributes: "df -i"'s IUsed and IUse% increase, IFree decreases. + tmpfs blocks may be swapped out, when there is a shortage of memory. tmpfs has a mount option to disable its use of swap: diff --git a/fs/Kconfig b/fs/Kconfig index 8218a71933f9..7da21f563192 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -205,8 +205,8 @@ config TMPFS_XATTR Extended attributes are name:value pairs associated with inodes by the kernel or by users (see the attr(5) manual page for details). - Currently this enables support for the trusted.* and - security.* namespaces. + This enables support for the trusted.*, security.* and user.* + namespaces. You need this for POSIX ACL support on tmpfs. diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5a1a4af9d3d2..660995856a04 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -556,7 +556,7 @@ void kernfs_put(struct kernfs_node *kn) kfree_const(kn->name); if (kn->iattr) { - simple_xattrs_free(&kn->iattr->xattrs); + simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); diff --git a/fs/xattr.c b/fs/xattr.c index ba37a8f5cfd1..2d607542281b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1039,6 +1039,26 @@ const char *xattr_full_name(const struct xattr_handler *handler, } EXPORT_SYMBOL(xattr_full_name); +/** + * simple_xattr_space - estimate the memory used by a simple xattr + * @name: the full name of the xattr + * @size: the size of its value + * + * This takes no account of how much larger the two slab objects actually are: + * that would depend on the slab implementation, when what is required is a + * deterministic number, which grows with name length and size and quantity. + * + * Return: The approximate number of bytes of memory used by such an xattr. + */ +size_t simple_xattr_space(const char *name, size_t size) +{ + /* + * Use "40" instead of sizeof(struct simple_xattr), to return the + * same result on 32-bit and 64-bit, and even if simple_xattr grows. + */ + return 40 + size + strlen(name); +} + /** * simple_xattr_free - free an xattr object * @xattr: the xattr object @@ -1363,14 +1383,17 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy + * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ -void simple_xattrs_free(struct simple_xattrs *xattrs) +void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; + if (freed_space) + *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; @@ -1379,6 +1402,9 @@ void simple_xattrs_free(struct simple_xattrs *xattrs) rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, + xattr->size); simple_xattr_free(xattr); rbp = rbp_next; } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index e37fe667ae04..d20051865800 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -114,7 +114,8 @@ struct simple_xattr { }; void simple_xattrs_init(struct simple_xattrs *xattrs); -void simple_xattrs_free(struct simple_xattrs *xattrs); +void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space); +size_t simple_xattr_space(const char *name, size_t size); struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); void simple_xattr_free(struct simple_xattr *xattr); int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, diff --git a/mm/shmem.c b/mm/shmem.c index 8c8f0b1b700c..ca43fb256b8e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -393,12 +393,12 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) return 0; } -static void shmem_free_inode(struct super_block *sb) +static void shmem_free_inode(struct super_block *sb, size_t freed_ispace) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { raw_spin_lock(&sbinfo->stat_lock); - sbinfo->free_ispace += BOGO_INODE_SIZE; + sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace; raw_spin_unlock(&sbinfo->stat_lock); } } @@ -1232,6 +1232,7 @@ static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + size_t freed = 0; if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); @@ -1258,9 +1259,9 @@ static void shmem_evict_inode(struct inode *inode) } } - simple_xattrs_free(&info->xattrs); + simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL); + shmem_free_inode(inode->i_sb, freed); WARN_ON(inode->i_blocks); - shmem_free_inode(inode->i_sb); clear_inode(inode); #ifdef CONFIG_TMPFS_QUOTA dquot_free_inode(inode); @@ -2440,7 +2441,7 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, inode = new_inode(sb); if (!inode) { - shmem_free_inode(sb); + shmem_free_inode(sb, 0); return ERR_PTR(-ENOSPC); } @@ -3284,7 +3285,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry); if (ret) { if (inode->i_nlink) - shmem_free_inode(inode->i_sb); + shmem_free_inode(inode->i_sb, 0); goto out; } @@ -3304,7 +3305,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) - shmem_free_inode(inode->i_sb); + shmem_free_inode(inode->i_sb, 0); simple_offset_remove(shmem_get_offset_ctx(dir), dentry); @@ -3557,21 +3558,40 @@ static int shmem_initxattrs(struct inode *inode, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); const struct xattr *xattr; struct simple_xattr *new_xattr; + size_t ispace = 0; size_t len; + if (sbinfo->max_inodes) { + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + ispace += simple_xattr_space(xattr->name, + xattr->value_len + XATTR_SECURITY_PREFIX_LEN); + } + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } + } + for (xattr = xattr_array; xattr->name != NULL; xattr++) { new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); if (!new_xattr) - return -ENOMEM; + break; len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL); if (!new_xattr->name) { kvfree(new_xattr); - return -ENOMEM; + break; } memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, @@ -3582,6 +3602,16 @@ static int shmem_initxattrs(struct inode *inode, simple_xattr_add(&info->xattrs, new_xattr); } + if (xattr->name != NULL) { + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); + } + simple_xattrs_free(&info->xattrs, NULL); + return -ENOMEM; + } + return 0; } @@ -3602,16 +3632,39 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct simple_xattr *old_xattr; + size_t ispace = 0; name = xattr_full_name(handler, name); + if (value && sbinfo->max_inodes) { + ispace = simple_xattr_space(name, size); + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->free_ispace < ispace) + ispace = 0; + else + sbinfo->free_ispace -= ispace; + raw_spin_unlock(&sbinfo->stat_lock); + if (!ispace) + return -ENOSPC; + } + old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags); if (!IS_ERR(old_xattr)) { + ispace = 0; + if (old_xattr && sbinfo->max_inodes) + ispace = simple_xattr_space(old_xattr->name, + old_xattr->size); simple_xattr_free(old_xattr); old_xattr = NULL; inode->i_ctime = current_time(inode); inode_inc_iversion(inode); } + if (ispace) { + raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_ispace += ispace; + raw_spin_unlock(&sbinfo->stat_lock); + } return PTR_ERR(old_xattr); } @@ -3627,9 +3680,16 @@ static const struct xattr_handler shmem_trusted_xattr_handler = { .set = shmem_xattr_handler_set, }; +static const struct xattr_handler shmem_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; + static const struct xattr_handler *shmem_xattr_handlers[] = { &shmem_security_xattr_handler, &shmem_trusted_xattr_handler, + &shmem_user_xattr_handler, NULL }; From e88e0d366f9cfbb810b0c8509dc5d130d5a53e02 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 10 Aug 2023 23:27:07 -0700 Subject: [PATCH 18/20] tmpfs: trivial support for direct IO Depending upon your philosophical viewpoint, either tmpfs always does direct IO, or it cannot ever do direct IO; but whichever, if tmpfs is to stand in for a more sophisticated filesystem, it can be helpful for tmpfs to support O_DIRECT. So, give tmpfs a shmem_file_open() method, to set the FMODE_CAN_ODIRECT flag: then unchanged shmem_file_read_iter() and new shmem_file_write_iter() do the work (without any shmem_direct_IO() stub). Perhaps later, once the direct_IO method has been eliminated from all filesystems, generic_file_write_iter() will be such that tmpfs can again use it, even for O_DIRECT. xfstests auto generic which were not run on tmpfs before but now pass: 036 091 113 125 130 133 135 198 207 208 209 210 211 212 214 226 239 263 323 355 391 406 412 422 427 446 451 465 551 586 591 609 615 647 708 729 with no new failures. LTP dio tests which were not run on tmpfs before but now pass: dio01 through dio30, except for dio04 and dio10, which fail because tmpfs dio read and write allow odd count: tmpfs could be made stricter, but would that be an improvement? Signed-off-by: Hugh Dickins Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Message-Id: <6f2742-6f1f-cae9-7c5b-ed20fc53215@google.com> Signed-off-by: Christian Brauner --- mm/shmem.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ca43fb256b8e..b782edeb69aa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2388,6 +2388,12 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int shmem_file_open(struct inode *inode, struct file *file) +{ + file->f_mode |= FMODE_CAN_ODIRECT; + return generic_file_open(inode, file); +} + #ifdef CONFIG_TMPFS_XATTR static int shmem_initxattrs(struct inode *, const struct xattr *, void *); @@ -2839,6 +2845,28 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } +static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + ret = file_remove_privs(file); + if (ret) + goto unlock; + ret = file_update_time(file); + if (ret) + goto unlock; + ret = generic_perform_write(iocb, from); +unlock: + inode_unlock(inode); + return ret; +} + static bool zero_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -4434,12 +4462,12 @@ EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, - .open = generic_file_open, + .open = shmem_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, .read_iter = shmem_file_read_iter, - .write_iter = generic_file_write_iter, + .write_iter = shmem_file_write_iter, .fsync = noop_fsync, .splice_read = shmem_file_splice_read, .splice_write = iter_file_splice_write, From aa5b9178c01905d7691512b366cf2886dfe2680c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 8 Aug 2023 21:36:12 -0700 Subject: [PATCH 19/20] mm: invalidation check mapping before folio_contains Enabling tmpfs "direct IO" exposes it to invalidate_inode_pages2_range(), which when swapping can hit the VM_BUG_ON_FOLIO(!folio_contains()): the folio has been moved from page cache to swap cache (with folio->mapping reset to NULL), but the folio_index() embedded in folio_contains() sees swapcache, and so returns the swapcache_index() - whereas folio->index would be the right one to check against the index from mapping's xarray. There are different ways to fix this, but my preference is just to order the checks in invalidate_inode_pages2_range() the same way that they are in __filemap_get_folio() and find_lock_entries() and filemap_fault(): check folio->mapping before folio_contains(). Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Message-Id: Signed-off-by: Christian Brauner --- mm/truncate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 95d1291d269b..c3320e66d6ea 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -657,11 +657,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); - if (folio->mapping != mapping) { + if (unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); if (folio_mapped(folio)) From 572a3d1e5d3a3e335b92e2c28a63c0b27944480c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Aug 2023 10:39:20 -0700 Subject: [PATCH 20/20] tmpfs,xattr: GFP_KERNEL_ACCOUNT for simple xattrs It is particularly important for the userns mount case (when a sensible nr_inodes maximum may not be enforced) that tmpfs user xattrs be subject to memory cgroup limiting. Leave temporary buffer allocations as is, but change the persistent simple xattr allocations from GFP_KERNEL to GFP_KERNEL_ACCOUNT. This limits kernfs's cgroupfs too, but that's good. (I had intended to send this change earlier, but had been confused by shmem_alloc_inode() using GFP_KERNEL, and thought a discussion would be needed to change that too: no, I was forgetting the SLAB_ACCOUNT on that kmem_cache, which implicitly adds __GFP_ACCOUNT to all its allocations.) Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Message-Id: Signed-off-by: Christian Brauner --- fs/xattr.c | 4 ++-- mm/shmem.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 2d607542281b..efd4736bc94b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1093,7 +1093,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) if (len < sizeof(*new_xattr)) return NULL; - new_xattr = kvmalloc(len, GFP_KERNEL); + new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; @@ -1217,7 +1217,7 @@ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, if (!new_xattr) return ERR_PTR(-ENOMEM); - new_xattr->name = kstrdup(name, GFP_KERNEL); + new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { simple_xattr_free(new_xattr); return ERR_PTR(-ENOMEM); diff --git a/mm/shmem.c b/mm/shmem.c index b782edeb69aa..11298c797cdc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3616,7 +3616,7 @@ static int shmem_initxattrs(struct inode *inode, len = strlen(xattr->name) + 1; new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { kvfree(new_xattr); break;