2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* hugetlbpage-backed filesystem. Based on ramfs.
|
|
|
|
*
|
2012-12-06 10:39:54 +01:00
|
|
|
* Nadia Yvette Chambers, 2002
|
2005-04-16 15:20:36 -07:00
|
|
|
*
|
|
|
|
* Copyright (C) 2002 Linus Torvalds.
|
2016-01-14 15:21:52 -08:00
|
|
|
* License: GPL
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
|
2014-06-04 16:07:21 -07:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/thread_info.h>
|
|
|
|
#include <asm/current.h>
|
2015-09-08 15:01:54 -07:00
|
|
|
#include <linux/falloc.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/file.h>
|
2007-07-15 23:40:52 -07:00
|
|
|
#include <linux/kernel.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/string.h>
|
2006-01-11 12:17:46 -08:00
|
|
|
#include <linux/capability.h>
|
2007-07-15 23:40:52 -07:00
|
|
|
#include <linux/ctype.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/backing-dev.h>
|
|
|
|
#include <linux/hugetlb.h>
|
|
|
|
#include <linux/pagevec.h>
|
2018-11-01 23:07:26 +00:00
|
|
|
#include <linux/fs_parser.h>
|
2007-05-06 14:50:12 -07:00
|
|
|
#include <linux/mman.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/dnotify.h>
|
|
|
|
#include <linux/statfs.h>
|
|
|
|
#include <linux/security.h>
|
2009-09-22 16:43:33 -07:00
|
|
|
#include <linux/magic.h>
|
2010-09-08 10:19:35 +09:00
|
|
|
#include <linux/migrate.h>
|
2015-04-03 11:31:35 -04:00
|
|
|
#include <linux/uio.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2016-12-24 11:46:01 -08:00
|
|
|
#include <linux/uaccess.h>
|
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 16:03:34 -07:00
|
|
|
#include <linux/sched/mm.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-06-28 04:26:44 -07:00
|
|
|
static const struct address_space_operations hugetlbfs_aops;
|
2006-03-28 01:56:42 -08:00
|
|
|
const struct file_operations hugetlbfs_file_operations;
|
2007-02-12 00:55:39 -08:00
|
|
|
static const struct inode_operations hugetlbfs_dir_inode_operations;
|
|
|
|
static const struct inode_operations hugetlbfs_inode_operations;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };
|
|
|
|
|
|
|
|
struct hugetlbfs_fs_context {
|
2017-07-05 16:24:18 +01:00
|
|
|
struct hstate *hstate;
|
2018-11-01 23:07:26 +00:00
|
|
|
unsigned long long max_size_opt;
|
|
|
|
unsigned long long min_size_opt;
|
2017-07-05 16:24:18 +01:00
|
|
|
long max_hpages;
|
|
|
|
long nr_inodes;
|
|
|
|
long min_hpages;
|
2018-11-01 23:07:26 +00:00
|
|
|
enum hugetlbfs_size_type max_val_type;
|
|
|
|
enum hugetlbfs_size_type min_val_type;
|
2017-07-05 16:24:18 +01:00
|
|
|
kuid_t uid;
|
|
|
|
kgid_t gid;
|
|
|
|
umode_t mode;
|
2012-03-21 16:34:12 -07:00
|
|
|
};
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
int sysctl_hugetlb_shm_group;
|
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
enum hugetlb_param {
|
|
|
|
Opt_gid,
|
|
|
|
Opt_min_size,
|
|
|
|
Opt_mode,
|
|
|
|
Opt_nr_inodes,
|
|
|
|
Opt_pagesize,
|
|
|
|
Opt_size,
|
|
|
|
Opt_uid,
|
2007-07-15 23:40:52 -07:00
|
|
|
};
|
|
|
|
|
2019-09-07 07:23:15 -04:00
|
|
|
static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
|
2018-11-01 23:07:26 +00:00
|
|
|
fsparam_u32 ("gid", Opt_gid),
|
|
|
|
fsparam_string("min_size", Opt_min_size),
|
2021-07-23 15:50:44 -07:00
|
|
|
fsparam_u32oct("mode", Opt_mode),
|
2018-11-01 23:07:26 +00:00
|
|
|
fsparam_string("nr_inodes", Opt_nr_inodes),
|
|
|
|
fsparam_string("pagesize", Opt_pagesize),
|
|
|
|
fsparam_string("size", Opt_size),
|
|
|
|
fsparam_u32 ("uid", Opt_uid),
|
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
2015-09-08 15:01:54 -07:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
|
|
|
|
struct inode *inode, pgoff_t index)
|
|
|
|
{
|
|
|
|
vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
|
|
|
|
index);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
mpol_cond_put(vma->vm_policy);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
|
|
|
|
struct inode *inode, pgoff_t index)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
hugetlbfs: check for pgoff value overflow
A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call. The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.
A sequence such as:
mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);
will result in the following when task exits/file closed,
kernel BUG at mm/hugetlb.c:749!
Call Trace:
hugetlbfs_evict_inode+0x2f/0x40
evict+0xcb/0x190
__dentry_kill+0xcb/0x150
__fput+0x164/0x1e0
task_work_run+0x84/0xa0
exit_to_usermode_loop+0x7d/0x80
do_syscall_64+0x18b/0x190
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.
The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.
[mike.kravetz@oracle.com: v3]
Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-22 16:17:13 -07:00
|
|
|
/*
|
|
|
|
* Mask used when checking the page offset value passed in via system
|
|
|
|
* calls. This value will be converted to a loff_t which is signed.
|
|
|
|
* Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
|
|
|
|
* value. The extra bit (- 1 in the shift value) is to take the sign
|
|
|
|
* bit into account.
|
|
|
|
*/
|
|
|
|
#define PGOFF_LOFFT_MAX \
|
|
|
|
(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
|
|
|
|
{
|
2013-01-23 17:07:38 -05:00
|
|
|
struct inode *inode = file_inode(file);
|
2021-05-14 17:27:04 -07:00
|
|
|
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
loff_t len, vma_len;
|
|
|
|
int ret;
|
2008-07-23 21:27:41 -07:00
|
|
|
struct hstate *h = hstate_file(file);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
[PATCH] hugetlb: prepare_hugepage_range check offset too
(David:)
If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.
But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it. That will eventually call down to the
non-hugepage version of unmap_page_range(). On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries. I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.
(Hugh:)
prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down. PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.
Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-11-14 02:03:32 -08:00
|
|
|
/*
|
2007-08-30 23:56:40 -07:00
|
|
|
* vma address alignment (but not the pgoff alignment) has
|
|
|
|
* already been checked by prepare_hugepage_range. If you add
|
|
|
|
* any error returns here, do so after setting VM_HUGETLB, so
|
|
|
|
* is_vm_hugetlb_page tests below unmap_region go the right
|
2020-08-06 23:23:37 -07:00
|
|
|
* way when do_mmap unwinds (may be important on powerpc
|
2007-08-30 23:56:40 -07:00
|
|
|
* and ia64).
|
[PATCH] hugetlb: prepare_hugepage_range check offset too
(David:)
If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.
But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it. That will eventually call down to the
non-hugepage version of unmap_page_range(). On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries. I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.
(Hugh:)
prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down. PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.
Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-11-14 02:03:32 -08:00
|
|
|
*/
|
2013-04-17 15:58:27 -07:00
|
|
|
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
|
[PATCH] hugetlb: prepare_hugepage_range check offset too
(David:)
If hugetlbfs_file_mmap() returns a failure to do_mmap_pgoff() - for example,
because the given file offset is not hugepage aligned - then do_mmap_pgoff
will go to the unmap_and_free_vma backout path.
But at this stage the vma hasn't been marked as hugepage, and the backout path
will call unmap_region() on it. That will eventually call down to the
non-hugepage version of unmap_page_range(). On ppc64, at least, that will
cause serious problems if there are any existing hugepage pagetable entries in
the vicinity - for example if there are any other hugepage mappings under the
same PUD. unmap_page_range() will trigger a bad_pud() on the hugepage pud
entries. I suspect this will also cause bad problems on ia64, though I don't
have a machine to test it on.
(Hugh:)
prepare_hugepage_range() should check file offset alignment when it checks
virtual address and length, to stop MAP_FIXED with a bad huge offset from
unmapping before it fails further down. PowerPC should apply the same
prepare_hugepage_range alignment checks as ia64 and all the others do.
Then none of the alignment checks in hugetlbfs_file_mmap are required (nor
is the check for too small a mapping); but even so, move up setting of
VM_HUGETLB and add a comment to warn of what David Gibson discovered - if
hugetlbfs_file_mmap fails before setting it, do_mmap_pgoff's unmap_region
when unwinding from error will go the non-huge way, which may cause bad
behaviour on architectures (powerpc and ia64) which segregate their huge
mappings into a separate region of the address space.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-11-14 02:03:32 -08:00
|
|
|
vma->vm_ops = &hugetlb_vm_ops;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2021-05-14 17:27:04 -07:00
|
|
|
ret = seal_check_future_write(info->seals, vma);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2017-04-13 14:56:32 -07:00
|
|
|
/*
|
hugetlbfs: check for pgoff value overflow
A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call. The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.
A sequence such as:
mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);
will result in the following when task exits/file closed,
kernel BUG at mm/hugetlb.c:749!
Call Trace:
hugetlbfs_evict_inode+0x2f/0x40
evict+0xcb/0x190
__dentry_kill+0xcb/0x150
__fput+0x164/0x1e0
task_work_run+0x84/0xa0
exit_to_usermode_loop+0x7d/0x80
do_syscall_64+0x18b/0x190
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.
The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.
[mike.kravetz@oracle.com: v3]
Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-22 16:17:13 -07:00
|
|
|
* page based offset in vm_pgoff could be sufficiently large to
|
2018-04-05 16:18:21 -07:00
|
|
|
* overflow a loff_t when converted to byte offset. This can
|
|
|
|
* only happen on architectures where sizeof(loff_t) ==
|
|
|
|
* sizeof(unsigned long). So, only check in those instances.
|
2017-04-13 14:56:32 -07:00
|
|
|
*/
|
2018-04-05 16:18:21 -07:00
|
|
|
if (sizeof(unsigned long) == sizeof(loff_t)) {
|
|
|
|
if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2017-04-13 14:56:32 -07:00
|
|
|
|
hugetlbfs: check for pgoff value overflow
A vma with vm_pgoff large enough to overflow a loff_t type when
converted to a byte offset can be passed via the remap_file_pages system
call. The hugetlbfs mmap routine uses the byte offset to calculate
reservations and file size.
A sequence such as:
mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0);
remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0);
will result in the following when task exits/file closed,
kernel BUG at mm/hugetlb.c:749!
Call Trace:
hugetlbfs_evict_inode+0x2f/0x40
evict+0xcb/0x190
__dentry_kill+0xcb/0x150
__fput+0x164/0x1e0
task_work_run+0x84/0xa0
exit_to_usermode_loop+0x7d/0x80
do_syscall_64+0x18b/0x190
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
The overflowed pgoff value causes hugetlbfs to try to set up a mapping
with a negative range (end < start) that leaves invalid state which
causes the BUG.
The previous overflow fix to this code was incomplete and did not take
the remap_file_pages system call into account.
[mike.kravetz@oracle.com: v3]
Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com
[akpm@linux-foundation.org: include mmdebug.h]
[akpm@linux-foundation.org: fix -ve left shift count on sh]
Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com
Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nic Losby <blurbdust@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Yisheng Xie <xieyisheng1@huawei.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-03-22 16:17:13 -07:00
|
|
|
/* must be huge page aligned */
|
2011-07-25 17:11:49 -07:00
|
|
|
if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
|
2007-08-30 23:56:40 -07:00
|
|
|
return -EINVAL;
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
|
2017-04-13 14:56:32 -07:00
|
|
|
len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
|
|
|
|
/* check for overflow */
|
|
|
|
if (len < vma_len)
|
|
|
|
return -EINVAL;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2016-01-22 15:40:57 -05:00
|
|
|
inode_lock(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
file_accessed(file);
|
|
|
|
|
|
|
|
ret = -ENOMEM;
|
2021-02-24 12:09:54 -08:00
|
|
|
if (!hugetlb_reserve_pages(inode,
|
2008-07-23 21:27:41 -07:00
|
|
|
vma->vm_pgoff >> huge_page_order(h),
|
2009-02-10 14:02:27 +00:00
|
|
|
len >> huge_page_shift(h), vma,
|
|
|
|
vma->vm_flags))
|
2006-06-23 02:03:15 -07:00
|
|
|
goto out;
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 00:08:55 -08:00
|
|
|
|
2005-10-29 18:16:46 -07:00
|
|
|
ret = 0;
|
[PATCH] mmap zero-length hugetlb file with PROT_NONE to protect a hugetlb virtual area
Sometimes, applications need below call to be successful although
"/mnt/hugepages/file1" doesn't exist.
fd = open("/mnt/hugepages/file1", O_CREAT|O_RDWR, 0755);
*addr = mmap(NULL, 0x1024*1024*256, PROT_NONE, 0, fd, 0);
As for regular pages (or files), above call does work, but as for huge
pages, above call would fail because hugetlbfs_file_mmap would fail if
(!(vma->vm_flags & VM_WRITE) && len > inode->i_size).
This capability on huge page is useful on ia64 when the process wants to
protect one area on region 4, so other threads couldn't read/write this
area. A famous JVM (Java Virtual Machine) implementation on IA64 needs the
capability.
Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hugh@veritas.com>
[ Expand-on-mmap semantics again... this time matching normal fs's. wli ]
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-10 04:44:49 -07:00
|
|
|
if (vma->vm_flags & VM_WRITE && inode->i_size < len)
|
2017-04-13 14:56:32 -07:00
|
|
|
i_size_write(inode, len);
|
2005-04-16 15:20:36 -07:00
|
|
|
out:
|
2016-01-22 15:40:57 -05:00
|
|
|
inode_unlock(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2020-06-08 21:33:51 -07:00
|
|
|
* Called under mmap_write_lock(mm).
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
|
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 16:03:34 -07:00
|
|
|
static unsigned long
|
|
|
|
hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff, unsigned long flags)
|
|
|
|
{
|
|
|
|
struct hstate *h = hstate_file(file);
|
|
|
|
struct vm_unmapped_area_info info;
|
|
|
|
|
|
|
|
info.flags = 0;
|
|
|
|
info.length = len;
|
|
|
|
info.low_limit = current->mm->mmap_base;
|
2022-04-09 19:17:28 +02:00
|
|
|
info.high_limit = arch_get_mmap_end(addr, len, flags);
|
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 16:03:34 -07:00
|
|
|
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
|
|
|
|
info.align_offset = 0;
|
|
|
|
return vm_unmapped_area(&info);
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long
|
|
|
|
hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff, unsigned long flags)
|
|
|
|
{
|
|
|
|
struct hstate *h = hstate_file(file);
|
|
|
|
struct vm_unmapped_area_info info;
|
|
|
|
|
|
|
|
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
|
|
|
|
info.length = len;
|
|
|
|
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
|
2022-04-21 16:35:46 -07:00
|
|
|
info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
|
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 16:03:34 -07:00
|
|
|
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
|
|
|
|
info.align_offset = 0;
|
|
|
|
addr = vm_unmapped_area(&info);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A failed mmap() very likely causes application failure,
|
|
|
|
* so fall back to the bottom-up function here. This scenario
|
|
|
|
* can happen with large stack limits and large mmap()
|
|
|
|
* allocations.
|
|
|
|
*/
|
|
|
|
if (unlikely(offset_in_page(addr))) {
|
|
|
|
VM_BUG_ON(addr != -ENOMEM);
|
|
|
|
info.flags = 0;
|
|
|
|
info.low_limit = current->mm->mmap_base;
|
2022-04-09 19:17:28 +02:00
|
|
|
info.high_limit = arch_get_mmap_end(addr, len, flags);
|
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 16:03:34 -07:00
|
|
|
addr = vm_unmapped_area(&info);
|
|
|
|
}
|
|
|
|
|
|
|
|
return addr;
|
|
|
|
}
|
|
|
|
|
2022-04-09 19:17:27 +02:00
|
|
|
unsigned long
|
|
|
|
generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
struct vm_area_struct *vma;
|
2008-07-23 21:27:41 -07:00
|
|
|
struct hstate *h = hstate_file(file);
|
2022-04-09 19:17:28 +02:00
|
|
|
const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-07-23 21:27:41 -07:00
|
|
|
if (len & ~huge_page_mask(h))
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
if (len > TASK_SIZE)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2007-05-06 14:50:12 -07:00
|
|
|
if (flags & MAP_FIXED) {
|
2008-07-23 21:27:41 -07:00
|
|
|
if (prepare_hugepage_range(file, addr, len))
|
2007-05-06 14:50:12 -07:00
|
|
|
return -EINVAL;
|
|
|
|
return addr;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
if (addr) {
|
2008-07-23 21:27:41 -07:00
|
|
|
addr = ALIGN(addr, huge_page_size(h));
|
2005-04-16 15:20:36 -07:00
|
|
|
vma = find_vma(mm, addr);
|
2022-04-21 16:35:46 -07:00
|
|
|
if (mmap_end - len >= addr &&
|
mm: larger stack guard gap, between vmas
Stack guard page is a useful feature to reduce a risk of stack smashing
into a different mapping. We have been using a single page gap which
is sufficient to prevent having stack adjacent to a different mapping.
But this seems to be insufficient in the light of the stack usage in
userspace. E.g. glibc uses as large as 64kB alloca() in many commonly
used functions. Others use constructs liks gid_t buffer[NGROUPS_MAX]
which is 256kB or stack strings with MAX_ARG_STRLEN.
This will become especially dangerous for suid binaries and the default
no limit for the stack size limit because those applications can be
tricked to consume a large portion of the stack and a single glibc call
could jump over the guard page. These attacks are not theoretical,
unfortunatelly.
Make those attacks less probable by increasing the stack guard gap
to 1MB (on systems with 4k pages; but make it depend on the page size
because systems with larger base pages might cap stack allocations in
the PAGE_SIZE units) which should cover larger alloca() and VLA stack
allocations. It is obviously not a full fix because the problem is
somehow inherent, but it should reduce attack space a lot.
One could argue that the gap size should be configurable from userspace,
but that can be done later when somebody finds that the new 1MB is wrong
for some special case applications. For now, add a kernel command line
option (stack_guard_gap) to specify the stack gap size (in page units).
Implementation wise, first delete all the old code for stack guard page:
because although we could get away with accounting one extra page in a
stack vma, accounting a larger gap can break userspace - case in point,
a program run with "ulimit -S -v 20000" failed when the 1MB gap was
counted for RLIMIT_AS; similar problems could come with RLIMIT_MLOCK
and strict non-overcommit mode.
Instead of keeping gap inside the stack vma, maintain the stack guard
gap as a gap between vmas: using vm_start_gap() in place of vm_start
(or vm_end_gap() in place of vm_end if VM_GROWSUP) in just those few
places which need to respect the gap - mainly arch_get_unmapped_area(),
and and the vma tree's subtree_gap support for that.
Original-patch-by: Oleg Nesterov <oleg@redhat.com>
Original-patch-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Tested-by: Helge Deller <deller@gmx.de> # parisc
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-06-19 04:03:24 -07:00
|
|
|
(!vma || addr + len <= vm_start_gap(vma)))
|
2005-04-16 15:20:36 -07:00
|
|
|
return addr;
|
|
|
|
}
|
|
|
|
|
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
In a 32-bit program, running on arm64 architecture. When the address
space below mmap base is completely exhausted, shmat() for huge pages will
return ENOMEM, but shmat() for normal pages can still success on no-legacy
mode. This seems not fair.
For normal pages, the calling trace of get_unmapped_area() is:
=> mm->get_unmapped_area()
if on legacy mode,
=> arch_get_unmapped_area()
=> vm_unmapped_area()
if on no-legacy mode,
=> arch_get_unmapped_area_topdown()
=> vm_unmapped_area()
For huge pages, the calling trace of get_unmapped_area() is:
=> file->f_op->get_unmapped_area()
=> hugetlb_get_unmapped_area()
=> vm_unmapped_area()
To solve this issue, we only need to make hugetlb_get_unmapped_area() take
the same way as mm->get_unmapped_area(). Add *bottomup() and *topdown()
for hugetlbfs, and check current mm->get_unmapped_area() to decide which
one to use. If mm->get_unmapped_area is equal to
arch_get_unmapped_area_topdown(), hugetlb_get_unmapped_area() calls
topdown routine, otherwise calls bottomup routine.
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Shijie Hu <hushijie3@huawei.com>
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Will Deacon <will@kernel.org>
Cc: Xiaoming Ni <nixiaoming@huawei.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: yangerkun <yangerkun@huawei.com>
Cc: ChenGang <cg.chen@huawei.com>
Cc: Chen Jie <chenjie6@huawei.com>
Link: http://lkml.kernel.org/r/20200518065338.113664-1-hushijie3@huawei.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-03 16:03:34 -07:00
|
|
|
/*
|
|
|
|
* Use mm->get_unmapped_area value as a hint to use topdown routine.
|
|
|
|
* If architectures have special needs, they should define their own
|
|
|
|
* version of hugetlb_get_unmapped_area.
|
|
|
|
*/
|
|
|
|
if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
|
|
|
|
return hugetlb_get_unmapped_area_topdown(file, addr, len,
|
|
|
|
pgoff, flags);
|
|
|
|
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
|
|
|
|
pgoff, flags);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2022-04-09 19:17:27 +02:00
|
|
|
|
|
|
|
#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
|
|
|
|
static unsigned long
|
|
|
|
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags)
|
|
|
|
{
|
|
|
|
return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
#endif
|
|
|
|
|
2007-10-16 01:26:22 -07:00
|
|
|
/*
|
|
|
|
* Support for read() - Find the page attached to f_mapping and copy out the
|
2022-07-26 22:29:17 +08:00
|
|
|
* data. This provides functionality similar to filemap_read().
|
2007-10-16 01:26:22 -07:00
|
|
|
*/
|
2015-04-03 11:31:35 -04:00
|
|
|
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
2007-10-16 01:26:22 -07:00
|
|
|
{
|
2015-04-03 11:31:35 -04:00
|
|
|
struct file *file = iocb->ki_filp;
|
|
|
|
struct hstate *h = hstate_file(file);
|
|
|
|
struct address_space *mapping = file->f_mapping;
|
2007-10-16 01:26:22 -07:00
|
|
|
struct inode *inode = mapping->host;
|
2015-04-03 11:31:35 -04:00
|
|
|
unsigned long index = iocb->ki_pos >> huge_page_shift(h);
|
|
|
|
unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
|
2007-10-16 01:26:22 -07:00
|
|
|
unsigned long end_index;
|
|
|
|
loff_t isize;
|
|
|
|
ssize_t retval = 0;
|
|
|
|
|
2015-04-03 11:31:35 -04:00
|
|
|
while (iov_iter_count(to)) {
|
2007-10-16 01:26:22 -07:00
|
|
|
struct page *page;
|
2015-04-03 11:31:35 -04:00
|
|
|
size_t nr, copied;
|
2007-10-16 01:26:22 -07:00
|
|
|
|
|
|
|
/* nr is the maximum number of bytes to copy from this page */
|
2008-07-23 21:27:41 -07:00
|
|
|
nr = huge_page_size(h);
|
2012-03-21 16:34:08 -07:00
|
|
|
isize = i_size_read(inode);
|
|
|
|
if (!isize)
|
2015-04-03 11:31:35 -04:00
|
|
|
break;
|
2012-03-21 16:34:08 -07:00
|
|
|
end_index = (isize - 1) >> huge_page_shift(h);
|
2015-04-03 11:31:35 -04:00
|
|
|
if (index > end_index)
|
|
|
|
break;
|
|
|
|
if (index == end_index) {
|
2008-07-23 21:27:41 -07:00
|
|
|
nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
|
2012-03-21 16:34:08 -07:00
|
|
|
if (nr <= offset)
|
2015-04-03 11:31:35 -04:00
|
|
|
break;
|
2007-10-16 01:26:22 -07:00
|
|
|
}
|
|
|
|
nr = nr - offset;
|
|
|
|
|
|
|
|
/* Find the page */
|
2012-03-21 16:34:08 -07:00
|
|
|
page = find_lock_page(mapping, index);
|
2007-10-16 01:26:22 -07:00
|
|
|
if (unlikely(page == NULL)) {
|
|
|
|
/*
|
|
|
|
* We have a HOLE, zero out the user-buffer for the
|
|
|
|
* length of the hole or request.
|
|
|
|
*/
|
2015-04-03 11:31:35 -04:00
|
|
|
copied = iov_iter_zero(nr, to);
|
2007-10-16 01:26:22 -07:00
|
|
|
} else {
|
2012-03-21 16:34:08 -07:00
|
|
|
unlock_page(page);
|
|
|
|
|
2007-10-16 01:26:22 -07:00
|
|
|
/*
|
|
|
|
* We have the page, copy it to user space buffer.
|
|
|
|
*/
|
2022-06-23 17:24:09 -04:00
|
|
|
copied = copy_page_to_iter(page, offset, nr, to);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
|
|
|
put_page(page);
|
2007-10-16 01:26:22 -07:00
|
|
|
}
|
2015-04-03 11:31:35 -04:00
|
|
|
offset += copied;
|
|
|
|
retval += copied;
|
|
|
|
if (copied != nr && iov_iter_count(to)) {
|
|
|
|
if (!retval)
|
|
|
|
retval = -EFAULT;
|
|
|
|
break;
|
2007-10-16 01:26:22 -07:00
|
|
|
}
|
2008-07-23 21:27:41 -07:00
|
|
|
index += offset >> huge_page_shift(h);
|
|
|
|
offset &= ~huge_page_mask(h);
|
2007-10-16 01:26:22 -07:00
|
|
|
}
|
2015-04-03 11:31:35 -04:00
|
|
|
iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
|
2007-10-16 01:26:22 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2007-10-16 01:25:03 -07:00
|
|
|
static int hugetlbfs_write_begin(struct file *file,
|
|
|
|
struct address_space *mapping,
|
2022-02-22 14:31:43 -05:00
|
|
|
loff_t pos, unsigned len,
|
2007-10-16 01:25:03 -07:00
|
|
|
struct page **pagep, void **fsdata)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2007-10-16 01:25:03 -07:00
|
|
|
static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
|
|
|
|
loff_t pos, unsigned len, unsigned copied,
|
|
|
|
struct page *page, void *fsdata)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2007-10-16 01:25:03 -07:00
|
|
|
BUG();
|
2005-04-16 15:20:36 -07:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2022-09-14 15:18:04 -07:00
|
|
|
static void hugetlb_delete_from_page_cache(struct page *page)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2015-04-14 15:45:27 -07:00
|
|
|
ClearPageDirty(page);
|
2005-04-16 15:20:36 -07:00
|
|
|
ClearPageUptodate(page);
|
2011-03-22 16:30:54 -07:00
|
|
|
delete_from_page_cache(page);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2016-01-15 16:57:40 -08:00
|
|
|
static void
|
2022-05-12 20:22:55 -07:00
|
|
|
hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
|
|
|
|
zap_flags_t zap_flags)
|
2016-01-15 16:57:40 -08:00
|
|
|
{
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
|
|
|
|
/*
|
2022-01-14 14:08:30 -08:00
|
|
|
* end == 0 indicates that the entire range after start should be
|
|
|
|
* unmapped. Note, end is exclusive, whereas the interval tree takes
|
|
|
|
* an inclusive "last".
|
2016-01-15 16:57:40 -08:00
|
|
|
*/
|
2022-01-14 14:08:30 -08:00
|
|
|
vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
|
2016-01-15 16:57:40 -08:00
|
|
|
unsigned long v_offset;
|
|
|
|
unsigned long v_end;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Can the expression below overflow on 32-bit arches?
|
|
|
|
* No, because the interval tree returns us only those vmas
|
|
|
|
* which overlap the truncated area starting at pgoff,
|
|
|
|
* and no vma on a 32-bit arch can span beyond the 4GB.
|
|
|
|
*/
|
|
|
|
if (vma->vm_pgoff < start)
|
|
|
|
v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
|
|
|
|
else
|
|
|
|
v_offset = 0;
|
|
|
|
|
|
|
|
if (!end)
|
|
|
|
v_end = vma->vm_end;
|
|
|
|
else {
|
|
|
|
v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
|
|
|
|
+ vma->vm_start;
|
|
|
|
if (v_end > vma->vm_end)
|
|
|
|
v_end = vma->vm_end;
|
|
|
|
}
|
|
|
|
|
|
|
|
unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
|
2022-05-12 20:22:55 -07:00
|
|
|
NULL, zap_flags);
|
2016-01-15 16:57:40 -08:00
|
|
|
}
|
|
|
|
}
|
2015-09-08 15:01:41 -07:00
|
|
|
|
2022-09-14 15:18:05 -07:00
|
|
|
/*
|
|
|
|
* Called with hugetlb fault mutex held.
|
|
|
|
* Returns true if page was actually removed, false otherwise.
|
|
|
|
*/
|
|
|
|
static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
|
|
|
|
struct address_space *mapping,
|
|
|
|
struct folio *folio, pgoff_t index,
|
|
|
|
bool truncate_op)
|
|
|
|
{
|
|
|
|
bool ret = false;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If folio is mapped, it was faulted in after being
|
|
|
|
* unmapped in caller. Unmap (again) while holding
|
|
|
|
* the fault mutex. The mutex will prevent faults
|
|
|
|
* until we finish removing the folio.
|
|
|
|
*/
|
|
|
|
if (unlikely(folio_mapped(folio))) {
|
|
|
|
i_mmap_lock_write(mapping);
|
|
|
|
hugetlb_vmdelete_list(&mapping->i_mmap,
|
|
|
|
index * pages_per_huge_page(h),
|
|
|
|
(index + 1) * pages_per_huge_page(h),
|
|
|
|
ZAP_FLAG_DROP_MARKER);
|
|
|
|
i_mmap_unlock_write(mapping);
|
|
|
|
}
|
|
|
|
|
|
|
|
folio_lock(folio);
|
|
|
|
/*
|
|
|
|
* After locking page, make sure mapping is the same.
|
|
|
|
* We could have raced with page fault populate and
|
|
|
|
* backout code.
|
|
|
|
*/
|
|
|
|
if (folio_mapping(folio) == mapping) {
|
|
|
|
/*
|
|
|
|
* We must remove the folio from page cache before removing
|
|
|
|
* the region/ reserve map (hugetlb_unreserve_pages). In
|
|
|
|
* rare out of memory conditions, removal of the region/reserve
|
|
|
|
* map could fail. Correspondingly, the subpool and global
|
|
|
|
* reserve usage count can need to be adjusted.
|
|
|
|
*/
|
|
|
|
VM_BUG_ON(HPageRestoreReserve(&folio->page));
|
|
|
|
hugetlb_delete_from_page_cache(&folio->page);
|
|
|
|
ret = true;
|
|
|
|
if (!truncate_op) {
|
|
|
|
if (unlikely(hugetlb_unreserve_pages(inode, index,
|
|
|
|
index + 1, 1)))
|
|
|
|
hugetlb_fix_reserve_counts(inode);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
folio_unlock(folio);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-09-08 15:01:41 -07:00
|
|
|
/*
|
|
|
|
* remove_inode_hugepages handles two distinct cases: truncation and hole
|
|
|
|
* punch. There are subtle differences in operation for each case.
|
2016-01-15 16:57:40 -08:00
|
|
|
*
|
2015-09-08 15:01:41 -07:00
|
|
|
* truncation is indicated by end of range being LLONG_MAX
|
|
|
|
* In this case, we first scan the range and release found pages.
|
2021-02-24 12:10:21 -08:00
|
|
|
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
|
2022-09-14 15:18:05 -07:00
|
|
|
* maps and global counts. Page faults can race with truncation.
|
|
|
|
* During faults, hugetlb_no_page() checks i_size before page allocation,
|
|
|
|
* and again after obtaining page table lock. It will 'back out'
|
|
|
|
* allocations in the truncated range.
|
2015-09-08 15:01:41 -07:00
|
|
|
* hole punch is indicated if end is not LLONG_MAX
|
|
|
|
* In the hole punch case we scan the range and release found pages.
|
2021-02-24 12:10:21 -08:00
|
|
|
* Only when releasing a page is the associated region/reserve map
|
|
|
|
* deleted. The region/reserve map for ranges without associated
|
2019-01-08 15:23:32 -08:00
|
|
|
* pages are not modified. Page faults can race with hole punch.
|
|
|
|
* This is indicated if we find a mapped page.
|
2015-09-08 15:01:41 -07:00
|
|
|
* Note: If the passed end of range value is beyond the end of file, but
|
|
|
|
* not LLONG_MAX this routine still performs a hole punch operation.
|
|
|
|
*/
|
|
|
|
static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
|
|
|
loff_t lend)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2008-07-23 21:27:41 -07:00
|
|
|
struct hstate *h = hstate_inode(inode);
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 00:08:55 -08:00
|
|
|
struct address_space *mapping = &inode->i_data;
|
2008-07-23 21:27:41 -07:00
|
|
|
const pgoff_t start = lstart >> huge_page_shift(h);
|
2015-09-08 15:01:41 -07:00
|
|
|
const pgoff_t end = lend >> huge_page_shift(h);
|
2022-06-04 16:39:04 -04:00
|
|
|
struct folio_batch fbatch;
|
2017-09-06 16:21:18 -07:00
|
|
|
pgoff_t next, index;
|
2006-06-23 02:03:15 -07:00
|
|
|
int i, freed = 0;
|
2015-09-08 15:01:41 -07:00
|
|
|
bool truncate_op = (lend == LLONG_MAX);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2022-06-04 16:39:04 -04:00
|
|
|
folio_batch_init(&fbatch);
|
2005-04-16 15:20:36 -07:00
|
|
|
next = start;
|
2022-06-04 16:39:04 -04:00
|
|
|
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
|
|
|
|
for (i = 0; i < folio_batch_count(&fbatch); ++i) {
|
|
|
|
struct folio *folio = fbatch.folios[i];
|
2021-05-04 18:33:34 -07:00
|
|
|
u32 hash = 0;
|
2015-09-08 15:01:41 -07:00
|
|
|
|
2022-06-04 16:39:04 -04:00
|
|
|
index = folio->index;
|
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-14 15:18:02 -07:00
|
|
|
hash = hugetlb_fault_mutex_hash(mapping, index);
|
|
|
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
2019-01-08 15:23:32 -08:00
|
|
|
|
2016-01-15 16:57:40 -08:00
|
|
|
/*
|
2022-09-14 15:18:05 -07:00
|
|
|
* Remove folio that was part of folio_batch.
|
2016-01-15 16:57:40 -08:00
|
|
|
*/
|
2022-09-14 15:18:05 -07:00
|
|
|
if (remove_inode_single_folio(h, inode, mapping, folio,
|
|
|
|
index, truncate_op))
|
|
|
|
freed++;
|
|
|
|
|
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-14 15:18:02 -07:00
|
|
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2022-06-04 16:39:04 -04:00
|
|
|
folio_batch_release(&fbatch);
|
2015-11-20 15:57:13 -08:00
|
|
|
cond_resched();
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2015-09-08 15:01:41 -07:00
|
|
|
|
|
|
|
if (truncate_op)
|
|
|
|
(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2010-06-04 19:52:12 -04:00
|
|
|
static void hugetlbfs_evict_inode(struct inode *inode)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-03 14:47:25 -07:00
|
|
|
struct resv_map *resv_map;
|
|
|
|
|
2015-09-08 15:01:41 -07:00
|
|
|
remove_inode_hugepages(inode, 0, LLONG_MAX);
|
2019-05-13 17:22:55 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get the resv_map from the address space embedded in the inode.
|
|
|
|
* This is the address space which points to any resv_map allocated
|
|
|
|
* at inode creation time. If this is a device special inode,
|
|
|
|
* i_mapping may not point to the original address space.
|
|
|
|
*/
|
|
|
|
resv_map = (struct resv_map *)(&inode->i_data)->private_data;
|
|
|
|
/* Only regular and link inodes have associated reserve maps */
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-03 14:47:25 -07:00
|
|
|
if (resv_map)
|
|
|
|
resv_map_release(&resv_map->refs);
|
2012-05-03 14:48:02 +02:00
|
|
|
clear_inode(inode);
|
2005-10-29 18:16:43 -07:00
|
|
|
}
|
|
|
|
|
2021-02-24 12:10:25 -08:00
|
|
|
static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-10-28 10:38:43 -07:00
|
|
|
pgoff_t pgoff;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct address_space *mapping = inode->i_mapping;
|
2008-07-23 21:27:41 -07:00
|
|
|
struct hstate *h = hstate_inode(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2008-07-23 21:27:41 -07:00
|
|
|
BUG_ON(offset & ~huge_page_mask(h));
|
2006-10-28 10:38:43 -07:00
|
|
|
pgoff = offset >> PAGE_SHIFT;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2020-04-01 21:11:08 -07:00
|
|
|
i_size_write(inode, offset);
|
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-14 15:18:02 -07:00
|
|
|
i_mmap_lock_write(mapping);
|
2017-09-08 16:15:08 -07:00
|
|
|
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
2022-05-12 20:22:55 -07:00
|
|
|
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
|
|
|
|
ZAP_FLAG_DROP_MARKER);
|
2018-12-28 00:39:42 -08:00
|
|
|
i_mmap_unlock_write(mapping);
|
2019-01-08 15:23:32 -08:00
|
|
|
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2022-06-13 13:36:48 -07:00
|
|
|
static void hugetlbfs_zero_partial_page(struct hstate *h,
|
|
|
|
struct address_space *mapping,
|
|
|
|
loff_t start,
|
|
|
|
loff_t end)
|
|
|
|
{
|
|
|
|
pgoff_t idx = start >> huge_page_shift(h);
|
|
|
|
struct folio *folio;
|
|
|
|
|
|
|
|
folio = filemap_lock_folio(mapping, idx);
|
|
|
|
if (!folio)
|
|
|
|
return;
|
|
|
|
|
|
|
|
start = start & ~huge_page_mask(h);
|
|
|
|
end = end & ~huge_page_mask(h);
|
|
|
|
if (!end)
|
|
|
|
end = huge_page_size(h);
|
|
|
|
|
|
|
|
folio_zero_segment(folio, (size_t)start, (size_t)end);
|
|
|
|
|
|
|
|
folio_unlock(folio);
|
|
|
|
folio_put(folio);
|
|
|
|
}
|
|
|
|
|
2015-09-08 15:01:54 -07:00
|
|
|
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
|
|
|
{
|
2022-06-13 13:36:48 -07:00
|
|
|
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
|
|
|
|
struct address_space *mapping = inode->i_mapping;
|
2015-09-08 15:01:54 -07:00
|
|
|
struct hstate *h = hstate_inode(inode);
|
|
|
|
loff_t hpage_size = huge_page_size(h);
|
|
|
|
loff_t hole_start, hole_end;
|
|
|
|
|
|
|
|
/*
|
2022-06-13 13:36:48 -07:00
|
|
|
* hole_start and hole_end indicate the full pages within the hole.
|
2015-09-08 15:01:54 -07:00
|
|
|
*/
|
|
|
|
hole_start = round_up(offset, hpage_size);
|
|
|
|
hole_end = round_down(offset + len, hpage_size);
|
|
|
|
|
2022-06-13 13:36:48 -07:00
|
|
|
inode_lock(inode);
|
|
|
|
|
|
|
|
/* protected by i_rwsem */
|
|
|
|
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
|
|
|
|
inode_unlock(inode);
|
|
|
|
return -EPERM;
|
|
|
|
}
|
2015-09-08 15:01:54 -07:00
|
|
|
|
2022-06-13 13:36:48 -07:00
|
|
|
i_mmap_lock_write(mapping);
|
2018-01-31 16:19:25 -08:00
|
|
|
|
2022-06-13 13:36:48 -07:00
|
|
|
/* If range starts before first full page, zero partial page. */
|
|
|
|
if (offset < hole_start)
|
|
|
|
hugetlbfs_zero_partial_page(h, mapping,
|
|
|
|
offset, min(offset + len, hole_start));
|
2018-01-31 16:19:25 -08:00
|
|
|
|
2022-06-13 13:36:48 -07:00
|
|
|
/* Unmap users of full pages in the hole. */
|
|
|
|
if (hole_end > hole_start) {
|
2017-09-08 16:15:08 -07:00
|
|
|
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
2015-09-08 15:01:54 -07:00
|
|
|
hugetlb_vmdelete_list(&mapping->i_mmap,
|
2022-05-12 20:22:55 -07:00
|
|
|
hole_start >> PAGE_SHIFT,
|
|
|
|
hole_end >> PAGE_SHIFT, 0);
|
2015-09-08 15:01:54 -07:00
|
|
|
}
|
|
|
|
|
2022-06-13 13:36:48 -07:00
|
|
|
/* If range extends beyond last full page, zero partial page. */
|
|
|
|
if ((offset + len) > hole_end && (offset + len) > hole_start)
|
|
|
|
hugetlbfs_zero_partial_page(h, mapping,
|
|
|
|
hole_end, offset + len);
|
|
|
|
|
|
|
|
i_mmap_unlock_write(mapping);
|
|
|
|
|
|
|
|
/* Remove full pages from the file. */
|
|
|
|
if (hole_end > hole_start)
|
|
|
|
remove_inode_hugepages(inode, hole_start, hole_end);
|
|
|
|
|
|
|
|
inode_unlock(inode);
|
|
|
|
|
2015-09-08 15:01:54 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
|
|
|
|
loff_t len)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(file);
|
2018-01-31 16:19:25 -08:00
|
|
|
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
|
2015-09-08 15:01:54 -07:00
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
struct hstate *h = hstate_inode(inode);
|
|
|
|
struct vm_area_struct pseudo_vma;
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
loff_t hpage_size = huge_page_size(h);
|
|
|
|
unsigned long hpage_shift = huge_page_shift(h);
|
|
|
|
pgoff_t start, index, end;
|
|
|
|
int error;
|
|
|
|
u32 hash;
|
|
|
|
|
|
|
|
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
if (mode & FALLOC_FL_PUNCH_HOLE)
|
|
|
|
return hugetlbfs_punch_hole(inode, offset, len);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Default preallocate case.
|
|
|
|
* For this range, start is rounded down and end is rounded up
|
|
|
|
* as well as being converted to page offsets.
|
|
|
|
*/
|
|
|
|
start = offset >> hpage_shift;
|
|
|
|
end = (offset + len + hpage_size - 1) >> hpage_shift;
|
|
|
|
|
2016-01-22 15:40:57 -05:00
|
|
|
inode_lock(inode);
|
2015-09-08 15:01:54 -07:00
|
|
|
|
|
|
|
/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
|
|
|
|
error = inode_newsize_ok(inode, offset + len);
|
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
|
2018-01-31 16:19:25 -08:00
|
|
|
if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
|
|
|
|
error = -EPERM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2015-09-08 15:01:54 -07:00
|
|
|
/*
|
|
|
|
* Initialize a pseudo vma as this is required by the huge page
|
|
|
|
* allocation routines. If NUMA is configured, use page index
|
|
|
|
* as input to create an allocation policy.
|
|
|
|
*/
|
2018-07-26 16:37:30 -07:00
|
|
|
vma_init(&pseudo_vma, mm);
|
2015-09-08 15:01:54 -07:00
|
|
|
pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
|
|
|
|
pseudo_vma.vm_file = file;
|
|
|
|
|
|
|
|
for (index = start; index < end; index++) {
|
|
|
|
/*
|
|
|
|
* This is supposed to be the vaddr where the page is being
|
|
|
|
* faulted in, but we have no vaddr here.
|
|
|
|
*/
|
|
|
|
struct page *page;
|
|
|
|
unsigned long addr;
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fallocate(2) manpage permits EINTR; we may have been
|
|
|
|
* interrupted because we are using up too much memory.
|
|
|
|
*/
|
|
|
|
if (signal_pending(current)) {
|
|
|
|
error = -EINTR;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set numa allocation policy based on index */
|
|
|
|
hugetlb_set_vma_policy(&pseudo_vma, inode, index);
|
|
|
|
|
|
|
|
/* addr is the offset within the file (zero based) */
|
|
|
|
addr = index * hpage_size;
|
|
|
|
|
hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race
Patch series "hugetlb: Use new vma lock for huge pmd sharing
synchronization", v2.
hugetlb fault scalability regressions have recently been reported [1].
This is not the first such report, as regressions were also noted when
commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") was added [2] in v5.7. At that time, a proposal to
address the regression was suggested [3] but went nowhere.
The regression and benefit of this patch series is not evident when
using the vm_scalability benchmark reported in [2] on a recent kernel.
Results from running,
"./usemem -n 48 --prealloc --prefault -O -U 3448054972"
48 sample Avg
next-20220913 next-20220913 next-20220913
unmodified revert i_mmap_sema locking vma sema locking, this series
-----------------------------------------------------------------------------
498150 KB/s 501934 KB/s 504793 KB/s
The recent regression report [1] notes page fault and fork latency of
shared hugetlb mappings. To measure this, I created two simple programs:
1) map a shared hugetlb area, write fault all pages, unmap area
Do this in a continuous loop to measure faults per second
2) map a shared hugetlb area, write fault a few pages, fork and exit
Do this in a continuous loop to measure forks per second
These programs were run on a 48 CPU VM with 320GB memory. The shared
mapping size was 250GB. For comparison, a single instance of the program
was run. Then, multiple instances were run in parallel to introduce
lock contention. Changing the locking scheme results in a significant
performance benefit.
test instances unmodified revert vma
--------------------------------------------------------------------------
faults per sec 1 393043 395680 389932
faults per sec 24 71405 81191 79048
forks per sec 1 2802 2747 2725
forks per sec 24 439 536 500
Combined faults 24 1621 68070 53662
Combined forks 24 358 67 142
Combined test is when running both faulting program and forking program
simultaneously.
Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which
depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in
the fault path to establish pmd sharing, so this is moved back to
huge_pmd_share. With c0d0381ade79 reverted, this race is exposed:
Faulting thread Unsharing thread
... ...
ptep = huge_pte_offset()
or
ptep = huge_pte_alloc()
...
i_mmap_lock_write
lock page table
ptep invalid <------------------------ huge_pmd_unshare()
Could be in a previously unlock_page_table
sharing process or worse i_mmap_unlock_write
...
ptl = huge_pte_lock(ptep)
get/update pte
set_pte_at(pte, ptep)
Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When
the new vma lock is put to use in patch 8, this will handle the fault/file
truncation races. This is explained in patch 9 where code associated with
these races is cleaned up.
Patches 3 - 5 restructure existing code in preparation for using the new
vma lock (rw semaphore) for pmd sharing synchronization. The idea is that
this semaphore will be held in read mode for the duration of fault
processing, and held in write mode for unmap operations which may call
huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to
synchronize huge pmd sharing. However it is only required in the fault
path when setting up sharing, and will be acquired in huge_pmd_share().
Patch 6 adds the new vma lock and all supporting routines, but does not
actually change code to use the new lock.
Patch 7 refactors code in preparation for using the new lock. And, patch
8 finally adds code to make use of this new vma lock. Unfortunately, the
fault code and truncate/hole punch code would naturally take locks in the
opposite order which could lead to deadlock. Since the performance of
page faults is more important, the truncation/hole punch code is modified
to back out and take locks in the correct order if necessary.
[1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/
[2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/
[3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/
This patch (of 9):
Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing
synchronization") added code to take i_mmap_rwsem in read mode for the
duration of fault processing. The use of i_mmap_rwsem to prevent
fault/truncate races depends on this. However, this has been shown to
cause performance/scaling issues. As a result, that code will be
reverted. Since the use i_mmap_rwsem to address page fault/truncate races
depends on this, it must also be reverted.
In a subsequent patch, code will be added to detect the fault/truncate
race and back out operations as required.
Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com
Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: James Houghton <jthoughton@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mina Almasry <almasrymina@google.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Prakash Sangappa <prakash.sangappa@oracle.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2022-09-14 15:18:02 -07:00
|
|
|
/* mutex taken here, fault path and hole punch */
|
2019-11-30 17:57:02 -08:00
|
|
|
hash = hugetlb_fault_mutex_hash(mapping, index);
|
2015-09-08 15:01:54 -07:00
|
|
|
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
|
|
|
|
|
|
|
/* See if already present in mapping to avoid alloc/free */
|
|
|
|
page = find_get_page(mapping, index);
|
|
|
|
if (page) {
|
|
|
|
put_page(page);
|
|
|
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
|
|
|
hugetlb_drop_vma_policy(&pseudo_vma);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-02-24 12:10:11 -08:00
|
|
|
/*
|
|
|
|
* Allocate page without setting the avoid_reserve argument.
|
|
|
|
* There certainly are no reserves associated with the
|
|
|
|
* pseudo_vma. However, there could be shared mappings with
|
|
|
|
* reserves for the file at the inode level. If we fallocate
|
|
|
|
* pages in these areas, we need to consume the reserves
|
|
|
|
* to keep reservation accounting consistent.
|
|
|
|
*/
|
|
|
|
page = alloc_huge_page(&pseudo_vma, addr, 0);
|
2015-09-08 15:01:54 -07:00
|
|
|
hugetlb_drop_vma_policy(&pseudo_vma);
|
|
|
|
if (IS_ERR(page)) {
|
|
|
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
|
|
|
error = PTR_ERR(page);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
clear_huge_page(page, addr, pages_per_huge_page(h));
|
|
|
|
__SetPageUptodate(page);
|
2022-09-14 15:18:04 -07:00
|
|
|
error = hugetlb_add_to_page_cache(page, mapping, index);
|
2015-09-08 15:01:54 -07:00
|
|
|
if (unlikely(error)) {
|
2021-06-15 18:23:29 -07:00
|
|
|
restore_reserve_on_error(h, &pseudo_vma, addr, page);
|
2015-09-08 15:01:54 -07:00
|
|
|
put_page(page);
|
|
|
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
|
|
|
|
|
2021-02-24 12:08:56 -08:00
|
|
|
SetHPageMigratable(page);
|
2015-09-08 15:01:54 -07:00
|
|
|
/*
|
2022-09-14 15:18:04 -07:00
|
|
|
* unlock_page because locked by hugetlb_add_to_page_cache()
|
2021-02-04 18:32:03 -08:00
|
|
|
* put_page() due to reference from alloc_huge_page()
|
2015-09-08 15:01:54 -07:00
|
|
|
*/
|
|
|
|
unlock_page(page);
|
fs/hugetlbfs/inode.c: change put_page/unlock_page order in hugetlbfs_fallocate()
hugetlfs_fallocate() currently performs put_page() before unlock_page().
This scenario opens a small time window, from the time the page is added
to the page cache, until it is unlocked, in which the page might be
removed from the page-cache by another core. If the page is removed
during this time windows, it might cause a memory corruption, as the
wrong page will be unlocked.
It is arguable whether this scenario can happen in a real system, and
there are several mitigating factors. The issue was found by code
inspection (actually grep), and not by actually triggering the flow.
Yet, since putting the page before unlocking is incorrect it should be
fixed, if only to prevent future breakage or someone copy-pasting this
code.
Mike said:
"I am of the opinion that this does not need to be sent to stable.
Although the ordering is current code is incorrect, there is no way
for this to be a problem with current locking. In addition, I verified
that the perhaps bigger issue with sys_fadvise64(POSIX_FADV_DONTNEED)
for hugetlbfs and other filesystems is addressed in 3a77d214807c ("mm:
fadvise: avoid fadvise for fs without backing device")"
Link: http://lkml.kernel.org/r/20170826191124.51642-1-namit@vmware.com
Fixes: 70c3547e36f5c ("hugetlbfs: add hugetlbfs_fallocate()")
Signed-off-by: Nadav Amit <namit@vmware.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-29 16:11:33 -08:00
|
|
|
put_page(page);
|
2015-09-08 15:01:54 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
|
|
|
|
i_size_write(inode, offset + len);
|
2016-09-14 07:48:04 -07:00
|
|
|
inode->i_ctime = current_time(inode);
|
2015-09-08 15:01:54 -07:00
|
|
|
out:
|
2016-01-22 15:40:57 -05:00
|
|
|
inode_unlock(inode);
|
2015-09-08 15:01:54 -07:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
|
|
|
|
struct dentry *dentry, struct iattr *attr)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2015-03-17 22:25:59 +00:00
|
|
|
struct inode *inode = d_inode(dentry);
|
2008-07-23 21:27:41 -07:00
|
|
|
struct hstate *h = hstate_inode(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
int error;
|
|
|
|
unsigned int ia_valid = attr->ia_valid;
|
2018-01-31 16:19:25 -08:00
|
|
|
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2021-01-21 14:19:26 +01:00
|
|
|
error = setattr_prepare(&init_user_ns, dentry, attr);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (error)
|
2010-06-04 11:30:02 +02:00
|
|
|
return error;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
if (ia_valid & ATTR_SIZE) {
|
2018-01-31 16:19:25 -08:00
|
|
|
loff_t oldsize = inode->i_size;
|
|
|
|
loff_t newsize = attr->ia_size;
|
|
|
|
|
|
|
|
if (newsize & ~huge_page_mask(h))
|
2010-06-04 11:30:02 +02:00
|
|
|
return -EINVAL;
|
2021-02-24 12:10:18 -08:00
|
|
|
/* protected by i_rwsem */
|
2018-01-31 16:19:25 -08:00
|
|
|
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
|
|
|
|
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
|
|
|
|
return -EPERM;
|
2021-02-24 12:10:25 -08:00
|
|
|
hugetlb_vmtruncate(inode, newsize);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
2010-06-04 11:30:02 +02:00
|
|
|
|
2021-01-21 14:19:26 +01:00
|
|
|
setattr_copy(&init_user_ns, inode, attr);
|
2010-06-04 11:30:02 +02:00
|
|
|
mark_inode_dirty(inode);
|
|
|
|
return 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2011-07-24 20:20:48 -04:00
|
|
|
static struct inode *hugetlbfs_get_root(struct super_block *sb,
|
2018-11-01 23:07:26 +00:00
|
|
|
struct hugetlbfs_fs_context *ctx)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
inode = new_inode(sb);
|
|
|
|
if (inode) {
|
2010-10-23 11:19:54 -04:00
|
|
|
inode->i_ino = get_next_ino();
|
2018-11-01 23:07:26 +00:00
|
|
|
inode->i_mode = S_IFDIR | ctx->mode;
|
|
|
|
inode->i_uid = ctx->uid;
|
|
|
|
inode->i_gid = ctx->gid;
|
2016-09-14 07:48:04 -07:00
|
|
|
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
|
2011-07-24 20:20:48 -04:00
|
|
|
inode->i_op = &hugetlbfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &simple_dir_operations;
|
|
|
|
/* directory inodes start off with i_nlink == 2 (for "." entry) */
|
|
|
|
inc_nlink(inode);
|
2012-04-25 16:01:50 -07:00
|
|
|
lockdep_annotate_inode_mutex_key(inode);
|
2011-07-24 20:20:48 -04:00
|
|
|
}
|
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
2013-08-13 16:00:55 -07:00
|
|
|
/*
|
2014-12-12 16:54:24 -08:00
|
|
|
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
|
2013-08-13 16:00:55 -07:00
|
|
|
* be taken from reclaim -- unlike regular filesystems. This needs an
|
2016-01-15 16:57:31 -08:00
|
|
|
* annotation because huge_pmd_share() does an allocation under hugetlb's
|
2014-12-12 16:54:24 -08:00
|
|
|
* i_mmap_rwsem.
|
2013-08-13 16:00:55 -07:00
|
|
|
*/
|
2014-12-12 16:54:24 -08:00
|
|
|
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
|
2013-08-13 16:00:55 -07:00
|
|
|
|
2011-07-24 20:20:48 -04:00
|
|
|
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
|
|
|
|
struct inode *dir,
|
2011-07-24 23:17:40 -04:00
|
|
|
umode_t mode, dev_t dev)
|
2011-07-24 20:20:48 -04:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
2019-04-05 18:39:06 -07:00
|
|
|
struct resv_map *resv_map = NULL;
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-03 14:47:25 -07:00
|
|
|
|
2019-04-05 18:39:06 -07:00
|
|
|
/*
|
|
|
|
* Reserve maps are only needed for inodes that can have associated
|
|
|
|
* page allocations.
|
|
|
|
*/
|
|
|
|
if (S_ISREG(mode) || S_ISLNK(mode)) {
|
|
|
|
resv_map = resv_map_alloc();
|
|
|
|
if (!resv_map)
|
|
|
|
return NULL;
|
|
|
|
}
|
2011-07-24 20:20:48 -04:00
|
|
|
|
|
|
|
inode = new_inode(sb);
|
|
|
|
if (inode) {
|
2018-01-31 16:19:25 -08:00
|
|
|
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
|
|
|
|
|
2011-07-24 20:20:48 -04:00
|
|
|
inode->i_ino = get_next_ino();
|
2021-01-21 14:19:25 +01:00
|
|
|
inode_init_owner(&init_user_ns, inode, dir, mode);
|
2014-12-12 16:54:24 -08:00
|
|
|
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
|
|
|
|
&hugetlbfs_i_mmap_rwsem_key);
|
2005-04-16 15:20:36 -07:00
|
|
|
inode->i_mapping->a_ops = &hugetlbfs_aops;
|
2016-09-14 07:48:04 -07:00
|
|
|
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-03 14:47:25 -07:00
|
|
|
inode->i_mapping->private_data = resv_map;
|
2018-01-31 16:19:25 -08:00
|
|
|
info->seals = F_SEAL_SEAL;
|
2005-04-16 15:20:36 -07:00
|
|
|
switch (mode & S_IFMT) {
|
|
|
|
default:
|
|
|
|
init_special_inode(inode, mode, dev);
|
|
|
|
break;
|
|
|
|
case S_IFREG:
|
|
|
|
inode->i_op = &hugetlbfs_inode_operations;
|
|
|
|
inode->i_fop = &hugetlbfs_file_operations;
|
|
|
|
break;
|
|
|
|
case S_IFDIR:
|
|
|
|
inode->i_op = &hugetlbfs_dir_inode_operations;
|
|
|
|
inode->i_fop = &simple_dir_operations;
|
|
|
|
|
|
|
|
/* directory inodes start off with i_nlink == 2 (for "." entry) */
|
2006-09-30 23:29:04 -07:00
|
|
|
inc_nlink(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
|
|
|
case S_IFLNK:
|
|
|
|
inode->i_op = &page_symlink_inode_operations;
|
2015-11-17 01:07:57 -05:00
|
|
|
inode_nohighmem(inode);
|
2005-04-16 15:20:36 -07:00
|
|
|
break;
|
|
|
|
}
|
lockdep: Add helper function for dir vs file i_mutex annotation
Purely in-memory filesystems do not use the inode hash as the dcache
tells us if an entry already exists. As a result, they do not call
unlock_new_inode, and thus directory inodes do not get put into a
different lockdep class for i_sem.
We need the different lockdep classes, because the locking order for
i_mutex is different for directory inodes and regular inodes. Directory
inodes can do "readdir()", which takes i_mutex *before* possibly taking
mm->mmap_sem (due to a page fault while copying the directory entry to
user space).
In contrast, regular inodes can be mmap'ed, which takes mm->mmap_sem
before accessing i_mutex.
The two cases can never happen for the same inode, so no real deadlock
can occur, but without the different lockdep classes, lockdep cannot
understand that. As a result, if CONFIG_DEBUG_LOCK_ALLOC is set, this
can lead to false positives from lockdep like below:
find/645 is trying to acquire lock:
(&mm->mmap_sem){++++++}, at: [<ffffffff81109514>] might_fault+0x5c/0xac
but task is already holding lock:
(&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffff81149f34>]
vfs_readdir+0x5b/0xb4
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&sb->s_type->i_mutex_key#15){+.+.+.}:
[<ffffffff8108ac26>] lock_acquire+0xbf/0x103
[<ffffffff814db822>] __mutex_lock_common+0x4c/0x361
[<ffffffff814dbc46>] mutex_lock_nested+0x40/0x45
[<ffffffff811daa87>] hugetlbfs_file_mmap+0x82/0x110
[<ffffffff81111557>] mmap_region+0x258/0x432
[<ffffffff811119dd>] do_mmap_pgoff+0x2ac/0x306
[<ffffffff81111b4f>] sys_mmap_pgoff+0x118/0x16a
[<ffffffff8100c858>] sys_mmap+0x22/0x24
[<ffffffff814e3ec2>] system_call_fastpath+0x16/0x1b
-> #0 (&mm->mmap_sem){++++++}:
[<ffffffff8108a4bc>] __lock_acquire+0xa1a/0xcf7
[<ffffffff8108ac26>] lock_acquire+0xbf/0x103
[<ffffffff81109541>] might_fault+0x89/0xac
[<ffffffff81149cff>] filldir+0x6f/0xc7
[<ffffffff811586ea>] dcache_readdir+0x67/0x205
[<ffffffff81149f54>] vfs_readdir+0x7b/0xb4
[<ffffffff8114a073>] sys_getdents+0x7e/0xd1
[<ffffffff814e3ec2>] system_call_fastpath+0x16/0x1b
This patch moves the directory vs file lockdep annotation into a helper
function that can be called by in-memory filesystems and has hugetlbfs
call it.
Signed-off-by: Josh Boyer <jwboyer@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-08-25 07:48:12 -04:00
|
|
|
lockdep_annotate_inode_mutex_key(inode);
|
2019-04-05 18:39:06 -07:00
|
|
|
} else {
|
|
|
|
if (resv_map)
|
|
|
|
kref_put(&resv_map->refs, resv_map_release);
|
|
|
|
}
|
mm, hugetlb: unify region structure handling
Currently, to track reserved and allocated regions, we use two different
ways, depending on the mapping. For MAP_SHARED, we use
address_mapping's private_list and, while for MAP_PRIVATE, we use a
resv_map.
Now, we are preparing to change a coarse grained lock which protect a
region structure to fine grained lock, and this difference hinder it.
So, before changing it, unify region structure handling, consistently
using a resv_map regardless of the kind of mapping.
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-03 14:47:25 -07:00
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return inode;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* File creation. Allocate an inode, and we're done..
|
|
|
|
*/
|
2019-11-30 17:56:43 -08:00
|
|
|
static int do_hugetlbfs_mknod(struct inode *dir,
|
|
|
|
struct dentry *dentry,
|
|
|
|
umode_t mode,
|
|
|
|
dev_t dev,
|
|
|
|
bool tmpfile)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
int error = -ENOSPC;
|
2011-07-24 20:20:48 -04:00
|
|
|
|
|
|
|
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (inode) {
|
2016-09-14 07:48:04 -07:00
|
|
|
dir->i_ctime = dir->i_mtime = current_time(dir);
|
2019-11-30 17:56:43 -08:00
|
|
|
if (tmpfile) {
|
|
|
|
d_tmpfile(dentry, inode);
|
|
|
|
} else {
|
|
|
|
d_instantiate(dentry, inode);
|
|
|
|
dget(dentry);/* Extra count - pin the dentry in core */
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
error = 0;
|
|
|
|
}
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
|
|
|
|
struct dentry *dentry, umode_t mode, dev_t dev)
|
2019-11-30 17:56:43 -08:00
|
|
|
{
|
|
|
|
return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
|
|
|
|
}
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
|
|
|
|
struct dentry *dentry, umode_t mode)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2021-01-21 14:19:43 +01:00
|
|
|
int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
|
|
|
|
mode | S_IFDIR, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!retval)
|
2006-09-30 23:29:04 -07:00
|
|
|
inc_nlink(dir);
|
2005-04-16 15:20:36 -07:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int hugetlbfs_create(struct user_namespace *mnt_userns,
|
|
|
|
struct inode *dir, struct dentry *dentry,
|
|
|
|
umode_t mode, bool excl)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2021-01-21 14:19:43 +01:00
|
|
|
return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
|
|
|
|
struct inode *dir, struct dentry *dentry,
|
|
|
|
umode_t mode)
|
2019-11-30 17:56:43 -08:00
|
|
|
{
|
|
|
|
return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
|
|
|
|
}
|
|
|
|
|
2021-01-21 14:19:43 +01:00
|
|
|
static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
|
|
|
|
struct inode *dir, struct dentry *dentry,
|
|
|
|
const char *symname)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
|
|
|
int error = -ENOSPC;
|
|
|
|
|
2011-07-24 20:20:48 -04:00
|
|
|
inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (inode) {
|
|
|
|
int l = strlen(symname)+1;
|
|
|
|
error = page_symlink(inode, symname, l);
|
|
|
|
if (!error) {
|
|
|
|
d_instantiate(dentry, inode);
|
|
|
|
dget(dentry);
|
|
|
|
} else
|
|
|
|
iput(inode);
|
|
|
|
}
|
2016-09-14 07:48:04 -07:00
|
|
|
dir->i_ctime = dir->i_mtime = current_time(dir);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2022-06-06 10:47:21 -04:00
|
|
|
#ifdef CONFIG_MIGRATION
|
|
|
|
static int hugetlbfs_migrate_folio(struct address_space *mapping,
|
|
|
|
struct folio *dst, struct folio *src,
|
2012-01-12 17:19:43 -08:00
|
|
|
enum migrate_mode mode)
|
2010-09-08 10:19:35 +09:00
|
|
|
{
|
|
|
|
int rc;
|
|
|
|
|
2022-06-06 10:47:21 -04:00
|
|
|
rc = migrate_huge_page_move_mapping(mapping, dst, src);
|
2012-12-11 16:02:31 -08:00
|
|
|
if (rc != MIGRATEPAGE_SUCCESS)
|
2010-09-08 10:19:35 +09:00
|
|
|
return rc;
|
hugetlbfs: fix races and page leaks during migration
hugetlb pages should only be migrated if they are 'active'. The
routines set/clear_page_huge_active() modify the active state of hugetlb
pages.
When a new hugetlb page is allocated at fault time, set_page_huge_active
is called before the page is locked. Therefore, another thread could
race and migrate the page while it is being added to page table by the
fault code. This race is somewhat hard to trigger, but can be seen by
strategically adding udelay to simulate worst case scheduling behavior.
Depending on 'how' the code races, various BUG()s could be triggered.
To address this issue, simply delay the set_page_huge_active call until
after the page is successfully added to the page table.
Hugetlb pages can also be leaked at migration time if the pages are
associated with a file in an explicitly mounted hugetlbfs filesystem.
For example, consider a two node system with 4GB worth of huge pages
available. A program mmaps a 2G file in a hugetlbfs filesystem. It
then migrates the pages associated with the file from one node to
another. When the program exits, huge page counts are as follows:
node0
1024 free_hugepages
1024 nr_hugepages
node1
0 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
That is as expected. 2G of huge pages are taken from the free_hugepages
counts, and 2G is the size of the file in the explicitly mounted
filesystem. If the file is then removed, the counts become:
node0
1024 free_hugepages
1024 nr_hugepages
node1
1024 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
Note that the filesystem still shows 2G of pages used, while there
actually are no huge pages in use. The only way to 'fix' the filesystem
accounting is to unmount the filesystem
If a hugetlb page is associated with an explicitly mounted filesystem,
this information in contained in the page_private field. At migration
time, this information is not preserved. To fix, simply transfer
page_private from old to new page at migration time if necessary.
There is a related race with removing a huge page from a file and
migration. When a huge page is removed from the pagecache, the
page_mapping() field is cleared, yet page_private remains set until the
page is actually freed by free_huge_page(). A page could be migrated
while in this state. However, since page_mapping() is not set the
hugetlbfs specific routine to transfer page_private is not called and we
leak the page count in the filesystem.
To fix that, check for this condition before migrating a huge page. If
the condition is detected, return EBUSY for the page.
Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com
Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com
Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: <stable@vger.kernel.org>
[mike.kravetz@oracle.com: v2]
Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com
[mike.kravetz@oracle.com: update comment and changelog]
Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-02-28 16:22:02 -08:00
|
|
|
|
2022-06-06 10:47:21 -04:00
|
|
|
if (hugetlb_page_subpool(&src->page)) {
|
|
|
|
hugetlb_set_page_subpool(&dst->page,
|
|
|
|
hugetlb_page_subpool(&src->page));
|
|
|
|
hugetlb_set_page_subpool(&src->page, NULL);
|
hugetlbfs: fix races and page leaks during migration
hugetlb pages should only be migrated if they are 'active'. The
routines set/clear_page_huge_active() modify the active state of hugetlb
pages.
When a new hugetlb page is allocated at fault time, set_page_huge_active
is called before the page is locked. Therefore, another thread could
race and migrate the page while it is being added to page table by the
fault code. This race is somewhat hard to trigger, but can be seen by
strategically adding udelay to simulate worst case scheduling behavior.
Depending on 'how' the code races, various BUG()s could be triggered.
To address this issue, simply delay the set_page_huge_active call until
after the page is successfully added to the page table.
Hugetlb pages can also be leaked at migration time if the pages are
associated with a file in an explicitly mounted hugetlbfs filesystem.
For example, consider a two node system with 4GB worth of huge pages
available. A program mmaps a 2G file in a hugetlbfs filesystem. It
then migrates the pages associated with the file from one node to
another. When the program exits, huge page counts are as follows:
node0
1024 free_hugepages
1024 nr_hugepages
node1
0 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
That is as expected. 2G of huge pages are taken from the free_hugepages
counts, and 2G is the size of the file in the explicitly mounted
filesystem. If the file is then removed, the counts become:
node0
1024 free_hugepages
1024 nr_hugepages
node1
1024 free_hugepages
1024 nr_hugepages
Filesystem Size Used Avail Use% Mounted on
nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool
Note that the filesystem still shows 2G of pages used, while there
actually are no huge pages in use. The only way to 'fix' the filesystem
accounting is to unmount the filesystem
If a hugetlb page is associated with an explicitly mounted filesystem,
this information in contained in the page_private field. At migration
time, this information is not preserved. To fix, simply transfer
page_private from old to new page at migration time if necessary.
There is a related race with removing a huge page from a file and
migration. When a huge page is removed from the pagecache, the
page_mapping() field is cleared, yet page_private remains set until the
page is actually freed by free_huge_page(). A page could be migrated
while in this state. However, since page_mapping() is not set the
hugetlbfs specific routine to transfer page_private is not called and we
leak the page count in the filesystem.
To fix that, check for this condition before migrating a huge page. If
the condition is detected, return EBUSY for the page.
Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com
Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com
Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active")
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: <stable@vger.kernel.org>
[mike.kravetz@oracle.com: v2]
Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com
[mike.kravetz@oracle.com: update comment and changelog]
Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-02-28 16:22:02 -08:00
|
|
|
}
|
|
|
|
|
2017-09-08 16:12:06 -07:00
|
|
|
if (mode != MIGRATE_SYNC_NO_COPY)
|
2022-06-06 10:47:21 -04:00
|
|
|
folio_migrate_copy(dst, src);
|
2017-09-08 16:12:06 -07:00
|
|
|
else
|
2022-06-06 10:47:21 -04:00
|
|
|
folio_migrate_flags(dst, src);
|
2010-09-08 10:19:35 +09:00
|
|
|
|
2012-12-11 16:02:31 -08:00
|
|
|
return MIGRATEPAGE_SUCCESS;
|
2010-09-08 10:19:35 +09:00
|
|
|
}
|
2022-06-06 10:47:21 -04:00
|
|
|
#else
|
|
|
|
#define hugetlbfs_migrate_folio NULL
|
|
|
|
#endif
|
2010-09-08 10:19:35 +09:00
|
|
|
|
2017-07-10 15:47:50 -07:00
|
|
|
static int hugetlbfs_error_remove_page(struct address_space *mapping,
|
|
|
|
struct page *page)
|
|
|
|
{
|
|
|
|
struct inode *inode = mapping->host;
|
2017-11-02 15:59:41 -07:00
|
|
|
pgoff_t index = page->index;
|
2017-07-10 15:47:50 -07:00
|
|
|
|
2022-09-14 15:18:04 -07:00
|
|
|
hugetlb_delete_from_page_cache(page);
|
2017-11-02 15:59:41 -07:00
|
|
|
if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
|
|
|
|
hugetlb_fix_reserve_counts(inode);
|
|
|
|
|
2017-07-10 15:47:50 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-07-05 16:24:18 +01:00
|
|
|
/*
|
|
|
|
* Display the mount options in /proc/mounts.
|
|
|
|
*/
|
|
|
|
static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root)
|
|
|
|
{
|
|
|
|
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
|
|
|
|
struct hugepage_subpool *spool = sbinfo->spool;
|
|
|
|
unsigned long hpage_size = huge_page_size(sbinfo->hstate);
|
|
|
|
unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
|
|
|
|
char mod;
|
|
|
|
|
|
|
|
if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
|
|
|
|
seq_printf(m, ",uid=%u",
|
|
|
|
from_kuid_munged(&init_user_ns, sbinfo->uid));
|
|
|
|
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
|
|
|
|
seq_printf(m, ",gid=%u",
|
|
|
|
from_kgid_munged(&init_user_ns, sbinfo->gid));
|
|
|
|
if (sbinfo->mode != 0755)
|
|
|
|
seq_printf(m, ",mode=%o", sbinfo->mode);
|
|
|
|
if (sbinfo->max_inodes != -1)
|
|
|
|
seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);
|
|
|
|
|
|
|
|
hpage_size /= 1024;
|
|
|
|
mod = 'K';
|
|
|
|
if (hpage_size >= 1024) {
|
|
|
|
hpage_size /= 1024;
|
|
|
|
mod = 'M';
|
|
|
|
}
|
|
|
|
seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
|
|
|
|
if (spool) {
|
|
|
|
if (spool->max_hpages != -1)
|
|
|
|
seq_printf(m, ",size=%llu",
|
|
|
|
(unsigned long long)spool->max_hpages << hpage_shift);
|
|
|
|
if (spool->min_hpages != -1)
|
|
|
|
seq_printf(m, ",min_size=%llu",
|
|
|
|
(unsigned long long)spool->min_hpages << hpage_shift);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-06-23 02:02:58 -07:00
|
|
|
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2006-06-23 02:02:58 -07:00
|
|
|
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
|
2015-03-17 22:25:59 +00:00
|
|
|
struct hstate *h = hstate_inode(d_inode(dentry));
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
buf->f_type = HUGETLBFS_MAGIC;
|
2008-07-23 21:27:41 -07:00
|
|
|
buf->f_bsize = huge_page_size(h);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (sbinfo) {
|
|
|
|
spin_lock(&sbinfo->stat_lock);
|
2022-07-26 22:29:18 +08:00
|
|
|
/* If no limits set, just report 0 or -1 for max/free/used
|
2005-11-21 21:32:24 -08:00
|
|
|
* blocks, like simple_statfs() */
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:12 -07:00
|
|
|
if (sbinfo->spool) {
|
|
|
|
long free_pages;
|
|
|
|
|
2022-05-09 18:20:50 -07:00
|
|
|
spin_lock_irq(&sbinfo->spool->lock);
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:12 -07:00
|
|
|
buf->f_blocks = sbinfo->spool->max_hpages;
|
|
|
|
free_pages = sbinfo->spool->max_hpages
|
|
|
|
- sbinfo->spool->used_hpages;
|
|
|
|
buf->f_bavail = buf->f_bfree = free_pages;
|
2022-05-09 18:20:50 -07:00
|
|
|
spin_unlock_irq(&sbinfo->spool->lock);
|
2005-11-21 21:32:24 -08:00
|
|
|
buf->f_files = sbinfo->max_inodes;
|
|
|
|
buf->f_ffree = sbinfo->free_inodes;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
|
|
}
|
|
|
|
buf->f_namelen = NAME_MAX;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void hugetlbfs_put_super(struct super_block *sb)
|
|
|
|
{
|
|
|
|
struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);
|
|
|
|
|
|
|
|
if (sbi) {
|
|
|
|
sb->s_fs_info = NULL;
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:12 -07:00
|
|
|
|
|
|
|
if (sbi->spool)
|
|
|
|
hugepage_put_subpool(sbi->spool);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
kfree(sbi);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-10-29 18:16:42 -07:00
|
|
|
static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
|
|
|
|
{
|
|
|
|
if (sbinfo->free_inodes >= 0) {
|
|
|
|
spin_lock(&sbinfo->stat_lock);
|
|
|
|
if (unlikely(!sbinfo->free_inodes)) {
|
|
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
sbinfo->free_inodes--;
|
|
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
|
|
|
|
{
|
|
|
|
if (sbinfo->free_inodes >= 0) {
|
|
|
|
spin_lock(&sbinfo->stat_lock);
|
|
|
|
sbinfo->free_inodes++;
|
|
|
|
spin_unlock(&sbinfo->stat_lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-12-06 20:33:20 -08:00
|
|
|
static struct kmem_cache *hugetlbfs_inode_cachep;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
|
|
|
|
{
|
2005-10-29 18:16:42 -07:00
|
|
|
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
|
2005-04-16 15:20:36 -07:00
|
|
|
struct hugetlbfs_inode_info *p;
|
|
|
|
|
2005-10-29 18:16:42 -07:00
|
|
|
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
|
|
|
|
return NULL;
|
2022-03-22 14:41:03 -07:00
|
|
|
p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
|
2005-10-29 18:16:42 -07:00
|
|
|
if (unlikely(!p)) {
|
|
|
|
hugetlbfs_inc_free_inodes(sbinfo);
|
2005-04-16 15:20:36 -07:00
|
|
|
return NULL;
|
2005-10-29 18:16:42 -07:00
|
|
|
}
|
2017-03-31 15:12:01 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Any time after allocation, hugetlbfs_destroy_inode can be called
|
|
|
|
* for the inode. mpol_free_shared_policy is unconditionally called
|
|
|
|
* as part of hugetlbfs_destroy_inode. So, initialize policy here
|
|
|
|
* in case of a quick call to destroy.
|
|
|
|
*
|
|
|
|
* Note that the policy is initialized even if we are creating a
|
|
|
|
* private inode. This simplifies hugetlbfs_destroy_inode.
|
|
|
|
*/
|
|
|
|
mpol_shared_policy_init(&p->policy, NULL);
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return &p->vfs_inode;
|
|
|
|
}
|
|
|
|
|
2019-04-15 23:16:38 -04:00
|
|
|
static void hugetlbfs_free_inode(struct inode *inode)
|
2011-01-07 17:49:49 +11:00
|
|
|
{
|
|
|
|
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static void hugetlbfs_destroy_inode(struct inode *inode)
|
|
|
|
{
|
2005-10-29 18:16:42 -07:00
|
|
|
hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
|
2005-04-16 15:20:36 -07:00
|
|
|
mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
|
|
|
|
}
|
|
|
|
|
2006-06-28 04:26:44 -07:00
|
|
|
static const struct address_space_operations hugetlbfs_aops = {
|
2007-10-16 01:25:03 -07:00
|
|
|
.write_begin = hugetlbfs_write_begin,
|
|
|
|
.write_end = hugetlbfs_write_end,
|
2022-02-09 20:22:13 +00:00
|
|
|
.dirty_folio = noop_dirty_folio,
|
2022-06-06 10:47:21 -04:00
|
|
|
.migrate_folio = hugetlbfs_migrate_folio,
|
2017-07-10 15:47:50 -07:00
|
|
|
.error_remove_page = hugetlbfs_error_remove_page,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2005-10-29 18:16:42 -07:00
|
|
|
|
2008-07-25 19:45:34 -07:00
|
|
|
static void init_once(void *foo)
|
2005-10-29 18:16:42 -07:00
|
|
|
{
|
|
|
|
struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
|
|
|
|
|
2007-05-16 22:10:57 -07:00
|
|
|
inode_init_once(&ei->vfs_inode);
|
2005-10-29 18:16:42 -07:00
|
|
|
}
|
|
|
|
|
2006-03-28 01:56:42 -08:00
|
|
|
const struct file_operations hugetlbfs_file_operations = {
|
2015-04-03 11:31:35 -04:00
|
|
|
.read_iter = hugetlbfs_read_iter,
|
2005-04-16 15:20:36 -07:00
|
|
|
.mmap = hugetlbfs_file_mmap,
|
2010-05-26 17:53:41 +02:00
|
|
|
.fsync = noop_fsync,
|
2005-04-16 15:20:36 -07:00
|
|
|
.get_unmapped_area = hugetlb_get_unmapped_area,
|
2015-09-08 15:01:54 -07:00
|
|
|
.llseek = default_llseek,
|
|
|
|
.fallocate = hugetlbfs_fallocate,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2007-02-12 00:55:39 -08:00
|
|
|
static const struct inode_operations hugetlbfs_dir_inode_operations = {
|
2005-04-16 15:20:36 -07:00
|
|
|
.create = hugetlbfs_create,
|
|
|
|
.lookup = simple_lookup,
|
|
|
|
.link = simple_link,
|
|
|
|
.unlink = simple_unlink,
|
|
|
|
.symlink = hugetlbfs_symlink,
|
|
|
|
.mkdir = hugetlbfs_mkdir,
|
|
|
|
.rmdir = simple_rmdir,
|
|
|
|
.mknod = hugetlbfs_mknod,
|
|
|
|
.rename = simple_rename,
|
|
|
|
.setattr = hugetlbfs_setattr,
|
2019-11-30 17:56:43 -08:00
|
|
|
.tmpfile = hugetlbfs_tmpfile,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2007-02-12 00:55:39 -08:00
|
|
|
static const struct inode_operations hugetlbfs_inode_operations = {
|
2005-04-16 15:20:36 -07:00
|
|
|
.setattr = hugetlbfs_setattr,
|
|
|
|
};
|
|
|
|
|
2007-02-12 00:55:41 -08:00
|
|
|
static const struct super_operations hugetlbfs_ops = {
|
2005-04-16 15:20:36 -07:00
|
|
|
.alloc_inode = hugetlbfs_alloc_inode,
|
2019-04-15 23:16:38 -04:00
|
|
|
.free_inode = hugetlbfs_free_inode,
|
2005-04-16 15:20:36 -07:00
|
|
|
.destroy_inode = hugetlbfs_destroy_inode,
|
2010-06-04 19:52:12 -04:00
|
|
|
.evict_inode = hugetlbfs_evict_inode,
|
2005-04-16 15:20:36 -07:00
|
|
|
.statfs = hugetlbfs_statfs,
|
|
|
|
.put_super = hugetlbfs_put_super,
|
2017-07-05 16:24:18 +01:00
|
|
|
.show_options = hugetlbfs_show_options,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2015-04-15 16:13:42 -07:00
|
|
|
/*
|
|
|
|
* Convert size option passed from command line to number of huge pages
|
|
|
|
* in the pool specified by hstate. Size option could be in bytes
|
|
|
|
* (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
|
|
|
|
*/
|
2017-07-05 16:24:18 +01:00
|
|
|
static long
|
2015-04-15 16:13:42 -07:00
|
|
|
hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
|
2017-07-05 16:24:18 +01:00
|
|
|
enum hugetlbfs_size_type val_type)
|
2015-04-15 16:13:42 -07:00
|
|
|
{
|
|
|
|
if (val_type == NO_SIZE)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
if (val_type == SIZE_PERCENT) {
|
|
|
|
size_opt <<= huge_page_shift(h);
|
|
|
|
size_opt *= h->max_huge_pages;
|
|
|
|
do_div(size_opt, 100);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_opt >>= huge_page_shift(h);
|
|
|
|
return size_opt;
|
|
|
|
}
|
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
/*
|
|
|
|
* Parse one mount parameter.
|
|
|
|
*/
|
|
|
|
static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2018-11-01 23:07:26 +00:00
|
|
|
struct hugetlbfs_fs_context *ctx = fc->fs_private;
|
|
|
|
struct fs_parse_result result;
|
|
|
|
char *rest;
|
|
|
|
unsigned long ps;
|
|
|
|
int opt;
|
|
|
|
|
2019-09-07 07:23:15 -04:00
|
|
|
opt = fs_parse(fc, hugetlb_fs_parameters, param, &result);
|
2018-11-01 23:07:26 +00:00
|
|
|
if (opt < 0)
|
|
|
|
return opt;
|
|
|
|
|
|
|
|
switch (opt) {
|
|
|
|
case Opt_uid:
|
|
|
|
ctx->uid = make_kuid(current_user_ns(), result.uint_32);
|
|
|
|
if (!uid_valid(ctx->uid))
|
|
|
|
goto bad_val;
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
case Opt_gid:
|
|
|
|
ctx->gid = make_kgid(current_user_ns(), result.uint_32);
|
|
|
|
if (!gid_valid(ctx->gid))
|
|
|
|
goto bad_val;
|
|
|
|
return 0;
|
2007-07-15 23:40:52 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
case Opt_mode:
|
|
|
|
ctx->mode = result.uint_32 & 01777U;
|
|
|
|
return 0;
|
2007-07-15 23:40:52 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
case Opt_size:
|
|
|
|
/* memparse() will accept a K/M/G without a digit */
|
|
|
|
if (!isdigit(param->string[0]))
|
|
|
|
goto bad_val;
|
|
|
|
ctx->max_size_opt = memparse(param->string, &rest);
|
|
|
|
ctx->max_val_type = SIZE_STD;
|
|
|
|
if (*rest == '%')
|
|
|
|
ctx->max_val_type = SIZE_PERCENT;
|
|
|
|
return 0;
|
2007-07-15 23:40:52 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
case Opt_nr_inodes:
|
|
|
|
/* memparse() will accept a K/M/G without a digit */
|
|
|
|
if (!isdigit(param->string[0]))
|
|
|
|
goto bad_val;
|
|
|
|
ctx->nr_inodes = memparse(param->string, &rest);
|
|
|
|
return 0;
|
2007-07-15 23:40:52 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
case Opt_pagesize:
|
|
|
|
ps = memparse(param->string, &rest);
|
|
|
|
ctx->hstate = size_to_hstate(ps);
|
|
|
|
if (!ctx->hstate) {
|
2022-07-26 22:29:14 +08:00
|
|
|
pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
|
2018-11-01 23:07:26 +00:00
|
|
|
return -EINVAL;
|
2007-07-15 23:40:52 -07:00
|
|
|
}
|
2018-11-01 23:07:26 +00:00
|
|
|
return 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
case Opt_min_size:
|
|
|
|
/* memparse() will accept a K/M/G without a digit */
|
|
|
|
if (!isdigit(param->string[0]))
|
|
|
|
goto bad_val;
|
|
|
|
ctx->min_size_opt = memparse(param->string, &rest);
|
|
|
|
ctx->min_val_type = SIZE_STD;
|
|
|
|
if (*rest == '%')
|
|
|
|
ctx->min_val_type = SIZE_PERCENT;
|
|
|
|
return 0;
|
2007-07-15 23:40:52 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2008-07-23 21:27:43 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
bad_val:
|
2019-12-21 21:34:06 -05:00
|
|
|
return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
|
2018-11-01 23:07:26 +00:00
|
|
|
param->string, param->key);
|
|
|
|
}
|
2015-04-15 16:13:42 -07:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
/*
|
|
|
|
* Validate the parsed options.
|
|
|
|
*/
|
|
|
|
static int hugetlbfs_validate(struct fs_context *fc)
|
|
|
|
{
|
|
|
|
struct hugetlbfs_fs_context *ctx = fc->fs_private;
|
2008-07-23 21:27:43 -07:00
|
|
|
|
2015-04-15 16:13:42 -07:00
|
|
|
/*
|
|
|
|
* Use huge page pool size (in hstate) to convert the size
|
|
|
|
* options to number of huge pages. If NO_SIZE, -1 is returned.
|
|
|
|
*/
|
2018-11-01 23:07:26 +00:00
|
|
|
ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
|
|
|
|
ctx->max_size_opt,
|
|
|
|
ctx->max_val_type);
|
|
|
|
ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
|
|
|
|
ctx->min_size_opt,
|
|
|
|
ctx->min_val_type);
|
2015-04-15 16:13:42 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If max_size was specified, then min_size must be smaller
|
|
|
|
*/
|
2018-11-01 23:07:26 +00:00
|
|
|
if (ctx->max_val_type > NO_SIZE &&
|
|
|
|
ctx->min_hpages > ctx->max_hpages) {
|
|
|
|
pr_err("Minimum size can not be greater than maximum size\n");
|
2015-04-15 16:13:42 -07:00
|
|
|
return -EINVAL;
|
2008-07-23 21:27:43 -07:00
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2018-11-01 23:07:26 +00:00
|
|
|
hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2018-11-01 23:07:26 +00:00
|
|
|
struct hugetlbfs_fs_context *ctx = fc->fs_private;
|
2005-04-16 15:20:36 -07:00
|
|
|
struct hugetlbfs_sb_info *sbinfo;
|
|
|
|
|
|
|
|
sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
|
|
|
|
if (!sbinfo)
|
|
|
|
return -ENOMEM;
|
|
|
|
sb->s_fs_info = sbinfo;
|
|
|
|
spin_lock_init(&sbinfo->stat_lock);
|
2018-11-01 23:07:26 +00:00
|
|
|
sbinfo->hstate = ctx->hstate;
|
|
|
|
sbinfo->max_inodes = ctx->nr_inodes;
|
|
|
|
sbinfo->free_inodes = ctx->nr_inodes;
|
|
|
|
sbinfo->spool = NULL;
|
|
|
|
sbinfo->uid = ctx->uid;
|
|
|
|
sbinfo->gid = ctx->gid;
|
|
|
|
sbinfo->mode = ctx->mode;
|
2017-07-05 16:24:18 +01:00
|
|
|
|
2015-04-15 16:13:42 -07:00
|
|
|
/*
|
|
|
|
* Allocate and initialize subpool if maximum or minimum size is
|
2021-02-24 12:10:21 -08:00
|
|
|
* specified. Any needed reservations (for minimum size) are taken
|
2022-07-26 22:29:17 +08:00
|
|
|
* when the subpool is created.
|
2015-04-15 16:13:42 -07:00
|
|
|
*/
|
2018-11-01 23:07:26 +00:00
|
|
|
if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
|
|
|
|
sbinfo->spool = hugepage_new_subpool(ctx->hstate,
|
|
|
|
ctx->max_hpages,
|
|
|
|
ctx->min_hpages);
|
hugepages: fix use after free bug in "quota" handling
hugetlbfs_{get,put}_quota() are badly named. They don't interact with the
general quota handling code, and they don't much resemble its behaviour.
Rather than being about maintaining limits on on-disk block usage by
particular users, they are instead about maintaining limits on in-memory
page usage (including anonymous MAP_PRIVATE copied-on-write pages)
associated with a particular hugetlbfs filesystem instance.
Worse, they work by having callbacks to the hugetlbfs filesystem code from
the low-level page handling code, in particular from free_huge_page().
This is a layering violation of itself, but more importantly, if the
kernel does a get_user_pages() on hugepages (which can happen from KVM
amongst others), then the free_huge_page() can be delayed until after the
associated inode has already been freed. If an unmount occurs at the
wrong time, even the hugetlbfs superblock where the "quota" limits are
stored may have been freed.
Andrew Barry proposed a patch to fix this by having hugepages, instead of
storing a pointer to their address_space and reaching the superblock from
there, had the hugepages store pointers directly to the superblock,
bumping the reference count as appropriate to avoid it being freed.
Andrew Morton rejected that version, however, on the grounds that it made
the existing layering violation worse.
This is a reworked version of Andrew's patch, which removes the extra, and
some of the existing, layering violation. It works by introducing the
concept of a hugepage "subpool" at the lower hugepage mm layer - that is a
finite logical pool of hugepages to allocate from. hugetlbfs now creates
a subpool for each filesystem instance with a page limit set, and a
pointer to the subpool gets added to each allocated hugepage, instead of
the address_space pointer used now. The subpool has its own lifetime and
is only freed once all pages in it _and_ all other references to it (i.e.
superblocks) are gone.
subpools are optional - a NULL subpool pointer is taken by the code to
mean that no subpool limits are in effect.
Previous discussion of this bug found in: "Fix refcounting in hugetlbfs
quota handling.". See: https://lkml.org/lkml/2011/8/11/28 or
http://marc.info/?l=linux-mm&m=126928970510627&w=1
v2: Fixed a bug spotted by Hillf Danton, and removed the extra parameter to
alloc_huge_page() - since it already takes the vma, it is not necessary.
Signed-off-by: Andrew Barry <abarry@cray.com>
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:34:12 -07:00
|
|
|
if (!sbinfo->spool)
|
|
|
|
goto out_free;
|
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
2018-11-01 23:07:26 +00:00
|
|
|
sb->s_blocksize = huge_page_size(ctx->hstate);
|
|
|
|
sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
|
2005-04-16 15:20:36 -07:00
|
|
|
sb->s_magic = HUGETLBFS_MAGIC;
|
|
|
|
sb->s_op = &hugetlbfs_ops;
|
|
|
|
sb->s_time_gran = 1;
|
2020-08-11 18:31:35 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Due to the special and limited functionality of hugetlbfs, it does
|
|
|
|
* not work well as a stacking filesystem.
|
|
|
|
*/
|
|
|
|
sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
|
2018-11-01 23:07:26 +00:00
|
|
|
sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
|
2012-01-08 22:15:13 -05:00
|
|
|
if (!sb->s_root)
|
2005-04-16 15:20:36 -07:00
|
|
|
goto out_free;
|
|
|
|
return 0;
|
|
|
|
out_free:
|
2014-06-04 16:10:40 -07:00
|
|
|
kfree(sbinfo->spool);
|
2005-04-16 15:20:36 -07:00
|
|
|
kfree(sbinfo);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
static int hugetlbfs_get_tree(struct fs_context *fc)
|
|
|
|
{
|
|
|
|
int err = hugetlbfs_validate(fc);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2019-06-01 20:48:55 -04:00
|
|
|
return get_tree_nodev(fc, hugetlbfs_fill_super);
|
2018-11-01 23:07:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void hugetlbfs_fs_context_free(struct fs_context *fc)
|
|
|
|
{
|
|
|
|
kfree(fc->fs_private);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const struct fs_context_operations hugetlbfs_fs_context_ops = {
|
|
|
|
.free = hugetlbfs_fs_context_free,
|
|
|
|
.parse_param = hugetlbfs_parse_param,
|
|
|
|
.get_tree = hugetlbfs_get_tree,
|
|
|
|
};
|
|
|
|
|
|
|
|
static int hugetlbfs_init_fs_context(struct fs_context *fc)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2018-11-01 23:07:26 +00:00
|
|
|
struct hugetlbfs_fs_context *ctx;
|
|
|
|
|
|
|
|
ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
|
|
|
|
if (!ctx)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ctx->max_hpages = -1; /* No limit on size by default */
|
|
|
|
ctx->nr_inodes = -1; /* No limit on number of inodes by default */
|
|
|
|
ctx->uid = current_fsuid();
|
|
|
|
ctx->gid = current_fsgid();
|
|
|
|
ctx->mode = 0755;
|
|
|
|
ctx->hstate = &default_hstate;
|
|
|
|
ctx->min_hpages = -1; /* No default minimum size */
|
|
|
|
ctx->max_val_type = NO_SIZE;
|
|
|
|
ctx->min_val_type = NO_SIZE;
|
|
|
|
fc->fs_private = ctx;
|
|
|
|
fc->ops = &hugetlbfs_fs_context_ops;
|
|
|
|
return 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct file_system_type hugetlbfs_fs_type = {
|
2018-11-01 23:07:26 +00:00
|
|
|
.name = "hugetlbfs",
|
|
|
|
.init_fs_context = hugetlbfs_init_fs_context,
|
2019-09-07 07:23:15 -04:00
|
|
|
.parameters = hugetlb_fs_parameters,
|
2018-11-01 23:07:26 +00:00
|
|
|
.kill_sb = kill_litter_super,
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2012-12-11 16:01:34 -08:00
|
|
|
static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-09-23 15:56:05 -07:00
|
|
|
static int can_do_hugetlb_shm(void)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
2012-02-07 16:19:25 -08:00
|
|
|
kgid_t shm_group;
|
|
|
|
shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
|
|
|
|
return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2012-12-11 16:01:34 -08:00
|
|
|
static int get_hstate_idx(int page_size_log)
|
|
|
|
{
|
2013-05-07 16:18:13 -07:00
|
|
|
struct hstate *h = hstate_sizelog(page_size_log);
|
2012-12-11 16:01:34 -08:00
|
|
|
|
|
|
|
if (!h)
|
|
|
|
return -1;
|
2021-05-04 18:33:22 -07:00
|
|
|
return hstate_index(h);
|
2012-12-11 16:01:34 -08:00
|
|
|
}
|
|
|
|
|
2013-05-07 16:18:13 -07:00
|
|
|
/*
|
|
|
|
* Note that size should be aligned to proper hugepage size in caller side,
|
|
|
|
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
|
|
|
|
*/
|
|
|
|
struct file *hugetlb_file_setup(const char *name, size_t size,
|
2021-11-08 18:31:27 -08:00
|
|
|
vm_flags_t acctflag, int creat_flags,
|
|
|
|
int page_size_log)
|
2005-04-16 15:20:36 -07:00
|
|
|
{
|
|
|
|
struct inode *inode;
|
2018-06-09 09:50:46 -04:00
|
|
|
struct vfsmount *mnt;
|
2012-12-11 16:01:34 -08:00
|
|
|
int hstate_idx;
|
2018-06-09 09:50:46 -04:00
|
|
|
struct file *file;
|
2012-12-11 16:01:34 -08:00
|
|
|
|
|
|
|
hstate_idx = get_hstate_idx(page_size_log);
|
|
|
|
if (hstate_idx < 0)
|
|
|
|
return ERR_PTR(-ENODEV);
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2018-06-09 09:50:46 -04:00
|
|
|
mnt = hugetlbfs_vfsmount[hstate_idx];
|
|
|
|
if (!mnt)
|
2007-05-06 14:50:18 -07:00
|
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
|
2009-09-23 15:56:05 -07:00
|
|
|
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
|
2021-11-08 18:31:27 -08:00
|
|
|
struct ucounts *ucounts = current_ucounts();
|
|
|
|
|
|
|
|
if (user_shm_lock(size, ucounts)) {
|
|
|
|
pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
|
2012-03-21 16:34:13 -07:00
|
|
|
current->comm, current->pid);
|
2021-11-08 18:31:27 -08:00
|
|
|
user_shm_unlock(size, ucounts);
|
2009-08-24 16:30:28 +01:00
|
|
|
}
|
2021-11-08 18:31:27 -08:00
|
|
|
return ERR_PTR(-EPERM);
|
2009-03-31 15:21:26 -07:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2012-09-12 20:11:55 -07:00
|
|
|
file = ERR_PTR(-ENOSPC);
|
2018-06-09 09:50:46 -04:00
|
|
|
inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (!inode)
|
2018-06-09 09:50:46 -04:00
|
|
|
goto out;
|
2015-08-06 15:46:55 -07:00
|
|
|
if (creat_flags == HUGETLB_SHMFS_INODE)
|
|
|
|
inode->i_flags |= S_PRIVATE;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
inode->i_size = size;
|
2011-10-28 14:13:28 +02:00
|
|
|
clear_nlink(inode);
|
2007-10-16 23:31:13 -07:00
|
|
|
|
2021-02-24 12:09:54 -08:00
|
|
|
if (!hugetlb_reserve_pages(inode, 0,
|
2018-06-09 09:50:46 -04:00
|
|
|
size >> huge_page_shift(hstate_inode(inode)), NULL,
|
|
|
|
acctflag))
|
|
|
|
file = ERR_PTR(-ENOMEM);
|
|
|
|
else
|
|
|
|
file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
|
|
|
|
&hugetlbfs_file_operations);
|
|
|
|
if (!IS_ERR(file))
|
|
|
|
return file;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
[PATCH] hugepage: Strict page reservation for hugepage inodes
These days, hugepages are demand-allocated at first fault time. There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.
A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations. In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available. In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.
The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated. MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL. MAP_PRIVATE mappings can still
trigger an OOM. (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)
This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.
This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).
Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-22 00:08:55 -08:00
|
|
|
iput(inode);
|
2018-06-09 09:50:46 -04:00
|
|
|
out:
|
2012-09-12 20:11:55 -07:00
|
|
|
return file;
|
2005-04-16 15:20:36 -07:00
|
|
|
}
|
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
|
|
|
|
{
|
|
|
|
struct fs_context *fc;
|
|
|
|
struct vfsmount *mnt;
|
|
|
|
|
|
|
|
fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
|
|
|
|
if (IS_ERR(fc)) {
|
|
|
|
mnt = ERR_CAST(fc);
|
|
|
|
} else {
|
|
|
|
struct hugetlbfs_fs_context *ctx = fc->fs_private;
|
|
|
|
ctx->hstate = h;
|
|
|
|
mnt = fc_mount(fc);
|
|
|
|
put_fs_context(fc);
|
|
|
|
}
|
|
|
|
if (IS_ERR(mnt))
|
2021-02-24 12:10:14 -08:00
|
|
|
pr_err("Cannot mount internal hugetlbfs for page size %luK",
|
2022-07-26 22:29:14 +08:00
|
|
|
huge_page_size(h) / SZ_1K);
|
2018-11-01 23:07:26 +00:00
|
|
|
return mnt;
|
|
|
|
}
|
|
|
|
|
2005-04-16 15:20:36 -07:00
|
|
|
static int __init init_hugetlbfs_fs(void)
|
|
|
|
{
|
2018-11-01 23:07:26 +00:00
|
|
|
struct vfsmount *mnt;
|
2012-12-11 16:01:34 -08:00
|
|
|
struct hstate *h;
|
2005-04-16 15:20:36 -07:00
|
|
|
int error;
|
2012-12-11 16:01:34 -08:00
|
|
|
int i;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
hugetlb: ensure hugepage access is denied if hugepages are not supported
Currently, I am seeing the following when I `mount -t hugetlbfs /none
/dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's
related to the fact that hugetlbfs is properly not correctly setting
itself up in this state?:
Unable to handle kernel paging request for data at address 0x00000031
Faulting instruction address: 0xc000000000245710
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 NUMA pSeries
....
In KVM guests on Power, in a guest not backed by hugepages, we see the
following:
AnonHugePages: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 64 kB
HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages
are not supported at boot-time, but this is only checked in
hugetlb_init(). Extract the check to a helper function, and use it in a
few relevant places.
This does make hugetlbfs not supported (not registered at all) in this
environment. I believe this is fine, as there are no valid hugepages
and that won't change at runtime.
[akpm@linux-foundation.org: use pr_info(), per Mel]
[akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined]
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-05-06 12:50:00 -07:00
|
|
|
if (!hugepages_supported()) {
|
2014-06-04 16:07:21 -07:00
|
|
|
pr_info("disabling because there are no supported hugepage sizes\n");
|
hugetlb: ensure hugepage access is denied if hugepages are not supported
Currently, I am seeing the following when I `mount -t hugetlbfs /none
/dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's
related to the fact that hugetlbfs is properly not correctly setting
itself up in this state?:
Unable to handle kernel paging request for data at address 0x00000031
Faulting instruction address: 0xc000000000245710
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=2048 NUMA pSeries
....
In KVM guests on Power, in a guest not backed by hugepages, we see the
following:
AnonHugePages: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 64 kB
HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages
are not supported at boot-time, but this is only checked in
hugetlb_init(). Extract the check to a helper function, and use it in a
few relevant places.
This does make hugetlbfs not supported (not registered at all) in this
environment. I believe this is fine, as there are no valid hugepages
and that won't change at runtime.
[akpm@linux-foundation.org: use pr_info(), per Mel]
[akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined]
Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-05-06 12:50:00 -07:00
|
|
|
return -ENOTSUPP;
|
|
|
|
}
|
|
|
|
|
2012-03-21 16:34:15 -07:00
|
|
|
error = -ENOMEM;
|
2005-04-16 15:20:36 -07:00
|
|
|
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
|
|
|
|
sizeof(struct hugetlbfs_inode_info),
|
2016-01-14 15:18:21 -08:00
|
|
|
0, SLAB_ACCOUNT, init_once);
|
2005-04-16 15:20:36 -07:00
|
|
|
if (hugetlbfs_inode_cachep == NULL)
|
2019-11-30 17:56:34 -08:00
|
|
|
goto out;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
error = register_filesystem(&hugetlbfs_fs_type);
|
|
|
|
if (error)
|
2019-11-30 17:56:34 -08:00
|
|
|
goto out_free;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2019-11-30 17:56:34 -08:00
|
|
|
/* default hstate mount is required */
|
2021-02-24 12:10:04 -08:00
|
|
|
mnt = mount_one_hugetlbfs(&default_hstate);
|
2019-11-30 17:56:34 -08:00
|
|
|
if (IS_ERR(mnt)) {
|
|
|
|
error = PTR_ERR(mnt);
|
|
|
|
goto out_unreg;
|
|
|
|
}
|
|
|
|
hugetlbfs_vfsmount[default_hstate_idx] = mnt;
|
|
|
|
|
|
|
|
/* other hstates are optional */
|
2012-12-11 16:01:34 -08:00
|
|
|
i = 0;
|
|
|
|
for_each_hstate(h) {
|
2020-01-03 18:37:18 +01:00
|
|
|
if (i == default_hstate_idx) {
|
|
|
|
i++;
|
2019-11-30 17:56:34 -08:00
|
|
|
continue;
|
2020-01-03 18:37:18 +01:00
|
|
|
}
|
2019-11-30 17:56:34 -08:00
|
|
|
|
2018-11-01 23:07:26 +00:00
|
|
|
mnt = mount_one_hugetlbfs(h);
|
2019-11-30 17:56:34 -08:00
|
|
|
if (IS_ERR(mnt))
|
|
|
|
hugetlbfs_vfsmount[i] = NULL;
|
|
|
|
else
|
|
|
|
hugetlbfs_vfsmount[i] = mnt;
|
2012-12-11 16:01:34 -08:00
|
|
|
i++;
|
|
|
|
}
|
2018-11-01 23:07:26 +00:00
|
|
|
|
|
|
|
return 0;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2019-11-30 17:56:34 -08:00
|
|
|
out_unreg:
|
|
|
|
(void)unregister_filesystem(&hugetlbfs_fs_type);
|
|
|
|
out_free:
|
2012-03-21 16:34:15 -07:00
|
|
|
kmem_cache_destroy(hugetlbfs_inode_cachep);
|
2019-11-30 17:56:34 -08:00
|
|
|
out:
|
2005-04-16 15:20:36 -07:00
|
|
|
return error;
|
|
|
|
}
|
2016-01-14 15:21:52 -08:00
|
|
|
fs_initcall(init_hugetlbfs_fs)
|