mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-09 15:29:16 +00:00
c4608d1bf7
commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") incured regression for stress-ng pthread benchmark [1]. It is because THP get allocated to pthread's stack area much more possible than before. Pthread's stack area is allocated by mmap without VM_GROWSDOWN or VM_GROWSUP flag, so kernel can't tell whether it is a stack area or not. The MAP_STACK flag is used to mark the stack area, but it is a no-op on Linux. Mapping MAP_STACK to VM_NOHUGEPAGE to prevent from allocating THP for such stack area. With this change the stack area looks like: fffd18e10000-fffd19610000 rw-p 00000000 00:00 0 Size: 8192 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 12 kB Pss: 12 kB Pss_Dirty: 12 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 12 kB Referenced: 12 kB Anonymous: 12 kB KSM: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB FilePmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd wr mr mw me ac nh The "nh" flag is set. [1] https://lore.kernel.org/linux-mm/202312192310.56367035-oliver.sang@intel.com/ Link: https://lkml.kernel.org/r/20231221065943.2803551-2-shy828301@gmail.com Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") Signed-off-by: Yang Shi <yang@os.amperecomputing.com> Reported-by: kernel test robot <oliver.sang@intel.com> Tested-by: Oliver Sang <oliver.sang@intel.com> Reviewed-by: Yin Fengwei <fengwei.yin@intel.com> Cc: Rik van Riel <riel@surriel.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Christopher Lameter <cl@linux.com> Cc: Huang, Ying <ying.huang@intel.com> Cc: <stable@vger.kerenl.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
199 lines
4.6 KiB
C
199 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_MMAN_H
|
|
#define _LINUX_MMAN_H
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/percpu_counter.h>
|
|
|
|
#include <linux/atomic.h>
|
|
#include <uapi/linux/mman.h>
|
|
|
|
/*
|
|
* Arrange for legacy / undefined architecture specific flags to be
|
|
* ignored by mmap handling code.
|
|
*/
|
|
#ifndef MAP_32BIT
|
|
#define MAP_32BIT 0
|
|
#endif
|
|
#ifndef MAP_ABOVE4G
|
|
#define MAP_ABOVE4G 0
|
|
#endif
|
|
#ifndef MAP_HUGE_2MB
|
|
#define MAP_HUGE_2MB 0
|
|
#endif
|
|
#ifndef MAP_HUGE_1GB
|
|
#define MAP_HUGE_1GB 0
|
|
#endif
|
|
#ifndef MAP_UNINITIALIZED
|
|
#define MAP_UNINITIALIZED 0
|
|
#endif
|
|
#ifndef MAP_SYNC
|
|
#define MAP_SYNC 0
|
|
#endif
|
|
|
|
/*
|
|
* The historical set of flags that all mmap implementations implicitly
|
|
* support when a ->mmap_validate() op is not provided in file_operations.
|
|
*
|
|
* MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the
|
|
* kernel.
|
|
*/
|
|
#define LEGACY_MAP_MASK (MAP_SHARED \
|
|
| MAP_PRIVATE \
|
|
| MAP_FIXED \
|
|
| MAP_ANONYMOUS \
|
|
| MAP_DENYWRITE \
|
|
| MAP_EXECUTABLE \
|
|
| MAP_UNINITIALIZED \
|
|
| MAP_GROWSDOWN \
|
|
| MAP_LOCKED \
|
|
| MAP_NORESERVE \
|
|
| MAP_POPULATE \
|
|
| MAP_NONBLOCK \
|
|
| MAP_STACK \
|
|
| MAP_HUGETLB \
|
|
| MAP_32BIT \
|
|
| MAP_ABOVE4G \
|
|
| MAP_HUGE_2MB \
|
|
| MAP_HUGE_1GB)
|
|
|
|
extern int sysctl_overcommit_memory;
|
|
extern int sysctl_overcommit_ratio;
|
|
extern unsigned long sysctl_overcommit_kbytes;
|
|
extern struct percpu_counter vm_committed_as;
|
|
|
|
#ifdef CONFIG_SMP
|
|
extern s32 vm_committed_as_batch;
|
|
extern void mm_compute_batch(int overcommit_policy);
|
|
#else
|
|
#define vm_committed_as_batch 0
|
|
static inline void mm_compute_batch(int overcommit_policy)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
unsigned long vm_memory_committed(void);
|
|
|
|
static inline void vm_acct_memory(long pages)
|
|
{
|
|
percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
|
|
}
|
|
|
|
static inline void vm_unacct_memory(long pages)
|
|
{
|
|
vm_acct_memory(-pages);
|
|
}
|
|
|
|
/*
|
|
* Allow architectures to handle additional protection and flag bits. The
|
|
* overriding macros must be defined in the arch-specific asm/mman.h file.
|
|
*/
|
|
|
|
#ifndef arch_calc_vm_prot_bits
|
|
#define arch_calc_vm_prot_bits(prot, pkey) 0
|
|
#endif
|
|
|
|
#ifndef arch_calc_vm_flag_bits
|
|
#define arch_calc_vm_flag_bits(flags) 0
|
|
#endif
|
|
|
|
#ifndef arch_validate_prot
|
|
/*
|
|
* This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have
|
|
* already been masked out.
|
|
*
|
|
* Returns true if the prot flags are valid
|
|
*/
|
|
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
|
|
{
|
|
return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
|
|
}
|
|
#define arch_validate_prot arch_validate_prot
|
|
#endif
|
|
|
|
#ifndef arch_validate_flags
|
|
/*
|
|
* This is called from mmap() and mprotect() with the updated vma->vm_flags.
|
|
*
|
|
* Returns true if the VM_* flags are valid.
|
|
*/
|
|
static inline bool arch_validate_flags(unsigned long flags)
|
|
{
|
|
return true;
|
|
}
|
|
#define arch_validate_flags arch_validate_flags
|
|
#endif
|
|
|
|
/*
|
|
* Optimisation macro. It is equivalent to:
|
|
* (x & bit1) ? bit2 : 0
|
|
* but this version is faster.
|
|
* ("bit1" and "bit2" must be single bits)
|
|
*/
|
|
#define _calc_vm_trans(x, bit1, bit2) \
|
|
((!(bit1) || !(bit2)) ? 0 : \
|
|
((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
|
|
: ((x) & (bit1)) / ((bit1) / (bit2))))
|
|
|
|
/*
|
|
* Combine the mmap "prot" argument into "vm_flags" used internally.
|
|
*/
|
|
static inline unsigned long
|
|
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
|
|
{
|
|
return _calc_vm_trans(prot, PROT_READ, VM_READ ) |
|
|
_calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
|
|
_calc_vm_trans(prot, PROT_EXEC, VM_EXEC) |
|
|
arch_calc_vm_prot_bits(prot, pkey);
|
|
}
|
|
|
|
/*
|
|
* Combine the mmap "flags" argument into "vm_flags" used internally.
|
|
*/
|
|
static inline unsigned long
|
|
calc_vm_flag_bits(unsigned long flags)
|
|
{
|
|
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
|
|
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
|
|
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
|
|
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
|
|
arch_calc_vm_flag_bits(flags);
|
|
}
|
|
|
|
unsigned long vm_commit_limit(void);
|
|
|
|
/*
|
|
* Denies creating a writable executable mapping or gaining executable permissions.
|
|
*
|
|
* This denies the following:
|
|
*
|
|
* a) mmap(PROT_WRITE | PROT_EXEC)
|
|
*
|
|
* b) mmap(PROT_WRITE)
|
|
* mprotect(PROT_EXEC)
|
|
*
|
|
* c) mmap(PROT_WRITE)
|
|
* mprotect(PROT_READ)
|
|
* mprotect(PROT_EXEC)
|
|
*
|
|
* But allows the following:
|
|
*
|
|
* d) mmap(PROT_READ | PROT_EXEC)
|
|
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
|
|
*/
|
|
static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
|
|
{
|
|
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
|
|
return false;
|
|
|
|
if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
|
|
return true;
|
|
|
|
if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
#endif /* _LINUX_MMAN_H */
|