x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels

_PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting
faults on x86.  Care is taken such that _PAGE_NUMA is used only in
situations where the VMA flags distinguish between NUMA hinting faults
and prot_none faults.  This decision was x86-specific and conceptually
it is difficult requiring special casing to distinguish between PROTNONE
and NUMA ptes based on context.

Fundamentally, we only need the _PAGE_NUMA bit to tell the difference
between an entry that is really unmapped and a page that is protected
for NUMA hinting faults as if the PTE is not present then a fault will
be trapped.

Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset.
This patch shrinks the maximum possible swap size and uses the bit to
uniquely distinguish between NUMA hinting ptes and swap ptes.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Mel Gorman 2014-06-04 16:06:30 -07:00 committed by Linus Torvalds
parent 4468dd76f5
commit c46a7c817e
8 changed files with 76 additions and 50 deletions

View File

@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
} }
#define pte_present_nonuma pte_present_nonuma
static inline int pte_present_nonuma(pte_t pte)
{
return pte_val(pte) & (_PAGE_PRESENT);
}
#define pte_numa pte_numa #define pte_numa pte_numa
static inline int pte_numa(pte_t pte) static inline int pte_numa(pte_t pte)
{ {

View File

@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)
static inline int pte_special(pte_t pte) static inline int pte_special(pte_t pte)
{ {
return pte_flags(pte) & _PAGE_SPECIAL; return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
(_PAGE_PRESENT|_PAGE_SPECIAL);
} }
static inline unsigned long pte_pfn(pte_t pte) static inline unsigned long pte_pfn(pte_t pte)
@ -452,6 +453,12 @@ static inline int pte_present(pte_t a)
_PAGE_NUMA); _PAGE_NUMA);
} }
#define pte_present_nonuma pte_present_nonuma
static inline int pte_present_nonuma(pte_t a)
{
return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}
#define pte_accessible pte_accessible #define pte_accessible pte_accessible
static inline bool pte_accessible(struct mm_struct *mm, pte_t a) static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{ {
@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
static inline pte_t pte_swp_mksoft_dirty(pte_t pte) static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{ {
VM_BUG_ON(pte_present(pte)); VM_BUG_ON(pte_present_nonuma(pte));
return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
} }
static inline int pte_swp_soft_dirty(pte_t pte) static inline int pte_swp_soft_dirty(pte_t pte)
{ {
VM_BUG_ON(pte_present(pte)); VM_BUG_ON(pte_present_nonuma(pte));
return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
} }
static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{ {
VM_BUG_ON(pte_present(pte)); VM_BUG_ON(pte_present_nonuma(pte));
return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
} }

View File

@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
/* Encode and de-code a swap entry */ /* Encode and de-code a swap entry */
#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) #ifdef CONFIG_NUMA_BALANCING
/* Automatic NUMA balancing needs to be distinguishable from swap entries */
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
#else #else
#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
#endif
#else
#ifdef CONFIG_NUMA_BALANCING
#error Incompatible format for automatic NUMA balancing
#endif
#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
#endif #endif

View File

@ -16,15 +16,26 @@
#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
#define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */
#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ #define _PAGE_BIT_SOFTW1 9 /* available for programmer */
#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ #define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/*
* Swap offsets on configurations that allow automatic NUMA balancing use the
* bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
* swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
* maximum possible swap space from 16TB to 8TB.
*/
#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
/* If _PAGE_BIT_PRESENT is clear, we use these: */ /* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */ /* - if the user mapped it with PROT_NONE; pte_present gives true */
#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@ -40,7 +51,7 @@
#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
@ -61,14 +72,27 @@
* they do not conflict with each other. * they do not conflict with each other.
*/ */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
#ifdef CONFIG_MEM_SOFT_DIRTY #ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else #else
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
#endif #endif
/*
* _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
* that is not present. The hinting fault gathers numa placement statistics
* (see pte_numa()). The bit is always zero when the PTE is not present.
*
* The bit picked must be always zero when the pmd is present and not
* present, so that we don't lose information when we set it while
* atomically clearing the present bit.
*/
#ifdef CONFIG_NUMA_BALANCING
#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
#else
#define _PAGE_NUMA (_AT(pteval_t, 0))
#endif
/* /*
* Tracking soft dirty bit when a page goes to a swap is tricky. * Tracking soft dirty bit when a page goes to a swap is tricky.
* We need a bit which can be stored in pte _and_ not conflict * We need a bit which can be stored in pte _and_ not conflict
@ -94,26 +118,6 @@
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
/*
* _PAGE_NUMA indicates that this page will trigger a numa hinting
* minor page fault to gather numa placement statistics (see
* pte_numa()). The bit picked (8) is within the range between
* _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
* require changes to the swp entry format because that bit is always
* zero when the pte is not present.
*
* The bit picked must be always zero when the pmd is present and not
* present, so that we don't lose information when we set it while
* atomically clearing the present bit.
*
* Because we shared the same bit (8) with _PAGE_PROTNONE this can be
* interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
* couldn't reach, like handle_mm_fault() (see access_error in
* arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
* handle_mm_fault() to be invoked).
*/
#define _PAGE_NUMA _PAGE_PROTNONE
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY) _PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
@ -122,8 +126,8 @@
/* Set of bits not changed in pte_modify */ /* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
_PAGE_SOFT_DIRTY) _PAGE_SOFT_DIRTY | _PAGE_NUMA)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0) #define _PAGE_CACHE_WB (0)

View File

@ -35,7 +35,7 @@ enum {
static int pte_testbit(pte_t pte) static int pte_testbit(pte_t pte)
{ {
return pte_flags(pte) & _PAGE_UNUSED1; return pte_flags(pte) & _PAGE_SOFTW1;
} }
struct split_state { struct split_state {

View File

@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
# define pte_accessible(mm, pte) ((void)(pte), 1) # define pte_accessible(mm, pte) ((void)(pte), 1)
#endif #endif
#ifndef pte_present_nonuma
#define pte_present_nonuma(pte) pte_present(pte)
#endif
#ifndef flush_tlb_fix_spurious_fault #ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif #endif
@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
static inline int pte_numa(pte_t pte) static inline int pte_numa(pte_t pte)
{ {
return (pte_flags(pte) & return (pte_flags(pte) &
(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
} }
#endif #endif
@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte)
static inline int pmd_numa(pmd_t pmd) static inline int pmd_numa(pmd_t pmd)
{ {
return (pmd_flags(pmd) & return (pmd_flags(pmd) &
(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA;
} }
#endif #endif

View File

@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
/* check whether a pte points to a swap entry */ /* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte) static inline int is_swap_pte(pte_t pte)
{ {
return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte);
} }
#endif #endif

View File

@ -756,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pte_pfn(pte); unsigned long pfn = pte_pfn(pte);
if (HAVE_PTE_SPECIAL) { if (HAVE_PTE_SPECIAL) {
if (likely(!pte_special(pte))) if (likely(!pte_special(pte) || pte_numa(pte)))
goto check_pfn; goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL; return NULL;
@ -782,14 +782,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
} }
} }
if (is_zero_pfn(pfn))
return NULL;
check_pfn: check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) { if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL); print_bad_pte(vma, addr, pte, NULL);
return NULL; return NULL;
} }
if (is_zero_pfn(pfn))
return NULL;
/* /*
* NOTE! We still have PageReserved() pages in the page tables. * NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist. * eg. VDSO mappings can cause them to exist.
@ -1722,13 +1723,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
/* /*
* If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault * If FOLL_FORCE is set then do not force a full fault as the hinting
* would be called on PROT_NONE ranges. We must never invoke * fault information is unrelated to the reference behaviour of a task
* handle_mm_fault on PROT_NONE ranges or the NUMA hinting * using the address space
* page faults would unprotect the PROT_NONE ranges if
* _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
* bitflag. So to avoid that, don't set FOLL_NUMA if
* FOLL_FORCE is set.
*/ */
if (!(gup_flags & FOLL_FORCE)) if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA; gup_flags |= FOLL_NUMA;