New Feature:

* Randomize the per-cpu entry areas
 Cleanups:
 * Have CR3_ADDR_MASK use PHYSICAL_PAGE_MASK instead of open
   coding it
 * Move to "native" set_memory_rox() helper
 * Clean up pmd_get_atomic() and i386-PAE
 * Remove some unused page table size macros
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEV76QKkVc4xCGURexaDWVMHDJkrAFAmOc53UACgkQaDWVMHDJ
 krCUHw//SGZ+La0hLZLAiAiZTXLZZHpYkOmg1Oj1+11qSU11uZzTFqDpauhaKpRS
 cJCSh+D+RXe5e2ipgt0+Zl0hESLt7pJf8258OE4ra0DL/IlyO9uqruAs9Kn3eRS/
 Fk76nG8gdEU+JKJqpG02GqOLslYQuIy96n9hpuj1x25b614+uezPfC7S4XEat0NT
 MbJQ+jnVDf16aJIJkzT+iSwhubDVeh+bSHeO0SSCzX23WLUqDeg5NvlyxoCHGbBh
 UpUTWggV/0pYAkBKRHToeJs8qTWREwuuH/8JGewpe9A0tjdB5wyZfNL2PuracweN
 9MauXC3T5f0+Ca4yIIaPq1fF7Ny/PR2dBFihk27rOD0N7tjaZxNwal2pB1sZcmvZ
 +PAokjyTPVH5ZXjkMYGGAUe1jyjwr2+TgFSZxhTnDuGtyVQiY4pihGKOifLCX6tv
 x6khvYeTBw7wfaDRtKEAf+2kLHYn+71HszHP/8bNKX9T03h+Zf0i1wdZu5xbM5Gc
 VK2wR7bCC+UftJJYG0pldcHg2qaF19RBHK2tLwp7zngUv7lTbkKfkgKjre73KV2a
 D4b76lrqdUMo6UYwYdw7WtDyarZS4OVLq2DcNhwwMddBCaX8kyN5a4AqwQlZYJ0u
 dM+kuMofE8U3yMxmMhJimkZUsj09yLHIqfynY0jbAcU3nhKZZNY=
 =wwVF
 -----END PGP SIGNATURE-----

Merge tag 'x86_mm_for_6.2_v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Dave Hansen:
 "New Feature:

   - Randomize the per-cpu entry areas

  Cleanups:

   - Have CR3_ADDR_MASK use PHYSICAL_PAGE_MASK instead of open coding it

   - Move to "native" set_memory_rox() helper

   - Clean up pmd_get_atomic() and i386-PAE

   - Remove some unused page table size macros"

* tag 'x86_mm_for_6.2_v2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (35 commits)
  x86/mm: Ensure forced page table splitting
  x86/kasan: Populate shadow for shared chunk of the CPU entry area
  x86/kasan: Add helpers to align shadow addresses up and down
  x86/kasan: Rename local CPU_ENTRY_AREA variables to shorten names
  x86/mm: Populate KASAN shadow for entire per-CPU range of CPU entry area
  x86/mm: Recompute physical address for every page of per-CPU CEA mapping
  x86/mm: Rename __change_page_attr_set_clr(.checkalias)
  x86/mm: Inhibit _PAGE_NX changes from cpa_process_alias()
  x86/mm: Untangle __change_page_attr_set_clr(.checkalias)
  x86/mm: Add a few comments
  x86/mm: Fix CR3_ADDR_MASK
  x86/mm: Remove P*D_PAGE_MASK and P*D_PAGE_SIZE macros
  mm: Convert __HAVE_ARCH_P..P_GET to the new style
  mm: Remove pointless barrier() after pmdp_get_lockless()
  x86/mm/pae: Get rid of set_64bit()
  x86_64: Remove pointless set_64bit() usage
  x86/mm/pae: Be consistent with pXXp_get_and_clear()
  x86/mm/pae: Use WRITE_ONCE()
  x86/mm/pae: Don't (ab)use atomic64
  mm/gup: Fix the lockless PMD access
  ...
This commit is contained in:
Linus Torvalds 2022-12-17 14:06:53 -06:00
commit 4f292c4de4
55 changed files with 358 additions and 397 deletions

View File

@ -10,11 +10,11 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/set_memory.h>
#include <asm/fncpy.h>
#include <asm/tlb.h>
#include <asm/cacheflush.h>
#include <asm/set_memory.h>
#include <asm/mach/map.h>
@ -74,8 +74,7 @@ void *omap_sram_push(void *funcp, unsigned long size)
dst = fncpy(sram, funcp, size);
set_memory_ro(base, pages);
set_memory_x(base, pages);
set_memory_rox(base, pages);
return dst;
}
@ -126,8 +125,7 @@ static void __init omap_detect_and_map_sram(void)
base = (unsigned long)omap_sram_base;
pages = PAGE_ALIGN(omap_sram_size) / PAGE_SIZE;
set_memory_ro(base, pages);
set_memory_x(base, pages);
set_memory_rox(base, pages);
}
static void (*_omap_sram_reprogram_clock)(u32 dpllctl, u32 ckctl);

View File

@ -14,11 +14,11 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/set_memory.h>
#include <asm/fncpy.h>
#include <asm/tlb.h>
#include <asm/cacheflush.h>
#include <asm/set_memory.h>
#include <asm/mach/map.h>
@ -96,8 +96,7 @@ void *omap_sram_push(void *funcp, unsigned long size)
dst = fncpy(sram, funcp, size);
set_memory_ro(base, pages);
set_memory_x(base, pages);
set_memory_rox(base, pages);
return dst;
}
@ -217,8 +216,7 @@ static void __init omap2_map_sram(void)
base = (unsigned long)omap_sram_base;
pages = PAGE_ALIGN(omap_sram_size) / PAGE_SIZE;
set_memory_ro(base, pages);
set_memory_x(base, pages);
set_memory_rox(base, pages);
}
static void (*_omap2_sram_ddr_init)(u32 *slow_dll_ctrl, u32 fast_dll_ctrl,

View File

@ -46,7 +46,7 @@ config MIPS
select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC
select GENERIC_SMP_IDLE_THREAD
select GENERIC_TIME_VSYSCALL
select GUP_GET_PTE_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT
select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT
select HAVE_ARCH_COMPILER_H
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KGDB if MIPS_FP_SUPPORT

View File

@ -263,7 +263,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p
}
#ifdef CONFIG_PPC_16K_PAGES
#define __HAVE_ARCH_PTEP_GET
#define ptep_get ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
pte_basic_t val = READ_ONCE(ptep->pte);

View File

@ -20,12 +20,12 @@
#include <linux/kdebug.h>
#include <linux/slab.h>
#include <linux/moduleloader.h>
#include <linux/set_memory.h>
#include <asm/code-patching.h>
#include <asm/cacheflush.h>
#include <asm/sstep.h>
#include <asm/sections.h>
#include <asm/inst.h>
#include <asm/set_memory.h>
#include <linux/uaccess.h>
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@ -134,10 +134,9 @@ void *alloc_insn_page(void)
if (!page)
return NULL;
if (strict_module_rwx_enabled()) {
set_memory_ro((unsigned long)page, 1);
set_memory_x((unsigned long)page, 1);
}
if (strict_module_rwx_enabled())
set_memory_rox((unsigned long)page, 1);
return page;
}

View File

@ -24,7 +24,7 @@ config SUPERH
select GENERIC_PCI_IOMAP if PCI
select GENERIC_SCHED_CLOCK
select GENERIC_SMP_IDLE_THREAD
select GUP_GET_PTE_LOW_HIGH if X2TLB
select GUP_GET_PXX_LOW_HIGH if X2TLB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_KGDB
select HAVE_ARCH_SECCOMP_FILTER

View File

@ -28,9 +28,15 @@
#define pmd_ERROR(e) \
printk("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))
typedef struct { unsigned long long pmd; } pmd_t;
typedef struct {
struct {
unsigned long pmd_low;
unsigned long pmd_high;
};
unsigned long long pmd;
} pmd_t;
#define pmd_val(x) ((x).pmd)
#define __pmd(x) ((pmd_t) { (x) } )
#define __pmd(x) ((pmd_t) { .pmd = (x) } )
static inline pmd_t *pud_pgtable(pud_t pud)
{

View File

@ -58,11 +58,7 @@
#define pud_populate(mm, pud, pmd) \
set_pud(pud, __pud(_PAGE_TABLE + __pa(pmd)))
#ifdef CONFIG_64BIT
#define set_pud(pudptr, pudval) set_64bit((u64 *) (pudptr), pud_val(pudval))
#else
#define set_pud(pudptr, pudval) (*(pudptr) = (pudval))
#endif
static inline int pgd_newpage(pgd_t pgd)
{
@ -71,11 +67,7 @@ static inline int pgd_newpage(pgd_t pgd)
static inline void pgd_mkuptodate(pgd_t pgd) { pgd_val(pgd) &= ~_PAGE_NEWPAGE; }
#ifdef CONFIG_64BIT
#define set_pmd(pmdptr, pmdval) set_64bit((u64 *) (pmdptr), pmd_val(pmdval))
#else
#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
#endif
static inline void pud_clear (pud_t *pud)
{

View File

@ -159,7 +159,7 @@ config X86
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
select GENERIC_VDSO_TIME_NS
select GUP_GET_PTE_LOW_HIGH if X86_PAE
select GUP_GET_PXX_LOW_HIGH if X86_PAE
select HARDIRQS_SW_RESEND
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
select HAVE_ACPI_APEI if ACPI

View File

@ -7,34 +7,6 @@
* you need to test for the feature in boot_cpu_data.
*/
/*
* CMPXCHG8B only writes to the target if we had the previous
* value in registers, otherwise it acts as a read and gives us the
* "new previous" value. That is why there is a loop. Preloading
* EDX:EAX is a performance optimization: in the common case it means
* we need only one locked operation.
*
* A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very
* least an FPU save and/or %cr0.ts manipulation.
*
* cmpxchg8b must be used with the lock prefix here to allow the
* instruction to be executed atomically. We need to have the reader
* side to see the coherent 64bit value.
*/
static inline void set_64bit(volatile u64 *ptr, u64 value)
{
u32 low = value;
u32 high = value >> 32;
u64 prev = *ptr;
asm volatile("\n1:\t"
LOCK_PREFIX "cmpxchg8b %0\n\t"
"jnz 1b"
: "=m" (*ptr), "+A" (prev)
: "b" (low), "c" (high)
: "memory");
}
#ifdef CONFIG_X86_CMPXCHG64
#define arch_cmpxchg64(ptr, o, n) \
((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \

View File

@ -2,11 +2,6 @@
#ifndef _ASM_X86_CMPXCHG_64_H
#define _ASM_X86_CMPXCHG_64_H
static inline void set_64bit(volatile u64 *ptr, u64 val)
{
*ptr = val;
}
#define arch_cmpxchg64(ptr, o, n) \
({ \
BUILD_BUG_ON(sizeof(*(ptr)) != 8); \

View File

@ -130,10 +130,6 @@ struct cpu_entry_area {
};
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
/* Total size includes the readonly IDT mapping page as well: */
#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);

View File

@ -28,9 +28,12 @@
#ifdef CONFIG_KASAN
void __init kasan_early_init(void);
void __init kasan_init(void);
void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid);
#else
static inline void kasan_early_init(void) { }
static inline void kasan_init(void) { }
static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size,
int nid) { }
#endif
#endif

View File

@ -11,20 +11,14 @@
#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
/* Cast *PAGE_MASK to a signed type so that it is sign-extended if
/* Cast P*D_MASK to a signed type so that it is sign-extended if
virtual addresses are 32-bits but physical addresses are larger
(ie, 32-bit PAE). */
#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_PAGE_MASK) & __PHYSICAL_MASK)
#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_PAGE_MASK) & __PHYSICAL_MASK)
#define PHYSICAL_PMD_PAGE_MASK (((signed long)PMD_MASK) & __PHYSICAL_MASK)
#define PHYSICAL_PUD_PAGE_MASK (((signed long)PUD_MASK) & __PHYSICAL_MASK)
#define HPAGE_SHIFT PMD_SHIFT
#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)

View File

@ -2,8 +2,6 @@
#ifndef _ASM_X86_PGTABLE_3LEVEL_H
#define _ASM_X86_PGTABLE_3LEVEL_H
#include <asm/atomic64_32.h>
/*
* Intel Physical Address Extension (PAE) Mode - three-level page
* tables on PPro+ CPUs.
@ -21,7 +19,15 @@
pr_err("%s:%d: bad pgd %p(%016Lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
/* Rules for using set_pte: the pte being assigned *must* be
#define pxx_xchg64(_pxx, _ptr, _val) ({ \
_pxx##val_t *_p = (_pxx##val_t *)_ptr; \
_pxx##val_t _o = *_p; \
do { } while (!try_cmpxchg64(_p, &_o, (_val))); \
native_make_##_pxx(_o); \
})
/*
* Rules for using set_pte: the pte being assigned *must* be
* either not present or in a state where the hardware will
* not attempt to update the pte. In places where this is
* not possible, use pte_get_and_clear to obtain the old pte
@ -29,75 +35,19 @@
*/
static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
ptep->pte_high = pte.pte_high;
WRITE_ONCE(ptep->pte_high, pte.pte_high);
smp_wmb();
ptep->pte_low = pte.pte_low;
}
#define pmd_read_atomic pmd_read_atomic
/*
* pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
* a "*pmdp" dereference done by GCC. Problem is, in certain places
* where pte_offset_map_lock() is called, concurrent page faults are
* allowed, if the mmap_lock is hold for reading. An example is mincore
* vs page faults vs MADV_DONTNEED. On the page fault side
* pmd_populate() rightfully does a set_64bit(), but if we're reading the
* pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
* because GCC will not read the 64-bit value of the pmd atomically.
*
* To fix this all places running pte_offset_map_lock() while holding the
* mmap_lock in read mode, shall read the pmdp pointer using this
* function to know if the pmd is null or not, and in turn to know if
* they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
* operations.
*
* Without THP if the mmap_lock is held for reading, the pmd can only
* transition from null to not null while pmd_read_atomic() runs. So
* we can always return atomic pmd values with this function.
*
* With THP if the mmap_lock is held for reading, the pmd can become
* trans_huge or none or point to a pte (and in turn become "stable")
* at any time under pmd_read_atomic(). We could read it truly
* atomically here with an atomic64_read() for the THP enabled case (and
* it would be a whole lot simpler), but to avoid using cmpxchg8b we
* only return an atomic pmdval if the low part of the pmdval is later
* found to be stable (i.e. pointing to a pte). We are also returning a
* 'none' (zero) pmdval if the low part of the pmd is zero.
*
* In some cases the high and low part of the pmdval returned may not be
* consistent if THP is enabled (the low part may point to previously
* mapped hugepage, while the high part may point to a more recently
* mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
* needs the low part of the pmd to be read atomically to decide if the
* pmd is unstable or not, with the only exception when the low part
* of the pmd is zero, in which case we return a 'none' pmd.
*/
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
pmdval_t ret;
u32 *tmp = (u32 *)pmdp;
ret = (pmdval_t) (*tmp);
if (ret) {
/*
* If the low part is null, we must not read the high part
* or we can end up with a partial pmd.
*/
smp_rmb();
ret |= ((pmdval_t)*(tmp + 1)) << 32;
}
return (pmd_t) { ret };
WRITE_ONCE(ptep->pte_low, pte.pte_low);
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
{
set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
pxx_xchg64(pte, ptep, native_pte_val(pte));
}
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
pxx_xchg64(pmd, pmdp, native_pmd_val(pmd));
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
@ -105,7 +55,7 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pud.p4d.pgd = pti_set_user_pgtbl(&pudp->p4d.pgd, pud.p4d.pgd);
#endif
set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
pxx_xchg64(pud, pudp, native_pud_val(pud));
}
/*
@ -116,17 +66,16 @@ static inline void native_set_pud(pud_t *pudp, pud_t pud)
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
ptep->pte_low = 0;
WRITE_ONCE(ptep->pte_low, 0);
smp_wmb();
ptep->pte_high = 0;
WRITE_ONCE(ptep->pte_high, 0);
}
static inline void native_pmd_clear(pmd_t *pmd)
static inline void native_pmd_clear(pmd_t *pmdp)
{
u32 *tmp = (u32 *)pmd;
*tmp = 0;
WRITE_ONCE(pmdp->pmd_low, 0);
smp_wmb();
*(tmp + 1) = 0;
WRITE_ONCE(pmdp->pmd_high, 0);
}
static inline void native_pud_clear(pud_t *pudp)
@ -149,41 +98,26 @@ static inline void pud_clear(pud_t *pudp)
*/
}
#ifdef CONFIG_SMP
static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
{
pte_t res;
return pxx_xchg64(pte, ptep, 0ULL);
}
res.pte = (pteval_t)arch_atomic64_xchg((atomic64_t *)ptep, 0);
static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
{
return pxx_xchg64(pmd, pmdp, 0ULL);
}
return res;
static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
{
return pxx_xchg64(pud, pudp, 0ULL);
}
#else
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
union split_pmd {
struct {
u32 pmd_low;
u32 pmd_high;
};
pmd_t pmd;
};
#ifdef CONFIG_SMP
static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
{
union split_pmd res, *orig = (union split_pmd *)pmdp;
/* xchg acts as a barrier before setting of the high bits */
res.pmd_low = xchg(&orig->pmd_low, 0);
res.pmd_high = orig->pmd_high;
orig->pmd_high = 0;
return res.pmd;
}
#else
#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
#ifndef pmdp_establish
@ -199,55 +133,18 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
* anybody.
*/
if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
union split_pmd old, new, *ptr;
ptr = (union split_pmd *)pmdp;
new.pmd = pmd;
/* xchg acts as a barrier before setting of the high bits */
old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
old.pmd_high = ptr->pmd_high;
ptr->pmd_high = new.pmd_high;
return old.pmd;
old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low);
old.pmd_high = READ_ONCE(pmdp->pmd_high);
WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high);
return old;
}
do {
old = *pmdp;
} while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd);
return old;
return pxx_xchg64(pmd, pmdp, pmd.pmd);
}
#endif
#ifdef CONFIG_SMP
union split_pud {
struct {
u32 pud_low;
u32 pud_high;
};
pud_t pud;
};
static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
{
union split_pud res, *orig = (union split_pud *)pudp;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pti_set_user_pgtbl(&pudp->p4d.pgd, __pgd(0));
#endif
/* xchg acts as a barrier before setting of the high bits */
res.pud_low = xchg(&orig->pud_low, 0);
res.pud_high = orig->pud_high;
orig->pud_high = 0;
return res.pud;
}
#else
#define native_pudp_get_and_clear(xp) native_local_pudp_get_and_clear(xp)
#endif
/* Encode and de-code a swap entry */
#define SWP_TYPE_BITS 5

View File

@ -18,6 +18,13 @@ typedef union {
};
pteval_t pte;
} pte_t;
typedef union {
struct {
unsigned long pmd_low, pmd_high;
};
pmdval_t pmd;
} pmd_t;
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI))

View File

@ -19,6 +19,7 @@ typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
typedef struct { pteval_t pte; } pte_t;
typedef struct { pmdval_t pmd; } pmd_t;
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled;

View File

@ -11,6 +11,12 @@
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
#ifdef CONFIG_X86_32
#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \
(CPU_ENTRY_AREA_SIZE * NR_CPUS) - \
CPU_ENTRY_AREA_BASE)
#else
#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE
#endif
#endif /* _ASM_X86_PGTABLE_AREAS_H */

View File

@ -361,11 +361,9 @@ static inline pudval_t native_pud_val(pud_t pud)
#endif
#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
static inline pmd_t native_make_pmd(pmdval_t val)
{
return (pmd_t) { val };
return (pmd_t) { .pmd = val };
}
static inline pmdval_t native_pmd_val(pmd_t pmd)

View File

@ -35,7 +35,7 @@
*/
#ifdef CONFIG_X86_64
/* Mask off the address space ID and SME encryption bits. */
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
#define CR3_ADDR_MASK __sme_clr(PHYSICAL_PAGE_MASK)
#define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63)

View File

@ -6,6 +6,9 @@
#include <asm/page.h>
#include <asm-generic/set_memory.h>
#define set_memory_rox set_memory_rox
int set_memory_rox(unsigned long addr, int numpages);
/*
* The set_memory_* API can be used to change various attributes of a virtual
* address range. The attributes include:

View File

@ -2142,11 +2142,6 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
{
struct text_poke_loc *tp;
if (unlikely(system_state == SYSTEM_BOOTING)) {
text_poke_early(addr, opcode, len);
return;
}
text_poke_flush(addr);
tp = &tp_vec[tp_vec_nr++];
@ -2168,11 +2163,6 @@ void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *
{
struct text_poke_loc tp;
if (unlikely(system_state == SYSTEM_BOOTING)) {
text_poke_early(addr, opcode, len);
return;
}
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}

View File

@ -504,7 +504,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
}
a = aper + iommu_size;
iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
iommu_size -= round_up(a, PMD_SIZE) - a;
if (iommu_size < 64*1024*1024) {
pr_warn("PCI-DMA: Warning: Small IOMMU %luMB."

View File

@ -24,10 +24,10 @@
#include <linux/module.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <trace/syscall.h>
#include <asm/set_memory.h>
#include <asm/kprobes.h>
#include <asm/ftrace.h>
#include <asm/nops.h>
@ -423,9 +423,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
if (likely(system_state != SYSTEM_BOOTING))
set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
set_memory_rox((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
tramp_free(trampoline);

View File

@ -203,7 +203,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
/* Is the address not 2M aligned? */
if (load_delta & ~PMD_PAGE_MASK)
if (load_delta & ~PMD_MASK)
for (;;);
/* Include the SME encryption mask in the fixup value */

View File

@ -266,7 +266,7 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
/* CPU entry erea is always used for CPU entry */
if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
CPU_ENTRY_AREA_TOTAL_SIZE))
CPU_ENTRY_AREA_MAP_SIZE))
return true;
/*

View File

@ -43,6 +43,7 @@
#include <linux/objtool.h>
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
#include <linux/set_memory.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@ -51,7 +52,6 @@
#include <asm/alternative.h>
#include <asm/insn.h>
#include <asm/debugreg.h>
#include <asm/set_memory.h>
#include <asm/ibt.h>
#include "common.h"
@ -414,17 +414,11 @@ void *alloc_insn_page(void)
if (!page)
return NULL;
/*
* First make the page read-only, and only then make it executable to
* prevent it from being W+X in between.
*/
set_memory_ro((unsigned long)page, 1);
/*
* TODO: Once additional kernel code protection mechanisms are set, ensure
* that the page was not maliciously altered and it is still zeroed.
*/
set_memory_x((unsigned long)page, 1);
set_memory_rox((unsigned long)page, 1);
return page;
}

View File

@ -9,22 +9,60 @@
#include <asm/cpu_entry_area.h>
#include <asm/fixmap.h>
#include <asm/desc.h>
#include <asm/kasan.h>
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
#endif
#ifdef CONFIG_X86_32
static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset);
static __always_inline unsigned int cea_offset(unsigned int cpu)
{
return per_cpu(_cea_offset, cpu);
}
static __init void init_cea_offsets(void)
{
unsigned int max_cea;
unsigned int i, j;
max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE;
/* O(sodding terrible) */
for_each_possible_cpu(i) {
unsigned int cea;
again:
cea = prandom_u32_max(max_cea);
for_each_possible_cpu(j) {
if (cea_offset(j) == cea)
goto again;
if (i == j)
break;
}
per_cpu(_cea_offset, i) = cea;
}
}
#else /* !X86_64 */
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
static __always_inline unsigned int cea_offset(unsigned int cpu)
{
return cpu;
}
static inline void init_cea_offsets(void) { }
#endif
/* Is called from entry code, so must be noinstr */
noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
return (struct cpu_entry_area *) va;
@ -148,6 +186,9 @@ static void __init setup_cpu_entry_area(unsigned int cpu)
pgprot_t tss_prot = PAGE_KERNEL;
#endif
kasan_populate_shadow_for_vaddr(cea, CPU_ENTRY_AREA_SIZE,
early_cpu_to_node(cpu));
cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot);
cea_map_percpu_pages(&cea->entry_stack_page,
@ -201,7 +242,6 @@ static __init void setup_cpu_entry_area_ptes(void)
/* The +1 is for the readonly IDT: */
BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
start = CPU_ENTRY_AREA_BASE;
@ -217,6 +257,8 @@ void __init setup_cpu_entry_areas(void)
{
unsigned int cpu;
init_cea_offsets();
setup_cpu_entry_area_ptes();
for_each_possible_cpu(cpu)

View File

@ -801,7 +801,7 @@ void __init poking_init(void)
spinlock_t *ptl;
pte_t *ptep;
poking_mm = copy_init_mm();
poking_mm = mm_alloc();
BUG_ON(!poking_mm);
/*

View File

@ -316,10 +316,33 @@ void __init kasan_early_init(void)
kasan_map_early_shadow(init_top_pgt);
}
static unsigned long kasan_mem_to_shadow_align_down(unsigned long va)
{
unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
return round_down(shadow, PAGE_SIZE);
}
static unsigned long kasan_mem_to_shadow_align_up(unsigned long va)
{
unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
return round_up(shadow, PAGE_SIZE);
}
void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
{
unsigned long shadow_start, shadow_end;
shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va);
shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size);
kasan_populate_shadow(shadow_start, shadow_end, nid);
}
void __init kasan_init(void)
{
unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end;
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
@ -360,16 +383,10 @@ void __init kasan_init(void)
map_range(&pfn_mapped[i]);
}
shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
shadow_cpu_entry_begin = (void *)round_down(
(unsigned long)shadow_cpu_entry_begin, PAGE_SIZE);
shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
CPU_ENTRY_AREA_MAP_SIZE);
shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
shadow_cpu_entry_end = (void *)round_up(
(unsigned long)shadow_cpu_entry_end, PAGE_SIZE);
shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE);
shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU);
shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE +
CPU_ENTRY_AREA_MAP_SIZE);
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
@ -391,12 +408,18 @@ void __init kasan_init(void)
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)VMALLOC_END + 1),
shadow_cpu_entry_begin);
(void *)shadow_cea_begin);
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
(unsigned long)shadow_cpu_entry_end, 0);
/*
* Populate the shadow for the shared portion of the CPU entry area.
* Shadows for the per-CPU areas are mapped on-demand, as each CPU's
* area is randomly placed somewhere in the 512GiB range and mapping
* the entire 512GiB range is prohibitively expensive.
*/
kasan_populate_shadow(shadow_cea_begin,
shadow_cea_per_cpu_begin, 0);
kasan_populate_early_shadow(shadow_cpu_entry_end,
kasan_populate_early_shadow((void *)shadow_cea_end,
kasan_mem_to_shadow((void *)__START_KERNEL_map));
kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),

View File

@ -26,7 +26,7 @@ SYM_FUNC_START(sme_encrypt_execute)
* RCX - virtual address of the encryption workarea, including:
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
* - intermediate copy buffer (PMD_PAGE_SIZE)
* - intermediate copy buffer (PMD_SIZE)
* R8 - physical address of the pagetables to use for encryption
*/
@ -123,7 +123,7 @@ SYM_FUNC_START(__enc_copy)
wbinvd /* Invalidate any cache entries */
/* Copy/encrypt up to 2MB at a time */
movq $PMD_PAGE_SIZE, %r12
movq $PMD_SIZE, %r12
1:
cmpq %r12, %r9
jnb 2f

View File

@ -93,7 +93,7 @@ struct sme_populate_pgd_data {
* section is 2MB aligned to allow for simple pagetable setup using only
* PMD entries (see vmlinux.lds.S).
*/
static char sme_workarea[2 * PMD_PAGE_SIZE] __section(".init.scratch");
static char sme_workarea[2 * PMD_SIZE] __section(".init.scratch");
static char sme_cmdline_arg[] __initdata = "mem_encrypt";
static char sme_cmdline_on[] __initdata = "on";
@ -198,8 +198,8 @@ static void __init __sme_map_range_pmd(struct sme_populate_pgd_data *ppd)
while (ppd->vaddr < ppd->vaddr_end) {
sme_populate_pgd_large(ppd);
ppd->vaddr += PMD_PAGE_SIZE;
ppd->paddr += PMD_PAGE_SIZE;
ppd->vaddr += PMD_SIZE;
ppd->paddr += PMD_SIZE;
}
}
@ -225,11 +225,11 @@ static void __init __sme_map_range(struct sme_populate_pgd_data *ppd,
vaddr_end = ppd->vaddr_end;
/* If start is not 2MB aligned, create PTE entries */
ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_PAGE_SIZE);
ppd->vaddr_end = ALIGN(ppd->vaddr, PMD_SIZE);
__sme_map_range_pte(ppd);
/* Create PMD entries */
ppd->vaddr_end = vaddr_end & PMD_PAGE_MASK;
ppd->vaddr_end = vaddr_end & PMD_MASK;
__sme_map_range_pmd(ppd);
/* If end is not 2MB aligned, create PTE entries */
@ -325,7 +325,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
/* Physical addresses gives us the identity mapped virtual addresses */
kernel_start = __pa_symbol(_text);
kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE);
kernel_end = ALIGN(__pa_symbol(_end), PMD_SIZE);
kernel_len = kernel_end - kernel_start;
initrd_start = 0;
@ -355,12 +355,12 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* executable encryption area size:
* stack page (PAGE_SIZE)
* encryption routine page (PAGE_SIZE)
* intermediate copy buffer (PMD_PAGE_SIZE)
* intermediate copy buffer (PMD_SIZE)
* pagetable structures for the encryption of the kernel
* pagetable structures for workarea (in case not currently mapped)
*/
execute_start = workarea_start;
execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE;
execute_end = execute_start + (PAGE_SIZE * 2) + PMD_SIZE;
execute_len = execute_end - execute_start;
/*
@ -383,7 +383,7 @@ void __init sme_encrypt_kernel(struct boot_params *bp)
* before it is mapped.
*/
workarea_len = execute_len + pgtable_area_len;
workarea_end = ALIGN(workarea_start + workarea_len, PMD_PAGE_SIZE);
workarea_end = ALIGN(workarea_start + workarea_len, PMD_SIZE);
/*
* Set the address to the start of where newly created pagetable

View File

@ -220,6 +220,23 @@ within_inclusive(unsigned long addr, unsigned long start, unsigned long end)
#ifdef CONFIG_X86_64
/*
* The kernel image is mapped into two places in the virtual address space
* (addresses without KASLR, of course):
*
* 1. The kernel direct map (0xffff880000000000)
* 2. The "high kernel map" (0xffffffff81000000)
*
* We actually execute out of #2. If we get the address of a kernel symbol, it
* points to #2, but almost all physical-to-virtual translations point to #1.
*
* This is so that we can have both a directmap of all physical memory *and*
* take full advantage of the the limited (s32) immediate addressing range (2G)
* of x86_64.
*
* See Documentation/x86/x86_64/mm.rst for more detail.
*/
static inline unsigned long highmap_start_pfn(void)
{
return __pa_symbol(_text) >> PAGE_SHIFT;
@ -605,10 +622,6 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star
{
unsigned long end;
/* Kernel text is rw at boot up */
if (system_state == SYSTEM_BOOTING)
return new;
/*
* 32-bit has some unfixable W+X issues, like EFI code
* and writeable data being in the same page. Disable
@ -765,11 +778,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
switch (level) {
case PG_LEVEL_1G:
phys_addr = (phys_addr_t)pud_pfn(*(pud_t *)pte) << PAGE_SHIFT;
offset = virt_addr & ~PUD_PAGE_MASK;
offset = virt_addr & ~PUD_MASK;
break;
case PG_LEVEL_2M:
phys_addr = (phys_addr_t)pmd_pfn(*(pmd_t *)pte) << PAGE_SHIFT;
offset = virt_addr & ~PMD_PAGE_MASK;
offset = virt_addr & ~PMD_MASK;
break;
default:
phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@ -1059,7 +1072,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
case PG_LEVEL_1G:
ref_prot = pud_pgprot(*(pud_t *)kpte);
ref_pfn = pud_pfn(*(pud_t *)kpte);
pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
pfninc = PMD_SIZE >> PAGE_SHIFT;
lpaddr = address & PUD_MASK;
lpinc = PMD_SIZE;
/*
@ -1646,8 +1659,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
return err;
}
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary);
/*
* Check the directmap and "high kernel map" 'aliases'.
*/
static int cpa_process_alias(struct cpa_data *cpa)
{
struct cpa_data alias_cpa;
@ -1671,6 +1687,12 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
/* Directmap always has NX set, do not modify. */
if (__supported_pte_mask & _PAGE_NX) {
alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
}
cpa->force_flush_all = 1;
ret = __change_page_attr_set_clr(&alias_cpa, 0);
@ -1693,6 +1715,15 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
alias_cpa.curpage = 0;
/*
* [_text, _brk_end) also covers data, do not modify NX except
* in cases where the highmap is the primary target.
*/
if (__supported_pte_mask & _PAGE_NX) {
alias_cpa.mask_clr.pgprot &= ~_PAGE_NX;
alias_cpa.mask_set.pgprot &= ~_PAGE_NX;
}
cpa->force_flush_all = 1;
/*
* The high mapping range is imprecise, so ignore the
@ -1705,12 +1736,19 @@ static int cpa_process_alias(struct cpa_data *cpa)
return 0;
}
static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
static int __change_page_attr_set_clr(struct cpa_data *cpa, int primary)
{
unsigned long numpages = cpa->numpages;
unsigned long rempages = numpages;
int ret = 0;
/*
* No changes, easy!
*/
if (!(pgprot_val(cpa->mask_set) | pgprot_val(cpa->mask_clr)) &&
!cpa->force_split)
return ret;
while (rempages) {
/*
* Store the remaining nr of pages for the large page
@ -1723,13 +1761,13 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
ret = __change_page_attr(cpa, checkalias);
ret = __change_page_attr(cpa, primary);
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
if (ret)
goto out;
if (checkalias) {
if (primary && !(cpa->flags & CPA_NO_CHECK_ALIAS)) {
ret = cpa_process_alias(cpa);
if (ret)
goto out;
@ -1757,7 +1795,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
struct page **pages)
{
struct cpa_data cpa;
int ret, cache, checkalias;
int ret, cache;
memset(&cpa, 0, sizeof(cpa));
@ -1803,20 +1841,11 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
cpa.numpages = numpages;
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
cpa.flags = 0;
cpa.flags = in_flag;
cpa.curpage = 0;
cpa.force_split = force_split;
if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa.flags |= in_flag;
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
/* Has caller explicitly disabled alias checking? */
if (in_flag & CPA_NO_CHECK_ALIAS)
checkalias = 0;
ret = __change_page_attr_set_clr(&cpa, checkalias);
ret = __change_page_attr_set_clr(&cpa, 1);
/*
* Check whether we really changed something:
@ -2047,6 +2076,16 @@ int set_memory_ro(unsigned long addr, int numpages)
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
}
int set_memory_rox(unsigned long addr, int numpages)
{
pgprot_t clr = __pgprot(_PAGE_RW);
if (__supported_pte_mask & _PAGE_NX)
clr.pgprot |= _PAGE_NX;
return change_page_attr_clear(&addr, numpages, clr, 0);
}
int set_memory_rw(unsigned long addr, int numpages)
{
return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
@ -2059,11 +2098,9 @@ int set_memory_np(unsigned long addr, int numpages)
int set_memory_np_noalias(unsigned long addr, int numpages)
{
int cpa_flags = CPA_NO_CHECK_ALIAS;
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
__pgprot(_PAGE_PRESENT), 0,
cpa_flags, NULL);
CPA_NO_CHECK_ALIAS, NULL);
}
int set_memory_4k(unsigned long addr, int numpages)
@ -2280,7 +2317,7 @@ static int __set_pages_p(struct page *page, int numpages)
.numpages = numpages,
.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.mask_clr = __pgprot(0),
.flags = 0};
.flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting present flag. otherwise,
@ -2288,7 +2325,7 @@ static int __set_pages_p(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
return __change_page_attr_set_clr(&cpa, 0);
return __change_page_attr_set_clr(&cpa, 1);
}
static int __set_pages_np(struct page *page, int numpages)
@ -2299,7 +2336,7 @@ static int __set_pages_np(struct page *page, int numpages)
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.flags = 0};
.flags = CPA_NO_CHECK_ALIAS };
/*
* No alias checking needed for setting not present flag. otherwise,
@ -2307,7 +2344,7 @@ static int __set_pages_np(struct page *page, int numpages)
* mappings (this adds to complexity if we want to do this from
* atomic context especially). Let's keep it simple!
*/
return __change_page_attr_set_clr(&cpa, 0);
return __change_page_attr_set_clr(&cpa, 1);
}
int set_direct_map_invalid_noflush(struct page *page)
@ -2378,7 +2415,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
.flags = 0,
.flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
@ -2391,7 +2428,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
retval = __change_page_attr_set_clr(&cpa, 0);
retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
out:
@ -2421,12 +2458,12 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
.numpages = numpages,
.mask_set = __pgprot(0),
.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
.flags = 0,
.flags = CPA_NO_CHECK_ALIAS,
};
WARN_ONCE(num_online_cpus() > 1, "Don't call after initializing SMP");
retval = __change_page_attr_set_clr(&cpa, 0);
retval = __change_page_attr_set_clr(&cpa, 1);
__flush_tlb_all();
return retval;

View File

@ -592,7 +592,7 @@ static void pti_set_kernel_image_nonglobal(void)
* of the image.
*/
unsigned long start = PFN_ALIGN(_text);
unsigned long end = ALIGN((unsigned long)_end, PMD_PAGE_SIZE);
unsigned long end = ALIGN((unsigned long)_end, PMD_SIZE);
/*
* This clears _PAGE_GLOBAL from the entire kernel image.

View File

@ -174,7 +174,6 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
index = irq_iommu->irte_index + irq_iommu->sub_handle;
irte = &iommu->ir_table->base[index];
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE)
if ((irte->pst == 1) || (irte_modified->pst == 1)) {
bool ret;
@ -188,11 +187,9 @@ static int modify_irte(struct irq_2_iommu *irq_iommu,
* same as the old value.
*/
WARN_ON(!ret);
} else
#endif
{
set_64bit(&irte->low, irte_modified->low);
set_64bit(&irte->high, irte_modified->high);
} else {
WRITE_ONCE(irte->low, irte_modified->low);
WRITE_ONCE(irte->high, irte_modified->high);
}
__iommu_flush_cache(iommu, irte, sizeof(*irte));
@ -250,8 +247,8 @@ static int clear_entries(struct irq_2_iommu *irq_iommu)
end = start + (1 << irq_iommu->irte_mask);
for (entry = start; entry < end; entry++) {
set_64bit(&entry->low, 0);
set_64bit(&entry->high, 0);
WRITE_ONCE(entry->low, 0);
WRITE_ONCE(entry->high, 0);
}
bitmap_release_region(iommu->ir_table->bitmap, index,
irq_iommu->irte_mask);

View File

@ -10,9 +10,9 @@
#include <linux/genalloc.h>
#include <linux/mm.h>
#include <linux/sram.h>
#include <linux/set_memory.h>
#include <asm/fncpy.h>
#include <asm/set_memory.h>
#include "sram.h"
@ -106,10 +106,7 @@ void *sram_exec_copy(struct gen_pool *pool, void *dst, void *src,
dst_cpy = fncpy(dst, src, size);
ret = set_memory_ro((unsigned long)base, pages);
if (ret)
goto error_out;
ret = set_memory_x((unsigned long)base, pages);
ret = set_memory_rox((unsigned long)base, pages);
if (ret)
goto error_out;

View File

@ -860,8 +860,7 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
set_vm_flush_reset_perms(hdr);
set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);

View File

@ -309,24 +309,28 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
ptep_get_and_clear(mm, addr, ptep);
}
#ifndef __HAVE_ARCH_PTEP_GET
#ifndef ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
return READ_ONCE(*ptep);
}
#endif
#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
#ifndef pmdp_get
static inline pmd_t pmdp_get(pmd_t *pmdp)
{
return READ_ONCE(*pmdp);
}
#endif
#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
/*
* WARNING: only to be used in the get_user_pages_fast() implementation.
*
* With get_user_pages_fast(), we walk down the pagetables without taking any
* locks. For this we would like to load the pointers atomically, but sometimes
* that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
* we do have is the guarantee that a PTE will only either go from not present
* to present, or present to not present or both -- it will not switch to a
* completely different present page without a TLB flush in between; something
* that we are blocking by holding interrupts off.
* For walking the pagetables without holding any locks. Some architectures
* (eg x86-32 PAE) cannot load the entries atomically without using expensive
* instructions. We are guaranteed that a PTE will only either go from not
* present to present, or present to not present -- it will not switch to a
* completely different present page without a TLB flush inbetween; which we
* are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
*
@ -361,15 +365,42 @@ static inline pte_t ptep_get_lockless(pte_t *ptep)
return pte;
}
#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
#define ptep_get_lockless ptep_get_lockless
#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
pmd_t pmd;
do {
pmd.pmd_low = pmdp->pmd_low;
smp_rmb();
pmd.pmd_high = pmdp->pmd_high;
smp_rmb();
} while (unlikely(pmd.pmd_low != pmdp->pmd_low));
return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */
/*
* We require that the PTE can be read atomically.
*/
#ifndef ptep_get_lockless
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
return ptep_get(ptep);
}
#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
#endif
#ifndef pmdp_get_lockless
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
return pmdp_get(pmdp);
}
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
@ -1313,18 +1344,6 @@ static inline int pud_trans_unstable(pud_t *pud)
#endif
}
#ifndef pmd_read_atomic
static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
{
/*
* Depend on compiler for an atomic pmd read. NOTE: this is
* only going to work, if the pmdval_t isn't larger than
* an unsigned long.
*/
return *pmdp;
}
#endif
#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif
@ -1351,13 +1370,13 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
*/
static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
{
pmd_t pmdval = pmd_read_atomic(pmd);
pmd_t pmdval = pmdp_get_lockless(pmd);
/*
* The barrier will stabilize the pmdval in a register or on
* the stack so that it will stop changing under the code.
*
* When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
* pmd_read_atomic is allowed to return a not atomic pmdval
* pmdp_get_lockless is allowed to return a not atomic pmdval
* (for example pointing to an hugepage that has never been
* mapped in the pmd). The below checks will only care about
* the low part of the pmd with 32bit PAE x86 anyway, with the

View File

@ -65,6 +65,7 @@ extern void sched_dead(struct task_struct *p);
void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);
extern void mm_cache_init(void);
extern void proc_caches_init(void);
extern void fork_init(void);
@ -90,7 +91,6 @@ extern void exit_itimers(struct task_struct *);
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
struct mm_struct *copy_init_mm(void);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);

View File

@ -14,6 +14,16 @@ static inline int set_memory_x(unsigned long addr, int numpages) { return 0; }
static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
#endif
#ifndef set_memory_rox
static inline int set_memory_rox(unsigned long addr, int numpages)
{
int ret = set_memory_ro(addr, numpages);
if (ret)
return ret;
return set_memory_x(addr, numpages);
}
#endif
#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP
static inline int set_direct_map_invalid_noflush(struct page *page)
{

View File

@ -863,6 +863,7 @@ static void __init mm_init(void)
/* Should be run after espfix64 is set up. */
pti_init();
kmsan_init_runtime();
mm_cache_init();
}
#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET
@ -998,7 +999,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
sort_main_extable();
trap_init();
mm_init();
poking_init();
ftrace_init();
/* trace_printk can be enabled here */
@ -1137,7 +1138,6 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
taskstats_init_early();
delayacct_init();
poking_init();
check_bugs();
acpi_subsystem_init();

View File

@ -494,8 +494,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
refcount_set(&kvalue->refcnt, 1);
bpf_map_inc(map);
set_memory_ro((long)st_map->image, 1);
set_memory_x((long)st_map->image, 1);
set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* Pair with smp_load_acquire() during lookup_elem().

View File

@ -868,8 +868,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
return pack;
}
@ -887,8 +886,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
if (ptr) {
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
}
goto out;
}

View File

@ -468,8 +468,7 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
if (err < 0)
goto out;
set_memory_ro((long)im->image, 1);
set_memory_x((long)im->image, 1);
set_memory_rox((long)im->image, 1);
WARN_ON(tr->cur_image && tr->selector == 0);
WARN_ON(!tr->cur_image && tr->selector);

View File

@ -7493,7 +7493,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
pmd = READ_ONCE(*pmdp);
pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;

View File

@ -2607,11 +2607,6 @@ struct task_struct * __init fork_idle(int cpu)
return task;
}
struct mm_struct *copy_init_mm(void)
{
return dup_mm(NULL, &init_mm);
}
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
@ -3030,10 +3025,27 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
void __init proc_caches_init(void)
void __init mm_cache_init(void)
{
unsigned int mm_size;
/*
* The mm_cpumask is located at the end of mm_struct, and is
* dynamically sized based on the maximum CPU number this system
* can have, taking hotplug into account (nr_cpu_ids).
*/
mm_size = sizeof(struct mm_struct) + cpumask_size();
mm_cachep = kmem_cache_create_usercopy("mm_struct",
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
offsetof(struct mm_struct, saved_auxv),
sizeof_field(struct mm_struct, saved_auxv),
NULL);
}
void __init proc_caches_init(void)
{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@ -3051,19 +3063,6 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
/*
* The mm_cpumask is located at the end of mm_struct, and is
* dynamically sized based on the maximum CPU number this system
* can have, taking hotplug into account (nr_cpu_ids).
*/
mm_size = sizeof(struct mm_struct) + cpumask_size();
mm_cachep = kmem_cache_create_usercopy("mm_struct",
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
offsetof(struct mm_struct, saved_auxv),
sizeof_field(struct mm_struct, saved_auxv),
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();

View File

@ -1078,7 +1078,7 @@ config GUP_TEST
comment "GUP_TEST needs to have DEBUG_FS enabled"
depends on !GUP_TEST && !DEBUG_FS
config GUP_GET_PTE_LOW_HIGH
config GUP_GET_PXX_LOW_HIGH
bool
config ARCH_HAS_PTE_SPECIAL

View File

@ -2721,7 +2721,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
pmd_t pmd = READ_ONCE(*pmdp);
pmd_t pmd = pmdp_get_lockless(pmdp);
next = pmd_addr_end(addr, end);
if (!pmd_present(pmd))

View File

@ -361,8 +361,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
* huge or device mapping one and compute corresponding pfn
* values.
*/
pmd = pmd_read_atomic(pmdp);
barrier();
pmd = pmdp_get_lockless(pmdp);
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;

View File

@ -857,7 +857,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
if (!*pmd)
return SCAN_PMD_NULL;
pmde = pmd_read_atomic(*pmd);
pmde = pmdp_get_lockless(*pmd);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* See comments in pmd_none_or_trans_huge_or_clear_bad() */

View File

@ -126,7 +126,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pmd_t pmdval = pmd_read_atomic(pmd);
pmd_t pmdval = pmdp_get_lockless(pmd);
if (!pmd_trans_unstable(&pmdval))
return 0;

View File

@ -297,7 +297,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
*/
static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
{
pmd_t pmdval = pmd_read_atomic(pmd);
pmd_t pmdval = pmdp_get_lockless(pmd);
/* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE

View File

@ -632,7 +632,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
break;
}
dst_pmdval = pmd_read_atomic(dst_pmd);
dst_pmdval = pmdp_get_lockless(dst_pmd);
/*
* If the dst_pmd is mapped as THP don't
* override it and just be strict.

View File

@ -4084,10 +4084,7 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
/* walk_pte_range() may call get_next_vma() */
vma = args->vma;
for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
pmd_t val = pmd_read_atomic(pmd + i);
/* for pmd_read_atomic() */
barrier();
pmd_t val = pmdp_get_lockless(pmd + i);
next = pmd_addr_end(addr, end);

View File

@ -124,8 +124,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
if (err < 0)
goto out;
set_memory_ro((long)image, 1);
set_memory_x((long)image, 1);
set_memory_rox((long)image, 1);
prog_ret = dummy_ops_call_op(image, args);
err = dummy_ops_copy_args(args);