mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2025-01-11 08:18:47 +00:00
Merge branch 'x86/cpu' into perf/core, to pick up dependent patches
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
commit
9e6e87e62a
@ -538,3 +538,26 @@ Description: Intel Energy and Performance Bias Hint (EPB)
|
||||
|
||||
This attribute is present for all online CPUs supporting the
|
||||
Intel EPB feature.
|
||||
|
||||
What: /sys/devices/system/cpu/umwait_control
|
||||
/sys/devices/system/cpu/umwait_control/enable_c02
|
||||
/sys/devices/system/cpu/umwait_control/max_time
|
||||
Date: May 2019
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Umwait control
|
||||
|
||||
enable_c02: Read/write interface to control umwait C0.2 state
|
||||
Read returns C0.2 state status:
|
||||
0: C0.2 is disabled
|
||||
1: C0.2 is enabled
|
||||
|
||||
Write 'y' or '1' or 'on' to enable C0.2 state.
|
||||
Write 'n' or '0' or 'off' to disable C0.2 state.
|
||||
|
||||
The interface is case insensitive.
|
||||
|
||||
max_time: Read/write interface to control umwait maximum time
|
||||
in TSC-quanta that the CPU can reside in either C0.1
|
||||
or C0.2 state. The time is an unsigned 32-bit number.
|
||||
Note that a value of zero means there is no limit.
|
||||
Low order two bits must be zero.
|
||||
|
@ -2857,6 +2857,8 @@
|
||||
no5lvl [X86-64] Disable 5-level paging mode. Forces
|
||||
kernel to use 4-level paging instead.
|
||||
|
||||
nofsgsbase [X86] Disables FSGSBASE instructions.
|
||||
|
||||
no_console_suspend
|
||||
[HW] Never suspend the console
|
||||
Disable suspending of consoles during suspend and
|
||||
|
@ -31,7 +31,7 @@ you probably needn't concern yourself with isdn4k-utils.
|
||||
====================== =============== ========================================
|
||||
GNU C 4.6 gcc --version
|
||||
GNU make 3.81 make --version
|
||||
binutils 2.20 ld -v
|
||||
binutils 2.21 ld -v
|
||||
flex 2.5.35 flex --version
|
||||
bison 2.0 bison --version
|
||||
util-linux 2.10o fdformat --version
|
||||
@ -77,9 +77,7 @@ You will need GNU make 3.81 or later to build the kernel.
|
||||
Binutils
|
||||
--------
|
||||
|
||||
The build system has, as of 4.13, switched to using thin archives (`ar T`)
|
||||
rather than incremental linking (`ld -r`) for built-in.a intermediate steps.
|
||||
This requires binutils 2.20 or newer.
|
||||
Binutils 2.21 or newer is needed to build the kernel.
|
||||
|
||||
pkg-config
|
||||
----------
|
||||
|
@ -108,3 +108,12 @@ We try to only use IST entries and the paranoid entry code for vectors
|
||||
that absolutely need the more expensive check for the GS base - and we
|
||||
generate all 'normal' entry points with the regular (faster) paranoid=0
|
||||
variant.
|
||||
|
||||
On a FSGSBASE system, however, user space can set GS without kernel
|
||||
interaction. It means the value of GS base itself does not imply anything,
|
||||
whether a kernel value or a user space value. So, there is no longer a safe
|
||||
way to check whether the exception is entering from user mode or kernel
|
||||
mode in the paranoid entry code path. So the GSBASE value needs to be read
|
||||
out, saved and the kernel GSBASE value written. On exit the saved GSBASE
|
||||
value needs to be restored unconditionally. The non paranoid entry/exit
|
||||
code still uses SWAPGS unconditionally as the state is known.
|
||||
|
199
Documentation/x86/x86_64/fsgs.rst
Normal file
199
Documentation/x86/x86_64/fsgs.rst
Normal file
@ -0,0 +1,199 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Using FS and GS segments in user space applications
|
||||
===================================================
|
||||
|
||||
The x86 architecture supports segmentation. Instructions which access
|
||||
memory can use segment register based addressing mode. The following
|
||||
notation is used to address a byte within a segment:
|
||||
|
||||
Segment-register:Byte-address
|
||||
|
||||
The segment base address is added to the Byte-address to compute the
|
||||
resulting virtual address which is accessed. This allows to access multiple
|
||||
instances of data with the identical Byte-address, i.e. the same code. The
|
||||
selection of a particular instance is purely based on the base-address in
|
||||
the segment register.
|
||||
|
||||
In 32-bit mode the CPU provides 6 segments, which also support segment
|
||||
limits. The limits can be used to enforce address space protections.
|
||||
|
||||
In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is
|
||||
always 0 to provide a full 64bit address space. The FS and GS segments are
|
||||
still functional in 64-bit mode.
|
||||
|
||||
Common FS and GS usage
|
||||
------------------------------
|
||||
|
||||
The FS segment is commonly used to address Thread Local Storage (TLS). FS
|
||||
is usually managed by runtime code or a threading library. Variables
|
||||
declared with the '__thread' storage class specifier are instantiated per
|
||||
thread and the compiler emits the FS: address prefix for accesses to these
|
||||
variables. Each thread has its own FS base address so common code can be
|
||||
used without complex address offset calculations to access the per thread
|
||||
instances. Applications should not use FS for other purposes when they use
|
||||
runtimes or threading libraries which manage the per thread FS.
|
||||
|
||||
The GS segment has no common use and can be used freely by
|
||||
applications. GCC and Clang support GS based addressing via address space
|
||||
identifiers.
|
||||
|
||||
Reading and writing the FS/GS base address
|
||||
------------------------------------------
|
||||
|
||||
There exist two mechanisms to read and write the FS/FS base address:
|
||||
|
||||
- the arch_prctl() system call
|
||||
|
||||
- the FSGSBASE instruction family
|
||||
|
||||
Accessing FS/GS base with arch_prctl()
|
||||
--------------------------------------
|
||||
|
||||
The arch_prctl(2) based mechanism is available on all 64bit CPUs and all
|
||||
kernel versions.
|
||||
|
||||
Reading the base:
|
||||
|
||||
arch_prctl(ARCH_GET_FS, &fsbase);
|
||||
arch_prctl(ARCH_GET_GS, &gsbase);
|
||||
|
||||
Writing the base:
|
||||
|
||||
arch_prctl(ARCH_SET_FS, fsbase);
|
||||
arch_prctl(ARCH_SET_GS, gsbase);
|
||||
|
||||
The ARCH_SET_GS prctl may be disabled depending on kernel configuration
|
||||
and security settings.
|
||||
|
||||
Accessing FS/GS base with the FSGSBASE instructions
|
||||
---------------------------------------------------
|
||||
|
||||
With the Ivy Bridge CPU generation Intel introduced a new set of
|
||||
instructions to access the FS and GS base registers directly from user
|
||||
space. These instructions are also supported on AMD Family 17H CPUs. The
|
||||
following instructions are available:
|
||||
|
||||
=============== ===========================
|
||||
RDFSBASE %reg Read the FS base register
|
||||
RDGSBASE %reg Read the GS base register
|
||||
WRFSBASE %reg Write the FS base register
|
||||
WRGSBASE %reg Write the GS base register
|
||||
=============== ===========================
|
||||
|
||||
The instructions avoid the overhead of the arch_prctl() syscall and allow
|
||||
more flexible usage of the FS/GS addressing modes in user space
|
||||
applications. This does not prevent conflicts between threading libraries
|
||||
and runtimes which utilize FS and applications which want to use it for
|
||||
their own purpose.
|
||||
|
||||
FSGSBASE instructions enablement
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If
|
||||
available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs.
|
||||
|
||||
The availability of the instructions does not enable them
|
||||
automatically. The kernel has to enable them explicitly in CR4. The
|
||||
reason for this is that older kernels make assumptions about the values in
|
||||
the GS register and enforce them when GS base is set via
|
||||
arch_prctl(). Allowing user space to write arbitrary values to GS base
|
||||
would violate these assumptions and cause malfunction.
|
||||
|
||||
On kernels which do not enable FSGSBASE the execution of the FSGSBASE
|
||||
instructions will fault with a #UD exception.
|
||||
|
||||
The kernel provides reliable information about the enabled state in the
|
||||
ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the
|
||||
kernel has FSGSBASE instructions enabled and applications can use them.
|
||||
The following code example shows how this detection works::
|
||||
|
||||
#include <sys/auxv.h>
|
||||
#include <elf.h>
|
||||
|
||||
/* Will be eventually in asm/hwcap.h */
|
||||
#ifndef HWCAP2_FSGSBASE
|
||||
#define HWCAP2_FSGSBASE (1 << 1)
|
||||
#endif
|
||||
|
||||
....
|
||||
|
||||
unsigned val = getauxval(AT_HWCAP2);
|
||||
|
||||
if (val & HWCAP2_FSGSBASE)
|
||||
printf("FSGSBASE enabled\n");
|
||||
|
||||
FSGSBASE instructions compiler support
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE
|
||||
instructions. Clang supports them as well.
|
||||
|
||||
=================== ===========================
|
||||
_readfsbase_u64() Read the FS base register
|
||||
_readfsbase_u64() Read the GS base register
|
||||
_writefsbase_u64() Write the FS base register
|
||||
_writegsbase_u64() Write the GS base register
|
||||
=================== ===========================
|
||||
|
||||
To utilize these instrinsics <immintrin.h> must be included in the source
|
||||
code and the compiler option -mfsgsbase has to be added.
|
||||
|
||||
Compiler support for FS/GS based addressing
|
||||
-------------------------------------------
|
||||
|
||||
GCC version 6 and newer provide support for FS/GS based addressing via
|
||||
Named Address Spaces. GCC implements the following address space
|
||||
identifiers for x86:
|
||||
|
||||
========= ====================================
|
||||
__seg_fs Variable is addressed relative to FS
|
||||
__seg_gs Variable is addressed relative to GS
|
||||
========= ====================================
|
||||
|
||||
The preprocessor symbols __SEG_FS and __SEG_GS are defined when these
|
||||
address spaces are supported. Code which implements fallback modes should
|
||||
check whether these symbols are defined. Usage example::
|
||||
|
||||
#ifdef __SEG_GS
|
||||
|
||||
long data0 = 0;
|
||||
long data1 = 1;
|
||||
|
||||
long __seg_gs *ptr;
|
||||
|
||||
/* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */
|
||||
....
|
||||
|
||||
/* Set GS to point to data0 */
|
||||
_writegsbase_u64(&data0);
|
||||
|
||||
/* Access offset 0 of GS */
|
||||
ptr = 0;
|
||||
printf("data0 = %ld\n", *ptr);
|
||||
|
||||
/* Set GS to point to data1 */
|
||||
_writegsbase_u64(&data1);
|
||||
/* ptr still addresses offset 0! */
|
||||
printf("data1 = %ld\n", *ptr);
|
||||
|
||||
|
||||
Clang does not provide the GCC address space identifiers, but it provides
|
||||
address spaces via an attribute based mechanism in Clang 5 and newer
|
||||
versions:
|
||||
|
||||
==================================== =====================================
|
||||
__attribute__((address_space(256)) Variable is addressed relative to GS
|
||||
__attribute__((address_space(257)) Variable is addressed relative to FS
|
||||
==================================== =====================================
|
||||
|
||||
FS/GS based addressing with inline assembly
|
||||
-------------------------------------------
|
||||
|
||||
In case the compiler does not support address spaces, inline assembly can
|
||||
be used for FS/GS based addressing mode::
|
||||
|
||||
mov %fs:offset, %reg
|
||||
mov %gs:offset, %reg
|
||||
|
||||
mov %reg, %fs:offset
|
||||
mov %reg, %gs:offset
|
@ -14,3 +14,4 @@ x86_64 Support
|
||||
fake-numa-for-cpusets
|
||||
cpu-hotplug-spec
|
||||
machinecheck
|
||||
fsgs
|
||||
|
@ -17485,6 +17485,12 @@ Q: https://patchwork.linuxtv.org/project/linux-media/list/
|
||||
S: Maintained
|
||||
F: drivers/media/dvb-frontends/zd1301_demod*
|
||||
|
||||
ZHAOXIN PROCESSOR SUPPORT
|
||||
M: Tony W Wang-oc <TonyWWang-oc@zhaoxin.com>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Maintained
|
||||
F: arch/x86/kernel/cpu/zhaoxin.c
|
||||
|
||||
ZPOOL COMPRESSED PAGE STORAGE API
|
||||
M: Dan Streetman <ddstreet@ieee.org>
|
||||
L: linux-mm@kvack.org
|
||||
|
@ -480,3 +480,16 @@ config CPU_SUP_UMC_32
|
||||
CPU might render the kernel unbootable.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config CPU_SUP_ZHAOXIN
|
||||
default y
|
||||
bool "Support Zhaoxin processors" if PROCESSOR_SELECT
|
||||
help
|
||||
This enables detection, tunings and quirks for Zhaoxin processors
|
||||
|
||||
You need this enabled if you want your kernel to run on a
|
||||
Zhaoxin CPU. Disabling this option on other types of CPUs
|
||||
makes the kernel a tiny bit smaller. Disabling it on a Zhaoxin
|
||||
CPU might render the kernel unbootable.
|
||||
|
||||
If unsure, say N.
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include <asm/percpu.h>
|
||||
#include <asm/asm-offsets.h>
|
||||
#include <asm/processor-flags.h>
|
||||
#include <asm/inst.h>
|
||||
|
||||
/*
|
||||
|
||||
@ -337,6 +338,12 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
|
||||
rdgsbase \save_reg
|
||||
GET_PERCPU_BASE \scratch_reg
|
||||
wrgsbase \scratch_reg
|
||||
.endm
|
||||
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
.macro STACKLEAK_ERASE
|
||||
@ -345,6 +352,39 @@ For 32-bit we have the following conventions - kernel is built with
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/*
|
||||
* CPU/node NR is loaded from the limit (size) field of a special segment
|
||||
* descriptor entry in GDT.
|
||||
*/
|
||||
.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
|
||||
movq $__CPUNODE_SEG, \reg
|
||||
lsl \reg, \reg
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Fetch the per-CPU GSBASE value for this processor and put it in @reg.
|
||||
* We normally use %gs for accessing per-CPU data, but we are setting up
|
||||
* %gs here and obviously can not use %gs itself to access per-CPU data.
|
||||
*/
|
||||
.macro GET_PERCPU_BASE reg:req
|
||||
ALTERNATIVE \
|
||||
"LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
|
||||
"RDPID \reg", \
|
||||
X86_FEATURE_RDPID
|
||||
andq $VDSO_CPUNODE_MASK, \reg
|
||||
movq __per_cpu_offset(, \reg, 8), \reg
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro GET_PERCPU_BASE reg:req
|
||||
movq pcpu_unit_offsets(%rip), \reg
|
||||
.endm
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
* This does 'call enter_from_user_mode' unless we can avoid it based on
|
||||
* kernel config or using the static jump infrastructure.
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include <asm/export.h>
|
||||
#include <asm/frame.h>
|
||||
#include <asm/nospec-branch.h>
|
||||
#include <asm/fsgsbase.h>
|
||||
#include <linux/err.h>
|
||||
|
||||
#include "calling.h"
|
||||
@ -947,7 +948,6 @@ ENTRY(\sym)
|
||||
addq $\ist_offset, CPU_TSS_IST(\shift_ist)
|
||||
.endif
|
||||
|
||||
/* these procedures expect "no swapgs" flag in ebx */
|
||||
.if \paranoid
|
||||
jmp paranoid_exit
|
||||
.else
|
||||
@ -1164,24 +1164,21 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Save all registers in pt_regs, and switch gs if needed.
|
||||
* Use slow, but surefire "are we in kernel?" check.
|
||||
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
|
||||
* Save all registers in pt_regs. Return GSBASE related information
|
||||
* in EBX depending on the availability of the FSGSBASE instructions:
|
||||
*
|
||||
* FSGSBASE R/EBX
|
||||
* N 0 -> SWAPGS on exit
|
||||
* 1 -> no SWAPGS on exit
|
||||
*
|
||||
* Y GSBASE value at entry, must be restored in paranoid_exit
|
||||
*/
|
||||
ENTRY(paranoid_entry)
|
||||
UNWIND_HINT_FUNC
|
||||
cld
|
||||
PUSH_AND_CLEAR_REGS save_ret=1
|
||||
ENCODE_FRAME_POINTER 8
|
||||
movl $1, %ebx
|
||||
movl $MSR_GS_BASE, %ecx
|
||||
rdmsr
|
||||
testl %edx, %edx
|
||||
js 1f /* negative -> in kernel */
|
||||
SWAPGS
|
||||
xorl %ebx, %ebx
|
||||
|
||||
1:
|
||||
/*
|
||||
* Always stash CR3 in %r14. This value will be restored,
|
||||
* verbatim, at exit. Needed if paranoid_entry interrupted
|
||||
@ -1191,9 +1188,49 @@ ENTRY(paranoid_entry)
|
||||
* This is also why CS (stashed in the "iret frame" by the
|
||||
* hardware at entry) can not be used: this may be a return
|
||||
* to kernel code, but with a user CR3 value.
|
||||
*
|
||||
* Switching CR3 does not depend on kernel GSBASE so it can
|
||||
* be done before switching to the kernel GSBASE. This is
|
||||
* required for FSGSBASE because the kernel GSBASE has to
|
||||
* be retrieved from a kernel internal table.
|
||||
*/
|
||||
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
|
||||
|
||||
/*
|
||||
* Handling GSBASE depends on the availability of FSGSBASE.
|
||||
*
|
||||
* Without FSGSBASE the kernel enforces that negative GSBASE
|
||||
* values indicate kernel GSBASE. With FSGSBASE no assumptions
|
||||
* can be made about the GSBASE value when entering from user
|
||||
* space.
|
||||
*/
|
||||
ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
|
||||
|
||||
/*
|
||||
* Read the current GSBASE and store it in in %rbx unconditionally,
|
||||
* retrieve and set the current CPUs kernel GSBASE. The stored value
|
||||
* has to be restored in paranoid_exit unconditionally.
|
||||
*/
|
||||
SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
|
||||
ret
|
||||
|
||||
.Lparanoid_entry_checkgs:
|
||||
/* EBX = 1 -> kernel GSBASE active, no restore required */
|
||||
movl $1, %ebx
|
||||
/*
|
||||
* The kernel-enforced convention is a negative GSBASE indicates
|
||||
* a kernel value. No SWAPGS needed on entry and exit.
|
||||
*/
|
||||
movl $MSR_GS_BASE, %ecx
|
||||
rdmsr
|
||||
testl %edx, %edx
|
||||
jns .Lparanoid_entry_swapgs
|
||||
ret
|
||||
|
||||
.Lparanoid_entry_swapgs:
|
||||
SWAPGS
|
||||
/* EBX = 0 -> SWAPGS required on exit */
|
||||
xorl %ebx, %ebx
|
||||
ret
|
||||
END(paranoid_entry)
|
||||
|
||||
@ -1204,28 +1241,47 @@ END(paranoid_entry)
|
||||
*
|
||||
* We may be returning to very strange contexts (e.g. very early
|
||||
* in syscall entry), so checking for preemption here would
|
||||
* be complicated. Fortunately, we there's no good reason
|
||||
* to try to handle preemption here.
|
||||
* be complicated. Fortunately, there's no good reason to try
|
||||
* to handle preemption here.
|
||||
*
|
||||
* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
|
||||
* R/EBX contains the GSBASE related information depending on the
|
||||
* availability of the FSGSBASE instructions:
|
||||
*
|
||||
* FSGSBASE R/EBX
|
||||
* N 0 -> SWAPGS on exit
|
||||
* 1 -> no SWAPGS on exit
|
||||
*
|
||||
* Y User space GSBASE, must be restored unconditionally
|
||||
*/
|
||||
ENTRY(paranoid_exit)
|
||||
UNWIND_HINT_REGS
|
||||
DISABLE_INTERRUPTS(CLBR_ANY)
|
||||
TRACE_IRQS_OFF_DEBUG
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
|
||||
/* Handle GS depending on FSGSBASE availability */
|
||||
ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE
|
||||
|
||||
/* With FSGSBASE enabled, unconditionally restore GSBASE */
|
||||
wrgsbase %rbx
|
||||
jmp .Lparanoid_exit_no_swapgs;
|
||||
|
||||
.Lparanoid_exit_checkgs:
|
||||
/* On non-FSGSBASE systems, conditionally do SWAPGS */
|
||||
testl %ebx, %ebx
|
||||
jnz .Lparanoid_exit_no_swapgs
|
||||
TRACE_IRQS_IRETQ
|
||||
/* Always restore stashed CR3 value (see paranoid_entry) */
|
||||
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
||||
SWAPGS_UNSAFE_STACK
|
||||
jmp .Lparanoid_exit_restore
|
||||
|
||||
.Lparanoid_exit_no_swapgs:
|
||||
TRACE_IRQS_IRETQ_DEBUG
|
||||
/* Always restore stashed CR3 value (see paranoid_entry) */
|
||||
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
|
||||
|
||||
.Lparanoid_exit_restore:
|
||||
jmp restore_regs_and_return_to_kernel
|
||||
jmp restore_regs_and_return_to_kernel
|
||||
END(paranoid_exit)
|
||||
|
||||
/*
|
||||
@ -1636,10 +1692,27 @@ end_repeat_nmi:
|
||||
/* Always restore stashed CR3 value (see paranoid_entry) */
|
||||
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
|
||||
|
||||
testl %ebx, %ebx /* swapgs needed? */
|
||||
/*
|
||||
* The above invocation of paranoid_entry stored the GSBASE
|
||||
* related information in R/EBX depending on the availability
|
||||
* of FSGSBASE.
|
||||
*
|
||||
* If FSGSBASE is enabled, restore the saved GSBASE value
|
||||
* unconditionally, otherwise take the conditional SWAPGS path.
|
||||
*/
|
||||
ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
|
||||
|
||||
wrgsbase %rbx
|
||||
jmp nmi_restore
|
||||
|
||||
nmi_no_fsgsbase:
|
||||
/* EBX == 0 -> invoke SWAPGS */
|
||||
testl %ebx, %ebx
|
||||
jnz nmi_restore
|
||||
|
||||
nmi_swapgs:
|
||||
SWAPGS_UNSAFE_STACK
|
||||
|
||||
nmi_restore:
|
||||
POP_REGS
|
||||
|
||||
|
@ -22,8 +22,8 @@ enum cpuid_leafs
|
||||
CPUID_LNX_3,
|
||||
CPUID_7_0_EBX,
|
||||
CPUID_D_1_EAX,
|
||||
CPUID_F_0_EDX,
|
||||
CPUID_F_1_EDX,
|
||||
CPUID_LNX_4,
|
||||
CPUID_7_1_EAX,
|
||||
CPUID_8000_0008_EBX,
|
||||
CPUID_6_EAX,
|
||||
CPUID_8000_000A_EDX,
|
||||
|
@ -271,13 +271,19 @@
|
||||
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */
|
||||
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */
|
||||
|
||||
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */
|
||||
#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
|
||||
/*
|
||||
* Extended auxiliary flags: Linux defined - for features scattered in various
|
||||
* CPUID levels like 0xf, etc.
|
||||
*
|
||||
* Reuse free bits when adding new feature flags!
|
||||
*/
|
||||
#define X86_FEATURE_CQM_LLC (11*32+ 0) /* LLC QoS if 1 */
|
||||
#define X86_FEATURE_CQM_OCCUP_LLC (11*32+ 1) /* LLC occupancy monitoring */
|
||||
#define X86_FEATURE_CQM_MBM_TOTAL (11*32+ 2) /* LLC Total MBM monitoring */
|
||||
#define X86_FEATURE_CQM_MBM_LOCAL (11*32+ 3) /* LLC Local MBM monitoring */
|
||||
|
||||
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */
|
||||
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */
|
||||
#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
|
||||
#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
|
||||
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
|
||||
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
|
||||
|
||||
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
|
||||
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
|
||||
@ -324,6 +330,7 @@
|
||||
#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */
|
||||
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
|
||||
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
|
||||
#define X86_FEATURE_WAITPKG (16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE Instructions */
|
||||
#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
|
||||
#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */
|
||||
#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */
|
||||
|
@ -19,35 +19,62 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task);
|
||||
extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
|
||||
extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
|
||||
|
||||
/* Must be protected by X86_FEATURE_FSGSBASE check. */
|
||||
|
||||
static __always_inline unsigned long rdfsbase(void)
|
||||
{
|
||||
unsigned long fsbase;
|
||||
|
||||
asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
|
||||
|
||||
return fsbase;
|
||||
}
|
||||
|
||||
static __always_inline unsigned long rdgsbase(void)
|
||||
{
|
||||
unsigned long gsbase;
|
||||
|
||||
asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
|
||||
|
||||
return gsbase;
|
||||
}
|
||||
|
||||
static __always_inline void wrfsbase(unsigned long fsbase)
|
||||
{
|
||||
asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
|
||||
}
|
||||
|
||||
static __always_inline void wrgsbase(unsigned long gsbase)
|
||||
{
|
||||
asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
|
||||
}
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
|
||||
/* Helper functions for reading/writing FS/GS base */
|
||||
|
||||
static inline unsigned long x86_fsbase_read_cpu(void)
|
||||
{
|
||||
unsigned long fsbase;
|
||||
|
||||
rdmsrl(MSR_FS_BASE, fsbase);
|
||||
if (static_cpu_has(X86_FEATURE_FSGSBASE))
|
||||
fsbase = rdfsbase();
|
||||
else
|
||||
rdmsrl(MSR_FS_BASE, fsbase);
|
||||
|
||||
return fsbase;
|
||||
}
|
||||
|
||||
static inline unsigned long x86_gsbase_read_cpu_inactive(void)
|
||||
{
|
||||
unsigned long gsbase;
|
||||
|
||||
rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
||||
|
||||
return gsbase;
|
||||
}
|
||||
|
||||
static inline void x86_fsbase_write_cpu(unsigned long fsbase)
|
||||
{
|
||||
wrmsrl(MSR_FS_BASE, fsbase);
|
||||
if (static_cpu_has(X86_FEATURE_FSGSBASE))
|
||||
wrfsbase(fsbase);
|
||||
else
|
||||
wrmsrl(MSR_FS_BASE, fsbase);
|
||||
}
|
||||
|
||||
static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
|
||||
{
|
||||
wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
||||
}
|
||||
extern unsigned long x86_gsbase_read_cpu_inactive(void);
|
||||
extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
|
||||
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
|
@ -306,6 +306,21 @@
|
||||
.endif
|
||||
MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
|
||||
.endm
|
||||
|
||||
.macro RDPID opd
|
||||
REG_TYPE rdpid_opd_type \opd
|
||||
.if rdpid_opd_type == REG_TYPE_R64
|
||||
R64_NUM rdpid_opd \opd
|
||||
.else
|
||||
R32_NUM rdpid_opd \opd
|
||||
.endif
|
||||
.byte 0xf3
|
||||
.if rdpid_opd > 7
|
||||
PFX_REX rdpid_opd 0
|
||||
.endif
|
||||
.byte 0x0f, 0xc7
|
||||
MODRM 0xc0 rdpid_opd 0x7
|
||||
.endm
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -77,6 +77,7 @@
|
||||
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
|
||||
#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
|
||||
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
|
||||
|
||||
#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
|
||||
|
||||
/* Xeon Phi */
|
||||
|
@ -61,6 +61,15 @@
|
||||
#define MSR_PLATFORM_INFO_CPUID_FAULT_BIT 31
|
||||
#define MSR_PLATFORM_INFO_CPUID_FAULT BIT_ULL(MSR_PLATFORM_INFO_CPUID_FAULT_BIT)
|
||||
|
||||
#define MSR_IA32_UMWAIT_CONTROL 0xe1
|
||||
#define MSR_IA32_UMWAIT_CONTROL_C02_DISABLE BIT(0)
|
||||
#define MSR_IA32_UMWAIT_CONTROL_RESERVED BIT(1)
|
||||
/*
|
||||
* The time field is bit[31:2], but representing a 32bit value with
|
||||
* bit[1:0] zero.
|
||||
*/
|
||||
#define MSR_IA32_UMWAIT_CONTROL_TIME_MASK (~0x03U)
|
||||
|
||||
#define MSR_PKG_CST_CONFIG_CONTROL 0x000000e2
|
||||
#define NHM_C3_AUTO_DEMOTE (1UL << 25)
|
||||
#define NHM_C1_AUTO_DEMOTE (1UL << 26)
|
||||
|
@ -146,7 +146,8 @@ enum cpuid_regs_idx {
|
||||
#define X86_VENDOR_TRANSMETA 7
|
||||
#define X86_VENDOR_NSC 8
|
||||
#define X86_VENDOR_HYGON 9
|
||||
#define X86_VENDOR_NUM 10
|
||||
#define X86_VENDOR_ZHAOXIN 10
|
||||
#define X86_VENDOR_NUM 11
|
||||
|
||||
#define X86_VENDOR_UNKNOWN 0xff
|
||||
|
||||
|
@ -5,4 +5,7 @@
|
||||
/* MONITOR/MWAIT enabled in Ring 3 */
|
||||
#define HWCAP2_RING3MWAIT (1 << 0)
|
||||
|
||||
/* Kernel allows FSGSBASE instructions available in Ring 3 */
|
||||
#define HWCAP2_FSGSBASE BIT(1)
|
||||
|
||||
#endif
|
||||
|
@ -64,6 +64,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
|
||||
c->x86_stepping >= 0x0e))
|
||||
flags->bm_check = 1;
|
||||
}
|
||||
|
||||
if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
|
||||
/*
|
||||
* All Zhaoxin CPUs that support C3 share cache.
|
||||
* And caches should not be flushed by software while
|
||||
* entering C3 type state.
|
||||
*/
|
||||
flags->bm_check = 1;
|
||||
/*
|
||||
* On all recent Zhaoxin platforms, ARB_DISABLE is a nop.
|
||||
* So, set bm_control to zero to indicate that ARB_DISABLE
|
||||
* is not required while entering C3 type state.
|
||||
*/
|
||||
flags->bm_control = 0;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
|
||||
|
||||
|
@ -24,6 +24,7 @@ obj-y += match.o
|
||||
obj-y += bugs.o
|
||||
obj-y += aperfmperf.o
|
||||
obj-y += cpuid-deps.o
|
||||
obj-y += umwait.o
|
||||
|
||||
obj-$(CONFIG_PROC_FS) += proc.o
|
||||
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
|
||||
@ -38,6 +39,7 @@ obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
|
||||
obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
|
||||
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
|
||||
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
|
||||
obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin.o
|
||||
|
||||
obj-$(CONFIG_X86_MCE) += mce/
|
||||
obj-$(CONFIG_MTRR) += mtrr/
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/percpu.h>
|
||||
#include <linux/cpufreq.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
|
||||
#include "cpu.h"
|
||||
|
||||
@ -85,6 +86,9 @@ unsigned int aperfmperf_get_khz(int cpu)
|
||||
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
|
||||
return 0;
|
||||
|
||||
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
|
||||
return 0;
|
||||
|
||||
aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
|
||||
return per_cpu(samples.khz, cpu);
|
||||
}
|
||||
@ -101,9 +105,12 @@ void arch_freq_prepare_all(void)
|
||||
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
|
||||
return;
|
||||
|
||||
for_each_online_cpu(cpu)
|
||||
for_each_online_cpu(cpu) {
|
||||
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
|
||||
continue;
|
||||
if (!aperfmperf_snapshot_cpu(cpu, now, false))
|
||||
wait = true;
|
||||
}
|
||||
|
||||
if (wait)
|
||||
msleep(APERFMPERF_REFRESH_DELAY_MS);
|
||||
@ -117,6 +124,9 @@ unsigned int arch_freq_get_on_cpu(int cpu)
|
||||
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
|
||||
return 0;
|
||||
|
||||
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
|
||||
return 0;
|
||||
|
||||
if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
|
||||
return per_cpu(samples.khz, cpu);
|
||||
|
||||
|
@ -658,8 +658,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
|
||||
if (c->x86 < 0x17) {
|
||||
/* LLC is at the node level. */
|
||||
per_cpu(cpu_llc_id, cpu) = node_id;
|
||||
} else if (c->x86 == 0x17 &&
|
||||
c->x86_model >= 0 && c->x86_model <= 0x1F) {
|
||||
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
|
||||
/*
|
||||
* LLC is at the core complex level.
|
||||
* Core complex ID is ApicId[3] for these processors.
|
||||
|
@ -366,6 +366,22 @@ out:
|
||||
cr4_clear_bits(X86_CR4_UMIP);
|
||||
}
|
||||
|
||||
static __init int x86_nofsgsbase_setup(char *arg)
|
||||
{
|
||||
/* Require an exact match without trailing characters. */
|
||||
if (strlen(arg))
|
||||
return 0;
|
||||
|
||||
/* Do not emit a message if the feature is not present. */
|
||||
if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
|
||||
return 1;
|
||||
|
||||
setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
|
||||
pr_info("FSGSBASE disabled via kernel command line\n");
|
||||
return 1;
|
||||
}
|
||||
__setup("nofsgsbase", x86_nofsgsbase_setup);
|
||||
|
||||
/*
|
||||
* Protection Keys are not available in 32-bit mode.
|
||||
*/
|
||||
@ -801,6 +817,30 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
|
||||
}
|
||||
}
|
||||
|
||||
static void init_cqm(struct cpuinfo_x86 *c)
|
||||
{
|
||||
if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
|
||||
c->x86_cache_max_rmid = -1;
|
||||
c->x86_cache_occ_scale = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
/* will be overridden if occupancy monitoring exists */
|
||||
c->x86_cache_max_rmid = cpuid_ebx(0xf);
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
|
||||
cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
|
||||
cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
|
||||
u32 eax, ebx, ecx, edx;
|
||||
|
||||
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
|
||||
cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
c->x86_cache_max_rmid = ecx;
|
||||
c->x86_cache_occ_scale = ebx;
|
||||
}
|
||||
}
|
||||
|
||||
void get_cpu_cap(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u32 eax, ebx, ecx, edx;
|
||||
@ -823,6 +863,12 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
|
||||
c->x86_capability[CPUID_7_0_EBX] = ebx;
|
||||
c->x86_capability[CPUID_7_ECX] = ecx;
|
||||
c->x86_capability[CPUID_7_EDX] = edx;
|
||||
|
||||
/* Check valid sub-leaf index before accessing it */
|
||||
if (eax >= 1) {
|
||||
cpuid_count(0x00000007, 1, &eax, &ebx, &ecx, &edx);
|
||||
c->x86_capability[CPUID_7_1_EAX] = eax;
|
||||
}
|
||||
}
|
||||
|
||||
/* Extended state features: level 0x0000000d */
|
||||
@ -832,33 +878,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
|
||||
c->x86_capability[CPUID_D_1_EAX] = eax;
|
||||
}
|
||||
|
||||
/* Additional Intel-defined flags: level 0x0000000F */
|
||||
if (c->cpuid_level >= 0x0000000F) {
|
||||
|
||||
/* QoS sub-leaf, EAX=0Fh, ECX=0 */
|
||||
cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
|
||||
c->x86_capability[CPUID_F_0_EDX] = edx;
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
|
||||
/* will be overridden if occupancy monitoring exists */
|
||||
c->x86_cache_max_rmid = ebx;
|
||||
|
||||
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
|
||||
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
|
||||
c->x86_capability[CPUID_F_1_EDX] = edx;
|
||||
|
||||
if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
|
||||
((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
|
||||
(cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
|
||||
c->x86_cache_max_rmid = ecx;
|
||||
c->x86_cache_occ_scale = ebx;
|
||||
}
|
||||
} else {
|
||||
c->x86_cache_max_rmid = -1;
|
||||
c->x86_cache_occ_scale = -1;
|
||||
}
|
||||
}
|
||||
|
||||
/* AMD-defined flags: level 0x80000001 */
|
||||
eax = cpuid_eax(0x80000000);
|
||||
c->extended_cpuid_level = eax;
|
||||
@ -889,6 +908,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
|
||||
|
||||
init_scattered_cpuid_features(c);
|
||||
init_speculation_control(c);
|
||||
init_cqm(c);
|
||||
|
||||
/*
|
||||
* Clear/Set all flags overridden by options, after probe.
|
||||
@ -1367,6 +1387,12 @@ static void identify_cpu(struct cpuinfo_x86 *c)
|
||||
setup_smap(c);
|
||||
setup_umip(c);
|
||||
|
||||
/* Enable FSGSBASE instructions if available. */
|
||||
if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
|
||||
cr4_set_bits(X86_CR4_FSGSBASE);
|
||||
elf_hwcap2 |= HWCAP2_FSGSBASE;
|
||||
}
|
||||
|
||||
/*
|
||||
* The vendor-specific functions might have changed features.
|
||||
* Now we do "generic changes."
|
||||
|
@ -59,6 +59,10 @@ static const struct cpuid_dep cpuid_deps[] = {
|
||||
{ X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
|
||||
{ X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
|
||||
{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
|
||||
{ X86_FEATURE_CQM_OCCUP_LLC, X86_FEATURE_CQM_LLC },
|
||||
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
|
||||
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
|
||||
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
|
||||
{}
|
||||
};
|
||||
|
||||
|
@ -26,6 +26,10 @@ struct cpuid_bit {
|
||||
static const struct cpuid_bit cpuid_bits[] = {
|
||||
{ X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
|
||||
{ X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
|
||||
{ X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 },
|
||||
{ X86_FEATURE_CQM_OCCUP_LLC, CPUID_EDX, 0, 0x0000000f, 1 },
|
||||
{ X86_FEATURE_CQM_MBM_TOTAL, CPUID_EDX, 1, 0x0000000f, 1 },
|
||||
{ X86_FEATURE_CQM_MBM_LOCAL, CPUID_EDX, 2, 0x0000000f, 1 },
|
||||
{ X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
|
||||
{ X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
|
||||
{ X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
|
||||
|
200
arch/x86/kernel/cpu/umwait.c
Normal file
200
arch/x86/kernel/cpu/umwait.c
Normal file
@ -0,0 +1,200 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/syscore_ops.h>
|
||||
#include <linux/suspend.h>
|
||||
#include <linux/cpu.h>
|
||||
|
||||
#include <asm/msr.h>
|
||||
|
||||
#define UMWAIT_C02_ENABLE 0
|
||||
|
||||
#define UMWAIT_CTRL_VAL(max_time, c02_disable) \
|
||||
(((max_time) & MSR_IA32_UMWAIT_CONTROL_TIME_MASK) | \
|
||||
((c02_disable) & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE))
|
||||
|
||||
/*
|
||||
* Cache IA32_UMWAIT_CONTROL MSR. This is a systemwide control. By default,
|
||||
* umwait max time is 100000 in TSC-quanta and C0.2 is enabled
|
||||
*/
|
||||
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
|
||||
|
||||
/*
|
||||
* Serialize access to umwait_control_cached and IA32_UMWAIT_CONTROL MSR in
|
||||
* the sysfs write functions.
|
||||
*/
|
||||
static DEFINE_MUTEX(umwait_lock);
|
||||
|
||||
static void umwait_update_control_msr(void * unused)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* The CPU hotplug callback sets the control MSR to the global control
|
||||
* value.
|
||||
*
|
||||
* Disable interrupts so the read of umwait_control_cached and the WRMSR
|
||||
* are protected against a concurrent sysfs write. Otherwise the sysfs
|
||||
* write could update the cached value after it had been read on this CPU
|
||||
* and issue the IPI before the old value had been written. The IPI would
|
||||
* interrupt, write the new value and after return from IPI the previous
|
||||
* value would be written by this CPU.
|
||||
*
|
||||
* With interrupts disabled the upcoming CPU either sees the new control
|
||||
* value or the IPI is updating this CPU to the new control value after
|
||||
* interrupts have been reenabled.
|
||||
*/
|
||||
static int umwait_cpu_online(unsigned int cpu)
|
||||
{
|
||||
local_irq_disable();
|
||||
umwait_update_control_msr(NULL);
|
||||
local_irq_enable();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* On resume, restore IA32_UMWAIT_CONTROL MSR on the boot processor which
|
||||
* is the only active CPU at this time. The MSR is set up on the APs via the
|
||||
* CPU hotplug callback.
|
||||
*
|
||||
* This function is invoked on resume from suspend and hibernation. On
|
||||
* resume from suspend the restore should be not required, but we neither
|
||||
* trust the firmware nor does it matter if the same value is written
|
||||
* again.
|
||||
*/
|
||||
static void umwait_syscore_resume(void)
|
||||
{
|
||||
umwait_update_control_msr(NULL);
|
||||
}
|
||||
|
||||
static struct syscore_ops umwait_syscore_ops = {
|
||||
.resume = umwait_syscore_resume,
|
||||
};
|
||||
|
||||
/* sysfs interface */
|
||||
|
||||
/*
|
||||
* When bit 0 in IA32_UMWAIT_CONTROL MSR is 1, C0.2 is disabled.
|
||||
* Otherwise, C0.2 is enabled.
|
||||
*/
|
||||
static inline bool umwait_ctrl_c02_enabled(u32 ctrl)
|
||||
{
|
||||
return !(ctrl & MSR_IA32_UMWAIT_CONTROL_C02_DISABLE);
|
||||
}
|
||||
|
||||
static inline u32 umwait_ctrl_max_time(u32 ctrl)
|
||||
{
|
||||
return ctrl & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
|
||||
}
|
||||
|
||||
static inline void umwait_update_control(u32 maxtime, bool c02_enable)
|
||||
{
|
||||
u32 ctrl = maxtime & MSR_IA32_UMWAIT_CONTROL_TIME_MASK;
|
||||
|
||||
if (!c02_enable)
|
||||
ctrl |= MSR_IA32_UMWAIT_CONTROL_C02_DISABLE;
|
||||
|
||||
WRITE_ONCE(umwait_control_cached, ctrl);
|
||||
/* Propagate to all CPUs */
|
||||
on_each_cpu(umwait_update_control_msr, NULL, 1);
|
||||
}
|
||||
|
||||
static ssize_t
|
||||
enable_c02_show(struct device *dev, struct device_attribute *attr, char *buf)
|
||||
{
|
||||
u32 ctrl = READ_ONCE(umwait_control_cached);
|
||||
|
||||
return sprintf(buf, "%d\n", umwait_ctrl_c02_enabled(ctrl));
|
||||
}
|
||||
|
||||
static ssize_t enable_c02_store(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
bool c02_enable;
|
||||
u32 ctrl;
|
||||
int ret;
|
||||
|
||||
ret = kstrtobool(buf, &c02_enable);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
mutex_lock(&umwait_lock);
|
||||
|
||||
ctrl = READ_ONCE(umwait_control_cached);
|
||||
if (c02_enable != umwait_ctrl_c02_enabled(ctrl))
|
||||
umwait_update_control(ctrl, c02_enable);
|
||||
|
||||
mutex_unlock(&umwait_lock);
|
||||
|
||||
return count;
|
||||
}
|
||||
static DEVICE_ATTR_RW(enable_c02);
|
||||
|
||||
static ssize_t
|
||||
max_time_show(struct device *kobj, struct device_attribute *attr, char *buf)
|
||||
{
|
||||
u32 ctrl = READ_ONCE(umwait_control_cached);
|
||||
|
||||
return sprintf(buf, "%u\n", umwait_ctrl_max_time(ctrl));
|
||||
}
|
||||
|
||||
static ssize_t max_time_store(struct device *kobj,
|
||||
struct device_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
u32 max_time, ctrl;
|
||||
int ret;
|
||||
|
||||
ret = kstrtou32(buf, 0, &max_time);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* bits[1:0] must be zero */
|
||||
if (max_time & ~MSR_IA32_UMWAIT_CONTROL_TIME_MASK)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&umwait_lock);
|
||||
|
||||
ctrl = READ_ONCE(umwait_control_cached);
|
||||
if (max_time != umwait_ctrl_max_time(ctrl))
|
||||
umwait_update_control(max_time, umwait_ctrl_c02_enabled(ctrl));
|
||||
|
||||
mutex_unlock(&umwait_lock);
|
||||
|
||||
return count;
|
||||
}
|
||||
static DEVICE_ATTR_RW(max_time);
|
||||
|
||||
static struct attribute *umwait_attrs[] = {
|
||||
&dev_attr_enable_c02.attr,
|
||||
&dev_attr_max_time.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
static struct attribute_group umwait_attr_group = {
|
||||
.attrs = umwait_attrs,
|
||||
.name = "umwait_control",
|
||||
};
|
||||
|
||||
static int __init umwait_init(void)
|
||||
{
|
||||
struct device *dev;
|
||||
int ret;
|
||||
|
||||
if (!boot_cpu_has(X86_FEATURE_WAITPKG))
|
||||
return -ENODEV;
|
||||
|
||||
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
|
||||
umwait_cpu_online, NULL);
|
||||
|
||||
register_syscore_ops(&umwait_syscore_ops);
|
||||
|
||||
/*
|
||||
* Add umwait control interface. Ignore failure, so at least the
|
||||
* default values are set up in case the machine manages to boot.
|
||||
*/
|
||||
dev = cpu_subsys.dev_root;
|
||||
return sysfs_create_group(&dev->kobj, &umwait_attr_group);
|
||||
}
|
||||
device_initcall(umwait_init);
|
167
arch/x86/kernel/cpu/zhaoxin.c
Normal file
167
arch/x86/kernel/cpu/zhaoxin.c
Normal file
@ -0,0 +1,167 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/clock.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
|
||||
#include "cpu.h"
|
||||
|
||||
#define MSR_ZHAOXIN_FCR57 0x00001257
|
||||
|
||||
#define ACE_PRESENT (1 << 6)
|
||||
#define ACE_ENABLED (1 << 7)
|
||||
#define ACE_FCR (1 << 7) /* MSR_ZHAOXIN_FCR */
|
||||
|
||||
#define RNG_PRESENT (1 << 2)
|
||||
#define RNG_ENABLED (1 << 3)
|
||||
#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
|
||||
|
||||
#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
|
||||
#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
|
||||
#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
|
||||
#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
|
||||
#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
|
||||
#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
|
||||
|
||||
static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u32 lo, hi;
|
||||
|
||||
/* Test for Extended Feature Flags presence */
|
||||
if (cpuid_eax(0xC0000000) >= 0xC0000001) {
|
||||
u32 tmp = cpuid_edx(0xC0000001);
|
||||
|
||||
/* Enable ACE unit, if present and disabled */
|
||||
if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
|
||||
rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
|
||||
/* Enable ACE unit */
|
||||
lo |= ACE_FCR;
|
||||
wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
|
||||
pr_info("CPU: Enabled ACE h/w crypto\n");
|
||||
}
|
||||
|
||||
/* Enable RNG unit, if present and disabled */
|
||||
if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
|
||||
rdmsr(MSR_ZHAOXIN_FCR57, lo, hi);
|
||||
/* Enable RNG unit */
|
||||
lo |= RNG_ENABLE;
|
||||
wrmsr(MSR_ZHAOXIN_FCR57, lo, hi);
|
||||
pr_info("CPU: Enabled h/w RNG\n");
|
||||
}
|
||||
|
||||
/*
|
||||
* Store Extended Feature Flags as word 5 of the CPU
|
||||
* capability bit array
|
||||
*/
|
||||
c->x86_capability[CPUID_C000_0001_EDX] = cpuid_edx(0xC0000001);
|
||||
}
|
||||
|
||||
if (c->x86 >= 0x6)
|
||||
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
|
||||
|
||||
cpu_detect_cache_sizes(c);
|
||||
}
|
||||
|
||||
static void early_init_zhaoxin(struct cpuinfo_x86 *c)
|
||||
{
|
||||
if (c->x86 >= 0x6)
|
||||
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
|
||||
#ifdef CONFIG_X86_64
|
||||
set_cpu_cap(c, X86_FEATURE_SYSENTER32);
|
||||
#endif
|
||||
if (c->x86_power & (1 << 8)) {
|
||||
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
|
||||
set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
|
||||
}
|
||||
|
||||
if (c->cpuid_level >= 0x00000001) {
|
||||
u32 eax, ebx, ecx, edx;
|
||||
|
||||
cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
|
||||
/*
|
||||
* If HTT (EDX[28]) is set EBX[16:23] contain the number of
|
||||
* apicids which are reserved per package. Store the resulting
|
||||
* shift value for the package management code.
|
||||
*/
|
||||
if (edx & (1U << 28))
|
||||
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
|
||||
{
|
||||
u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
|
||||
|
||||
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
|
||||
msr_ctl = vmx_msr_high | vmx_msr_low;
|
||||
|
||||
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
|
||||
set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
|
||||
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
|
||||
set_cpu_cap(c, X86_FEATURE_VNMI);
|
||||
if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
|
||||
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
|
||||
vmx_msr_low, vmx_msr_high);
|
||||
msr_ctl2 = vmx_msr_high | vmx_msr_low;
|
||||
if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
|
||||
(msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
|
||||
set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
|
||||
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
|
||||
set_cpu_cap(c, X86_FEATURE_EPT);
|
||||
if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
|
||||
set_cpu_cap(c, X86_FEATURE_VPID);
|
||||
}
|
||||
}
|
||||
|
||||
static void init_zhaoxin(struct cpuinfo_x86 *c)
|
||||
{
|
||||
early_init_zhaoxin(c);
|
||||
init_intel_cacheinfo(c);
|
||||
detect_num_cpu_cores(c);
|
||||
#ifdef CONFIG_X86_32
|
||||
detect_ht(c);
|
||||
#endif
|
||||
|
||||
if (c->cpuid_level > 9) {
|
||||
unsigned int eax = cpuid_eax(10);
|
||||
|
||||
/*
|
||||
* Check for version and the number of counters
|
||||
* Version(eax[7:0]) can't be 0;
|
||||
* Counters(eax[15:8]) should be greater than 1;
|
||||
*/
|
||||
if ((eax & 0xff) && (((eax >> 8) & 0xff) > 1))
|
||||
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
|
||||
}
|
||||
|
||||
if (c->x86 >= 0x6)
|
||||
init_zhaoxin_cap(c);
|
||||
#ifdef CONFIG_X86_64
|
||||
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
|
||||
#endif
|
||||
|
||||
if (cpu_has(c, X86_FEATURE_VMX))
|
||||
zhaoxin_detect_vmx_virtcap(c);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
static unsigned int
|
||||
zhaoxin_size_cache(struct cpuinfo_x86 *c, unsigned int size)
|
||||
{
|
||||
return size;
|
||||
}
|
||||
#endif
|
||||
|
||||
static const struct cpu_dev zhaoxin_cpu_dev = {
|
||||
.c_vendor = "zhaoxin",
|
||||
.c_ident = { " Shanghai " },
|
||||
.c_early_init = early_init_zhaoxin,
|
||||
.c_init = init_zhaoxin,
|
||||
#ifdef CONFIG_X86_32
|
||||
.legacy_cache_size = zhaoxin_size_cache,
|
||||
#endif
|
||||
.c_x86_vendor = X86_VENDOR_ZHAOXIN,
|
||||
};
|
||||
|
||||
cpu_dev_register(zhaoxin_cpu_dev);
|
@ -161,6 +161,40 @@ enum which_selector {
|
||||
GS
|
||||
};
|
||||
|
||||
/*
|
||||
* Out of line to be protected from kprobes. It is not used on Xen
|
||||
* paravirt. When paravirt support is needed, it needs to be renamed
|
||||
* with native_ prefix.
|
||||
*/
|
||||
static noinline unsigned long __rdgsbase_inactive(void)
|
||||
{
|
||||
unsigned long gsbase;
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
native_swapgs();
|
||||
gsbase = rdgsbase();
|
||||
native_swapgs();
|
||||
|
||||
return gsbase;
|
||||
}
|
||||
NOKPROBE_SYMBOL(__rdgsbase_inactive);
|
||||
|
||||
/*
|
||||
* Out of line to be protected from kprobes. It is not used on Xen
|
||||
* paravirt. When paravirt support is needed, it needs to be renamed
|
||||
* with native_ prefix.
|
||||
*/
|
||||
static noinline void __wrgsbase_inactive(unsigned long gsbase)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
native_swapgs();
|
||||
wrgsbase(gsbase);
|
||||
native_swapgs();
|
||||
}
|
||||
NOKPROBE_SYMBOL(__wrgsbase_inactive);
|
||||
|
||||
/*
|
||||
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
|
||||
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
|
||||
@ -210,8 +244,22 @@ static __always_inline void save_fsgs(struct task_struct *task)
|
||||
{
|
||||
savesegment(fs, task->thread.fsindex);
|
||||
savesegment(gs, task->thread.gsindex);
|
||||
save_base_legacy(task, task->thread.fsindex, FS);
|
||||
save_base_legacy(task, task->thread.gsindex, GS);
|
||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* If FSGSBASE is enabled, we can't make any useful guesses
|
||||
* about the base, and user code expects us to save the current
|
||||
* value. Fortunately, reading the base directly is efficient.
|
||||
*/
|
||||
task->thread.fsbase = rdfsbase();
|
||||
local_irq_save(flags);
|
||||
task->thread.gsbase = __rdgsbase_inactive();
|
||||
local_irq_restore(flags);
|
||||
} else {
|
||||
save_base_legacy(task, task->thread.fsindex, FS);
|
||||
save_base_legacy(task, task->thread.gsindex, GS);
|
||||
}
|
||||
}
|
||||
|
||||
#if IS_ENABLED(CONFIG_KVM)
|
||||
@ -290,10 +338,22 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
|
||||
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
|
||||
struct thread_struct *next)
|
||||
{
|
||||
load_seg_legacy(prev->fsindex, prev->fsbase,
|
||||
next->fsindex, next->fsbase, FS);
|
||||
load_seg_legacy(prev->gsindex, prev->gsbase,
|
||||
next->gsindex, next->gsbase, GS);
|
||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
||||
/* Update the FS and GS selectors if they could have changed. */
|
||||
if (unlikely(prev->fsindex || next->fsindex))
|
||||
loadseg(FS, next->fsindex);
|
||||
if (unlikely(prev->gsindex || next->gsindex))
|
||||
loadseg(GS, next->gsindex);
|
||||
|
||||
/* Update the bases. */
|
||||
wrfsbase(next->fsbase);
|
||||
__wrgsbase_inactive(next->gsbase);
|
||||
} else {
|
||||
load_seg_legacy(prev->fsindex, prev->fsbase,
|
||||
next->fsindex, next->fsbase, FS);
|
||||
load_seg_legacy(prev->gsindex, prev->gsbase,
|
||||
next->gsindex, next->gsbase, GS);
|
||||
}
|
||||
}
|
||||
|
||||
static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
||||
@ -339,13 +399,46 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
|
||||
return base;
|
||||
}
|
||||
|
||||
unsigned long x86_gsbase_read_cpu_inactive(void)
|
||||
{
|
||||
unsigned long gsbase;
|
||||
|
||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
||||
unsigned long flags;
|
||||
|
||||
/* Interrupts are disabled here. */
|
||||
local_irq_save(flags);
|
||||
gsbase = __rdgsbase_inactive();
|
||||
local_irq_restore(flags);
|
||||
} else {
|
||||
rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
||||
}
|
||||
|
||||
return gsbase;
|
||||
}
|
||||
|
||||
void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
|
||||
{
|
||||
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
|
||||
unsigned long flags;
|
||||
|
||||
/* Interrupts are disabled here. */
|
||||
local_irq_save(flags);
|
||||
__wrgsbase_inactive(gsbase);
|
||||
local_irq_restore(flags);
|
||||
} else {
|
||||
wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned long x86_fsbase_read_task(struct task_struct *task)
|
||||
{
|
||||
unsigned long fsbase;
|
||||
|
||||
if (task == current)
|
||||
fsbase = x86_fsbase_read_cpu();
|
||||
else if (task->thread.fsindex == 0)
|
||||
else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
|
||||
(task->thread.fsindex == 0))
|
||||
fsbase = task->thread.fsbase;
|
||||
else
|
||||
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
|
||||
@ -359,7 +452,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
|
||||
|
||||
if (task == current)
|
||||
gsbase = x86_gsbase_read_cpu_inactive();
|
||||
else if (task->thread.gsindex == 0)
|
||||
else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
|
||||
(task->thread.gsindex == 0))
|
||||
gsbase = task->thread.gsbase;
|
||||
else
|
||||
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
|
||||
@ -399,10 +493,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
|
||||
p->thread.sp = (unsigned long) fork_frame;
|
||||
p->thread.io_bitmap_ptr = NULL;
|
||||
|
||||
savesegment(gs, p->thread.gsindex);
|
||||
p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
|
||||
savesegment(fs, p->thread.fsindex);
|
||||
p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
|
||||
save_fsgs(me);
|
||||
p->thread.fsindex = me->thread.fsindex;
|
||||
p->thread.fsbase = me->thread.fsbase;
|
||||
p->thread.gsindex = me->thread.gsindex;
|
||||
p->thread.gsbase = me->thread.gsbase;
|
||||
savesegment(es, p->thread.es);
|
||||
savesegment(ds, p->thread.ds);
|
||||
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
|
||||
|
@ -397,22 +397,12 @@ static int putreg(struct task_struct *child,
|
||||
case offsetof(struct user_regs_struct,fs_base):
|
||||
if (value >= TASK_SIZE_MAX)
|
||||
return -EIO;
|
||||
/*
|
||||
* When changing the FS base, use do_arch_prctl_64()
|
||||
* to set the index to zero and to set the base
|
||||
* as requested.
|
||||
*/
|
||||
if (child->thread.fsbase != value)
|
||||
return do_arch_prctl_64(child, ARCH_SET_FS, value);
|
||||
x86_fsbase_write_task(child, value);
|
||||
return 0;
|
||||
case offsetof(struct user_regs_struct,gs_base):
|
||||
/*
|
||||
* Exactly the same here as the %fs handling above.
|
||||
*/
|
||||
if (value >= TASK_SIZE_MAX)
|
||||
return -EIO;
|
||||
if (child->thread.gsbase != value)
|
||||
return do_arch_prctl_64(child, ARCH_SET_GS, value);
|
||||
x86_gsbase_write_task(child, value);
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
@ -47,8 +47,6 @@ static const struct cpuid_reg reverse_cpuid[] = {
|
||||
[CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
|
||||
[CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
|
||||
[CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
|
||||
[CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX},
|
||||
[CPUID_F_1_EDX] = { 0xf, 1, CPUID_EDX},
|
||||
[CPUID_8000_0008_EBX] = {0x80000008, 0, CPUID_EBX},
|
||||
[CPUID_6_EAX] = { 6, 0, CPUID_EAX},
|
||||
[CPUID_8000_000A_EDX] = {0x8000000a, 0, CPUID_EDX},
|
||||
|
@ -64,6 +64,7 @@ static void power_saving_mwait_init(void)
|
||||
case X86_VENDOR_HYGON:
|
||||
case X86_VENDOR_AMD:
|
||||
case X86_VENDOR_INTEL:
|
||||
case X86_VENDOR_ZHAOXIN:
|
||||
/*
|
||||
* AMD Fam10h TSC will tick in all
|
||||
* C/P/S0/S1 states when this bit is set.
|
||||
|
@ -196,6 +196,7 @@ static void tsc_check_state(int state)
|
||||
case X86_VENDOR_AMD:
|
||||
case X86_VENDOR_INTEL:
|
||||
case X86_VENDOR_CENTAUR:
|
||||
case X86_VENDOR_ZHAOXIN:
|
||||
/*
|
||||
* AMD Fam10h TSC will tick in all
|
||||
* C/P/S0/S1 states when this bit is set.
|
||||
|
@ -23,6 +23,10 @@
|
||||
#include <pthread.h>
|
||||
#include <asm/ldt.h>
|
||||
#include <sys/mman.h>
|
||||
#include <stddef.h>
|
||||
#include <sys/ptrace.h>
|
||||
#include <sys/wait.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
#ifndef __x86_64__
|
||||
# error This test is 64-bit only
|
||||
@ -71,6 +75,43 @@ static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
|
||||
|
||||
}
|
||||
|
||||
static jmp_buf jmpbuf;
|
||||
|
||||
static void sigill(int sig, siginfo_t *si, void *ctx_void)
|
||||
{
|
||||
siglongjmp(jmpbuf, 1);
|
||||
}
|
||||
|
||||
static bool have_fsgsbase;
|
||||
|
||||
static inline unsigned long rdgsbase(void)
|
||||
{
|
||||
unsigned long gsbase;
|
||||
|
||||
asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
|
||||
|
||||
return gsbase;
|
||||
}
|
||||
|
||||
static inline unsigned long rdfsbase(void)
|
||||
{
|
||||
unsigned long fsbase;
|
||||
|
||||
asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
|
||||
|
||||
return fsbase;
|
||||
}
|
||||
|
||||
static inline void wrgsbase(unsigned long gsbase)
|
||||
{
|
||||
asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
|
||||
}
|
||||
|
||||
static inline void wrfsbase(unsigned long fsbase)
|
||||
{
|
||||
asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
|
||||
}
|
||||
|
||||
enum which_base { FS, GS };
|
||||
|
||||
static unsigned long read_base(enum which_base which)
|
||||
@ -199,14 +240,16 @@ static void do_remote_base()
|
||||
to_set, hard_zero ? " and clear gs" : "", sel);
|
||||
}
|
||||
|
||||
void do_unexpected_base(void)
|
||||
static __thread int set_thread_area_entry_number = -1;
|
||||
|
||||
static void do_unexpected_base(void)
|
||||
{
|
||||
/*
|
||||
* The goal here is to try to arrange for GS == 0, GSBASE !=
|
||||
* 0, and for the the kernel the think that GSBASE == 0.
|
||||
*
|
||||
* To make the test as reliable as possible, this uses
|
||||
* explicit descriptorss. (This is not the only way. This
|
||||
* explicit descriptors. (This is not the only way. This
|
||||
* could use ARCH_SET_GS with a low, nonzero base, but the
|
||||
* relevant side effect of ARCH_SET_GS could change.)
|
||||
*/
|
||||
@ -239,7 +282,7 @@ void do_unexpected_base(void)
|
||||
MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
|
||||
memcpy(low_desc, &desc, sizeof(desc));
|
||||
|
||||
low_desc->entry_number = -1;
|
||||
low_desc->entry_number = set_thread_area_entry_number;
|
||||
|
||||
/* 32-bit set_thread_area */
|
||||
long ret;
|
||||
@ -254,6 +297,8 @@ void do_unexpected_base(void)
|
||||
return;
|
||||
}
|
||||
printf("\tother thread: using GDT slot %d\n", desc.entry_number);
|
||||
set_thread_area_entry_number = desc.entry_number;
|
||||
|
||||
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3)));
|
||||
}
|
||||
|
||||
@ -265,6 +310,34 @@ void do_unexpected_base(void)
|
||||
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
|
||||
}
|
||||
|
||||
void test_wrbase(unsigned short index, unsigned long base)
|
||||
{
|
||||
unsigned short newindex;
|
||||
unsigned long newbase;
|
||||
|
||||
printf("[RUN]\tGS = 0x%hx, GSBASE = 0x%lx\n", index, base);
|
||||
|
||||
asm volatile ("mov %0, %%gs" : : "rm" (index));
|
||||
wrgsbase(base);
|
||||
|
||||
remote_base = 0;
|
||||
ftx = 1;
|
||||
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
|
||||
while (ftx != 0)
|
||||
syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
|
||||
|
||||
asm volatile ("mov %%gs, %0" : "=rm" (newindex));
|
||||
newbase = rdgsbase();
|
||||
|
||||
if (newindex == index && newbase == base) {
|
||||
printf("[OK]\tIndex and base were preserved\n");
|
||||
} else {
|
||||
printf("[FAIL]\tAfter switch, GS = 0x%hx and GSBASE = 0x%lx\n",
|
||||
newindex, newbase);
|
||||
nerrs++;
|
||||
}
|
||||
}
|
||||
|
||||
static void *threadproc(void *ctx)
|
||||
{
|
||||
while (1) {
|
||||
@ -367,10 +440,93 @@ static void test_unexpected_base(void)
|
||||
}
|
||||
}
|
||||
|
||||
#define USER_REGS_OFFSET(r) offsetof(struct user_regs_struct, r)
|
||||
|
||||
static void test_ptrace_write_gsbase(void)
|
||||
{
|
||||
int status;
|
||||
pid_t child = fork();
|
||||
|
||||
if (child < 0)
|
||||
err(1, "fork");
|
||||
|
||||
if (child == 0) {
|
||||
printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
|
||||
|
||||
/*
|
||||
* Use the LDT setup and fetch the GSBASE from the LDT
|
||||
* by switching to the (nonzero) selector (again)
|
||||
*/
|
||||
do_unexpected_base();
|
||||
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
|
||||
|
||||
if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
|
||||
err(1, "PTRACE_TRACEME");
|
||||
|
||||
raise(SIGTRAP);
|
||||
_exit(0);
|
||||
}
|
||||
|
||||
wait(&status);
|
||||
|
||||
if (WSTOPSIG(status) == SIGTRAP) {
|
||||
unsigned long gs, base;
|
||||
unsigned long gs_offset = USER_REGS_OFFSET(gs);
|
||||
unsigned long base_offset = USER_REGS_OFFSET(gs_base);
|
||||
|
||||
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
|
||||
|
||||
if (gs != 0x7) {
|
||||
nerrs++;
|
||||
printf("[FAIL]\tGS is not prepared with nonzero\n");
|
||||
goto END;
|
||||
}
|
||||
|
||||
if (ptrace(PTRACE_POKEUSER, child, base_offset, 0xFF) != 0)
|
||||
err(1, "PTRACE_POKEUSER");
|
||||
|
||||
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
|
||||
base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
|
||||
|
||||
/*
|
||||
* In a non-FSGSBASE system, the nonzero selector will load
|
||||
* GSBASE (again). But what is tested here is whether the
|
||||
* selector value is changed or not by the GSBASE write in
|
||||
* a ptracer.
|
||||
*/
|
||||
if (gs != 0x7) {
|
||||
nerrs++;
|
||||
printf("[FAIL]\tGS changed to %lx\n", gs);
|
||||
} else if (have_fsgsbase && (base != 0xFF)) {
|
||||
nerrs++;
|
||||
printf("[FAIL]\tGSBASE changed to %lx\n", base);
|
||||
} else {
|
||||
printf("[OK]\tGS remained 0x7 %s");
|
||||
if (have_fsgsbase)
|
||||
printf("and GSBASE changed to 0xFF");
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
END:
|
||||
ptrace(PTRACE_CONT, child, NULL, NULL);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
pthread_t thread;
|
||||
|
||||
/* Probe FSGSBASE */
|
||||
sethandler(SIGILL, sigill, 0);
|
||||
if (sigsetjmp(jmpbuf, 1) == 0) {
|
||||
rdfsbase();
|
||||
have_fsgsbase = true;
|
||||
printf("\tFSGSBASE instructions are enabled\n");
|
||||
} else {
|
||||
printf("\tFSGSBASE instructions are disabled\n");
|
||||
}
|
||||
clearhandler(SIGILL);
|
||||
|
||||
sethandler(SIGSEGV, sigsegv, 0);
|
||||
|
||||
check_gs_value(0);
|
||||
@ -417,11 +573,28 @@ int main()
|
||||
|
||||
test_unexpected_base();
|
||||
|
||||
if (have_fsgsbase) {
|
||||
unsigned short ss;
|
||||
|
||||
asm volatile ("mov %%ss, %0" : "=rm" (ss));
|
||||
|
||||
test_wrbase(0, 0);
|
||||
test_wrbase(0, 1);
|
||||
test_wrbase(0, 0x200000000);
|
||||
test_wrbase(0, 0xffffffffffffffff);
|
||||
test_wrbase(ss, 0);
|
||||
test_wrbase(ss, 1);
|
||||
test_wrbase(ss, 0x200000000);
|
||||
test_wrbase(ss, 0xffffffffffffffff);
|
||||
}
|
||||
|
||||
ftx = 3; /* Kill the thread. */
|
||||
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
|
||||
|
||||
if (pthread_join(thread, NULL) != 0)
|
||||
err(1, "pthread_join");
|
||||
|
||||
test_ptrace_write_gsbase();
|
||||
|
||||
return nerrs == 0 ? 0 : 1;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user