mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
synced 2024-12-28 00:32:00 +00:00
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git
This commit is contained in:
commit
ea2356f802
@ -194,8 +194,6 @@ is applicable::
|
||||
WDT Watchdog support is enabled.
|
||||
X86-32 X86-32, aka i386 architecture is enabled.
|
||||
X86-64 X86-64 architecture is enabled.
|
||||
More X86-64 boot options can be found in
|
||||
Documentation/arch/x86/x86_64/boot-options.rst.
|
||||
X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
|
||||
X86_UV SGI UV support is enabled.
|
||||
XEN Xen support is enabled
|
||||
@ -213,7 +211,6 @@ Do not modify the syntax of boot loader parameters without extreme
|
||||
need or coordination with <Documentation/arch/x86/boot.rst>.
|
||||
|
||||
There are also arch-specific kernel-parameters not documented here.
|
||||
See for example <Documentation/arch/x86/x86_64/boot-options.rst>.
|
||||
|
||||
Note that ALL kernel parameters listed below are CASE SENSITIVE, and that
|
||||
a trailing = on the name of any parameter states that that parameter will
|
||||
|
@ -21,6 +21,10 @@
|
||||
strictly ACPI specification compliant.
|
||||
rsdt -- prefer RSDT over (default) XSDT
|
||||
copy_dsdt -- copy DSDT to memory
|
||||
nocmcff -- Disable firmware first mode for corrected
|
||||
errors. This disables parsing the HEST CMC error
|
||||
source to check if firmware has set the FF flag. This
|
||||
may result in duplicate corrected error reports.
|
||||
nospcr -- disable console in ACPI SPCR table as
|
||||
default _serial_ console on ARM64
|
||||
For ARM64, ONLY "acpi=off", "acpi=on", "acpi=force" or
|
||||
@ -405,6 +409,8 @@
|
||||
not play well with APC CPU idle - disable it if you have
|
||||
APC and your system crashes randomly.
|
||||
|
||||
apic [APIC,X86-64] Use IO-APIC. Default.
|
||||
|
||||
apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller
|
||||
Change the output verbosity while booting
|
||||
Format: { quiet (default) | verbose | debug }
|
||||
@ -424,6 +430,10 @@
|
||||
useful so that a dump capture kernel won't be
|
||||
shot down by NMI
|
||||
|
||||
apicpmtimer Do APIC timer calibration using the pmtimer. Implies
|
||||
apicmaintimer. Useful when your PIT timer is totally
|
||||
broken.
|
||||
|
||||
autoconf= [IPV6]
|
||||
See Documentation/networking/ipv6.rst.
|
||||
|
||||
@ -1726,6 +1736,8 @@
|
||||
|
||||
off: Disable GDS mitigation.
|
||||
|
||||
gbpages [X86] Use GB pages for kernel direct mappings.
|
||||
|
||||
gcov_persist= [GCOV] When non-zero (default), profiling data for
|
||||
kernel modules is saved and remains accessible via
|
||||
debugfs, even when the module is unloaded/reloaded.
|
||||
@ -2008,12 +2020,21 @@
|
||||
|
||||
idle= [X86,EARLY]
|
||||
Format: idle=poll, idle=halt, idle=nomwait
|
||||
Poll forces a polling idle loop that can slightly
|
||||
improve the performance of waking up a idle CPU, but
|
||||
will use a lot of power and make the system run hot.
|
||||
Not recommended.
|
||||
|
||||
idle=poll: Don't do power saving in the idle loop
|
||||
using HLT, but poll for rescheduling event. This will
|
||||
make the CPUs eat a lot more power, but may be useful
|
||||
to get slightly better performance in multiprocessor
|
||||
benchmarks. It also makes some profiling using
|
||||
performance counters more accurate. Please note that
|
||||
on systems with MONITOR/MWAIT support (like Intel
|
||||
EM64T CPUs) this option has no performance advantage
|
||||
over the normal idle loop. It may also interact badly
|
||||
with hyperthreading.
|
||||
|
||||
idle=halt: Halt is forced to be used for CPU idle.
|
||||
In such case C2/C3 won't be used again.
|
||||
|
||||
idle=nomwait: Disable mwait for CPU C-states
|
||||
|
||||
idxd.sva= [HW]
|
||||
@ -2311,20 +2332,73 @@
|
||||
relaxed
|
||||
|
||||
iommu= [X86,EARLY]
|
||||
|
||||
off
|
||||
Don't initialize and use any kind of IOMMU.
|
||||
|
||||
force
|
||||
Force the use of the hardware IOMMU even when
|
||||
it is not actually needed (e.g. because < 3 GB
|
||||
memory).
|
||||
|
||||
noforce
|
||||
Don't force hardware IOMMU usage when it is not
|
||||
needed. (default).
|
||||
|
||||
biomerge
|
||||
panic
|
||||
nopanic
|
||||
merge
|
||||
nomerge
|
||||
|
||||
soft
|
||||
pt [X86]
|
||||
nopt [X86]
|
||||
nobypass [PPC/POWERNV]
|
||||
Use software bounce buffering (SWIOTLB) (default for
|
||||
Intel machines). This can be used to prevent the usage
|
||||
of an available hardware IOMMU.
|
||||
|
||||
[X86]
|
||||
pt
|
||||
[X86]
|
||||
nopt
|
||||
[PPC/POWERNV]
|
||||
nobypass
|
||||
Disable IOMMU bypass, using IOMMU for PCI devices.
|
||||
|
||||
[X86]
|
||||
AMD Gart HW IOMMU-specific options:
|
||||
|
||||
<size>
|
||||
Set the size of the remapping area in bytes.
|
||||
|
||||
allowed
|
||||
Overwrite iommu off workarounds for specific chipsets
|
||||
|
||||
fullflush
|
||||
Flush IOMMU on each allocation (default).
|
||||
|
||||
nofullflush
|
||||
Don't use IOMMU fullflush.
|
||||
|
||||
memaper[=<order>]
|
||||
Allocate an own aperture over RAM with size
|
||||
32MB<<order. (default: order=1, i.e. 64MB)
|
||||
|
||||
merge
|
||||
Do scatter-gather (SG) merging. Implies "force"
|
||||
(experimental).
|
||||
|
||||
nomerge
|
||||
Don't do scatter-gather (SG) merging.
|
||||
|
||||
noaperture
|
||||
Ask the IOMMU not to touch the aperture for AGP.
|
||||
|
||||
noagp
|
||||
Don't initialize the AGP driver and use full aperture.
|
||||
|
||||
panic
|
||||
Always panic when IOMMU overflows.
|
||||
|
||||
iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices.
|
||||
Format: { "0" | "1" }
|
||||
0 - Try to allocate a 32-bit DMA address first, before
|
||||
@ -2432,7 +2506,9 @@
|
||||
specified in the flag list (default: domain):
|
||||
|
||||
nohz
|
||||
Disable the tick when a single task runs.
|
||||
Disable the tick when a single task runs as well as
|
||||
disabling other kernel noises like having RCU callbacks
|
||||
offloaded. This is equivalent to the nohz_full parameter.
|
||||
|
||||
A residual 1Hz tick is offloaded to workqueues, which you
|
||||
need to affine to housekeeping through the global
|
||||
@ -3259,9 +3335,77 @@
|
||||
devices can be requested on-demand with the
|
||||
/dev/loop-control interface.
|
||||
|
||||
mce [X86-32] Machine Check Exception
|
||||
mce= [X86-{32,64}]
|
||||
|
||||
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
|
||||
|
||||
off
|
||||
disable machine check
|
||||
|
||||
no_cmci
|
||||
disable CMCI(Corrected Machine Check Interrupt) that
|
||||
Intel processor supports. Usually this disablement is
|
||||
not recommended, but it might be handy if your
|
||||
hardware is misbehaving.
|
||||
|
||||
Note that you'll get more problems without CMCI than
|
||||
with due to the shared banks, i.e. you might get
|
||||
duplicated error logs.
|
||||
|
||||
dont_log_ce
|
||||
don't make logs for corrected errors. All events
|
||||
reported as corrected are silently cleared by OS. This
|
||||
option will be useful if you have no interest in any
|
||||
of corrected errors.
|
||||
|
||||
ignore_ce
|
||||
disable features for corrected errors, e.g.
|
||||
polling timer and CMCI. All events reported as
|
||||
corrected are not cleared by OS and remained in its
|
||||
error banks.
|
||||
|
||||
Usually this disablement is not recommended, however
|
||||
if there is an agent checking/clearing corrected
|
||||
errors (e.g. BIOS or hardware monitoring
|
||||
applications), conflicting with OS's error handling,
|
||||
and you cannot deactivate the agent, then this option
|
||||
will be a help.
|
||||
|
||||
no_lmce
|
||||
do not opt-in to Local MCE delivery. Use legacy method
|
||||
to broadcast MCEs.
|
||||
|
||||
bootlog
|
||||
enable logging of machine checks left over from
|
||||
booting. Disabled by default on AMD Fam10h and older
|
||||
because some BIOS leave bogus ones.
|
||||
|
||||
If your BIOS doesn't do that it's a good idea to
|
||||
enable though to make sure you log even machine check
|
||||
events that result in a reboot. On Intel systems it is
|
||||
enabled by default.
|
||||
|
||||
nobootlog
|
||||
disable boot machine check logging.
|
||||
|
||||
monarchtimeout (number)
|
||||
sets the time in us to wait for other CPUs on machine
|
||||
checks. 0 to disable.
|
||||
|
||||
bios_cmci_threshold
|
||||
don't overwrite the bios-set CMCI threshold. This boot
|
||||
option prevents Linux from overwriting the CMCI
|
||||
threshold set by the bios. Without this option, Linux
|
||||
always sets the CMCI threshold to 1. Enabling this may
|
||||
make memory predictive failure analysis less effective
|
||||
if the bios sets thresholds for memory errors since we
|
||||
will not see details for all errors.
|
||||
|
||||
recovery
|
||||
force-enable recoverable machine check code paths
|
||||
|
||||
Everything else is in sysfs now.
|
||||
|
||||
mce=option [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
|
||||
|
||||
md= [HW] RAID subsystems devices and level
|
||||
See Documentation/admin-guide/md.rst.
|
||||
@ -3887,6 +4031,8 @@
|
||||
noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any
|
||||
IOAPICs that may be present in the system.
|
||||
|
||||
noapictimer [APIC,X86] Don't set up the APIC timer
|
||||
|
||||
noautogroup Disable scheduler automatic task group creation.
|
||||
|
||||
nocache [ARM,EARLY]
|
||||
@ -3934,6 +4080,8 @@
|
||||
register save and restore. The kernel will only save
|
||||
legacy floating-point registers on task switch.
|
||||
|
||||
nogbpages [X86] Do not use GB pages for kernel direct mappings.
|
||||
|
||||
no_hash_pointers
|
||||
[KNL,EARLY]
|
||||
Force pointers printed to the console or buffers to be
|
||||
@ -3960,6 +4108,8 @@
|
||||
the impact of the sleep instructions. This is also
|
||||
useful when using JTAG debugger.
|
||||
|
||||
nohpet [X86] Don't use the HPET timer.
|
||||
|
||||
nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings.
|
||||
|
||||
nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings.
|
||||
@ -4111,8 +4261,10 @@
|
||||
|
||||
nosync [HW,M68K] Disables sync negotiation for all devices.
|
||||
|
||||
no_timer_check [X86,APIC] Disables the code which tests for
|
||||
broken timer IRQ sources.
|
||||
no_timer_check [X86,APIC] Disables the code which tests for broken
|
||||
timer IRQ sources, i.e., the IO-APIC timer. This can
|
||||
work around problems with incorrect timer
|
||||
initialization on some boards.
|
||||
|
||||
no_uaccess_flush
|
||||
[PPC,EARLY] Don't flush the L1-D cache after accessing user data.
|
||||
@ -4192,6 +4344,11 @@
|
||||
If given as an integer followed by 'U', it will
|
||||
divide each physical node into N emulated nodes.
|
||||
|
||||
numa=noacpi [X86] Don't parse the SRAT table for NUMA setup
|
||||
|
||||
numa=nohmat [X86] Don't parse the HMAT table for NUMA setup, or
|
||||
soft-reserved memory partitioning.
|
||||
|
||||
numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic
|
||||
NUMA balancing.
|
||||
Allowed values are enable and disable
|
||||
@ -5715,6 +5872,55 @@
|
||||
reboot_cpu is s[mp]#### with #### being the processor
|
||||
to be used for rebooting.
|
||||
|
||||
acpi
|
||||
Use the ACPI RESET_REG in the FADT. If ACPI is not
|
||||
configured or the ACPI reset does not work, the reboot
|
||||
path attempts the reset using the keyboard controller.
|
||||
|
||||
bios
|
||||
Use the CPU reboot vector for warm reset
|
||||
|
||||
cold
|
||||
Set the cold reboot flag
|
||||
|
||||
default
|
||||
There are some built-in platform specific "quirks"
|
||||
- you may see: "reboot: <name> series board detected.
|
||||
Selecting <type> for reboots." In the case where you
|
||||
think the quirk is in error (e.g. you have newer BIOS,
|
||||
or newer board) using this option will ignore the
|
||||
built-in quirk table, and use the generic default
|
||||
reboot actions.
|
||||
|
||||
efi
|
||||
Use efi reset_system runtime service. If EFI is not
|
||||
configured or the EFI reset does not work, the reboot
|
||||
path attempts the reset using the keyboard controller.
|
||||
|
||||
force
|
||||
Don't stop other CPUs on reboot. This can make reboot
|
||||
more reliable in some cases.
|
||||
|
||||
kbd
|
||||
Use the keyboard controller. cold reset (default)
|
||||
|
||||
pci
|
||||
Use a write to the PCI config space register 0xcf9 to
|
||||
trigger reboot.
|
||||
|
||||
triple
|
||||
Force a triple fault (init)
|
||||
|
||||
warm
|
||||
Don't set the cold reboot flag
|
||||
|
||||
Using warm reset will be much faster especially on big
|
||||
memory systems because the BIOS will not go through
|
||||
the memory check. Disadvantage is that not all
|
||||
hardware will be completely reinitialized on reboot so
|
||||
there may be boot problems on some systems.
|
||||
|
||||
|
||||
refscale.holdoff= [KNL]
|
||||
Set test-start holdoff period. The purpose of
|
||||
this parameter is to delay the start of the
|
||||
@ -6106,7 +6312,16 @@
|
||||
|
||||
serialnumber [BUGS=X86-32]
|
||||
|
||||
sev=option[,option...] [X86-64] See Documentation/arch/x86/x86_64/boot-options.rst
|
||||
sev=option[,option...] [X86-64]
|
||||
|
||||
debug
|
||||
Enable debug messages.
|
||||
|
||||
nosnp
|
||||
Do not enable SEV-SNP (applies to host/hypervisor
|
||||
only). Setting 'nosnp' avoids the RMP check overhead
|
||||
in memory accesses when users do not want to run
|
||||
SEV-SNP guests.
|
||||
|
||||
shapers= [NET]
|
||||
Maximal number of shapers.
|
||||
|
@ -130,8 +130,126 @@ SNP feature support.
|
||||
|
||||
More details in AMD64 APM[1] Vol 2: 15.34.10 SEV_STATUS MSR
|
||||
|
||||
Reverse Map Table (RMP)
|
||||
=======================
|
||||
|
||||
The RMP is a structure in system memory that is used to ensure a one-to-one
|
||||
mapping between system physical addresses and guest physical addresses. Each
|
||||
page of memory that is potentially assignable to guests has one entry within
|
||||
the RMP.
|
||||
|
||||
The RMP table can be either contiguous in memory or a collection of segments
|
||||
in memory.
|
||||
|
||||
Contiguous RMP
|
||||
--------------
|
||||
|
||||
Support for this form of the RMP is present when support for SEV-SNP is
|
||||
present, which can be determined using the CPUID instruction::
|
||||
|
||||
0x8000001f[eax]:
|
||||
Bit[4] indicates support for SEV-SNP
|
||||
|
||||
The location of the RMP is identified to the hardware through two MSRs::
|
||||
|
||||
0xc0010132 (RMP_BASE):
|
||||
System physical address of the first byte of the RMP
|
||||
|
||||
0xc0010133 (RMP_END):
|
||||
System physical address of the last byte of the RMP
|
||||
|
||||
Hardware requires that RMP_BASE and (RPM_END + 1) be 8KB aligned, but SEV
|
||||
firmware increases the alignment requirement to require a 1MB alignment.
|
||||
|
||||
The RMP consists of a 16KB region used for processor bookkeeping followed
|
||||
by the RMP entries, which are 16 bytes in size. The size of the RMP
|
||||
determines the range of physical memory that the hypervisor can assign to
|
||||
SEV-SNP guests. The RMP covers the system physical address from::
|
||||
|
||||
0 to ((RMP_END + 1 - RMP_BASE - 16KB) / 16B) x 4KB.
|
||||
|
||||
The current Linux support relies on BIOS to allocate/reserve the memory for
|
||||
the RMP and to set RMP_BASE and RMP_END appropriately. Linux uses the MSR
|
||||
values to locate the RMP and determine the size of the RMP. The RMP must
|
||||
cover all of system memory in order for Linux to enable SEV-SNP.
|
||||
|
||||
Segmented RMP
|
||||
-------------
|
||||
|
||||
Segmented RMP support is a new way of representing the layout of an RMP.
|
||||
Initial RMP support required the RMP table to be contiguous in memory.
|
||||
RMP accesses from a NUMA node on which the RMP doesn't reside
|
||||
can take longer than accesses from a NUMA node on which the RMP resides.
|
||||
Segmented RMP support allows the RMP entries to be located on the same
|
||||
node as the memory the RMP is covering, potentially reducing latency
|
||||
associated with accessing an RMP entry associated with the memory. Each
|
||||
RMP segment covers a specific range of system physical addresses.
|
||||
|
||||
Support for this form of the RMP can be determined using the CPUID
|
||||
instruction::
|
||||
|
||||
0x8000001f[eax]:
|
||||
Bit[23] indicates support for segmented RMP
|
||||
|
||||
If supported, segmented RMP attributes can be found using the CPUID
|
||||
instruction::
|
||||
|
||||
0x80000025[eax]:
|
||||
Bits[5:0] minimum supported RMP segment size
|
||||
Bits[11:6] maximum supported RMP segment size
|
||||
|
||||
0x80000025[ebx]:
|
||||
Bits[9:0] number of cacheable RMP segment definitions
|
||||
Bit[10] indicates if the number of cacheable RMP segments
|
||||
is a hard limit
|
||||
|
||||
To enable a segmented RMP, a new MSR is available::
|
||||
|
||||
0xc0010136 (RMP_CFG):
|
||||
Bit[0] indicates if segmented RMP is enabled
|
||||
Bits[13:8] contains the size of memory covered by an RMP
|
||||
segment (expressed as a power of 2)
|
||||
|
||||
The RMP segment size defined in the RMP_CFG MSR applies to all segments
|
||||
of the RMP. Therefore each RMP segment covers a specific range of system
|
||||
physical addresses. For example, if the RMP_CFG MSR value is 0x2401, then
|
||||
the RMP segment coverage value is 0x24 => 36, meaning the size of memory
|
||||
covered by an RMP segment is 64GB (1 << 36). So the first RMP segment
|
||||
covers physical addresses from 0 to 0xF_FFFF_FFFF, the second RMP segment
|
||||
covers physical addresses from 0x10_0000_0000 to 0x1F_FFFF_FFFF, etc.
|
||||
|
||||
When a segmented RMP is enabled, RMP_BASE points to the RMP bookkeeping
|
||||
area as it does today (16K in size). However, instead of RMP entries
|
||||
beginning immediately after the bookkeeping area, there is a 4K RMP
|
||||
segment table (RST). Each entry in the RST is 8-bytes in size and represents
|
||||
an RMP segment::
|
||||
|
||||
Bits[19:0] mapped size (in GB)
|
||||
The mapped size can be less than the defined segment size.
|
||||
A value of zero, indicates that no RMP exists for the range
|
||||
of system physical addresses associated with this segment.
|
||||
Bits[51:20] segment physical address
|
||||
This address is left shift 20-bits (or just masked when
|
||||
read) to form the physical address of the segment (1MB
|
||||
alignment).
|
||||
|
||||
The RST can hold 512 segment entries but can be limited in size to the number
|
||||
of cacheable RMP segments (CPUID 0x80000025_EBX[9:0]) if the number of cacheable
|
||||
RMP segments is a hard limit (CPUID 0x80000025_EBX[10]).
|
||||
|
||||
The current Linux support relies on BIOS to allocate/reserve the memory for
|
||||
the segmented RMP (the bookkeeping area, RST, and all segments), build the RST
|
||||
and to set RMP_BASE, RMP_END, and RMP_CFG appropriately. Linux uses the MSR
|
||||
values to locate the RMP and determine the size and location of the RMP
|
||||
segments. The RMP must cover all of system memory in order for Linux to enable
|
||||
SEV-SNP.
|
||||
|
||||
More details in the AMD64 APM Vol 2, section "15.36.3 Reverse Map Table",
|
||||
docID: 24593.
|
||||
|
||||
Secure VM Service Module (SVSM)
|
||||
===============================
|
||||
|
||||
SNP provides a feature called Virtual Machine Privilege Levels (VMPL) which
|
||||
defines four privilege levels at which guest software can run. The most
|
||||
privileged level is 0 and numerically higher numbers have lesser privileges.
|
||||
|
@ -384,6 +384,16 @@ When monitoring is enabled all MON groups will also contain:
|
||||
Available only with debug option. The identifier used by hardware
|
||||
for the monitor group. On x86 this is the RMID.
|
||||
|
||||
When the "mba_MBps" mount option is used all CTRL_MON groups will also contain:
|
||||
|
||||
"mba_MBps_event":
|
||||
Reading this file shows which memory bandwidth event is used
|
||||
as input to the software feedback loop that keeps memory bandwidth
|
||||
below the value specified in the schemata file. Writing the
|
||||
name of one of the supported memory bandwidth events found in
|
||||
/sys/fs/resctrl/info/L3_MON/mon_features changes the input
|
||||
event.
|
||||
|
||||
Resource allocation rules
|
||||
-------------------------
|
||||
|
||||
|
@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
|
||||
The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
|
||||
"core_id."
|
||||
|
||||
- topology_logical_core_id();
|
||||
|
||||
The logical core ID to which a thread belongs.
|
||||
|
||||
|
||||
|
||||
System topology examples
|
||||
|
@ -1,312 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================
|
||||
AMD64 Specific Boot Options
|
||||
===========================
|
||||
|
||||
There are many others (usually documented in driver documentation), but
|
||||
only the AMD64 specific ones are listed here.
|
||||
|
||||
Machine check
|
||||
=============
|
||||
Please see Documentation/arch/x86/x86_64/machinecheck.rst for sysfs runtime tunables.
|
||||
|
||||
mce=off
|
||||
Disable machine check
|
||||
mce=no_cmci
|
||||
Disable CMCI(Corrected Machine Check Interrupt) that
|
||||
Intel processor supports. Usually this disablement is
|
||||
not recommended, but it might be handy if your hardware
|
||||
is misbehaving.
|
||||
Note that you'll get more problems without CMCI than with
|
||||
due to the shared banks, i.e. you might get duplicated
|
||||
error logs.
|
||||
mce=dont_log_ce
|
||||
Don't make logs for corrected errors. All events reported
|
||||
as corrected are silently cleared by OS.
|
||||
This option will be useful if you have no interest in any
|
||||
of corrected errors.
|
||||
mce=ignore_ce
|
||||
Disable features for corrected errors, e.g. polling timer
|
||||
and CMCI. All events reported as corrected are not cleared
|
||||
by OS and remained in its error banks.
|
||||
Usually this disablement is not recommended, however if
|
||||
there is an agent checking/clearing corrected errors
|
||||
(e.g. BIOS or hardware monitoring applications), conflicting
|
||||
with OS's error handling, and you cannot deactivate the agent,
|
||||
then this option will be a help.
|
||||
mce=no_lmce
|
||||
Do not opt-in to Local MCE delivery. Use legacy method
|
||||
to broadcast MCEs.
|
||||
mce=bootlog
|
||||
Enable logging of machine checks left over from booting.
|
||||
Disabled by default on AMD Fam10h and older because some BIOS
|
||||
leave bogus ones.
|
||||
If your BIOS doesn't do that it's a good idea to enable though
|
||||
to make sure you log even machine check events that result
|
||||
in a reboot. On Intel systems it is enabled by default.
|
||||
mce=nobootlog
|
||||
Disable boot machine check logging.
|
||||
mce=monarchtimeout (number)
|
||||
monarchtimeout:
|
||||
Sets the time in us to wait for other CPUs on machine checks. 0
|
||||
to disable.
|
||||
mce=bios_cmci_threshold
|
||||
Don't overwrite the bios-set CMCI threshold. This boot option
|
||||
prevents Linux from overwriting the CMCI threshold set by the
|
||||
bios. Without this option, Linux always sets the CMCI
|
||||
threshold to 1. Enabling this may make memory predictive failure
|
||||
analysis less effective if the bios sets thresholds for memory
|
||||
errors since we will not see details for all errors.
|
||||
mce=recovery
|
||||
Force-enable recoverable machine check code paths
|
||||
|
||||
nomce (for compatibility with i386)
|
||||
same as mce=off
|
||||
|
||||
Everything else is in sysfs now.
|
||||
|
||||
APICs
|
||||
=====
|
||||
|
||||
apic
|
||||
Use IO-APIC. Default
|
||||
|
||||
noapic
|
||||
Don't use the IO-APIC.
|
||||
|
||||
disableapic
|
||||
Don't use the local APIC
|
||||
|
||||
nolapic
|
||||
Don't use the local APIC (alias for i386 compatibility)
|
||||
|
||||
pirq=...
|
||||
See Documentation/arch/x86/i386/IO-APIC.rst
|
||||
|
||||
noapictimer
|
||||
Don't set up the APIC timer
|
||||
|
||||
no_timer_check
|
||||
Don't check the IO-APIC timer. This can work around
|
||||
problems with incorrect timer initialization on some boards.
|
||||
|
||||
apicpmtimer
|
||||
Do APIC timer calibration using the pmtimer. Implies
|
||||
apicmaintimer. Useful when your PIT timer is totally broken.
|
||||
|
||||
Timing
|
||||
======
|
||||
|
||||
notsc
|
||||
Deprecated, use tsc=unstable instead.
|
||||
|
||||
nohpet
|
||||
Don't use the HPET timer.
|
||||
|
||||
Idle loop
|
||||
=========
|
||||
|
||||
idle=poll
|
||||
Don't do power saving in the idle loop using HLT, but poll for rescheduling
|
||||
event. This will make the CPUs eat a lot more power, but may be useful
|
||||
to get slightly better performance in multiprocessor benchmarks. It also
|
||||
makes some profiling using performance counters more accurate.
|
||||
Please note that on systems with MONITOR/MWAIT support (like Intel EM64T
|
||||
CPUs) this option has no performance advantage over the normal idle loop.
|
||||
It may also interact badly with hyperthreading.
|
||||
|
||||
Rebooting
|
||||
=========
|
||||
|
||||
reboot=b[ios] | t[riple] | k[bd] | a[cpi] | e[fi] | p[ci] [, [w]arm | [c]old]
|
||||
bios
|
||||
Use the CPU reboot vector for warm reset
|
||||
warm
|
||||
Don't set the cold reboot flag
|
||||
cold
|
||||
Set the cold reboot flag
|
||||
triple
|
||||
Force a triple fault (init)
|
||||
kbd
|
||||
Use the keyboard controller. cold reset (default)
|
||||
acpi
|
||||
Use the ACPI RESET_REG in the FADT. If ACPI is not configured or
|
||||
the ACPI reset does not work, the reboot path attempts the reset
|
||||
using the keyboard controller.
|
||||
efi
|
||||
Use efi reset_system runtime service. If EFI is not configured or
|
||||
the EFI reset does not work, the reboot path attempts the reset using
|
||||
the keyboard controller.
|
||||
pci
|
||||
Use a write to the PCI config space register 0xcf9 to trigger reboot.
|
||||
|
||||
Using warm reset will be much faster especially on big memory
|
||||
systems because the BIOS will not go through the memory check.
|
||||
Disadvantage is that not all hardware will be completely reinitialized
|
||||
on reboot so there may be boot problems on some systems.
|
||||
|
||||
reboot=force
|
||||
Don't stop other CPUs on reboot. This can make reboot more reliable
|
||||
in some cases.
|
||||
|
||||
reboot=default
|
||||
There are some built-in platform specific "quirks" - you may see:
|
||||
"reboot: <name> series board detected. Selecting <type> for reboots."
|
||||
In the case where you think the quirk is in error (e.g. you have
|
||||
newer BIOS, or newer board) using this option will ignore the built-in
|
||||
quirk table, and use the generic default reboot actions.
|
||||
|
||||
NUMA
|
||||
====
|
||||
|
||||
numa=off
|
||||
Only set up a single NUMA node spanning all memory.
|
||||
|
||||
numa=noacpi
|
||||
Don't parse the SRAT table for NUMA setup
|
||||
|
||||
numa=nohmat
|
||||
Don't parse the HMAT table for NUMA setup, or soft-reserved memory
|
||||
partitioning.
|
||||
|
||||
ACPI
|
||||
====
|
||||
|
||||
acpi=off
|
||||
Don't enable ACPI
|
||||
acpi=ht
|
||||
Use ACPI boot table parsing, but don't enable ACPI interpreter
|
||||
acpi=force
|
||||
Force ACPI on (currently not needed)
|
||||
acpi=strict
|
||||
Disable out of spec ACPI workarounds.
|
||||
acpi_sci={edge,level,high,low}
|
||||
Set up ACPI SCI interrupt.
|
||||
acpi=noirq
|
||||
Don't route interrupts
|
||||
acpi=nocmcff
|
||||
Disable firmware first mode for corrected errors. This
|
||||
disables parsing the HEST CMC error source to check if
|
||||
firmware has set the FF flag. This may result in
|
||||
duplicate corrected error reports.
|
||||
|
||||
PCI
|
||||
===
|
||||
|
||||
pci=off
|
||||
Don't use PCI
|
||||
pci=conf1
|
||||
Use conf1 access.
|
||||
pci=conf2
|
||||
Use conf2 access.
|
||||
pci=rom
|
||||
Assign ROMs.
|
||||
pci=assign-busses
|
||||
Assign busses
|
||||
pci=irqmask=MASK
|
||||
Set PCI interrupt mask to MASK
|
||||
pci=lastbus=NUMBER
|
||||
Scan up to NUMBER busses, no matter what the mptable says.
|
||||
pci=noacpi
|
||||
Don't use ACPI to set up PCI interrupt routing.
|
||||
|
||||
IOMMU (input/output memory management unit)
|
||||
===========================================
|
||||
Multiple x86-64 PCI-DMA mapping implementations exist, for example:
|
||||
|
||||
1. <kernel/dma/direct.c>: use no hardware/software IOMMU at all
|
||||
(e.g. because you have < 3 GB memory).
|
||||
Kernel boot message: "PCI-DMA: Disabling IOMMU"
|
||||
|
||||
2. <arch/x86/kernel/amd_gart_64.c>: AMD GART based hardware IOMMU.
|
||||
Kernel boot message: "PCI-DMA: using GART IOMMU"
|
||||
|
||||
3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
|
||||
e.g. if there is no hardware IOMMU in the system and it is need because
|
||||
you have >3GB memory or told the kernel to us it (iommu=soft))
|
||||
Kernel boot message: "PCI-DMA: Using software bounce buffering
|
||||
for IO (SWIOTLB)"
|
||||
|
||||
::
|
||||
|
||||
iommu=[<size>][,noagp][,off][,force][,noforce]
|
||||
[,memaper[=<order>]][,merge][,fullflush][,nomerge]
|
||||
[,noaperture]
|
||||
|
||||
General iommu options:
|
||||
|
||||
off
|
||||
Don't initialize and use any kind of IOMMU.
|
||||
noforce
|
||||
Don't force hardware IOMMU usage when it is not needed. (default).
|
||||
force
|
||||
Force the use of the hardware IOMMU even when it is
|
||||
not actually needed (e.g. because < 3 GB memory).
|
||||
soft
|
||||
Use software bounce buffering (SWIOTLB) (default for
|
||||
Intel machines). This can be used to prevent the usage
|
||||
of an available hardware IOMMU.
|
||||
|
||||
iommu options only relevant to the AMD GART hardware IOMMU:
|
||||
|
||||
<size>
|
||||
Set the size of the remapping area in bytes.
|
||||
allowed
|
||||
Overwrite iommu off workarounds for specific chipsets.
|
||||
fullflush
|
||||
Flush IOMMU on each allocation (default).
|
||||
nofullflush
|
||||
Don't use IOMMU fullflush.
|
||||
memaper[=<order>]
|
||||
Allocate an own aperture over RAM with size 32MB<<order.
|
||||
(default: order=1, i.e. 64MB)
|
||||
merge
|
||||
Do scatter-gather (SG) merging. Implies "force" (experimental).
|
||||
nomerge
|
||||
Don't do scatter-gather (SG) merging.
|
||||
noaperture
|
||||
Ask the IOMMU not to touch the aperture for AGP.
|
||||
noagp
|
||||
Don't initialize the AGP driver and use full aperture.
|
||||
panic
|
||||
Always panic when IOMMU overflows.
|
||||
|
||||
iommu options only relevant to the software bounce buffering (SWIOTLB) IOMMU
|
||||
implementation:
|
||||
|
||||
swiotlb=<slots>[,force,noforce]
|
||||
<slots>
|
||||
Prereserve that many 2K slots for the software IO bounce buffering.
|
||||
force
|
||||
Force all IO through the software TLB.
|
||||
noforce
|
||||
Do not initialize the software TLB.
|
||||
|
||||
|
||||
Miscellaneous
|
||||
=============
|
||||
|
||||
nogbpages
|
||||
Do not use GB pages for kernel direct mappings.
|
||||
gbpages
|
||||
Use GB pages for kernel direct mappings.
|
||||
|
||||
|
||||
AMD SEV (Secure Encrypted Virtualization)
|
||||
=========================================
|
||||
Options relating to AMD SEV, specified via the following format:
|
||||
|
||||
::
|
||||
|
||||
sev=option1[,option2]
|
||||
|
||||
The available options are:
|
||||
|
||||
debug
|
||||
Enable debug messages.
|
||||
|
||||
nosnp
|
||||
Do not enable SEV-SNP (applies to host/hypervisor only). Setting
|
||||
'nosnp' avoids the RMP check overhead in memory accesses when
|
||||
users do not want to run SEV-SNP guests.
|
@ -18,7 +18,7 @@ For more information on the features of cpusets, see
|
||||
Documentation/admin-guide/cgroup-v1/cpusets.rst.
|
||||
There are a number of different configurations you can use for your needs. For
|
||||
more information on the numa=fake command line option and its various ways of
|
||||
configuring fake nodes, see Documentation/arch/x86/x86_64/boot-options.rst.
|
||||
configuring fake nodes, see Documentation/admin-guide/kernel-parameters.txt
|
||||
|
||||
For the purposes of this introduction, we'll assume a very primitive NUMA
|
||||
emulation setup of "numa=fake=4*512,". This will split our system memory into
|
||||
|
@ -7,7 +7,6 @@ x86_64 Support
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
boot-options
|
||||
uefi
|
||||
mm
|
||||
5level-paging
|
||||
|
@ -127,29 +127,6 @@ void crash_smp_send_stop(void)
|
||||
cpus_stopped = 1;
|
||||
}
|
||||
|
||||
static void machine_kexec_mask_interrupts(void)
|
||||
{
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
void machine_crash_shutdown(struct pt_regs *regs)
|
||||
{
|
||||
local_irq_disable();
|
||||
|
@ -149,6 +149,7 @@ config ARM64
|
||||
select GENERIC_IDLE_POLL_SETUP
|
||||
select GENERIC_IOREMAP
|
||||
select GENERIC_IRQ_IPI
|
||||
select GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
|
||||
select GENERIC_IRQ_PROBE
|
||||
select GENERIC_IRQ_SHOW
|
||||
select GENERIC_IRQ_SHOW_LEVEL
|
||||
|
@ -207,37 +207,6 @@ void machine_kexec(struct kimage *kimage)
|
||||
BUG(); /* Should never get here. */
|
||||
}
|
||||
|
||||
static void machine_kexec_mask_interrupts(void)
|
||||
{
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
int ret;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* First try to remove the active state. If this
|
||||
* fails, try to EOI the interrupt.
|
||||
*/
|
||||
ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
|
||||
|
||||
if (ret && irqd_irq_inprogress(&desc->irq_data) &&
|
||||
chip->irq_eoi)
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* machine_crash_shutdown - shutdown non-crashing cpus and save registers
|
||||
*/
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <asm/break.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/objtool.h>
|
||||
|
||||
#ifndef CONFIG_DEBUG_BUGVERBOSE
|
||||
#define _BUGVERBOSE_LOCATION(file, line)
|
||||
@ -33,25 +34,25 @@
|
||||
|
||||
#define ASM_BUG_FLAGS(flags) \
|
||||
__BUG_ENTRY(flags) \
|
||||
break BRK_BUG
|
||||
break BRK_BUG;
|
||||
|
||||
#define ASM_BUG() ASM_BUG_FLAGS(0)
|
||||
|
||||
#define __BUG_FLAGS(flags) \
|
||||
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)));
|
||||
#define __BUG_FLAGS(flags, extra) \
|
||||
asm_inline volatile (__stringify(ASM_BUG_FLAGS(flags)) \
|
||||
extra);
|
||||
|
||||
#define __WARN_FLAGS(flags) \
|
||||
do { \
|
||||
instrumentation_begin(); \
|
||||
__BUG_FLAGS(BUGFLAG_WARNING|(flags)); \
|
||||
annotate_reachable(); \
|
||||
__BUG_FLAGS(BUGFLAG_WARNING|(flags), ANNOTATE_REACHABLE(10001b));\
|
||||
instrumentation_end(); \
|
||||
} while (0)
|
||||
|
||||
#define BUG() \
|
||||
do { \
|
||||
instrumentation_begin(); \
|
||||
__BUG_FLAGS(0); \
|
||||
__BUG_FLAGS(0, ""); \
|
||||
unreachable(); \
|
||||
} while (0)
|
||||
|
||||
|
@ -61,7 +61,6 @@ struct pt_regs;
|
||||
extern void kexec_smp_wait(void); /* get and clear naca physid, wait for
|
||||
master to copy new code to 0 */
|
||||
extern void default_machine_kexec(struct kimage *image);
|
||||
extern void machine_kexec_mask_interrupts(void);
|
||||
|
||||
void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer,
|
||||
unsigned long start_address) __noreturn;
|
||||
|
@ -22,28 +22,6 @@
|
||||
#include <asm/setup.h>
|
||||
#include <asm/firmware.h>
|
||||
|
||||
void machine_kexec_mask_interrupts(void) {
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CRASH_DUMP
|
||||
void machine_crash_shutdown(struct pt_regs *regs)
|
||||
{
|
||||
|
@ -7,6 +7,7 @@
|
||||
* Copyright (C) 2005 IBM Corporation.
|
||||
*/
|
||||
|
||||
#include <linux/irq.h>
|
||||
#include <linux/kexec.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/string.h>
|
||||
|
@ -114,29 +114,6 @@ void machine_shutdown(void)
|
||||
#endif
|
||||
}
|
||||
|
||||
static void machine_kexec_mask_interrupts(void)
|
||||
{
|
||||
unsigned int i;
|
||||
struct irq_desc *desc;
|
||||
|
||||
for_each_irq_desc(i, desc) {
|
||||
struct irq_chip *chip;
|
||||
|
||||
chip = irq_desc_get_chip(desc);
|
||||
if (!chip)
|
||||
continue;
|
||||
|
||||
if (chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
|
||||
chip->irq_eoi(&desc->irq_data);
|
||||
|
||||
if (chip->irq_mask)
|
||||
chip->irq_mask(&desc->irq_data);
|
||||
|
||||
if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data))
|
||||
chip->irq_disable(&desc->irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* machine_crash_shutdown - Prepare to kexec after a kernel crash
|
||||
*
|
||||
|
@ -981,7 +981,7 @@ static int cfdiag_push_sample(struct perf_event *event,
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw.frag.size = cpuhw->usedss;
|
||||
raw.frag.data = cpuhw->stop;
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
overflow = perf_event_overflow(event, &data, ®s);
|
||||
|
@ -981,7 +981,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
|
||||
cpuhw->flags &= ~PMU_F_ENABLED;
|
||||
}
|
||||
|
||||
/* perf_exclude_event() - Filter event
|
||||
/* perf_event_exclude() - Filter event
|
||||
* @event: The perf event
|
||||
* @regs: pt_regs structure
|
||||
* @sde_regs: Sample-data-entry (sde) regs structure
|
||||
@ -990,7 +990,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu)
|
||||
*
|
||||
* Return non-zero if the event shall be excluded.
|
||||
*/
|
||||
static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs,
|
||||
static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs,
|
||||
struct perf_sf_sde_regs *sde_regs)
|
||||
{
|
||||
if (event->attr.exclude_user && user_mode(regs))
|
||||
@ -1073,7 +1073,7 @@ static int perf_push_sample(struct perf_event *event,
|
||||
data.tid_entry.pid = basic->hpp & LPP_PID_MASK;
|
||||
|
||||
overflow = 0;
|
||||
if (perf_exclude_event(event, ®s, sde_regs))
|
||||
if (perf_event_exclude(event, ®s, sde_regs))
|
||||
goto out;
|
||||
if (perf_event_overflow(event, &data, ®s)) {
|
||||
overflow = 1;
|
||||
|
@ -478,7 +478,7 @@ static int paicrypt_push_sample(size_t rawsize, struct paicrypt_map *cpump,
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw.frag.size = rawsize;
|
||||
raw.frag.data = cpump->save;
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
overflow = perf_event_overflow(event, &data, ®s);
|
||||
|
@ -503,7 +503,7 @@ static int paiext_push_sample(size_t rawsize, struct paiext_map *cpump,
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw.frag.size = rawsize;
|
||||
raw.frag.data = cpump->save;
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
overflow = perf_event_overflow(event, &data, ®s);
|
||||
|
@ -97,7 +97,7 @@ config IOMMU_DEBUG
|
||||
code. When you use it make sure you have a big enough
|
||||
IOMMU/AGP aperture. Most of the options enabled by this can
|
||||
be set more finegrained using the iommu= command line
|
||||
options. See Documentation/arch/x86/x86_64/boot-options.rst for more
|
||||
options. See Documentation/admin-guide/kernel-parameters.txt for more
|
||||
details.
|
||||
|
||||
config IOMMU_LEAK
|
||||
|
@ -25,10 +25,6 @@
|
||||
#include "efi.h"
|
||||
|
||||
#include <generated/compile.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/uts.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <generated/utsversion.h>
|
||||
#include <generated/utsrelease.h>
|
||||
|
||||
|
@ -777,15 +777,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
|
||||
|
||||
val = sev_es_rd_ghcb_msr();
|
||||
|
||||
if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
|
||||
"Wrong PSC response code: 0x%x\n",
|
||||
(unsigned int)GHCB_RESP_CODE(val)))
|
||||
if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
|
||||
goto e_term;
|
||||
|
||||
if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
|
||||
"Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
|
||||
op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
|
||||
paddr, GHCB_MSR_PSC_RESP_VAL(val)))
|
||||
if (GHCB_MSR_PSC_RESP_VAL(val))
|
||||
goto e_term;
|
||||
|
||||
/* Page validation must be performed after changing to private */
|
||||
@ -821,7 +816,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
|
||||
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
|
||||
}
|
||||
|
||||
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
|
||||
void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
|
||||
unsigned long npages)
|
||||
{
|
||||
/*
|
||||
@ -2361,8 +2356,8 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
|
||||
call.rax = SVSM_CORE_CALL(SVSM_CORE_REMAP_CA);
|
||||
call.rcx = pa;
|
||||
ret = svsm_perform_call_protocol(&call);
|
||||
if (ret)
|
||||
panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out);
|
||||
while (ret)
|
||||
cpu_relax(); /* too early to panic */
|
||||
|
||||
RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
|
||||
RIP_REL_REF(boot_svsm_caa_pa) = pa;
|
||||
|
@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void)
|
||||
*
|
||||
* Return: XSAVE area size on success, 0 otherwise.
|
||||
*/
|
||||
static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
|
||||
static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
|
||||
{
|
||||
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
|
||||
u64 xfeatures_found = 0;
|
||||
@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
|
||||
}
|
||||
|
||||
static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
|
||||
struct cpuid_leaf *leaf)
|
||||
static int __head
|
||||
snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
|
||||
struct cpuid_leaf *leaf)
|
||||
{
|
||||
struct cpuid_leaf leaf_hv = *leaf;
|
||||
|
||||
@ -1243,7 +1244,7 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs
|
||||
__pval_terminate(pfn, action, page_size, ret, svsm_ret);
|
||||
}
|
||||
|
||||
static void svsm_pval_4k_page(unsigned long paddr, bool validate)
|
||||
static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
|
||||
{
|
||||
struct svsm_pvalidate_call *pc;
|
||||
struct svsm_call call = {};
|
||||
@ -1275,12 +1276,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate)
|
||||
|
||||
ret = svsm_perform_call_protocol(&call);
|
||||
if (ret)
|
||||
svsm_pval_terminate(pc, ret, call.rax_out);
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
|
||||
|
||||
native_local_irq_restore(flags);
|
||||
}
|
||||
|
||||
static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate)
|
||||
static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
|
||||
bool validate)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -1293,7 +1295,7 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val
|
||||
} else {
|
||||
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
|
||||
if (ret)
|
||||
__pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0);
|
||||
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,3 +1,3 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
obj-y += tdx.o tdx-shared.o tdcall.o
|
||||
obj-y += debug.o tdcall.o tdx.o tdx-shared.o
|
||||
|
69
arch/x86/coco/tdx/debug.c
Normal file
69
arch/x86/coco/tdx/debug.c
Normal file
@ -0,0 +1,69 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#undef pr_fmt
|
||||
#define pr_fmt(fmt) "tdx: " fmt
|
||||
|
||||
#include <linux/array_size.h>
|
||||
#include <linux/printk.h>
|
||||
#include <asm/tdx.h>
|
||||
|
||||
#define DEF_TDX_ATTR_NAME(_name) [TDX_ATTR_##_name##_BIT] = __stringify(_name)
|
||||
|
||||
static __initdata const char *tdx_attributes[] = {
|
||||
DEF_TDX_ATTR_NAME(DEBUG),
|
||||
DEF_TDX_ATTR_NAME(HGS_PLUS_PROF),
|
||||
DEF_TDX_ATTR_NAME(PERF_PROF),
|
||||
DEF_TDX_ATTR_NAME(PMT_PROF),
|
||||
DEF_TDX_ATTR_NAME(ICSSD),
|
||||
DEF_TDX_ATTR_NAME(LASS),
|
||||
DEF_TDX_ATTR_NAME(SEPT_VE_DISABLE),
|
||||
DEF_TDX_ATTR_NAME(MIGRTABLE),
|
||||
DEF_TDX_ATTR_NAME(PKS),
|
||||
DEF_TDX_ATTR_NAME(KL),
|
||||
DEF_TDX_ATTR_NAME(TPA),
|
||||
DEF_TDX_ATTR_NAME(PERFMON),
|
||||
};
|
||||
|
||||
#define DEF_TD_CTLS_NAME(_name) [TD_CTLS_##_name##_BIT] = __stringify(_name)
|
||||
|
||||
static __initdata const char *tdcs_td_ctls[] = {
|
||||
DEF_TD_CTLS_NAME(PENDING_VE_DISABLE),
|
||||
DEF_TD_CTLS_NAME(ENUM_TOPOLOGY),
|
||||
DEF_TD_CTLS_NAME(VIRT_CPUID2),
|
||||
DEF_TD_CTLS_NAME(REDUCE_VE),
|
||||
DEF_TD_CTLS_NAME(LOCK),
|
||||
};
|
||||
|
||||
void __init tdx_dump_attributes(u64 td_attr)
|
||||
{
|
||||
pr_info("Attributes:");
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(tdx_attributes); i++) {
|
||||
if (!tdx_attributes[i])
|
||||
continue;
|
||||
if (td_attr & BIT(i))
|
||||
pr_cont(" %s", tdx_attributes[i]);
|
||||
td_attr &= ~BIT(i);
|
||||
}
|
||||
|
||||
if (td_attr)
|
||||
pr_cont(" unknown:%#llx", td_attr);
|
||||
pr_cont("\n");
|
||||
|
||||
}
|
||||
|
||||
void __init tdx_dump_td_ctls(u64 td_ctls)
|
||||
{
|
||||
pr_info("TD_CTLS:");
|
||||
|
||||
for (int i = 0; i < ARRAY_SIZE(tdcs_td_ctls); i++) {
|
||||
if (!tdcs_td_ctls[i])
|
||||
continue;
|
||||
if (td_ctls & BIT(i))
|
||||
pr_cont(" %s", tdcs_td_ctls[i]);
|
||||
td_ctls &= ~BIT(i);
|
||||
}
|
||||
if (td_ctls)
|
||||
pr_cont(" unknown:%#llx", td_ctls);
|
||||
pr_cont("\n");
|
||||
}
|
@ -32,9 +32,6 @@
|
||||
#define VE_GET_PORT_NUM(e) ((e) >> 16)
|
||||
#define VE_IS_IO_STRING(e) ((e) & BIT(4))
|
||||
|
||||
#define ATTR_DEBUG BIT(0)
|
||||
#define ATTR_SEPT_VE_DISABLE BIT(28)
|
||||
|
||||
/* TDX Module call error codes */
|
||||
#define TDCALL_RETURN_CODE(a) ((a) >> 32)
|
||||
#define TDCALL_INVALID_OPERAND 0xc0000100
|
||||
@ -200,14 +197,14 @@ static void __noreturn tdx_panic(const char *msg)
|
||||
*
|
||||
* TDX 1.0 does not allow the guest to disable SEPT #VE on its own. The VMM
|
||||
* controls if the guest will receive such #VE with TD attribute
|
||||
* ATTR_SEPT_VE_DISABLE.
|
||||
* TDX_ATTR_SEPT_VE_DISABLE.
|
||||
*
|
||||
* Newer TDX modules allow the guest to control if it wants to receive SEPT
|
||||
* violation #VEs.
|
||||
*
|
||||
* Check if the feature is available and disable SEPT #VE if possible.
|
||||
*
|
||||
* If the TD is allowed to disable/enable SEPT #VEs, the ATTR_SEPT_VE_DISABLE
|
||||
* If the TD is allowed to disable/enable SEPT #VEs, the TDX_ATTR_SEPT_VE_DISABLE
|
||||
* attribute is no longer reliable. It reflects the initial state of the
|
||||
* control for the TD, but it will not be updated if someone (e.g. bootloader)
|
||||
* changes it before the kernel starts. Kernel must check TDCS_TD_CTLS bit to
|
||||
@ -216,14 +213,14 @@ static void __noreturn tdx_panic(const char *msg)
|
||||
static void disable_sept_ve(u64 td_attr)
|
||||
{
|
||||
const char *msg = "TD misconfiguration: SEPT #VE has to be disabled";
|
||||
bool debug = td_attr & ATTR_DEBUG;
|
||||
bool debug = td_attr & TDX_ATTR_DEBUG;
|
||||
u64 config, controls;
|
||||
|
||||
/* Is this TD allowed to disable SEPT #VE */
|
||||
tdg_vm_rd(TDCS_CONFIG_FLAGS, &config);
|
||||
if (!(config & TDCS_CONFIG_FLEXIBLE_PENDING_VE)) {
|
||||
/* No SEPT #VE controls for the guest: check the attribute */
|
||||
if (td_attr & ATTR_SEPT_VE_DISABLE)
|
||||
if (td_attr & TDX_ATTR_SEPT_VE_DISABLE)
|
||||
return;
|
||||
|
||||
/* Relax SEPT_VE_DISABLE check for debug TD for backtraces */
|
||||
@ -274,6 +271,20 @@ static void enable_cpu_topology_enumeration(void)
|
||||
tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_ENUM_TOPOLOGY, TD_CTLS_ENUM_TOPOLOGY);
|
||||
}
|
||||
|
||||
static void reduce_unnecessary_ve(void)
|
||||
{
|
||||
u64 err = tdg_vm_wr(TDCS_TD_CTLS, TD_CTLS_REDUCE_VE, TD_CTLS_REDUCE_VE);
|
||||
|
||||
if (err == TDX_SUCCESS)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Enabling REDUCE_VE includes ENUM_TOPOLOGY. Only try to
|
||||
* enable ENUM_TOPOLOGY if REDUCE_VE was not successful.
|
||||
*/
|
||||
enable_cpu_topology_enumeration();
|
||||
}
|
||||
|
||||
static void tdx_setup(u64 *cc_mask)
|
||||
{
|
||||
struct tdx_module_args args = {};
|
||||
@ -305,7 +316,8 @@ static void tdx_setup(u64 *cc_mask)
|
||||
tdg_vm_wr(TDCS_NOTIFY_ENABLES, 0, -1ULL);
|
||||
|
||||
disable_sept_ve(td_attr);
|
||||
enable_cpu_topology_enumeration();
|
||||
|
||||
reduce_unnecessary_ve();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1025,6 +1037,20 @@ static void tdx_kexec_finish(void)
|
||||
}
|
||||
}
|
||||
|
||||
static __init void tdx_announce(void)
|
||||
{
|
||||
struct tdx_module_args args = {};
|
||||
u64 controls;
|
||||
|
||||
pr_info("Guest detected\n");
|
||||
|
||||
tdcall(TDG_VP_INFO, &args);
|
||||
tdx_dump_attributes(args.rdx);
|
||||
|
||||
tdg_vm_rd(TDCS_TD_CTLS, &controls);
|
||||
tdx_dump_td_ctls(controls);
|
||||
}
|
||||
|
||||
void __init tdx_early_init(void)
|
||||
{
|
||||
u64 cc_mask;
|
||||
@ -1094,5 +1120,5 @@ void __init tdx_early_init(void)
|
||||
*/
|
||||
x86_cpuinit.parallel_bringup = false;
|
||||
|
||||
pr_info("Guest detected\n");
|
||||
tdx_announce();
|
||||
}
|
||||
|
@ -308,10 +308,9 @@ SYM_CODE_END(xen_error_entry)
|
||||
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
|
||||
.endif
|
||||
|
||||
call \cfunc
|
||||
|
||||
/* For some configurations \cfunc ends up being a noreturn. */
|
||||
REACHABLE
|
||||
ANNOTATE_REACHABLE
|
||||
call \cfunc
|
||||
|
||||
jmp error_return
|
||||
.endm
|
||||
@ -529,10 +528,10 @@ SYM_CODE_START(\asmsym)
|
||||
movq %rsp, %rdi /* pt_regs pointer into first argument */
|
||||
movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
|
||||
movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
|
||||
call \cfunc
|
||||
|
||||
/* For some configurations \cfunc ends up being a noreturn. */
|
||||
REACHABLE
|
||||
ANNOTATE_REACHABLE
|
||||
call \cfunc
|
||||
|
||||
jmp paranoid_exit
|
||||
|
||||
|
@ -1001,8 +1001,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
|
||||
if (!x86_perf_event_set_period(event))
|
||||
continue;
|
||||
|
||||
if (has_branch_stack(event))
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
|
||||
if (perf_event_overflow(event, &data, regs))
|
||||
x86_pmu_stop(event, 0);
|
||||
|
@ -31,6 +31,8 @@ static u32 ibs_caps;
|
||||
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
|
||||
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
|
||||
|
||||
/* attr.config2 */
|
||||
#define IBS_SW_FILTER_MASK 1
|
||||
|
||||
/*
|
||||
* IBS states:
|
||||
@ -290,6 +292,16 @@ static int perf_ibs_init(struct perf_event *event)
|
||||
if (has_branch_stack(event))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* handle exclude_{user,kernel} in the IRQ handler */
|
||||
if (event->attr.exclude_host || event->attr.exclude_guest ||
|
||||
event->attr.exclude_idle)
|
||||
return -EINVAL;
|
||||
|
||||
if (!(event->attr.config2 & IBS_SW_FILTER_MASK) &&
|
||||
(event->attr.exclude_kernel || event->attr.exclude_user ||
|
||||
event->attr.exclude_hv))
|
||||
return -EINVAL;
|
||||
|
||||
ret = validate_group(event);
|
||||
if (ret)
|
||||
return ret;
|
||||
@ -550,24 +562,14 @@ static struct attribute *attrs_empty[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group empty_format_group = {
|
||||
.name = "format",
|
||||
.attrs = attrs_empty,
|
||||
};
|
||||
|
||||
static struct attribute_group empty_caps_group = {
|
||||
.name = "caps",
|
||||
.attrs = attrs_empty,
|
||||
};
|
||||
|
||||
static const struct attribute_group *empty_attr_groups[] = {
|
||||
&empty_format_group,
|
||||
&empty_caps_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
PMU_FORMAT_ATTR(rand_en, "config:57");
|
||||
PMU_FORMAT_ATTR(cnt_ctl, "config:19");
|
||||
PMU_FORMAT_ATTR(swfilt, "config2:0");
|
||||
PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59");
|
||||
PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16");
|
||||
PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1");
|
||||
@ -578,8 +580,9 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int
|
||||
return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0;
|
||||
}
|
||||
|
||||
static struct attribute *rand_en_attrs[] = {
|
||||
static struct attribute *fetch_attrs[] = {
|
||||
&format_attr_rand_en.attr,
|
||||
&format_attr_swfilt.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
@ -593,9 +596,9 @@ static struct attribute *zen4_ibs_extensions_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group group_rand_en = {
|
||||
static struct attribute_group group_fetch_formats = {
|
||||
.name = "format",
|
||||
.attrs = rand_en_attrs,
|
||||
.attrs = fetch_attrs,
|
||||
};
|
||||
|
||||
static struct attribute_group group_fetch_l3missonly = {
|
||||
@ -611,7 +614,7 @@ static struct attribute_group group_zen4_ibs_extensions = {
|
||||
};
|
||||
|
||||
static const struct attribute_group *fetch_attr_groups[] = {
|
||||
&group_rand_en,
|
||||
&group_fetch_formats,
|
||||
&empty_caps_group,
|
||||
NULL,
|
||||
};
|
||||
@ -628,6 +631,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i)
|
||||
return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0;
|
||||
}
|
||||
|
||||
static struct attribute *op_attrs[] = {
|
||||
&format_attr_swfilt.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute *cnt_ctl_attrs[] = {
|
||||
&format_attr_cnt_ctl.attr,
|
||||
NULL,
|
||||
@ -638,6 +646,11 @@ static struct attribute *op_l3missonly_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group group_op_formats = {
|
||||
.name = "format",
|
||||
.attrs = op_attrs,
|
||||
};
|
||||
|
||||
static struct attribute_group group_cnt_ctl = {
|
||||
.name = "format",
|
||||
.attrs = cnt_ctl_attrs,
|
||||
@ -650,6 +663,12 @@ static struct attribute_group group_op_l3missonly = {
|
||||
.is_visible = zen4_ibs_extensions_is_visible,
|
||||
};
|
||||
|
||||
static const struct attribute_group *op_attr_groups[] = {
|
||||
&group_op_formats,
|
||||
&empty_caps_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group *op_attr_update[] = {
|
||||
&group_cnt_ctl,
|
||||
&group_op_l3missonly,
|
||||
@ -667,7 +686,6 @@ static struct perf_ibs perf_ibs_fetch = {
|
||||
.start = perf_ibs_start,
|
||||
.stop = perf_ibs_stop,
|
||||
.read = perf_ibs_read,
|
||||
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
|
||||
},
|
||||
.msr = MSR_AMD64_IBSFETCHCTL,
|
||||
.config_mask = IBS_FETCH_CONFIG_MASK,
|
||||
@ -691,7 +709,6 @@ static struct perf_ibs perf_ibs_op = {
|
||||
.start = perf_ibs_start,
|
||||
.stop = perf_ibs_stop,
|
||||
.read = perf_ibs_read,
|
||||
.capabilities = PERF_PMU_CAP_NO_EXCLUDE,
|
||||
},
|
||||
.msr = MSR_AMD64_IBSOPCTL,
|
||||
.config_mask = IBS_OP_CONFIG_MASK,
|
||||
@ -1111,6 +1128,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
|
||||
regs.flags |= PERF_EFLAGS_EXACT;
|
||||
}
|
||||
|
||||
if ((event->attr.config2 & IBS_SW_FILTER_MASK) &&
|
||||
perf_exclude_event(event, ®s)) {
|
||||
throttle = perf_event_account_interrupt(event);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (event->attr.sample_type & PERF_SAMPLE_RAW) {
|
||||
raw = (struct perf_raw_record){
|
||||
.frag = {
|
||||
@ -1118,7 +1141,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
|
||||
.data = ibs_data.data,
|
||||
},
|
||||
};
|
||||
perf_sample_save_raw_data(&data, &raw);
|
||||
perf_sample_save_raw_data(&data, event, &raw);
|
||||
}
|
||||
|
||||
if (perf_ibs == &perf_ibs_op)
|
||||
@ -1129,8 +1152,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
|
||||
* recorded as part of interrupt regs. Thus we need to use rip from
|
||||
* interrupt regs while unwinding call stack.
|
||||
*/
|
||||
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
|
||||
perf_sample_save_callchain(&data, event, iregs);
|
||||
perf_sample_save_callchain(&data, event, iregs);
|
||||
|
||||
throttle = perf_event_overflow(event, &data, ®s);
|
||||
out:
|
||||
@ -1228,7 +1250,7 @@ static __init int perf_ibs_op_init(void)
|
||||
if (ibs_caps & IBS_CAPS_ZEN4)
|
||||
perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY;
|
||||
|
||||
perf_ibs_op.pmu.attr_groups = empty_attr_groups;
|
||||
perf_ibs_op.pmu.attr_groups = op_attr_groups;
|
||||
perf_ibs_op.pmu.attr_update = op_attr_update;
|
||||
|
||||
return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
|
||||
|
@ -1707,8 +1707,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
|
||||
|
||||
perf_sample_data_init(&data, 0, event->hw.last_period);
|
||||
|
||||
if (has_branch_stack(event))
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
perf_sample_save_brstack(&data, event, &cpuc->lbr_stack, NULL);
|
||||
|
||||
if (perf_event_overflow(event, &data, regs))
|
||||
x86_pmu_stop(event, 0);
|
||||
|
@ -5371,42 +5371,32 @@ static __init void intel_clovertown_quirk(void)
|
||||
x86_pmu.pebs_constraints = NULL;
|
||||
}
|
||||
|
||||
static const struct x86_cpu_desc isolation_ucodes[] = {
|
||||
INTEL_CPU_DESC(INTEL_HASWELL, 3, 0x0000001f),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_L, 1, 0x0000001e),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_G, 1, 0x00000015),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_X, 2, 0x00000037),
|
||||
INTEL_CPU_DESC(INTEL_HASWELL_X, 4, 0x0000000a),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL, 4, 0x00000023),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_G, 1, 0x00000014),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 2, 0x00000010),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 3, 0x07000009),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 4, 0x0f000009),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_D, 5, 0x0e000002),
|
||||
INTEL_CPU_DESC(INTEL_BROADWELL_X, 1, 0x0b000014),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 3, 0x00000021),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 4, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 5, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 6, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 7, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_X, 11, 0x00000000),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE_L, 3, 0x0000007c),
|
||||
INTEL_CPU_DESC(INTEL_SKYLAKE, 3, 0x0000007c),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 9, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 9, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 10, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 11, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE_L, 12, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 10, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 11, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 12, 0x0000004e),
|
||||
INTEL_CPU_DESC(INTEL_KABYLAKE, 13, 0x0000004e),
|
||||
static const struct x86_cpu_id isolation_ucodes[] = {
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL, 3, 3, 0x0000001f),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_L, 1, 1, 0x0000001e),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_G, 1, 1, 0x00000015),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 2, 2, 0x00000037),
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 4, 4, 0x0000000a),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL, 4, 4, 0x00000023),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_G, 1, 1, 0x00000014),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 2, 2, 0x00000010),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 3, 3, 0x07000009),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 4, 4, 0x0f000009),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 5, 5, 0x0e000002),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_X, 1, 1, 0x0b000014),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 3, 3, 0x00000021),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 4, 7, 0x00000000),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 11, 11, 0x00000000),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_L, 3, 3, 0x0000007c),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE, 3, 3, 0x0000007c),
|
||||
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE, 9, 13, 0x0000004e),
|
||||
X86_MATCH_VFM_STEPS(INTEL_KABYLAKE_L, 9, 12, 0x0000004e),
|
||||
{}
|
||||
};
|
||||
|
||||
static void intel_check_pebs_isolation(void)
|
||||
{
|
||||
x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes);
|
||||
x86_pmu.pebs_no_isolation = !x86_match_min_microcode_rev(isolation_ucodes);
|
||||
}
|
||||
|
||||
static __init void intel_pebs_isolation_quirk(void)
|
||||
@ -5416,16 +5406,16 @@ static __init void intel_pebs_isolation_quirk(void)
|
||||
intel_check_pebs_isolation();
|
||||
}
|
||||
|
||||
static const struct x86_cpu_desc pebs_ucodes[] = {
|
||||
INTEL_CPU_DESC(INTEL_SANDYBRIDGE, 7, 0x00000028),
|
||||
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 6, 0x00000618),
|
||||
INTEL_CPU_DESC(INTEL_SANDYBRIDGE_X, 7, 0x0000070c),
|
||||
static const struct x86_cpu_id pebs_ucodes[] = {
|
||||
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE, 7, 7, 0x00000028),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 6, 6, 0x00000618),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SANDYBRIDGE_X, 7, 7, 0x0000070c),
|
||||
{}
|
||||
};
|
||||
|
||||
static bool intel_snb_pebs_broken(void)
|
||||
{
|
||||
return !x86_cpu_has_min_microcode_rev(pebs_ucodes);
|
||||
return !x86_match_min_microcode_rev(pebs_ucodes);
|
||||
}
|
||||
|
||||
static void intel_snb_check_microcode(void)
|
||||
|
@ -1789,8 +1789,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
|
||||
* previous PMI context or an (I)RET happened between the record and
|
||||
* PMI.
|
||||
*/
|
||||
if (sample_type & PERF_SAMPLE_CALLCHAIN)
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
|
||||
/*
|
||||
* We use the interrupt regs as a base because the PEBS record does not
|
||||
@ -1889,8 +1888,7 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
|
||||
if (x86_pmu.intel_cap.pebs_format >= 3)
|
||||
setup_pebs_time(event, data, pebs->tsc);
|
||||
|
||||
if (has_branch_stack(event))
|
||||
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
|
||||
perf_sample_save_brstack(data, event, &cpuc->lbr_stack, NULL);
|
||||
}
|
||||
|
||||
static void adaptive_pebs_save_regs(struct pt_regs *regs,
|
||||
@ -1917,8 +1915,6 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs,
|
||||
}
|
||||
|
||||
#define PEBS_LATENCY_MASK 0xffff
|
||||
#define PEBS_CACHE_LATENCY_OFFSET 32
|
||||
#define PEBS_RETIRE_LATENCY_OFFSET 32
|
||||
|
||||
/*
|
||||
* With adaptive PEBS the layout depends on what fields are configured.
|
||||
@ -1932,8 +1928,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct pebs_basic *basic = __pebs;
|
||||
void *next_record = basic + 1;
|
||||
u64 sample_type;
|
||||
u64 format_size;
|
||||
u64 sample_type, format_group;
|
||||
struct pebs_meminfo *meminfo = NULL;
|
||||
struct pebs_gprs *gprs = NULL;
|
||||
struct x86_perf_regs *perf_regs;
|
||||
@ -1945,7 +1940,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
perf_regs->xmm_regs = NULL;
|
||||
|
||||
sample_type = event->attr.sample_type;
|
||||
format_size = basic->format_size;
|
||||
format_group = basic->format_group;
|
||||
perf_sample_data_init(data, 0, event->hw.last_period);
|
||||
data->period = event->hw.last_period;
|
||||
|
||||
@ -1957,8 +1952,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
* previous PMI context or an (I)RET happened between the record and
|
||||
* PMI.
|
||||
*/
|
||||
if (sample_type & PERF_SAMPLE_CALLCHAIN)
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
perf_sample_save_callchain(data, event, iregs);
|
||||
|
||||
*regs = *iregs;
|
||||
/* The ip in basic is EventingIP */
|
||||
@ -1967,7 +1961,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
|
||||
if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
|
||||
if (x86_pmu.flags & PMU_FL_RETIRE_LATENCY)
|
||||
data->weight.var3_w = format_size >> PEBS_RETIRE_LATENCY_OFFSET & PEBS_LATENCY_MASK;
|
||||
data->weight.var3_w = basic->retire_latency;
|
||||
else
|
||||
data->weight.var3_w = 0;
|
||||
}
|
||||
@ -1977,12 +1971,12 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
* But PERF_SAMPLE_TRANSACTION needs gprs->ax.
|
||||
* Save the pointer here but process later.
|
||||
*/
|
||||
if (format_size & PEBS_DATACFG_MEMINFO) {
|
||||
if (format_group & PEBS_DATACFG_MEMINFO) {
|
||||
meminfo = next_record;
|
||||
next_record = meminfo + 1;
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_GP) {
|
||||
if (format_group & PEBS_DATACFG_GP) {
|
||||
gprs = next_record;
|
||||
next_record = gprs + 1;
|
||||
|
||||
@ -1995,14 +1989,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
adaptive_pebs_save_regs(regs, gprs);
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_MEMINFO) {
|
||||
if (format_group & PEBS_DATACFG_MEMINFO) {
|
||||
if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
|
||||
u64 weight = meminfo->latency;
|
||||
u64 latency = x86_pmu.flags & PMU_FL_INSTR_LATENCY ?
|
||||
meminfo->cache_latency : meminfo->mem_latency;
|
||||
|
||||
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) {
|
||||
data->weight.var2_w = weight & PEBS_LATENCY_MASK;
|
||||
weight >>= PEBS_CACHE_LATENCY_OFFSET;
|
||||
}
|
||||
if (x86_pmu.flags & PMU_FL_INSTR_LATENCY)
|
||||
data->weight.var2_w = meminfo->instr_latency;
|
||||
|
||||
/*
|
||||
* Although meminfo::latency is defined as a u64,
|
||||
@ -2010,12 +2003,13 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
* in practice on Ice Lake and earlier platforms.
|
||||
*/
|
||||
if (sample_type & PERF_SAMPLE_WEIGHT) {
|
||||
data->weight.full = weight ?:
|
||||
data->weight.full = latency ?:
|
||||
intel_get_tsx_weight(meminfo->tsx_tuning);
|
||||
} else {
|
||||
data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
|
||||
data->weight.var1_dw = (u32)latency ?:
|
||||
intel_get_tsx_weight(meminfo->tsx_tuning);
|
||||
}
|
||||
|
||||
data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
|
||||
}
|
||||
|
||||
@ -2036,16 +2030,16 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
}
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_XMMS) {
|
||||
if (format_group & PEBS_DATACFG_XMMS) {
|
||||
struct pebs_xmm *xmm = next_record;
|
||||
|
||||
next_record = xmm + 1;
|
||||
perf_regs->xmm_regs = xmm->xmm;
|
||||
}
|
||||
|
||||
if (format_size & PEBS_DATACFG_LBRS) {
|
||||
if (format_group & PEBS_DATACFG_LBRS) {
|
||||
struct lbr_entry *lbr = next_record;
|
||||
int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
|
||||
int num_lbr = ((format_group >> PEBS_DATACFG_LBR_SHIFT)
|
||||
& 0xff) + 1;
|
||||
next_record = next_record + num_lbr * sizeof(struct lbr_entry);
|
||||
|
||||
@ -2055,11 +2049,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
|
||||
}
|
||||
}
|
||||
|
||||
WARN_ONCE(next_record != __pebs + (format_size >> 48),
|
||||
"PEBS record size %llu, expected %llu, config %llx\n",
|
||||
format_size >> 48,
|
||||
WARN_ONCE(next_record != __pebs + basic->format_size,
|
||||
"PEBS record size %u, expected %llu, config %llx\n",
|
||||
basic->format_size,
|
||||
(u64)(next_record - __pebs),
|
||||
basic->format_size);
|
||||
format_group);
|
||||
}
|
||||
|
||||
static inline void *
|
||||
@ -2170,46 +2164,33 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
|
||||
return 0;
|
||||
}
|
||||
|
||||
typedef void (*setup_fn)(struct perf_event *, struct pt_regs *, void *,
|
||||
struct perf_sample_data *, struct pt_regs *);
|
||||
|
||||
static struct pt_regs dummy_iregs;
|
||||
|
||||
static __always_inline void
|
||||
__intel_pmu_pebs_event(struct perf_event *event,
|
||||
struct pt_regs *iregs,
|
||||
struct pt_regs *regs,
|
||||
struct perf_sample_data *data,
|
||||
void *base, void *top,
|
||||
int bit, int count,
|
||||
void (*setup_sample)(struct perf_event *,
|
||||
struct pt_regs *,
|
||||
void *,
|
||||
struct perf_sample_data *,
|
||||
struct pt_regs *))
|
||||
void *at,
|
||||
setup_fn setup_sample)
|
||||
{
|
||||
setup_sample(event, iregs, at, data, regs);
|
||||
perf_event_output(event, data, regs);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
__intel_pmu_pebs_last_event(struct perf_event *event,
|
||||
struct pt_regs *iregs,
|
||||
struct pt_regs *regs,
|
||||
struct perf_sample_data *data,
|
||||
void *at,
|
||||
int count,
|
||||
setup_fn setup_sample)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
struct x86_perf_regs perf_regs;
|
||||
struct pt_regs *regs = &perf_regs.regs;
|
||||
void *at = get_next_pebs_record_by_bit(base, top, bit);
|
||||
static struct pt_regs dummy_iregs;
|
||||
|
||||
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
|
||||
/*
|
||||
* Now, auto-reload is only enabled in fixed period mode.
|
||||
* The reload value is always hwc->sample_period.
|
||||
* May need to change it, if auto-reload is enabled in
|
||||
* freq mode later.
|
||||
*/
|
||||
intel_pmu_save_and_restart_reload(event, count);
|
||||
} else if (!intel_pmu_save_and_restart(event))
|
||||
return;
|
||||
|
||||
if (!iregs)
|
||||
iregs = &dummy_iregs;
|
||||
|
||||
while (count > 1) {
|
||||
setup_sample(event, iregs, at, data, regs);
|
||||
perf_event_output(event, data, regs);
|
||||
at += cpuc->pebs_record_size;
|
||||
at = get_next_pebs_record_by_bit(at, top, bit);
|
||||
count--;
|
||||
}
|
||||
|
||||
setup_sample(event, iregs, at, data, regs);
|
||||
if (iregs == &dummy_iregs) {
|
||||
@ -2228,6 +2209,44 @@ __intel_pmu_pebs_event(struct perf_event *event,
|
||||
if (perf_event_overflow(event, data, regs))
|
||||
x86_pmu_stop(event, 0);
|
||||
}
|
||||
|
||||
if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
|
||||
/*
|
||||
* Now, auto-reload is only enabled in fixed period mode.
|
||||
* The reload value is always hwc->sample_period.
|
||||
* May need to change it, if auto-reload is enabled in
|
||||
* freq mode later.
|
||||
*/
|
||||
intel_pmu_save_and_restart_reload(event, count);
|
||||
} else
|
||||
intel_pmu_save_and_restart(event);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
__intel_pmu_pebs_events(struct perf_event *event,
|
||||
struct pt_regs *iregs,
|
||||
struct perf_sample_data *data,
|
||||
void *base, void *top,
|
||||
int bit, int count,
|
||||
setup_fn setup_sample)
|
||||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct x86_perf_regs perf_regs;
|
||||
struct pt_regs *regs = &perf_regs.regs;
|
||||
void *at = get_next_pebs_record_by_bit(base, top, bit);
|
||||
int cnt = count;
|
||||
|
||||
if (!iregs)
|
||||
iregs = &dummy_iregs;
|
||||
|
||||
while (cnt > 1) {
|
||||
__intel_pmu_pebs_event(event, iregs, regs, data, at, setup_sample);
|
||||
at += cpuc->pebs_record_size;
|
||||
at = get_next_pebs_record_by_bit(at, top, bit);
|
||||
cnt--;
|
||||
}
|
||||
|
||||
__intel_pmu_pebs_last_event(event, iregs, regs, data, at, count, setup_sample);
|
||||
}
|
||||
|
||||
static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data)
|
||||
@ -2264,8 +2283,8 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_
|
||||
return;
|
||||
}
|
||||
|
||||
__intel_pmu_pebs_event(event, iregs, data, at, top, 0, n,
|
||||
setup_pebs_fixed_sample_data);
|
||||
__intel_pmu_pebs_events(event, iregs, data, at, top, 0, n,
|
||||
setup_pebs_fixed_sample_data);
|
||||
}
|
||||
|
||||
static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size)
|
||||
@ -2396,9 +2415,9 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
|
||||
}
|
||||
|
||||
if (counts[bit]) {
|
||||
__intel_pmu_pebs_event(event, iregs, data, base,
|
||||
top, bit, counts[bit],
|
||||
setup_pebs_fixed_sample_data);
|
||||
__intel_pmu_pebs_events(event, iregs, data, base,
|
||||
top, bit, counts[bit],
|
||||
setup_pebs_fixed_sample_data);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2406,8 +2425,12 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d
|
||||
static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data)
|
||||
{
|
||||
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
|
||||
void *last[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS];
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct debug_store *ds = cpuc->ds;
|
||||
struct x86_perf_regs perf_regs;
|
||||
struct pt_regs *regs = &perf_regs.regs;
|
||||
struct pebs_basic *basic;
|
||||
struct perf_event *event;
|
||||
void *base, *at, *top;
|
||||
int bit;
|
||||
@ -2429,30 +2452,41 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
|
||||
return;
|
||||
}
|
||||
|
||||
for (at = base; at < top; at += cpuc->pebs_record_size) {
|
||||
if (!iregs)
|
||||
iregs = &dummy_iregs;
|
||||
|
||||
/* Process all but the last event for each counter. */
|
||||
for (at = base; at < top; at += basic->format_size) {
|
||||
u64 pebs_status;
|
||||
|
||||
pebs_status = get_pebs_status(at) & cpuc->pebs_enabled;
|
||||
pebs_status &= mask;
|
||||
basic = at;
|
||||
if (basic->format_size != cpuc->pebs_record_size)
|
||||
continue;
|
||||
|
||||
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX)
|
||||
counts[bit]++;
|
||||
pebs_status = basic->applicable_counters & cpuc->pebs_enabled & mask;
|
||||
for_each_set_bit(bit, (unsigned long *)&pebs_status, X86_PMC_IDX_MAX) {
|
||||
event = cpuc->events[bit];
|
||||
|
||||
if (WARN_ON_ONCE(!event) ||
|
||||
WARN_ON_ONCE(!event->attr.precise_ip))
|
||||
continue;
|
||||
|
||||
if (counts[bit]++) {
|
||||
__intel_pmu_pebs_event(event, iregs, regs, data, last[bit],
|
||||
setup_pebs_adaptive_sample_data);
|
||||
}
|
||||
last[bit] = at;
|
||||
}
|
||||
}
|
||||
|
||||
for_each_set_bit(bit, (unsigned long *)&mask, X86_PMC_IDX_MAX) {
|
||||
if (counts[bit] == 0)
|
||||
if (!counts[bit])
|
||||
continue;
|
||||
|
||||
event = cpuc->events[bit];
|
||||
if (WARN_ON_ONCE(!event))
|
||||
continue;
|
||||
|
||||
if (WARN_ON_ONCE(!event->attr.precise_ip))
|
||||
continue;
|
||||
|
||||
__intel_pmu_pebs_event(event, iregs, data, base,
|
||||
top, bit, counts[bit],
|
||||
setup_pebs_adaptive_sample_data);
|
||||
__intel_pmu_pebs_last_event(event, iregs, regs, data, last[bit],
|
||||
counts[bit], setup_pebs_adaptive_sample_data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/device.h>
|
||||
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/insn.h>
|
||||
#include <asm/io.h>
|
||||
@ -201,10 +202,10 @@ static int __init pt_pmu_hw_init(void)
|
||||
* otherwise, zero for numerator stands for "not enumerated"
|
||||
* as per SDM
|
||||
*/
|
||||
if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
|
||||
if (boot_cpu_data.cpuid_level >= CPUID_LEAF_TSC) {
|
||||
u32 eax, ebx, ecx, edx;
|
||||
|
||||
cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
|
||||
cpuid(CPUID_LEAF_TSC, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
pt_pmu.tsc_art_num = ebx;
|
||||
pt_pmu.tsc_art_den = eax;
|
||||
|
@ -37,9 +37,6 @@ struct topa_entry {
|
||||
u64 rsvd4 : 12;
|
||||
};
|
||||
|
||||
/* TSC to Core Crystal Clock Ratio */
|
||||
#define CPUID_TSC_LEAF 0x15
|
||||
|
||||
struct pt_pmu {
|
||||
struct pmu pmu;
|
||||
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
|
||||
|
@ -39,6 +39,10 @@
|
||||
* event: rapl_energy_psys
|
||||
* perf code: 0x5
|
||||
*
|
||||
* core counter: consumption of a single physical core
|
||||
* event: rapl_energy_core (power_core PMU)
|
||||
* perf code: 0x1
|
||||
*
|
||||
* We manage those counters as free running (read-only). They may be
|
||||
* use simultaneously by other tools, such as turbostat.
|
||||
*
|
||||
@ -70,18 +74,22 @@ MODULE_LICENSE("GPL");
|
||||
/*
|
||||
* RAPL energy status counters
|
||||
*/
|
||||
enum perf_rapl_events {
|
||||
enum perf_rapl_pkg_events {
|
||||
PERF_RAPL_PP0 = 0, /* all cores */
|
||||
PERF_RAPL_PKG, /* entire package */
|
||||
PERF_RAPL_RAM, /* DRAM */
|
||||
PERF_RAPL_PP1, /* gpu */
|
||||
PERF_RAPL_PSYS, /* psys */
|
||||
|
||||
PERF_RAPL_MAX,
|
||||
NR_RAPL_DOMAINS = PERF_RAPL_MAX,
|
||||
PERF_RAPL_PKG_EVENTS_MAX,
|
||||
NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
|
||||
};
|
||||
|
||||
static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
|
||||
#define PERF_RAPL_CORE 0 /* single core */
|
||||
#define PERF_RAPL_CORE_EVENTS_MAX 1
|
||||
#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX
|
||||
|
||||
static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
|
||||
"pp0-core",
|
||||
"package",
|
||||
"dram",
|
||||
@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
|
||||
"psys",
|
||||
};
|
||||
|
||||
static const char *const rapl_core_domain_name __initconst = "core";
|
||||
|
||||
/*
|
||||
* event code: LSB 8 bits, passed in attr->config
|
||||
* any other bit is reserved
|
||||
@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \
|
||||
* considered as either pkg-scope or die-scope, and we are considering
|
||||
* them as die-scope.
|
||||
*/
|
||||
#define rapl_pmu_is_pkg_scope() \
|
||||
#define rapl_pkg_pmu_is_pkg_scope() \
|
||||
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \
|
||||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
|
||||
|
||||
@ -129,7 +139,8 @@ struct rapl_pmu {
|
||||
struct rapl_pmus {
|
||||
struct pmu pmu;
|
||||
unsigned int nr_rapl_pmu;
|
||||
struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu);
|
||||
unsigned int cntr_mask;
|
||||
struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu);
|
||||
};
|
||||
|
||||
enum rapl_unit_quirk {
|
||||
@ -139,44 +150,43 @@ enum rapl_unit_quirk {
|
||||
};
|
||||
|
||||
struct rapl_model {
|
||||
struct perf_msr *rapl_msrs;
|
||||
unsigned long events;
|
||||
struct perf_msr *rapl_pkg_msrs;
|
||||
struct perf_msr *rapl_core_msrs;
|
||||
unsigned long pkg_events;
|
||||
unsigned long core_events;
|
||||
unsigned int msr_power_unit;
|
||||
enum rapl_unit_quirk unit_quirk;
|
||||
};
|
||||
|
||||
/* 1/2^hw_unit Joule */
|
||||
static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
|
||||
static struct rapl_pmus *rapl_pmus;
|
||||
static unsigned int rapl_cntr_mask;
|
||||
static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
|
||||
static int rapl_core_hw_unit __read_mostly;
|
||||
static struct rapl_pmus *rapl_pmus_pkg;
|
||||
static struct rapl_pmus *rapl_pmus_core;
|
||||
static u64 rapl_timer_ms;
|
||||
static struct perf_msr *rapl_msrs;
|
||||
static struct rapl_model *rapl_model;
|
||||
|
||||
/*
|
||||
* Helper functions to get the correct topology macros according to the
|
||||
* Helper function to get the correct topology id according to the
|
||||
* RAPL PMU scope.
|
||||
*/
|
||||
static inline unsigned int get_rapl_pmu_idx(int cpu)
|
||||
static inline unsigned int get_rapl_pmu_idx(int cpu, int scope)
|
||||
{
|
||||
return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
|
||||
topology_logical_die_id(cpu);
|
||||
}
|
||||
|
||||
static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
|
||||
{
|
||||
return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
|
||||
topology_die_cpumask(cpu);
|
||||
}
|
||||
|
||||
static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
|
||||
{
|
||||
unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
|
||||
|
||||
/*
|
||||
* The unsigned check also catches the '-1' return value for non
|
||||
* existent mappings in the topology map.
|
||||
* Returns unsigned int, which converts the '-1' return value
|
||||
* (for non-existent mappings in topology map) to UINT_MAX, so
|
||||
* the error check in the caller is simplified.
|
||||
*/
|
||||
return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
|
||||
switch (scope) {
|
||||
case PERF_PMU_SCOPE_PKG:
|
||||
return topology_logical_package_id(cpu);
|
||||
case PERF_PMU_SCOPE_DIE:
|
||||
return topology_logical_die_id(cpu);
|
||||
case PERF_PMU_SCOPE_CORE:
|
||||
return topology_logical_core_id(cpu);
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
static inline u64 rapl_read_counter(struct perf_event *event)
|
||||
@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event)
|
||||
return raw;
|
||||
}
|
||||
|
||||
static inline u64 rapl_scale(u64 v, int cfg)
|
||||
static inline u64 rapl_scale(u64 v, struct perf_event *event)
|
||||
{
|
||||
if (cfg > NR_RAPL_DOMAINS) {
|
||||
pr_warn("Invalid domain %d, failed to scale data\n", cfg);
|
||||
return v;
|
||||
}
|
||||
int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1];
|
||||
|
||||
if (event->pmu->scope == PERF_PMU_SCOPE_CORE)
|
||||
hw_unit = rapl_core_hw_unit;
|
||||
|
||||
/*
|
||||
* scale delta to smallest unit (1/2^32)
|
||||
* users must then scale back: count * 1/(1e9*2^32) to get Joules
|
||||
* or use ldexp(count, -32).
|
||||
* Watts = Joules/Time delta
|
||||
*/
|
||||
return v << (32 - rapl_hw_unit[cfg - 1]);
|
||||
return v << (32 - hw_unit);
|
||||
}
|
||||
|
||||
static u64 rapl_event_update(struct perf_event *event)
|
||||
@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event)
|
||||
delta = (new_raw_count << shift) - (prev_raw_count << shift);
|
||||
delta >>= shift;
|
||||
|
||||
sdelta = rapl_scale(delta, event->hw.config);
|
||||
sdelta = rapl_scale(delta, event);
|
||||
|
||||
local64_add(sdelta, &event->count);
|
||||
|
||||
@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
|
||||
|
||||
static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
|
||||
{
|
||||
struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
|
||||
struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
|
||||
struct perf_event *event;
|
||||
unsigned long flags;
|
||||
|
||||
if (!pmu->n_active)
|
||||
if (!rapl_pmu->n_active)
|
||||
return HRTIMER_NORESTART;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
list_for_each_entry(event, &pmu->active_list, active_entry)
|
||||
list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
|
||||
rapl_event_update(event);
|
||||
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
|
||||
hrtimer_forward_now(hrtimer, pmu->timer_interval);
|
||||
hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
|
||||
|
||||
return HRTIMER_RESTART;
|
||||
}
|
||||
|
||||
static void rapl_hrtimer_init(struct rapl_pmu *pmu)
|
||||
static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
|
||||
{
|
||||
struct hrtimer *hr = &pmu->hrtimer;
|
||||
struct hrtimer *hr = &rapl_pmu->hrtimer;
|
||||
|
||||
hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
hr->function = rapl_hrtimer_handle;
|
||||
}
|
||||
|
||||
static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
|
||||
static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
|
||||
struct perf_event *event)
|
||||
{
|
||||
if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
|
||||
@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
|
||||
|
||||
event->hw.state = 0;
|
||||
|
||||
list_add_tail(&event->active_entry, &pmu->active_list);
|
||||
list_add_tail(&event->active_entry, &rapl_pmu->active_list);
|
||||
|
||||
local64_set(&event->hw.prev_count, rapl_read_counter(event));
|
||||
|
||||
pmu->n_active++;
|
||||
if (pmu->n_active == 1)
|
||||
rapl_start_hrtimer(pmu);
|
||||
rapl_pmu->n_active++;
|
||||
if (rapl_pmu->n_active == 1)
|
||||
rapl_start_hrtimer(rapl_pmu);
|
||||
}
|
||||
|
||||
static void rapl_pmu_event_start(struct perf_event *event, int mode)
|
||||
{
|
||||
struct rapl_pmu *pmu = event->pmu_private;
|
||||
struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
__rapl_pmu_event_start(pmu, event);
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
__rapl_pmu_event_start(rapl_pmu, event);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
}
|
||||
|
||||
static void rapl_pmu_event_stop(struct perf_event *event, int mode)
|
||||
{
|
||||
struct rapl_pmu *pmu = event->pmu_private;
|
||||
struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
/* mark event as deactivated and stopped */
|
||||
if (!(hwc->state & PERF_HES_STOPPED)) {
|
||||
WARN_ON_ONCE(pmu->n_active <= 0);
|
||||
pmu->n_active--;
|
||||
if (pmu->n_active == 0)
|
||||
hrtimer_cancel(&pmu->hrtimer);
|
||||
WARN_ON_ONCE(rapl_pmu->n_active <= 0);
|
||||
rapl_pmu->n_active--;
|
||||
if (rapl_pmu->n_active == 0)
|
||||
hrtimer_cancel(&rapl_pmu->hrtimer);
|
||||
|
||||
list_del(&event->active_entry);
|
||||
|
||||
@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
|
||||
hwc->state |= PERF_HES_UPTODATE;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
}
|
||||
|
||||
static int rapl_pmu_event_add(struct perf_event *event, int mode)
|
||||
{
|
||||
struct rapl_pmu *pmu = event->pmu_private;
|
||||
struct rapl_pmu *rapl_pmu = event->pmu_private;
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&pmu->lock, flags);
|
||||
raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
|
||||
|
||||
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
|
||||
|
||||
if (mode & PERF_EF_START)
|
||||
__rapl_pmu_event_start(pmu, event);
|
||||
__rapl_pmu_event_start(rapl_pmu, event);
|
||||
|
||||
raw_spin_unlock_irqrestore(&pmu->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags)
|
||||
static int rapl_pmu_event_init(struct perf_event *event)
|
||||
{
|
||||
u64 cfg = event->attr.config & RAPL_EVENT_MASK;
|
||||
int bit, ret = 0;
|
||||
struct rapl_pmu *pmu;
|
||||
int bit, rapl_pmus_scope, ret = 0;
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
unsigned int rapl_pmu_idx;
|
||||
struct rapl_pmus *rapl_pmus;
|
||||
|
||||
/* only look at RAPL events */
|
||||
if (event->attr.type != rapl_pmus->pmu.type)
|
||||
return -ENOENT;
|
||||
/* unsupported modes and filters */
|
||||
if (event->attr.sample_period) /* no sampling */
|
||||
return -EINVAL;
|
||||
|
||||
/* check only supported bits are set */
|
||||
if (event->attr.config & ~RAPL_EVENT_MASK)
|
||||
@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event)
|
||||
if (event->cpu < 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
|
||||
rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
|
||||
if (!rapl_pmus)
|
||||
return -EINVAL;
|
||||
rapl_pmus_scope = rapl_pmus->pmu.scope;
|
||||
|
||||
cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
|
||||
bit = cfg - 1;
|
||||
if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) {
|
||||
/* only look at RAPL package events */
|
||||
if (event->attr.type != rapl_pmus_pkg->pmu.type)
|
||||
return -ENOENT;
|
||||
|
||||
cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
|
||||
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
return -EINVAL;
|
||||
|
||||
bit = cfg - 1;
|
||||
event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
|
||||
} else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) {
|
||||
/* only look at RAPL core events */
|
||||
if (event->attr.type != rapl_pmus_core->pmu.type)
|
||||
return -ENOENT;
|
||||
|
||||
cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1);
|
||||
if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
|
||||
return -EINVAL;
|
||||
|
||||
bit = cfg - 1;
|
||||
event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
|
||||
} else
|
||||
return -EINVAL;
|
||||
|
||||
/* check event supported */
|
||||
if (!(rapl_cntr_mask & (1 << bit)))
|
||||
if (!(rapl_pmus->cntr_mask & (1 << bit)))
|
||||
return -EINVAL;
|
||||
|
||||
/* unsupported modes and filters */
|
||||
if (event->attr.sample_period) /* no sampling */
|
||||
rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope);
|
||||
if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu)
|
||||
return -EINVAL;
|
||||
|
||||
/* must be done before validate_group */
|
||||
pmu = cpu_to_rapl_pmu(event->cpu);
|
||||
if (!pmu)
|
||||
rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
|
||||
if (!rapl_pmu)
|
||||
return -EINVAL;
|
||||
event->pmu_private = pmu;
|
||||
event->hw.event_base = rapl_msrs[bit].msr;
|
||||
|
||||
event->pmu_private = rapl_pmu;
|
||||
event->hw.config = cfg;
|
||||
event->hw.idx = bit;
|
||||
|
||||
@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
|
||||
RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
|
||||
RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01");
|
||||
|
||||
RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
|
||||
RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules");
|
||||
|
||||
/*
|
||||
* we compute in 0.23 nJ increments regardless of MSR
|
||||
@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890
|
||||
RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
|
||||
RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10");
|
||||
|
||||
/*
|
||||
* There are no default events, but we need to create
|
||||
@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group *rapl_core_attr_groups[] = {
|
||||
&rapl_pmu_format_group,
|
||||
&rapl_pmu_events_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute *rapl_events_cores[] = {
|
||||
EVENT_PTR(rapl_cores),
|
||||
EVENT_PTR(rapl_cores_unit),
|
||||
@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = {
|
||||
.attrs = rapl_events_psys,
|
||||
};
|
||||
|
||||
static struct attribute *rapl_events_core[] = {
|
||||
EVENT_PTR(rapl_core),
|
||||
EVENT_PTR(rapl_core_unit),
|
||||
EVENT_PTR(rapl_core_scale),
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct attribute_group rapl_events_core_group = {
|
||||
.name = "events",
|
||||
.attrs = rapl_events_core,
|
||||
};
|
||||
|
||||
static bool test_msr(int idx, void *data)
|
||||
{
|
||||
return test_bit(idx, (unsigned long *) data);
|
||||
@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
|
||||
};
|
||||
|
||||
/*
|
||||
* Force to PERF_RAPL_MAX size due to:
|
||||
* - perf_msr_probe(PERF_RAPL_MAX)
|
||||
* Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
|
||||
* - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
|
||||
* - want to use same event codes across both architectures
|
||||
*/
|
||||
static struct perf_msr amd_rapl_msrs[] = {
|
||||
static struct perf_msr amd_rapl_pkg_msrs[] = {
|
||||
[PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 },
|
||||
[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
|
||||
[PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 },
|
||||
@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = {
|
||||
[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 },
|
||||
};
|
||||
|
||||
static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
static struct perf_msr amd_rapl_core_msrs[] = {
|
||||
[PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group,
|
||||
test_msr, false, RAPL_MSR_MASK },
|
||||
};
|
||||
|
||||
static int rapl_check_hw_unit(void)
|
||||
{
|
||||
u64 msr_rapl_power_unit_bits;
|
||||
int i;
|
||||
|
||||
/* protect rdmsrl() to handle virtualization */
|
||||
if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
|
||||
if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
|
||||
return -1;
|
||||
for (i = 0; i < NR_RAPL_DOMAINS; i++)
|
||||
rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
|
||||
rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
|
||||
switch (rm->unit_quirk) {
|
||||
rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
|
||||
|
||||
switch (rapl_model->unit_quirk) {
|
||||
/*
|
||||
* DRAM domain on HSW server and KNL has fixed energy unit which can be
|
||||
* different than the unit from power unit MSR. See
|
||||
@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
|
||||
*/
|
||||
case RAPL_UNIT_QUIRK_INTEL_HSW:
|
||||
rapl_hw_unit[PERF_RAPL_RAM] = 16;
|
||||
rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16;
|
||||
break;
|
||||
/* SPR uses a fixed energy unit for Psys domain. */
|
||||
case RAPL_UNIT_QUIRK_INTEL_SPR:
|
||||
rapl_hw_unit[PERF_RAPL_PSYS] = 0;
|
||||
rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Calculate the timer rate:
|
||||
* Use reference of 200W for scaling the timeout to avoid counter
|
||||
@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
* if hw unit is 32, then we use 2 ms 1/200/2
|
||||
*/
|
||||
rapl_timer_ms = 2;
|
||||
if (rapl_hw_unit[0] < 32) {
|
||||
if (rapl_pkg_hw_unit[0] < 32) {
|
||||
rapl_timer_ms = (1000 / (2 * 100));
|
||||
rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
|
||||
rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm)
|
||||
static void __init rapl_advertise(void)
|
||||
{
|
||||
int i;
|
||||
int num_counters = hweight32(rapl_pmus_pkg->cntr_mask);
|
||||
|
||||
if (rapl_pmus_core)
|
||||
num_counters += hweight32(rapl_pmus_core->cntr_mask);
|
||||
|
||||
pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
|
||||
hweight32(rapl_cntr_mask), rapl_timer_ms);
|
||||
num_counters, rapl_timer_ms);
|
||||
|
||||
for (i = 0; i < NR_RAPL_DOMAINS; i++) {
|
||||
if (rapl_cntr_mask & (1 << i)) {
|
||||
for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
|
||||
if (rapl_pmus_pkg->cntr_mask & (1 << i)) {
|
||||
pr_info("hw unit of domain %s 2^-%d Joules\n",
|
||||
rapl_domain_names[i], rapl_hw_unit[i]);
|
||||
rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE)))
|
||||
pr_info("hw unit of domain %s 2^-%d Joules\n",
|
||||
rapl_core_domain_name, rapl_core_hw_unit);
|
||||
}
|
||||
|
||||
static void cleanup_rapl_pmus(void)
|
||||
static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
|
||||
kfree(rapl_pmus->pmus[i]);
|
||||
kfree(rapl_pmus->rapl_pmu[i]);
|
||||
kfree(rapl_pmus);
|
||||
}
|
||||
|
||||
@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int __init init_rapl_pmu(void)
|
||||
static const struct attribute_group *rapl_core_attr_update[] = {
|
||||
&rapl_events_core_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus)
|
||||
{
|
||||
struct rapl_pmu *pmu;
|
||||
struct rapl_pmu *rapl_pmu;
|
||||
int idx;
|
||||
|
||||
for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) {
|
||||
pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
|
||||
if (!pmu)
|
||||
rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmu)
|
||||
goto free;
|
||||
|
||||
raw_spin_lock_init(&pmu->lock);
|
||||
INIT_LIST_HEAD(&pmu->active_list);
|
||||
pmu->pmu = &rapl_pmus->pmu;
|
||||
pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
rapl_hrtimer_init(pmu);
|
||||
raw_spin_lock_init(&rapl_pmu->lock);
|
||||
INIT_LIST_HEAD(&rapl_pmu->active_list);
|
||||
rapl_pmu->pmu = &rapl_pmus->pmu;
|
||||
rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
|
||||
rapl_hrtimer_init(rapl_pmu);
|
||||
|
||||
rapl_pmus->pmus[idx] = pmu;
|
||||
rapl_pmus->rapl_pmu[idx] = rapl_pmu;
|
||||
}
|
||||
|
||||
return 0;
|
||||
free:
|
||||
for (; idx > 0; idx--)
|
||||
kfree(rapl_pmus->pmus[idx - 1]);
|
||||
kfree(rapl_pmus->rapl_pmu[idx - 1]);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int __init init_rapl_pmus(void)
|
||||
static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope,
|
||||
const struct attribute_group **rapl_attr_groups,
|
||||
const struct attribute_group **rapl_attr_update)
|
||||
{
|
||||
int nr_rapl_pmu = topology_max_packages();
|
||||
int rapl_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
struct rapl_pmus *rapl_pmus;
|
||||
|
||||
if (!rapl_pmu_is_pkg_scope()) {
|
||||
nr_rapl_pmu *= topology_max_dies_per_package();
|
||||
rapl_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
}
|
||||
/*
|
||||
* rapl_pmu_scope must be either PKG, DIE or CORE
|
||||
*/
|
||||
if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE)
|
||||
nr_rapl_pmu *= topology_max_dies_per_package();
|
||||
else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE)
|
||||
nr_rapl_pmu *= topology_num_cores_per_package();
|
||||
else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG)
|
||||
return -EINVAL;
|
||||
|
||||
rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
|
||||
rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
|
||||
if (!rapl_pmus)
|
||||
return -ENOMEM;
|
||||
|
||||
*rapl_pmus_ptr = rapl_pmus;
|
||||
|
||||
rapl_pmus->nr_rapl_pmu = nr_rapl_pmu;
|
||||
rapl_pmus->pmu.attr_groups = rapl_attr_groups;
|
||||
rapl_pmus->pmu.attr_update = rapl_attr_update;
|
||||
@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void)
|
||||
rapl_pmus->pmu.module = THIS_MODULE;
|
||||
rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
|
||||
|
||||
return init_rapl_pmu();
|
||||
return init_rapl_pmu(rapl_pmus);
|
||||
}
|
||||
|
||||
static struct rapl_model model_snb = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_PP1),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_snbep = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_hsw = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PP1),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_hsx = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_knl = {
|
||||
.events = BIT(PERF_RAPL_PKG) |
|
||||
.pkg_events = BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_skl = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PP1) |
|
||||
BIT(PERF_RAPL_PSYS),
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_spr = {
|
||||
.events = BIT(PERF_RAPL_PP0) |
|
||||
.pkg_events = BIT(PERF_RAPL_PP0) |
|
||||
BIT(PERF_RAPL_PKG) |
|
||||
BIT(PERF_RAPL_RAM) |
|
||||
BIT(PERF_RAPL_PSYS),
|
||||
.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
|
||||
.msr_power_unit = MSR_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = intel_rapl_spr_msrs,
|
||||
.rapl_pkg_msrs = intel_rapl_spr_msrs,
|
||||
};
|
||||
|
||||
static struct rapl_model model_amd_hygon = {
|
||||
.events = BIT(PERF_RAPL_PKG),
|
||||
.pkg_events = BIT(PERF_RAPL_PKG),
|
||||
.core_events = BIT(PERF_RAPL_CORE),
|
||||
.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
|
||||
.rapl_msrs = amd_rapl_msrs,
|
||||
.rapl_pkg_msrs = amd_rapl_pkg_msrs,
|
||||
.rapl_core_msrs = amd_rapl_core_msrs,
|
||||
};
|
||||
|
||||
static const struct x86_cpu_id rapl_model_match[] __initconst = {
|
||||
@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
|
||||
static int __init rapl_pmu_init(void)
|
||||
{
|
||||
const struct x86_cpu_id *id;
|
||||
struct rapl_model *rm;
|
||||
int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE;
|
||||
int ret;
|
||||
|
||||
if (rapl_pkg_pmu_is_pkg_scope())
|
||||
rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG;
|
||||
|
||||
id = x86_match_cpu(rapl_model_match);
|
||||
if (!id)
|
||||
return -ENODEV;
|
||||
|
||||
rm = (struct rapl_model *) id->driver_data;
|
||||
rapl_model = (struct rapl_model *) id->driver_data;
|
||||
|
||||
rapl_msrs = rm->rapl_msrs;
|
||||
|
||||
rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
|
||||
false, (void *) &rm->events);
|
||||
|
||||
ret = rapl_check_hw_unit(rm);
|
||||
ret = rapl_check_hw_unit();
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = init_rapl_pmus();
|
||||
ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups,
|
||||
rapl_attr_update);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
|
||||
rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs,
|
||||
PERF_RAPL_PKG_EVENTS_MAX, false,
|
||||
(void *) &rapl_model->pkg_events);
|
||||
|
||||
ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
if (rapl_model->core_events) {
|
||||
ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE,
|
||||
rapl_core_attr_groups,
|
||||
rapl_core_attr_update);
|
||||
if (ret) {
|
||||
pr_warn("power-core PMU initialization failed (%d)\n", ret);
|
||||
goto core_init_failed;
|
||||
}
|
||||
|
||||
rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
|
||||
PERF_RAPL_CORE_EVENTS_MAX, false,
|
||||
(void *) &rapl_model->core_events);
|
||||
|
||||
ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1);
|
||||
if (ret) {
|
||||
pr_warn("power-core PMU registration failed (%d)\n", ret);
|
||||
cleanup_rapl_pmus(rapl_pmus_core);
|
||||
}
|
||||
}
|
||||
|
||||
core_init_failed:
|
||||
rapl_advertise();
|
||||
return 0;
|
||||
|
||||
out:
|
||||
pr_warn("Initialization failed (%d), disabled\n", ret);
|
||||
cleanup_rapl_pmus();
|
||||
cleanup_rapl_pmus(rapl_pmus_pkg);
|
||||
return ret;
|
||||
}
|
||||
module_init(rapl_pmu_init);
|
||||
|
||||
static void __exit intel_rapl_exit(void)
|
||||
{
|
||||
perf_pmu_unregister(&rapl_pmus->pmu);
|
||||
cleanup_rapl_pmus();
|
||||
if (rapl_pmus_core) {
|
||||
perf_pmu_unregister(&rapl_pmus_core->pmu);
|
||||
cleanup_rapl_pmus(rapl_pmus_core);
|
||||
}
|
||||
perf_pmu_unregister(&rapl_pmus_pkg->pmu);
|
||||
cleanup_rapl_pmus(rapl_pmus_pkg);
|
||||
}
|
||||
module_exit(intel_rapl_exit);
|
||||
|
@ -664,7 +664,7 @@ void __init hv_vtom_init(void)
|
||||
x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility;
|
||||
|
||||
/* Set WB as the default cache mode. */
|
||||
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
|
||||
guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
|
||||
}
|
||||
|
||||
#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/objtool.h>
|
||||
#include <asm/asm.h>
|
||||
|
||||
#define ALT_FLAGS_SHIFT 16
|
||||
@ -54,16 +55,6 @@
|
||||
#define LOCK_PREFIX ""
|
||||
#endif
|
||||
|
||||
/*
|
||||
* objtool annotation to ignore the alternatives and only consider the original
|
||||
* instruction(s).
|
||||
*/
|
||||
#define ANNOTATE_IGNORE_ALTERNATIVE \
|
||||
"999:\n\t" \
|
||||
".pushsection .discard.ignore_alts\n\t" \
|
||||
".long 999b\n\t" \
|
||||
".popsection\n\t"
|
||||
|
||||
/*
|
||||
* The patching flags are part of the upper bits of the @ft_flags parameter when
|
||||
* specifying them. The split is currently like this:
|
||||
@ -310,17 +301,6 @@ void nop_func(void);
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/*
|
||||
* objtool annotation to ignore the alternatives and only consider the original
|
||||
* instruction(s).
|
||||
*/
|
||||
.macro ANNOTATE_IGNORE_ALTERNATIVE
|
||||
.Lannotate_\@:
|
||||
.pushsection .discard.ignore_alts
|
||||
.long .Lannotate_\@
|
||||
.popsection
|
||||
.endm
|
||||
|
||||
/*
|
||||
* Issue one struct alt_instr descriptor entry (need to put it into
|
||||
* the section .altinstructions, see below). This entry contains
|
||||
|
@ -92,7 +92,7 @@ do { \
|
||||
do { \
|
||||
__auto_type __flags = BUGFLAG_WARNING|(flags); \
|
||||
instrumentation_begin(); \
|
||||
_BUG_FLAGS(ASM_UD2, __flags, ASM_REACHABLE); \
|
||||
_BUG_FLAGS(ASM_UD2, __flags, ANNOTATE_REACHABLE(1b)); \
|
||||
instrumentation_end(); \
|
||||
} while (0)
|
||||
|
||||
|
@ -56,7 +56,6 @@
|
||||
/* x86_cpu_id::flags */
|
||||
#define X86_CPU_ID_FLAG_ENTRY_VALID BIT(0)
|
||||
|
||||
#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
|
||||
/**
|
||||
* X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
|
||||
* @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY
|
||||
@ -208,6 +207,7 @@
|
||||
VFM_MODEL(vfm), \
|
||||
X86_STEPPING_ANY, X86_FEATURE_ANY, data)
|
||||
|
||||
#define __X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins)
|
||||
/**
|
||||
* X86_MATCH_VFM_STEPPINGS - Match encoded vendor/family/model/stepping
|
||||
* @vfm: Encoded 8-bits each for vendor, family, model
|
||||
@ -218,12 +218,13 @@
|
||||
*
|
||||
* feature is set to wildcard
|
||||
*/
|
||||
#define X86_MATCH_VFM_STEPPINGS(vfm, steppings, data) \
|
||||
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
|
||||
VFM_VENDOR(vfm), \
|
||||
VFM_FAMILY(vfm), \
|
||||
VFM_MODEL(vfm), \
|
||||
steppings, X86_FEATURE_ANY, data)
|
||||
#define X86_MATCH_VFM_STEPS(vfm, min_step, max_step, data) \
|
||||
X86_MATCH_VENDORID_FAM_MODEL_STEPPINGS_FEATURE( \
|
||||
VFM_VENDOR(vfm), \
|
||||
VFM_FAMILY(vfm), \
|
||||
VFM_MODEL(vfm), \
|
||||
__X86_STEPPINGS(min_step, max_step), \
|
||||
X86_FEATURE_ANY, data)
|
||||
|
||||
/**
|
||||
* X86_MATCH_VFM_FEATURE - Match encoded vendor/family/model/feature
|
||||
@ -242,41 +243,7 @@
|
||||
VFM_MODEL(vfm), \
|
||||
X86_STEPPING_ANY, feature, data)
|
||||
|
||||
/*
|
||||
* Match specific microcode revisions.
|
||||
*
|
||||
* vendor/family/model/stepping must be all set.
|
||||
*
|
||||
* Only checks against the boot CPU. When mixed-stepping configs are
|
||||
* valid for a CPU model, add a quirk for every valid stepping and
|
||||
* do the fine-tuning in the quirk handler.
|
||||
*/
|
||||
|
||||
struct x86_cpu_desc {
|
||||
u8 x86_family;
|
||||
u8 x86_vendor;
|
||||
u8 x86_model;
|
||||
u8 x86_stepping;
|
||||
u32 x86_microcode_rev;
|
||||
};
|
||||
|
||||
#define INTEL_CPU_DESC(vfm, stepping, revision) { \
|
||||
.x86_family = VFM_FAMILY(vfm), \
|
||||
.x86_vendor = VFM_VENDOR(vfm), \
|
||||
.x86_model = VFM_MODEL(vfm), \
|
||||
.x86_stepping = (stepping), \
|
||||
.x86_microcode_rev = (revision), \
|
||||
}
|
||||
|
||||
#define AMD_CPU_DESC(fam, model, stepping, revision) { \
|
||||
.x86_family = (fam), \
|
||||
.x86_vendor = X86_VENDOR_AMD, \
|
||||
.x86_model = (model), \
|
||||
.x86_stepping = (stepping), \
|
||||
.x86_microcode_rev = (revision), \
|
||||
}
|
||||
|
||||
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
|
||||
extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
|
||||
extern bool x86_match_min_microcode_rev(const struct x86_cpu_id *table);
|
||||
|
||||
#endif /* _ASM_X86_CPU_DEVICE_ID */
|
||||
|
@ -132,11 +132,12 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
|
||||
x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
|
||||
|
||||
/*
|
||||
* This macro is for detection of features which need kernel
|
||||
* infrastructure to be used. It may *not* directly test the CPU
|
||||
* itself. Use the cpu_has() family if you want true runtime
|
||||
* testing of CPU features, like in hypervisor code where you are
|
||||
* supporting a possible guest feature where host support for it
|
||||
* This is the default CPU features testing macro to use in code.
|
||||
*
|
||||
* It is for detection of features which need kernel infrastructure to be
|
||||
* used. It may *not* directly test the CPU itself. Use the cpu_has() family
|
||||
* if you want true runtime testing of CPU features, like in hypervisor code
|
||||
* where you are supporting a possible guest feature where host support for it
|
||||
* is not relevant.
|
||||
*/
|
||||
#define cpu_feature_enabled(bit) \
|
||||
@ -161,13 +162,6 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
|
||||
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
|
||||
|
||||
/*
|
||||
* Static testing of CPU features. Used the same as boot_cpu_has(). It
|
||||
* statically patches the target code for additional performance. Use
|
||||
* static_cpu_has() only in fast paths, where every cycle counts. Which
|
||||
* means that the boot_cpu_has() variant is already fast enough for the
|
||||
* majority of cases and you should stick to using it as it is generally
|
||||
* only two instructions: a RIP-relative MOV and a TEST.
|
||||
*
|
||||
* Do not use an "m" constraint for [cap_byte] here: gcc doesn't know
|
||||
* that this is only used on a fallback path and will sometimes cause
|
||||
* it to manifest the address of boot_cpu_data in a register, fouling
|
||||
|
@ -83,8 +83,8 @@
|
||||
#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* "centaur_mcr" Centaur MCRs (= MTRRs) */
|
||||
#define X86_FEATURE_K8 ( 3*32+ 4) /* Opteron, Athlon64 */
|
||||
#define X86_FEATURE_ZEN5 ( 3*32+ 5) /* CPU based on Zen5 microarchitecture */
|
||||
#define X86_FEATURE_P3 ( 3*32+ 6) /* P3 */
|
||||
#define X86_FEATURE_P4 ( 3*32+ 7) /* P4 */
|
||||
/* Free ( 3*32+ 6) */
|
||||
/* Free ( 3*32+ 7) */
|
||||
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* "constant_tsc" TSC ticks at a constant rate */
|
||||
#define X86_FEATURE_UP ( 3*32+ 9) /* "up" SMP kernel running on UP */
|
||||
#define X86_FEATURE_ART ( 3*32+10) /* "art" Always running timer (ART) */
|
||||
@ -451,6 +451,8 @@
|
||||
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* Virtual TSC_AUX */
|
||||
#define X86_FEATURE_SME_COHERENT (19*32+10) /* AMD hardware-enforced cache coherency */
|
||||
#define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" AMD SEV-ES full debug state swap support */
|
||||
#define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */
|
||||
#define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */
|
||||
#define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */
|
||||
|
||||
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
|
||||
|
@ -21,6 +21,13 @@ enum cpuid_regs_idx {
|
||||
CPUID_EDX,
|
||||
};
|
||||
|
||||
#define CPUID_LEAF_MWAIT 0x5
|
||||
#define CPUID_LEAF_DCA 0x9
|
||||
#define CPUID_LEAF_XSTATE 0x0d
|
||||
#define CPUID_LEAF_TSC 0x15
|
||||
#define CPUID_LEAF_FREQ 0x16
|
||||
#define CPUID_LEAF_TILE 0x1d
|
||||
|
||||
#ifdef CONFIG_X86_32
|
||||
bool have_cpuid_p(void);
|
||||
#else
|
||||
|
@ -12,10 +12,6 @@
|
||||
/* Bit 63 of XCR0 is reserved for future expansion */
|
||||
#define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63)))
|
||||
|
||||
#define XSTATE_CPUID 0x0000000d
|
||||
|
||||
#define TILE_CPUID 0x0000001d
|
||||
|
||||
#define FXSAVE_SIZE 512
|
||||
|
||||
#define XSAVE_HDR_SIZE 64
|
||||
|
@ -2,7 +2,7 @@
|
||||
#ifndef _ASM_X86_INIT_H
|
||||
#define _ASM_X86_INIT_H
|
||||
|
||||
#define __head __section(".head.text")
|
||||
#define __head __section(".head.text") __no_sanitize_undefined
|
||||
|
||||
struct x86_mapping_info {
|
||||
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
|
||||
|
@ -100,8 +100,8 @@
|
||||
}
|
||||
|
||||
#define ASM_CALL_ARG0 \
|
||||
"call %c[__func] \n" \
|
||||
ASM_REACHABLE
|
||||
"1: call %c[__func] \n" \
|
||||
ANNOTATE_REACHABLE(1b)
|
||||
|
||||
#define ASM_CALL_ARG1 \
|
||||
"movq %[arg1], %%rdi \n" \
|
||||
|
@ -8,14 +8,9 @@
|
||||
# define PA_PGD 2
|
||||
# define PA_SWAP_PAGE 3
|
||||
# define PAGES_NR 4
|
||||
#else
|
||||
# define PA_CONTROL_PAGE 0
|
||||
# define VA_CONTROL_PAGE 1
|
||||
# define PA_TABLE_PAGE 2
|
||||
# define PA_SWAP_PAGE 3
|
||||
# define PAGES_NR 4
|
||||
#endif
|
||||
|
||||
# define KEXEC_CONTROL_PAGE_SIZE 4096
|
||||
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
@ -43,7 +38,6 @@ struct kimage;
|
||||
/* Maximum address we can use for the control code buffer */
|
||||
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
|
||||
|
||||
# define KEXEC_CONTROL_PAGE_SIZE 4096
|
||||
|
||||
/* The native architecture */
|
||||
# define KEXEC_ARCH KEXEC_ARCH_386
|
||||
@ -58,11 +52,12 @@ struct kimage;
|
||||
/* Maximum address we can use for the control pages */
|
||||
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
|
||||
|
||||
/* Allocate one page for the pdp and the second for the code */
|
||||
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
|
||||
|
||||
/* The native architecture */
|
||||
# define KEXEC_ARCH KEXEC_ARCH_X86_64
|
||||
|
||||
extern unsigned long kexec_va_control_page;
|
||||
extern unsigned long kexec_pa_table_page;
|
||||
extern unsigned long kexec_pa_swap_page;
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -125,7 +120,7 @@ relocate_kernel(unsigned long indirection_page,
|
||||
#else
|
||||
unsigned long
|
||||
relocate_kernel(unsigned long indirection_page,
|
||||
unsigned long page_list,
|
||||
unsigned long pa_control_page,
|
||||
unsigned long start_address,
|
||||
unsigned int preserve_context,
|
||||
unsigned int host_mem_enc_active);
|
||||
@ -145,6 +140,19 @@ struct kimage_arch {
|
||||
};
|
||||
#else
|
||||
struct kimage_arch {
|
||||
/*
|
||||
* This is a kimage control page, as it must not overlap with either
|
||||
* source or destination address ranges.
|
||||
*/
|
||||
pgd_t *pgd;
|
||||
/*
|
||||
* The virtual mapping of the control code page itself is used only
|
||||
* during the transition, while the current kernel's pages are all
|
||||
* in place. Thus the intermediate page table pages used to map it
|
||||
* are not control pages, but instead just normal pages obtained
|
||||
* with get_zeroed_page(). And have to be tracked (below) so that
|
||||
* they can be freed.
|
||||
*/
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
@ -37,6 +37,8 @@ typedef struct {
|
||||
*/
|
||||
atomic64_t tlb_gen;
|
||||
|
||||
unsigned long next_trim_cpumask;
|
||||
|
||||
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
||||
struct rw_semaphore ldt_usr_sem;
|
||||
struct ldt_struct *ldt;
|
||||
|
@ -151,6 +151,7 @@ static inline int init_new_context(struct task_struct *tsk,
|
||||
|
||||
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
|
||||
atomic64_set(&mm->context.tlb_gen, 0);
|
||||
mm->context.next_trim_cpumask = jiffies + HZ;
|
||||
|
||||
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
||||
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
|
||||
|
@ -644,6 +644,7 @@
|
||||
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
|
||||
#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b
|
||||
#define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e
|
||||
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
|
||||
#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
|
||||
#define MSR_AMD64_SEV 0xc0010131
|
||||
#define MSR_AMD64_SEV_ENABLED_BIT 0
|
||||
@ -682,11 +683,12 @@
|
||||
#define MSR_AMD64_SNP_SMT_PROT BIT_ULL(MSR_AMD64_SNP_SMT_PROT_BIT)
|
||||
#define MSR_AMD64_SNP_RESV_BIT 18
|
||||
#define MSR_AMD64_SNP_RESERVED_MASK GENMASK_ULL(63, MSR_AMD64_SNP_RESV_BIT)
|
||||
|
||||
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
|
||||
|
||||
#define MSR_AMD64_RMP_BASE 0xc0010132
|
||||
#define MSR_AMD64_RMP_END 0xc0010133
|
||||
#define MSR_AMD64_RMP_CFG 0xc0010136
|
||||
#define MSR_AMD64_SEG_RMP_ENABLED_BIT 0
|
||||
#define MSR_AMD64_SEG_RMP_ENABLED BIT_ULL(MSR_AMD64_SEG_RMP_ENABLED_BIT)
|
||||
#define MSR_AMD64_RMP_SEGMENT_SHIFT(x) (((x) & GENMASK_ULL(13, 8)) >> 8)
|
||||
|
||||
#define MSR_SVSM_CAA 0xc001f000
|
||||
|
||||
|
@ -58,8 +58,8 @@ struct mtrr_state_type {
|
||||
*/
|
||||
# ifdef CONFIG_MTRR
|
||||
void mtrr_bp_init(void);
|
||||
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
|
||||
mtrr_type def_type);
|
||||
void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
|
||||
mtrr_type def_type);
|
||||
extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
|
||||
extern void mtrr_save_fixed_ranges(void *);
|
||||
extern void mtrr_save_state(void);
|
||||
@ -75,9 +75,9 @@ void mtrr_disable(void);
|
||||
void mtrr_enable(void);
|
||||
void mtrr_generic_set_state(void);
|
||||
# else
|
||||
static inline void mtrr_overwrite_state(struct mtrr_var_range *var,
|
||||
unsigned int num_var,
|
||||
mtrr_type def_type)
|
||||
static inline void guest_force_mtrr_state(struct mtrr_var_range *var,
|
||||
unsigned int num_var,
|
||||
mtrr_type def_type)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,6 @@
|
||||
#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
|
||||
#define MWAIT_C1_SUBSTATE_MASK 0xf0
|
||||
|
||||
#define CPUID_MWAIT_LEAF 5
|
||||
#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
|
||||
#define CPUID5_ECX_INTERRUPT_BREAK 0x2
|
||||
|
||||
|
@ -179,18 +179,6 @@
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
|
||||
/*
|
||||
* This should be used immediately before an indirect jump/call. It tells
|
||||
* objtool the subsequent indirect jump/call is vouched safe for retpoline
|
||||
* builds.
|
||||
*/
|
||||
.macro ANNOTATE_RETPOLINE_SAFE
|
||||
.Lhere_\@:
|
||||
.pushsection .discard.retpoline_safe
|
||||
.long .Lhere_\@
|
||||
.popsection
|
||||
.endm
|
||||
|
||||
/*
|
||||
* (ab)use RETPOLINE_SAFE on RET to annotate away 'bare' RET instructions
|
||||
* vs RETBleed validation.
|
||||
@ -350,12 +338,6 @@
|
||||
|
||||
#else /* __ASSEMBLY__ */
|
||||
|
||||
#define ANNOTATE_RETPOLINE_SAFE \
|
||||
"999:\n\t" \
|
||||
".pushsection .discard.retpoline_safe\n\t" \
|
||||
".long 999b\n\t" \
|
||||
".popsection\n\t"
|
||||
|
||||
typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE];
|
||||
extern retpoline_thunk_t __x86_indirect_thunk_array[];
|
||||
extern retpoline_thunk_t __x86_indirect_call_thunk_array[];
|
||||
|
@ -180,13 +180,6 @@ static inline void halt(void)
|
||||
PVOP_VCALL0(irq.halt);
|
||||
}
|
||||
|
||||
extern noinstr void pv_native_wbinvd(void);
|
||||
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT_XEN);
|
||||
}
|
||||
|
||||
static inline u64 paravirt_read_msr(unsigned msr)
|
||||
{
|
||||
return PVOP_CALL1(u64, cpu.read_msr, msr);
|
||||
|
@ -86,8 +86,6 @@ struct pv_cpu_ops {
|
||||
void (*update_io_bitmap)(void);
|
||||
#endif
|
||||
|
||||
void (*wbinvd)(void);
|
||||
|
||||
/* cpuid emulation, mostly so that caps bits can be disabled */
|
||||
void (*cpuid)(unsigned int *eax, unsigned int *ebx,
|
||||
unsigned int *ecx, unsigned int *edx);
|
||||
|
@ -422,7 +422,9 @@ static inline bool is_topdown_idx(int idx)
|
||||
*/
|
||||
|
||||
struct pebs_basic {
|
||||
u64 format_size;
|
||||
u64 format_group:32,
|
||||
retire_latency:16,
|
||||
format_size:16;
|
||||
u64 ip;
|
||||
u64 applicable_counters;
|
||||
u64 tsc;
|
||||
@ -431,7 +433,17 @@ struct pebs_basic {
|
||||
struct pebs_meminfo {
|
||||
u64 address;
|
||||
u64 aux;
|
||||
u64 latency;
|
||||
union {
|
||||
/* pre Alder Lake */
|
||||
u64 mem_latency;
|
||||
/* Alder Lake and later */
|
||||
struct {
|
||||
u64 instr_latency:16;
|
||||
u64 pad2:16;
|
||||
u64 cache_latency:16;
|
||||
u64 pad3:16;
|
||||
};
|
||||
};
|
||||
u64 tsx_tuning;
|
||||
};
|
||||
|
||||
|
@ -98,6 +98,7 @@ struct cpuinfo_topology {
|
||||
// Logical ID mappings
|
||||
u32 logical_pkg_id;
|
||||
u32 logical_die_id;
|
||||
u32 logical_core_id;
|
||||
|
||||
// AMD Node ID and Nodes per Package info
|
||||
u32 amd_node_id;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <asm-generic/sections.h>
|
||||
#include <asm/extable.h>
|
||||
|
||||
extern char __relocate_kernel_start[], __relocate_kernel_end[];
|
||||
extern char __brk_base[], __brk_limit[];
|
||||
extern char __end_rodata_aligned[];
|
||||
|
||||
|
@ -49,7 +49,7 @@ extern unsigned long saved_video_mode;
|
||||
|
||||
extern void reserve_standard_io_resources(void);
|
||||
extern void i386_reserve_resources(void);
|
||||
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
|
||||
extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
|
||||
extern void startup_64_setup_gdt_idt(void);
|
||||
extern void early_setup_idt(void);
|
||||
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
|
||||
|
@ -19,6 +19,32 @@
|
||||
#define TDG_VM_RD 7
|
||||
#define TDG_VM_WR 8
|
||||
|
||||
/* TDX attributes */
|
||||
#define TDX_ATTR_DEBUG_BIT 0
|
||||
#define TDX_ATTR_DEBUG BIT_ULL(TDX_ATTR_DEBUG_BIT)
|
||||
#define TDX_ATTR_HGS_PLUS_PROF_BIT 4
|
||||
#define TDX_ATTR_HGS_PLUS_PROF BIT_ULL(TDX_ATTR_HGS_PLUS_PROF_BIT)
|
||||
#define TDX_ATTR_PERF_PROF_BIT 5
|
||||
#define TDX_ATTR_PERF_PROF BIT_ULL(TDX_ATTR_PERF_PROF_BIT)
|
||||
#define TDX_ATTR_PMT_PROF_BIT 6
|
||||
#define TDX_ATTR_PMT_PROF BIT_ULL(TDX_ATTR_PMT_PROF_BIT)
|
||||
#define TDX_ATTR_ICSSD_BIT 16
|
||||
#define TDX_ATTR_ICSSD BIT_ULL(TDX_ATTR_ICSSD_BIT)
|
||||
#define TDX_ATTR_LASS_BIT 27
|
||||
#define TDX_ATTR_LASS BIT_ULL(TDX_ATTR_LASS_BIT)
|
||||
#define TDX_ATTR_SEPT_VE_DISABLE_BIT 28
|
||||
#define TDX_ATTR_SEPT_VE_DISABLE BIT_ULL(TDX_ATTR_SEPT_VE_DISABLE_BIT)
|
||||
#define TDX_ATTR_MIGRTABLE_BIT 29
|
||||
#define TDX_ATTR_MIGRTABLE BIT_ULL(TDX_ATTR_MIGRTABLE_BIT)
|
||||
#define TDX_ATTR_PKS_BIT 30
|
||||
#define TDX_ATTR_PKS BIT_ULL(TDX_ATTR_PKS_BIT)
|
||||
#define TDX_ATTR_KL_BIT 31
|
||||
#define TDX_ATTR_KL BIT_ULL(TDX_ATTR_KL_BIT)
|
||||
#define TDX_ATTR_TPA_BIT 62
|
||||
#define TDX_ATTR_TPA BIT_ULL(TDX_ATTR_TPA_BIT)
|
||||
#define TDX_ATTR_PERFMON_BIT 63
|
||||
#define TDX_ATTR_PERFMON BIT_ULL(TDX_ATTR_PERFMON_BIT)
|
||||
|
||||
/* TDX TD-Scope Metadata. To be used by TDG.VM.WR and TDG.VM.RD */
|
||||
#define TDCS_CONFIG_FLAGS 0x1110000300000016
|
||||
#define TDCS_TD_CTLS 0x1110000300000017
|
||||
@ -29,8 +55,16 @@
|
||||
#define TDCS_CONFIG_FLEXIBLE_PENDING_VE BIT_ULL(1)
|
||||
|
||||
/* TDCS_TD_CTLS bits */
|
||||
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(0)
|
||||
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(1)
|
||||
#define TD_CTLS_PENDING_VE_DISABLE_BIT 0
|
||||
#define TD_CTLS_PENDING_VE_DISABLE BIT_ULL(TD_CTLS_PENDING_VE_DISABLE_BIT)
|
||||
#define TD_CTLS_ENUM_TOPOLOGY_BIT 1
|
||||
#define TD_CTLS_ENUM_TOPOLOGY BIT_ULL(TD_CTLS_ENUM_TOPOLOGY_BIT)
|
||||
#define TD_CTLS_VIRT_CPUID2_BIT 2
|
||||
#define TD_CTLS_VIRT_CPUID2 BIT_ULL(TD_CTLS_VIRT_CPUID2_BIT)
|
||||
#define TD_CTLS_REDUCE_VE_BIT 3
|
||||
#define TD_CTLS_REDUCE_VE BIT_ULL(TD_CTLS_REDUCE_VE_BIT)
|
||||
#define TD_CTLS_LOCK_BIT 63
|
||||
#define TD_CTLS_LOCK BIT_ULL(TD_CTLS_LOCK_BIT)
|
||||
|
||||
/* TDX hypercall Leaf IDs */
|
||||
#define TDVMCALL_MAP_GPA 0x10001
|
||||
|
@ -115,7 +115,7 @@ static inline void wrpkru(u32 pkru)
|
||||
}
|
||||
#endif
|
||||
|
||||
static __always_inline void native_wbinvd(void)
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
asm volatile("wbinvd": : :"memory");
|
||||
}
|
||||
@ -167,12 +167,6 @@ static inline void __write_cr4(unsigned long x)
|
||||
{
|
||||
native_write_cr4(x);
|
||||
}
|
||||
|
||||
static __always_inline void wbinvd(void)
|
||||
{
|
||||
native_wbinvd();
|
||||
}
|
||||
|
||||
#endif /* CONFIG_PARAVIRT_XXL */
|
||||
|
||||
static __always_inline void clflush(volatile void *__p)
|
||||
|
@ -66,6 +66,9 @@ int tdx_mcall_get_report0(u8 *reportdata, u8 *tdreport);
|
||||
|
||||
u64 tdx_hcall_get_quote(u8 *buf, size_t size);
|
||||
|
||||
void __init tdx_dump_attributes(u64 td_attr);
|
||||
void __init tdx_dump_td_ctls(u64 td_ctls);
|
||||
|
||||
#else
|
||||
|
||||
static inline void tdx_early_init(void) { };
|
||||
|
@ -222,6 +222,7 @@ struct flush_tlb_info {
|
||||
unsigned int initiating_cpu;
|
||||
u8 stride_shift;
|
||||
u8 freed_tables;
|
||||
u8 trim_cpumask;
|
||||
};
|
||||
|
||||
void flush_tlb_local(void);
|
||||
|
@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
|
||||
#define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id)
|
||||
#define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id)
|
||||
#define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id)
|
||||
#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id)
|
||||
#define topology_die_id(cpu) (cpu_data(cpu).topo.die_id)
|
||||
#define topology_core_id(cpu) (cpu_data(cpu).topo.core_id)
|
||||
#define topology_ppin(cpu) (cpu_data(cpu).ppin)
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include <linux/sched.h>
|
||||
|
||||
#include <acpi/processor.h>
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/mwait.h>
|
||||
#include <asm/special_insns.h>
|
||||
|
||||
@ -128,7 +129,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
|
||||
unsigned int cstate_type; /* C-state type and not ACPI C-state type */
|
||||
unsigned int num_cstate_subtype;
|
||||
|
||||
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
|
||||
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
/* Check whether this particular cx_type (in CST) is supported or not */
|
||||
cstate_type = (((cx->address >> MWAIT_SUBSTATE_SIZE) &
|
||||
@ -172,7 +173,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
|
||||
struct cpuinfo_x86 *c = &cpu_data(cpu);
|
||||
long retval;
|
||||
|
||||
if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF)
|
||||
if (!cpu_cstate_entry || c->cpuid_level < CPUID_LEAF_MWAIT)
|
||||
return -1;
|
||||
|
||||
if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT)
|
||||
|
@ -1854,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
|
||||
return temp_state;
|
||||
}
|
||||
|
||||
__ro_after_init struct mm_struct *poking_mm;
|
||||
__ro_after_init unsigned long poking_addr;
|
||||
|
||||
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
|
||||
{
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
switch_mm_irqs_off(NULL, prev_state.mm, current);
|
||||
|
||||
/* Clear the cpumask, to indicate no TLB flushing is needed anywhere */
|
||||
cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm));
|
||||
|
||||
/*
|
||||
* Restore the breakpoints if they were disabled before the temporary mm
|
||||
* was loaded.
|
||||
@ -1867,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
|
||||
hw_breakpoint_restore();
|
||||
}
|
||||
|
||||
__ro_after_init struct mm_struct *poking_mm;
|
||||
__ro_after_init unsigned long poking_addr;
|
||||
|
||||
static void text_poke_memcpy(void *dst, const void *src, size_t len)
|
||||
{
|
||||
memcpy(dst, src, len);
|
||||
|
@ -509,19 +509,19 @@ static struct clock_event_device lapic_clockevent = {
|
||||
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
|
||||
|
||||
static const struct x86_cpu_id deadline_match[] __initconst = {
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x2, 0x2, 0x3a), /* EP */
|
||||
X86_MATCH_VFM_STEPS(INTEL_HASWELL_X, 0x4, 0x4, 0x0f), /* EX */
|
||||
|
||||
X86_MATCH_VFM(INTEL_BROADWELL_X, 0x0b000020),
|
||||
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x2, 0x2, 0x00000011),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x3, 0x3, 0x0700000e),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x4, 0x4, 0x0f00000c),
|
||||
X86_MATCH_VFM_STEPS(INTEL_BROADWELL_D, 0x5, 0x5, 0x0e000003),
|
||||
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
|
||||
X86_MATCH_VFM_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x3, 0x3, 0x01000136),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x4, 0x4, 0x02000014),
|
||||
X86_MATCH_VFM_STEPS(INTEL_SKYLAKE_X, 0x5, 0xf, 0),
|
||||
|
||||
X86_MATCH_VFM(INTEL_HASWELL, 0x22),
|
||||
X86_MATCH_VFM(INTEL_HASWELL_L, 0x20),
|
||||
@ -2582,19 +2582,12 @@ int apic_is_clustered_box(void)
|
||||
/*
|
||||
* APIC command line parameters
|
||||
*/
|
||||
static int __init setup_disableapic(char *arg)
|
||||
static int __init setup_nolapic(char *arg)
|
||||
{
|
||||
apic_is_disabled = true;
|
||||
setup_clear_cpu_cap(X86_FEATURE_APIC);
|
||||
return 0;
|
||||
}
|
||||
early_param("disableapic", setup_disableapic);
|
||||
|
||||
/* same as disableapic, for compatibility */
|
||||
static int __init setup_nolapic(char *arg)
|
||||
{
|
||||
return setup_disableapic(arg);
|
||||
}
|
||||
early_param("nolapic", setup_nolapic);
|
||||
|
||||
static int __init parse_lapic_timer_c2_ok(char *arg)
|
||||
|
@ -139,9 +139,15 @@ static bool skip_addr(void *dest)
|
||||
return true;
|
||||
#endif
|
||||
#ifdef CONFIG_KEXEC_CORE
|
||||
# ifdef CONFIG_X86_64
|
||||
if (dest >= (void *)__relocate_kernel_start &&
|
||||
dest < (void *)__relocate_kernel_end)
|
||||
return true;
|
||||
# else
|
||||
if (dest >= (void *)relocate_kernel &&
|
||||
dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
|
||||
return true;
|
||||
# endif
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
@ -355,10 +355,15 @@ static void bsp_determine_snp(struct cpuinfo_x86 *c)
|
||||
/*
|
||||
* RMP table entry format is not architectural and is defined by the
|
||||
* per-processor PPR. Restrict SNP support on the known CPU models
|
||||
* for which the RMP table entry format is currently defined for.
|
||||
* for which the RMP table entry format is currently defined or for
|
||||
* processors which support the architecturally defined RMPREAD
|
||||
* instruction.
|
||||
*/
|
||||
if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
|
||||
c->x86 >= 0x19 && snp_probe_rmptable_info()) {
|
||||
(cpu_feature_enabled(X86_FEATURE_ZEN3) ||
|
||||
cpu_feature_enabled(X86_FEATURE_ZEN4) ||
|
||||
cpu_feature_enabled(X86_FEATURE_RMPREAD)) &&
|
||||
snp_probe_rmptable_info()) {
|
||||
cc_platform_set(CC_ATTR_HOST_SEV_SNP);
|
||||
} else {
|
||||
setup_clear_cpu_cap(X86_FEATURE_SEV_SNP);
|
||||
@ -795,10 +800,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
|
||||
clear_rdrand_cpuid_bit(c);
|
||||
}
|
||||
|
||||
static const struct x86_cpu_desc erratum_1386_microcode[] = {
|
||||
AMD_CPU_DESC(0x17, 0x1, 0x2, 0x0800126e),
|
||||
AMD_CPU_DESC(0x17, 0x31, 0x0, 0x08301052),
|
||||
{},
|
||||
static const struct x86_cpu_id erratum_1386_microcode[] = {
|
||||
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x01), 0x2, 0x2, 0x0800126e),
|
||||
X86_MATCH_VFM_STEPS(VFM_MAKE(X86_VENDOR_AMD, 0x17, 0x31), 0x0, 0x0, 0x08301052),
|
||||
};
|
||||
|
||||
static void fix_erratum_1386(struct cpuinfo_x86 *c)
|
||||
@ -814,7 +818,7 @@ static void fix_erratum_1386(struct cpuinfo_x86 *c)
|
||||
* Clear the feature flag only on microcode revisions which
|
||||
* don't have the fix.
|
||||
*/
|
||||
if (x86_cpu_has_min_microcode_rev(erratum_1386_microcode))
|
||||
if (x86_match_min_microcode_rev(erratum_1386_microcode))
|
||||
return;
|
||||
|
||||
clear_cpu_cap(c, X86_FEATURE_XSAVES);
|
||||
|
@ -29,6 +29,7 @@
|
||||
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/cmdline.h>
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/perf_event.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/doublefault.h>
|
||||
@ -636,9 +637,9 @@ struct cpuid_dependent_feature {
|
||||
|
||||
static const struct cpuid_dependent_feature
|
||||
cpuid_dependent_features[] = {
|
||||
{ X86_FEATURE_MWAIT, 0x00000005 },
|
||||
{ X86_FEATURE_DCA, 0x00000009 },
|
||||
{ X86_FEATURE_XSAVE, 0x0000000d },
|
||||
{ X86_FEATURE_MWAIT, CPUID_LEAF_MWAIT },
|
||||
{ X86_FEATURE_DCA, CPUID_LEAF_DCA },
|
||||
{ X86_FEATURE_XSAVE, CPUID_LEAF_XSTATE },
|
||||
{ 0, 0 }
|
||||
};
|
||||
|
||||
@ -1201,8 +1202,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
||||
#define VULNBL(vendor, family, model, blacklist) \
|
||||
X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist)
|
||||
|
||||
#define VULNBL_INTEL_STEPPINGS(vfm, steppings, issues) \
|
||||
X86_MATCH_VFM_STEPPINGS(vfm, steppings, issues)
|
||||
#define VULNBL_INTEL_STEPS(vfm, max_stepping, issues) \
|
||||
X86_MATCH_VFM_STEPS(vfm, X86_STEP_MIN, max_stepping, issues)
|
||||
|
||||
#define VULNBL_AMD(family, blacklist) \
|
||||
VULNBL(AMD, family, X86_MODEL_ANY, blacklist)
|
||||
@ -1227,43 +1228,43 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
||||
#define RFDS BIT(7)
|
||||
|
||||
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_L, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_G, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_HASWELL_X, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_D, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_G, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL_X, X86_STEPPING_ANY, MMIO),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_BROADWELL, X86_STEPPING_ANY, SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE_L, X86_STEPPING_ANY, GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_TIGERLAKE, X86_STEPPING_ANY, GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ALDERLAKE_L, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_P, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_RAPTORLAKE_S, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GRACEMONT, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO | RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_TREMONT_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_D, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPPINGS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEPPING_ANY, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL_L, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL_G, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_HASWELL_X, X86_STEP_MAX, MMIO),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL_D, X86_STEP_MAX, MMIO),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO),
|
||||
VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED),
|
||||
VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED),
|
||||
VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED),
|
||||
VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_P, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_RAPTORLAKE_S, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GRACEMONT, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_D, X86_STEP_MAX, MMIO | RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_TREMONT_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_D, X86_STEP_MAX, RFDS),
|
||||
VULNBL_INTEL_STEPS(INTEL_ATOM_GOLDMONT_PLUS, X86_STEP_MAX, RFDS),
|
||||
|
||||
VULNBL_AMD(0x15, RETBLEED),
|
||||
VULNBL_AMD(0x16, RETBLEED),
|
||||
|
@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
|
||||
seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c));
|
||||
seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id);
|
||||
seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id);
|
||||
seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id);
|
||||
seq_printf(m, "llc_id: %u\n", c->topo.llc_id);
|
||||
seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id);
|
||||
seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id);
|
||||
|
@ -599,11 +599,6 @@ static void init_intel(struct cpuinfo_x86 *c)
|
||||
if (p)
|
||||
strcpy(c->x86_model_id, p);
|
||||
}
|
||||
|
||||
if (c->x86 == 15)
|
||||
set_cpu_cap(c, X86_FEATURE_P4);
|
||||
if (c->x86 == 6)
|
||||
set_cpu_cap(c, X86_FEATURE_P3);
|
||||
#endif
|
||||
|
||||
/* Work around errata */
|
||||
|
@ -6,7 +6,7 @@
|
||||
#include <linux/slab.h>
|
||||
|
||||
/**
|
||||
* x86_match_cpu - match current CPU again an array of x86_cpu_ids
|
||||
* x86_match_cpu - match current CPU against an array of x86_cpu_ids
|
||||
* @match: Pointer to array of x86_cpu_ids. Last entry terminated with
|
||||
* {}.
|
||||
*
|
||||
@ -56,33 +56,13 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
|
||||
}
|
||||
EXPORT_SYMBOL(x86_match_cpu);
|
||||
|
||||
static const struct x86_cpu_desc *
|
||||
x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
|
||||
bool x86_match_min_microcode_rev(const struct x86_cpu_id *table)
|
||||
{
|
||||
struct cpuinfo_x86 *c = &boot_cpu_data;
|
||||
const struct x86_cpu_desc *m;
|
||||
const struct x86_cpu_id *res = x86_match_cpu(table);
|
||||
|
||||
for (m = match; m->x86_family | m->x86_model; m++) {
|
||||
if (c->x86_vendor != m->x86_vendor)
|
||||
continue;
|
||||
if (c->x86 != m->x86_family)
|
||||
continue;
|
||||
if (c->x86_model != m->x86_model)
|
||||
continue;
|
||||
if (c->x86_stepping != m->x86_stepping)
|
||||
continue;
|
||||
return m;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
|
||||
{
|
||||
const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
|
||||
|
||||
if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
|
||||
if (!res || res->driver_data > boot_cpu_data.microcode)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
|
||||
EXPORT_SYMBOL_GPL(x86_match_min_microcode_rev);
|
||||
|
@ -423,7 +423,7 @@ void __init mtrr_copy_map(void)
|
||||
}
|
||||
|
||||
/**
|
||||
* mtrr_overwrite_state - set static MTRR state
|
||||
* guest_force_mtrr_state - set static MTRR state for a guest
|
||||
*
|
||||
* Used to set MTRR state via different means (e.g. with data obtained from
|
||||
* a hypervisor).
|
||||
@ -436,8 +436,8 @@ void __init mtrr_copy_map(void)
|
||||
* @num_var: length of the @var array
|
||||
* @def_type: default caching type
|
||||
*/
|
||||
void mtrr_overwrite_state(struct mtrr_var_range *var, unsigned int num_var,
|
||||
mtrr_type def_type)
|
||||
void guest_force_mtrr_state(struct mtrr_var_range *var, unsigned int num_var,
|
||||
mtrr_type def_type)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
|
@ -625,7 +625,7 @@ void mtrr_save_state(void)
|
||||
static int __init mtrr_init_finalize(void)
|
||||
{
|
||||
/*
|
||||
* Map might exist if mtrr_overwrite_state() has been called or if
|
||||
* Map might exist if guest_force_mtrr_state() has been called or if
|
||||
* mtrr_enabled() returns true.
|
||||
*/
|
||||
mtrr_copy_map();
|
||||
|
@ -234,7 +234,9 @@ static __init bool __get_mem_config_intel(struct rdt_resource *r)
|
||||
r->membw.throttle_mode = THREAD_THROTTLE_PER_THREAD;
|
||||
else
|
||||
r->membw.throttle_mode = THREAD_THROTTLE_MAX;
|
||||
thread_throttle_mode_init();
|
||||
|
||||
resctrl_file_fflags_init("thread_throttle_mode",
|
||||
RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
|
||||
|
||||
r->alloc_capable = true;
|
||||
|
||||
@ -961,6 +963,11 @@ static __init bool get_rdt_mon_resources(void)
|
||||
if (!rdt_mon_features)
|
||||
return false;
|
||||
|
||||
if (is_mbm_local_enabled())
|
||||
mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
|
||||
else if (is_mbm_total_enabled())
|
||||
mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
|
||||
|
||||
return !rdt_get_mon_l3_config(r);
|
||||
}
|
||||
|
||||
|
@ -518,6 +518,76 @@ static int smp_mon_event_count(void *arg)
|
||||
return 0;
|
||||
}
|
||||
|
||||
ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
struct rdtgroup *rdtgrp;
|
||||
int ret = 0;
|
||||
|
||||
/* Valid input requires a trailing newline */
|
||||
if (nbytes == 0 || buf[nbytes - 1] != '\n')
|
||||
return -EINVAL;
|
||||
buf[nbytes - 1] = '\0';
|
||||
|
||||
rdtgrp = rdtgroup_kn_lock_live(of->kn);
|
||||
if (!rdtgrp) {
|
||||
rdtgroup_kn_unlock(of->kn);
|
||||
return -ENOENT;
|
||||
}
|
||||
rdt_last_cmd_clear();
|
||||
|
||||
if (!strcmp(buf, "mbm_local_bytes")) {
|
||||
if (is_mbm_local_enabled())
|
||||
rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
|
||||
else
|
||||
ret = -EINVAL;
|
||||
} else if (!strcmp(buf, "mbm_total_bytes")) {
|
||||
if (is_mbm_total_enabled())
|
||||
rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
|
||||
else
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
}
|
||||
|
||||
if (ret)
|
||||
rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
|
||||
|
||||
rdtgroup_kn_unlock(of->kn);
|
||||
|
||||
return ret ?: nbytes;
|
||||
}
|
||||
|
||||
int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
|
||||
struct seq_file *s, void *v)
|
||||
{
|
||||
struct rdtgroup *rdtgrp;
|
||||
int ret = 0;
|
||||
|
||||
rdtgrp = rdtgroup_kn_lock_live(of->kn);
|
||||
|
||||
if (rdtgrp) {
|
||||
switch (rdtgrp->mba_mbps_event) {
|
||||
case QOS_L3_MBM_LOCAL_EVENT_ID:
|
||||
seq_puts(s, "mbm_local_bytes\n");
|
||||
break;
|
||||
case QOS_L3_MBM_TOTAL_EVENT_ID:
|
||||
seq_puts(s, "mbm_total_bytes\n");
|
||||
break;
|
||||
default:
|
||||
pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
ret = -ENOENT;
|
||||
}
|
||||
|
||||
rdtgroup_kn_unlock(of->kn);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
|
||||
struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
|
||||
cpumask_t *cpumask, int evtid, int first)
|
||||
|
@ -283,6 +283,7 @@ struct pseudo_lock_region {
|
||||
* monitor only or ctrl_mon group
|
||||
* @mon: mongroup related data
|
||||
* @mode: mode of resource group
|
||||
* @mba_mbps_event: input monitoring event id when mba_sc is enabled
|
||||
* @plr: pseudo-locked region
|
||||
*/
|
||||
struct rdtgroup {
|
||||
@ -295,6 +296,7 @@ struct rdtgroup {
|
||||
enum rdt_group_type type;
|
||||
struct mongroup mon;
|
||||
enum rdtgrp_mode mode;
|
||||
enum resctrl_event_id mba_mbps_event;
|
||||
struct pseudo_lock_region *plr;
|
||||
};
|
||||
|
||||
@ -508,6 +510,7 @@ extern struct mutex rdtgroup_mutex;
|
||||
extern struct rdt_hw_resource rdt_resources_all[];
|
||||
extern struct rdtgroup rdtgroup_default;
|
||||
extern struct dentry *debugfs_resctrl;
|
||||
extern enum resctrl_event_id mba_mbps_default_event;
|
||||
|
||||
enum resctrl_res_level {
|
||||
RDT_RESOURCE_L3,
|
||||
@ -607,6 +610,10 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off);
|
||||
int rdtgroup_schemata_show(struct kernfs_open_file *of,
|
||||
struct seq_file *s, void *v);
|
||||
ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off);
|
||||
int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
|
||||
struct seq_file *s, void *v);
|
||||
bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
|
||||
unsigned long cbm, int closid, bool exclusive);
|
||||
unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
|
||||
@ -647,10 +654,8 @@ void cqm_handle_limbo(struct work_struct *work);
|
||||
bool has_busy_rmid(struct rdt_mon_domain *d);
|
||||
void __check_limbo(struct rdt_mon_domain *d, bool force_free);
|
||||
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
|
||||
void __init thread_throttle_mode_init(void);
|
||||
void __init mbm_config_rftype_init(const char *config);
|
||||
void resctrl_file_fflags_init(const char *config, unsigned long fflags);
|
||||
void rdt_staged_configs_clear(void);
|
||||
bool closid_allocated(unsigned int closid);
|
||||
int resctrl_find_cleanest_closid(void);
|
||||
|
||||
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
|
||||
|
@ -663,9 +663,12 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
|
||||
*/
|
||||
static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
|
||||
{
|
||||
u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
|
||||
struct mbm_state *m = &rr->d->mbm_local[idx];
|
||||
u64 cur_bw, bytes, cur_bytes;
|
||||
struct mbm_state *m;
|
||||
|
||||
m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
|
||||
if (WARN_ON_ONCE(!m))
|
||||
return;
|
||||
|
||||
cur_bytes = rr->val;
|
||||
bytes = cur_bytes - m->prev_bw_bytes;
|
||||
@ -752,20 +755,20 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
|
||||
u32 closid, rmid, cur_msr_val, new_msr_val;
|
||||
struct mbm_state *pmbm_data, *cmbm_data;
|
||||
struct rdt_ctrl_domain *dom_mba;
|
||||
enum resctrl_event_id evt_id;
|
||||
struct rdt_resource *r_mba;
|
||||
u32 cur_bw, user_bw, idx;
|
||||
struct list_head *head;
|
||||
struct rdtgroup *entry;
|
||||
|
||||
if (!is_mbm_local_enabled())
|
||||
return;
|
||||
u32 cur_bw, user_bw;
|
||||
|
||||
r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
|
||||
evt_id = rgrp->mba_mbps_event;
|
||||
|
||||
closid = rgrp->closid;
|
||||
rmid = rgrp->mon.rmid;
|
||||
idx = resctrl_arch_rmid_idx_encode(closid, rmid);
|
||||
pmbm_data = &dom_mbm->mbm_local[idx];
|
||||
pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
|
||||
if (WARN_ON_ONCE(!pmbm_data))
|
||||
return;
|
||||
|
||||
dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
|
||||
if (!dom_mba) {
|
||||
@ -784,7 +787,9 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
|
||||
*/
|
||||
head = &rgrp->mon.crdtgrp_list;
|
||||
list_for_each_entry(entry, head, mon.crdtgrp_list) {
|
||||
cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
|
||||
cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
|
||||
if (WARN_ON_ONCE(!cmbm_data))
|
||||
return;
|
||||
cur_bw += cmbm_data->prev_bw;
|
||||
}
|
||||
|
||||
@ -813,54 +818,45 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
|
||||
resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
|
||||
}
|
||||
|
||||
static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
|
||||
u32 closid, u32 rmid)
|
||||
static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
|
||||
u32 closid, u32 rmid, enum resctrl_event_id evtid)
|
||||
{
|
||||
struct rmid_read rr = {0};
|
||||
|
||||
rr.r = r;
|
||||
rr.d = d;
|
||||
rr.evtid = evtid;
|
||||
rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
|
||||
if (IS_ERR(rr.arch_mon_ctx)) {
|
||||
pr_warn_ratelimited("Failed to allocate monitor context: %ld",
|
||||
PTR_ERR(rr.arch_mon_ctx));
|
||||
return;
|
||||
}
|
||||
|
||||
__mon_event_count(closid, rmid, &rr);
|
||||
|
||||
/*
|
||||
* This is protected from concurrent reads from user
|
||||
* as both the user and we hold the global mutex.
|
||||
* If the software controller is enabled, compute the
|
||||
* bandwidth for this event id.
|
||||
*/
|
||||
if (is_mbm_total_enabled()) {
|
||||
rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
|
||||
rr.val = 0;
|
||||
rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
|
||||
if (IS_ERR(rr.arch_mon_ctx)) {
|
||||
pr_warn_ratelimited("Failed to allocate monitor context: %ld",
|
||||
PTR_ERR(rr.arch_mon_ctx));
|
||||
return;
|
||||
}
|
||||
if (is_mba_sc(NULL))
|
||||
mbm_bw_count(closid, rmid, &rr);
|
||||
|
||||
__mon_event_count(closid, rmid, &rr);
|
||||
resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
|
||||
}
|
||||
|
||||
resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
|
||||
}
|
||||
if (is_mbm_local_enabled()) {
|
||||
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
|
||||
rr.val = 0;
|
||||
rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
|
||||
if (IS_ERR(rr.arch_mon_ctx)) {
|
||||
pr_warn_ratelimited("Failed to allocate monitor context: %ld",
|
||||
PTR_ERR(rr.arch_mon_ctx));
|
||||
return;
|
||||
}
|
||||
static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
|
||||
u32 closid, u32 rmid)
|
||||
{
|
||||
/*
|
||||
* This is protected from concurrent reads from user as both
|
||||
* the user and overflow handler hold the global mutex.
|
||||
*/
|
||||
if (is_mbm_total_enabled())
|
||||
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
|
||||
|
||||
__mon_event_count(closid, rmid, &rr);
|
||||
|
||||
/*
|
||||
* Call the MBA software controller only for the
|
||||
* control groups and when user has enabled
|
||||
* the software controller explicitly.
|
||||
*/
|
||||
if (is_mba_sc(NULL))
|
||||
mbm_bw_count(closid, rmid, &rr);
|
||||
|
||||
resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
|
||||
}
|
||||
if (is_mbm_local_enabled())
|
||||
mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1224,11 +1220,13 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r)
|
||||
|
||||
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
|
||||
mbm_total_event.configurable = true;
|
||||
mbm_config_rftype_init("mbm_total_bytes_config");
|
||||
resctrl_file_fflags_init("mbm_total_bytes_config",
|
||||
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
|
||||
}
|
||||
if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
|
||||
mbm_local_event.configurable = true;
|
||||
mbm_config_rftype_init("mbm_local_bytes_config");
|
||||
resctrl_file_fflags_init("mbm_local_bytes_config",
|
||||
RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -459,7 +459,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
|
||||
* increase likelihood that allocated cache portion will be filled
|
||||
* with associated memory.
|
||||
*/
|
||||
native_wbinvd();
|
||||
wbinvd();
|
||||
|
||||
/*
|
||||
* Always called with interrupts enabled. By disabling interrupts
|
||||
@ -1205,20 +1205,14 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
|
||||
plr->cpu = cpu;
|
||||
|
||||
if (sel == 1)
|
||||
thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
|
||||
cpu_to_node(cpu),
|
||||
"pseudo_lock_measure/%u",
|
||||
cpu);
|
||||
thread = kthread_run_on_cpu(measure_cycles_lat_fn, plr,
|
||||
cpu, "pseudo_lock_measure/%u");
|
||||
else if (sel == 2)
|
||||
thread = kthread_create_on_node(measure_l2_residency, plr,
|
||||
cpu_to_node(cpu),
|
||||
"pseudo_lock_measure/%u",
|
||||
cpu);
|
||||
thread = kthread_run_on_cpu(measure_l2_residency, plr,
|
||||
cpu, "pseudo_lock_measure/%u");
|
||||
else if (sel == 3)
|
||||
thread = kthread_create_on_node(measure_l3_residency, plr,
|
||||
cpu_to_node(cpu),
|
||||
"pseudo_lock_measure/%u",
|
||||
cpu);
|
||||
thread = kthread_run_on_cpu(measure_l3_residency, plr,
|
||||
cpu, "pseudo_lock_measure/%u");
|
||||
else
|
||||
goto out;
|
||||
|
||||
@ -1226,8 +1220,6 @@ static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
|
||||
ret = PTR_ERR(thread);
|
||||
goto out;
|
||||
}
|
||||
kthread_bind(thread, cpu);
|
||||
wake_up_process(thread);
|
||||
|
||||
ret = wait_event_interruptible(plr->lock_thread_wq,
|
||||
plr->thread_done == 1);
|
||||
@ -1315,18 +1307,14 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
|
||||
|
||||
plr->thread_done = 0;
|
||||
|
||||
thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
|
||||
cpu_to_node(plr->cpu),
|
||||
"pseudo_lock/%u", plr->cpu);
|
||||
thread = kthread_run_on_cpu(pseudo_lock_fn, rdtgrp,
|
||||
plr->cpu, "pseudo_lock/%u");
|
||||
if (IS_ERR(thread)) {
|
||||
ret = PTR_ERR(thread);
|
||||
rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
|
||||
goto out_cstates;
|
||||
}
|
||||
|
||||
kthread_bind(thread, plr->cpu);
|
||||
wake_up_process(thread);
|
||||
|
||||
ret = wait_event_interruptible(plr->lock_thread_wq,
|
||||
plr->thread_done == 1);
|
||||
if (ret < 0) {
|
||||
|
@ -65,6 +65,15 @@ static void rdtgroup_destroy_root(void);
|
||||
|
||||
struct dentry *debugfs_resctrl;
|
||||
|
||||
/*
|
||||
* Memory bandwidth monitoring event to use for the default CTRL_MON group
|
||||
* and each new CTRL_MON group created by the user. Only relevant when
|
||||
* the filesystem is mounted with the "mba_MBps" option so it does not
|
||||
* matter that it remains uninitialized on systems that do not support
|
||||
* the "mba_MBps" option.
|
||||
*/
|
||||
enum resctrl_event_id mba_mbps_default_event;
|
||||
|
||||
static bool resctrl_debug;
|
||||
|
||||
void rdt_last_cmd_clear(void)
|
||||
@ -1941,6 +1950,13 @@ static struct rftype res_common_files[] = {
|
||||
.seq_show = rdtgroup_schemata_show,
|
||||
.fflags = RFTYPE_CTRL_BASE,
|
||||
},
|
||||
{
|
||||
.name = "mba_MBps_event",
|
||||
.mode = 0644,
|
||||
.kf_ops = &rdtgroup_kf_single_ops,
|
||||
.write = rdtgroup_mba_mbps_event_write,
|
||||
.seq_show = rdtgroup_mba_mbps_event_show,
|
||||
},
|
||||
{
|
||||
.name = "mode",
|
||||
.mode = 0644,
|
||||
@ -2020,24 +2036,13 @@ static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void __init thread_throttle_mode_init(void)
|
||||
{
|
||||
struct rftype *rft;
|
||||
|
||||
rft = rdtgroup_get_rftype_by_name("thread_throttle_mode");
|
||||
if (!rft)
|
||||
return;
|
||||
|
||||
rft->fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB;
|
||||
}
|
||||
|
||||
void __init mbm_config_rftype_init(const char *config)
|
||||
void resctrl_file_fflags_init(const char *config, unsigned long fflags)
|
||||
{
|
||||
struct rftype *rft;
|
||||
|
||||
rft = rdtgroup_get_rftype_by_name(config);
|
||||
if (rft)
|
||||
rft->fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE;
|
||||
rft->fflags = fflags;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -2343,7 +2348,7 @@ static bool supports_mba_mbps(void)
|
||||
struct rdt_resource *rmbm = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
|
||||
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
|
||||
|
||||
return (is_mbm_local_enabled() &&
|
||||
return (is_mbm_enabled() &&
|
||||
r->alloc_capable && is_mba_linear() &&
|
||||
r->ctrl_scope == rmbm->mon_scope);
|
||||
}
|
||||
@ -2357,6 +2362,7 @@ static int set_mba_sc(bool mba_sc)
|
||||
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
|
||||
u32 num_closid = resctrl_arch_get_num_closid(r);
|
||||
struct rdt_ctrl_domain *d;
|
||||
unsigned long fflags;
|
||||
int i;
|
||||
|
||||
if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
|
||||
@ -2364,11 +2370,16 @@ static int set_mba_sc(bool mba_sc)
|
||||
|
||||
r->membw.mba_sc = mba_sc;
|
||||
|
||||
rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
|
||||
|
||||
list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
|
||||
for (i = 0; i < num_closid; i++)
|
||||
d->mbps_val[i] = MBA_MAX_MBPS;
|
||||
}
|
||||
|
||||
fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
|
||||
resctrl_file_fflags_init("mba_MBps_event", fflags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2768,7 +2779,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
|
||||
ctx->enable_cdpl2 = true;
|
||||
return 0;
|
||||
case Opt_mba_mbps:
|
||||
msg = "mba_MBps requires local MBM and linear scale MBA at L3 scope";
|
||||
msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
|
||||
if (!supports_mba_mbps())
|
||||
return invalfc(fc, msg);
|
||||
ctx->enable_mba_mbps = true;
|
||||
@ -3622,6 +3633,8 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
|
||||
rdt_last_cmd_puts("kernfs subdir error\n");
|
||||
goto out_del_list;
|
||||
}
|
||||
if (is_mba_sc(NULL))
|
||||
rdtgrp->mba_mbps_event = mba_mbps_default_event;
|
||||
}
|
||||
|
||||
goto out_unlock;
|
||||
|
@ -428,7 +428,7 @@ void __init topology_apply_cmdline_limits_early(void)
|
||||
{
|
||||
unsigned int possible = nr_cpu_ids;
|
||||
|
||||
/* 'maxcpus=0' 'nosmp' 'nolapic' 'disableapic' */
|
||||
/* 'maxcpus=0' 'nosmp' 'nolapic' */
|
||||
if (!setup_max_cpus || apic_is_disabled)
|
||||
possible = 1;
|
||||
|
||||
|
@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
|
||||
if (!early) {
|
||||
c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
|
||||
c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
|
||||
c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
|
||||
}
|
||||
|
||||
/* Package relative core ID */
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <asm/fpu/signal.h>
|
||||
#include <asm/fpu/xcr.h>
|
||||
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/prctl.h>
|
||||
#include <asm/elf.h>
|
||||
@ -232,7 +233,7 @@ static void __init setup_xstate_cache(void)
|
||||
xmm_space);
|
||||
|
||||
for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) {
|
||||
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
xstate_sizes[i] = eax;
|
||||
xstate_flags[i] = ecx;
|
||||
@ -398,7 +399,7 @@ int xfeature_size(int xfeature_nr)
|
||||
u32 eax, ebx, ecx, edx;
|
||||
|
||||
CHECK_XFEATURE(xfeature_nr);
|
||||
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx);
|
||||
return eax;
|
||||
}
|
||||
|
||||
@ -441,9 +442,9 @@ static void __init __xstate_dump_leaves(void)
|
||||
* just in case there are some goodies up there
|
||||
*/
|
||||
for (i = 0; i < XFEATURE_MAX + 10; i++) {
|
||||
cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx);
|
||||
pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n",
|
||||
XSTATE_CPUID, i, eax, ebx, ecx, edx);
|
||||
CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx);
|
||||
}
|
||||
}
|
||||
|
||||
@ -484,7 +485,7 @@ static int __init check_xtile_data_against_struct(int size)
|
||||
* Check the maximum palette id:
|
||||
* eax: the highest numbered palette subleaf.
|
||||
*/
|
||||
cpuid_count(TILE_CPUID, 0, &max_palid, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx);
|
||||
|
||||
/*
|
||||
* Cross-check each tile size and find the maximum number of
|
||||
@ -498,7 +499,7 @@ static int __init check_xtile_data_against_struct(int size)
|
||||
* eax[31:16]: bytes per title
|
||||
* ebx[31:16]: the max names (or max number of tiles)
|
||||
*/
|
||||
cpuid_count(TILE_CPUID, palid, &eax, &ebx, &edx, &edx);
|
||||
cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx);
|
||||
tile_size = eax >> 16;
|
||||
max = ebx >> 16;
|
||||
|
||||
@ -633,7 +634,7 @@ static unsigned int __init get_compacted_size(void)
|
||||
* are no supervisor states, but XSAVEC still uses compacted
|
||||
* format.
|
||||
*/
|
||||
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
|
||||
return ebx;
|
||||
}
|
||||
|
||||
@ -674,7 +675,7 @@ static unsigned int __init get_xsave_size_user(void)
|
||||
* containing all the *user* state components
|
||||
* corresponding to bits currently set in XCR0.
|
||||
*/
|
||||
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
|
||||
return ebx;
|
||||
}
|
||||
|
||||
@ -763,21 +764,16 @@ void __init fpu__init_system_xstate(unsigned int legacy_size)
|
||||
return;
|
||||
}
|
||||
|
||||
if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
|
||||
WARN_ON_FPU(1);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find user xstates supported by the processor.
|
||||
*/
|
||||
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx);
|
||||
fpu_kernel_cfg.max_features = eax + ((u64)edx << 32);
|
||||
|
||||
/*
|
||||
* Find supervisor xstates supported by the processor.
|
||||
*/
|
||||
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
|
||||
cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx);
|
||||
fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32);
|
||||
|
||||
if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
|
||||
|
@ -91,9 +91,11 @@ static inline bool check_la57_support(void)
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
|
||||
static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
|
||||
pmdval_t *pmd,
|
||||
unsigned long p2v_offset)
|
||||
{
|
||||
unsigned long vaddr, vaddr_end;
|
||||
unsigned long paddr, paddr_end;
|
||||
int i;
|
||||
|
||||
/* Encrypt the kernel and related (if SME is active) */
|
||||
@ -106,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
|
||||
* attribute.
|
||||
*/
|
||||
if (sme_get_me_mask()) {
|
||||
vaddr = (unsigned long)__start_bss_decrypted;
|
||||
vaddr_end = (unsigned long)__end_bss_decrypted;
|
||||
paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
|
||||
paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
|
||||
|
||||
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
|
||||
for (; paddr < paddr_end; paddr += PMD_SIZE) {
|
||||
/*
|
||||
* On SNP, transition the page to shared in the RMP table so that
|
||||
* it is consistent with the page table attribute change.
|
||||
@ -118,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
|
||||
* mapping (kernel .text). PVALIDATE, by way of
|
||||
* early_snp_set_memory_shared(), requires a valid virtual
|
||||
* address but the kernel is currently running off of the identity
|
||||
* mapping so use __pa() to get a *currently* valid virtual address.
|
||||
* mapping so use the PA to get a *currently* valid virtual address.
|
||||
*/
|
||||
early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD);
|
||||
early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
|
||||
|
||||
i = pmd_index(vaddr);
|
||||
i = pmd_index(paddr - p2v_offset);
|
||||
pmd[i] -= sme_get_me_mask();
|
||||
}
|
||||
}
|
||||
@ -138,12 +140,15 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
|
||||
* doesn't have to generate PC-relative relocations when accessing globals from
|
||||
* that function. Clang actually does not generate them, which leads to
|
||||
* boot-time crashes. To work around this problem, every global pointer must
|
||||
* be accessed using RIP_REL_REF().
|
||||
* be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
|
||||
* by subtracting p2v_offset from the RIP-relative address.
|
||||
*/
|
||||
unsigned long __head __startup_64(unsigned long physaddr,
|
||||
unsigned long __head __startup_64(unsigned long p2v_offset,
|
||||
struct boot_params *bp)
|
||||
{
|
||||
pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
|
||||
unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
|
||||
unsigned long va_text, va_end;
|
||||
unsigned long pgtable_flags;
|
||||
unsigned long load_delta;
|
||||
pgdval_t *pgd;
|
||||
@ -163,13 +168,16 @@ unsigned long __head __startup_64(unsigned long physaddr,
|
||||
* Compute the delta between the address I am compiled to run at
|
||||
* and the address I am actually running at.
|
||||
*/
|
||||
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
|
||||
load_delta = __START_KERNEL_map + p2v_offset;
|
||||
RIP_REL_REF(phys_base) = load_delta;
|
||||
|
||||
/* Is the address not 2M aligned? */
|
||||
if (load_delta & ~PMD_MASK)
|
||||
for (;;);
|
||||
|
||||
va_text = physaddr - p2v_offset;
|
||||
va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
|
||||
|
||||
/* Include the SME encryption mask in the fixup value */
|
||||
load_delta += sme_get_me_mask();
|
||||
|
||||
@ -178,7 +186,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
|
||||
pgd = &RIP_REL_REF(early_top_pgt)->pgd;
|
||||
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
|
||||
|
||||
if (la57) {
|
||||
if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
|
||||
p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
|
||||
p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
|
||||
|
||||
@ -230,7 +238,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
|
||||
pmd_entry += sme_get_me_mask();
|
||||
pmd_entry += physaddr;
|
||||
|
||||
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
|
||||
for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
|
||||
int idx = i + (physaddr >> PMD_SHIFT);
|
||||
|
||||
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
|
||||
@ -255,11 +263,11 @@ unsigned long __head __startup_64(unsigned long physaddr,
|
||||
pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
|
||||
|
||||
/* invalidate pages before the kernel image */
|
||||
for (i = 0; i < pmd_index((unsigned long)_text); i++)
|
||||
for (i = 0; i < pmd_index(va_text); i++)
|
||||
pmd[i] &= ~_PAGE_PRESENT;
|
||||
|
||||
/* fixup pages that are part of the kernel image */
|
||||
for (; i <= pmd_index((unsigned long)_end); i++)
|
||||
for (; i <= pmd_index(va_end); i++)
|
||||
if (pmd[i] & _PAGE_PRESENT)
|
||||
pmd[i] += load_delta;
|
||||
|
||||
@ -267,7 +275,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
|
||||
for (; i < PTRS_PER_PMD; i++)
|
||||
pmd[i] &= ~_PAGE_PRESENT;
|
||||
|
||||
return sme_postprocess_startup(bp, pmd);
|
||||
return sme_postprocess_startup(bp, pmd, p2v_offset);
|
||||
}
|
||||
|
||||
/* Wipe all early page tables except for the kernel symbol map */
|
||||
|
@ -94,13 +94,19 @@ SYM_CODE_START_NOALIGN(startup_64)
|
||||
/* Sanitize CPU configuration */
|
||||
call verify_cpu
|
||||
|
||||
/*
|
||||
* Derive the kernel's physical-to-virtual offset from the physical and
|
||||
* virtual addresses of common_startup_64().
|
||||
*/
|
||||
leaq common_startup_64(%rip), %rdi
|
||||
subq .Lcommon_startup_64(%rip), %rdi
|
||||
|
||||
/*
|
||||
* Perform pagetable fixups. Additionally, if SME is active, encrypt
|
||||
* the kernel and retrieve the modifier (SME encryption mask if SME
|
||||
* is active) to be added to the initial pgdir entry that will be
|
||||
* programmed into CR3.
|
||||
*/
|
||||
leaq _text(%rip), %rdi
|
||||
movq %r15, %rsi
|
||||
call __startup_64
|
||||
|
||||
@ -128,11 +134,11 @@ SYM_CODE_START_NOALIGN(startup_64)
|
||||
|
||||
/* Branch to the common startup code at its kernel virtual address */
|
||||
ANNOTATE_RETPOLINE_SAFE
|
||||
jmp *0f(%rip)
|
||||
jmp *.Lcommon_startup_64(%rip)
|
||||
SYM_CODE_END(startup_64)
|
||||
|
||||
__INITRODATA
|
||||
0: .quad common_startup_64
|
||||
SYM_DATA_LOCAL(.Lcommon_startup_64, .quad common_startup_64)
|
||||
|
||||
.text
|
||||
SYM_CODE_START(secondary_startup_64)
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/irq.h>
|
||||
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/irq_remapping.h>
|
||||
#include <asm/hpet.h>
|
||||
#include <asm/time.h>
|
||||
@ -927,10 +928,7 @@ static bool __init mwait_pc10_supported(void)
|
||||
if (!cpu_feature_enabled(X86_FEATURE_MWAIT))
|
||||
return false;
|
||||
|
||||
if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
|
||||
return false;
|
||||
|
||||
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
|
||||
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &mwait_substates);
|
||||
|
||||
return (ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) &&
|
||||
(ecx & CPUID5_ECX_INTERRUPT_BREAK) &&
|
||||
|
@ -28,19 +28,19 @@ static ssize_t version_show(struct kobject *kobj,
|
||||
static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
|
||||
|
||||
static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
|
||||
struct bin_attribute *bin_attr,
|
||||
const struct bin_attribute *bin_attr,
|
||||
char *buf, loff_t off, size_t count)
|
||||
{
|
||||
memcpy(buf, (void *)&boot_params + off, count);
|
||||
return count;
|
||||
}
|
||||
|
||||
static struct bin_attribute boot_params_data_attr = {
|
||||
static const struct bin_attribute boot_params_data_attr = {
|
||||
.attr = {
|
||||
.name = "data",
|
||||
.mode = S_IRUGO,
|
||||
},
|
||||
.read = boot_params_data_read,
|
||||
.read_new = boot_params_data_read,
|
||||
.size = sizeof(boot_params),
|
||||
};
|
||||
|
||||
@ -49,14 +49,14 @@ static struct attribute *boot_params_version_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct bin_attribute *boot_params_data_attrs[] = {
|
||||
static const struct bin_attribute *const boot_params_data_attrs[] = {
|
||||
&boot_params_data_attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group boot_params_attr_group = {
|
||||
.attrs = boot_params_version_attrs,
|
||||
.bin_attrs = boot_params_data_attrs,
|
||||
.bin_attrs_new = boot_params_data_attrs,
|
||||
};
|
||||
|
||||
static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
|
||||
@ -172,7 +172,7 @@ static ssize_t type_show(struct kobject *kobj,
|
||||
|
||||
static ssize_t setup_data_data_read(struct file *fp,
|
||||
struct kobject *kobj,
|
||||
struct bin_attribute *bin_attr,
|
||||
const struct bin_attribute *bin_attr,
|
||||
char *buf,
|
||||
loff_t off, size_t count)
|
||||
{
|
||||
@ -250,7 +250,7 @@ static struct bin_attribute data_attr __ro_after_init = {
|
||||
.name = "data",
|
||||
.mode = S_IRUGO,
|
||||
},
|
||||
.read = setup_data_data_read,
|
||||
.read_new = setup_data_data_read,
|
||||
};
|
||||
|
||||
static struct attribute *setup_data_type_attrs[] = {
|
||||
@ -258,14 +258,14 @@ static struct attribute *setup_data_type_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static struct bin_attribute *setup_data_data_attrs[] = {
|
||||
static const struct bin_attribute *const setup_data_data_attrs[] = {
|
||||
&data_attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group setup_data_attr_group = {
|
||||
.attrs = setup_data_type_attrs,
|
||||
.bin_attrs = setup_data_data_attrs,
|
||||
.bin_attrs_new = setup_data_data_attrs,
|
||||
};
|
||||
|
||||
static int __init create_setup_data_node(struct kobject *parent,
|
||||
|
@ -983,7 +983,7 @@ static void __init kvm_init_platform(void)
|
||||
x86_platform.apic_post_init = kvm_apic_init;
|
||||
|
||||
/* Set WB as the default cache mode for SEV-SNP and TDX */
|
||||
mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
|
||||
guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_AMD_MEM_ENCRYPT)
|
||||
|
@ -146,7 +146,8 @@ static void free_transition_pgtable(struct kimage *image)
|
||||
image->arch.pte = NULL;
|
||||
}
|
||||
|
||||
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
|
||||
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd,
|
||||
unsigned long control_page)
|
||||
{
|
||||
pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
|
||||
unsigned long vaddr, paddr;
|
||||
@ -156,8 +157,13 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
|
||||
pmd_t *pmd;
|
||||
pte_t *pte;
|
||||
|
||||
vaddr = (unsigned long)relocate_kernel;
|
||||
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
|
||||
/*
|
||||
* For the transition to the identity mapped page tables, the control
|
||||
* code page also needs to be mapped at the virtual address it starts
|
||||
* off running from.
|
||||
*/
|
||||
vaddr = (unsigned long)__va(control_page);
|
||||
paddr = control_page;
|
||||
pgd += pgd_index(vaddr);
|
||||
if (!pgd_present(*pgd)) {
|
||||
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
|
||||
@ -216,7 +222,7 @@ static void *alloc_pgt_page(void *data)
|
||||
return p;
|
||||
}
|
||||
|
||||
static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
|
||||
static int init_pgtable(struct kimage *image, unsigned long control_page)
|
||||
{
|
||||
struct x86_mapping_info info = {
|
||||
.alloc_pgt_page = alloc_pgt_page,
|
||||
@ -225,12 +231,12 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
|
||||
.kernpg_flag = _KERNPG_TABLE_NOENC,
|
||||
};
|
||||
unsigned long mstart, mend;
|
||||
pgd_t *level4p;
|
||||
int result;
|
||||
int i;
|
||||
|
||||
level4p = (pgd_t *)__va(start_pgtable);
|
||||
clear_page(level4p);
|
||||
image->arch.pgd = alloc_pgt_page(image);
|
||||
if (!image->arch.pgd)
|
||||
return -ENOMEM;
|
||||
|
||||
if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
|
||||
info.page_flag |= _PAGE_ENC;
|
||||
@ -244,8 +250,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
|
||||
mstart = pfn_mapped[i].start << PAGE_SHIFT;
|
||||
mend = pfn_mapped[i].end << PAGE_SHIFT;
|
||||
|
||||
result = kernel_ident_mapping_init(&info,
|
||||
level4p, mstart, mend);
|
||||
result = kernel_ident_mapping_init(&info, image->arch.pgd,
|
||||
mstart, mend);
|
||||
if (result)
|
||||
return result;
|
||||
}
|
||||
@ -260,8 +266,8 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
|
||||
mstart = image->segment[i].mem;
|
||||
mend = mstart + image->segment[i].memsz;
|
||||
|
||||
result = kernel_ident_mapping_init(&info,
|
||||
level4p, mstart, mend);
|
||||
result = kernel_ident_mapping_init(&info, image->arch.pgd,
|
||||
mstart, mend);
|
||||
|
||||
if (result)
|
||||
return result;
|
||||
@ -271,15 +277,19 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
|
||||
* Prepare EFI systab and ACPI tables for kexec kernel since they are
|
||||
* not covered by pfn_mapped.
|
||||
*/
|
||||
result = map_efi_systab(&info, level4p);
|
||||
result = map_efi_systab(&info, image->arch.pgd);
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
result = map_acpi_tables(&info, level4p);
|
||||
result = map_acpi_tables(&info, image->arch.pgd);
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
return init_transition_pgtable(image, level4p);
|
||||
/*
|
||||
* This must be last because the intermediate page table pages it
|
||||
* allocates will not be control pages and may overlap the image.
|
||||
*/
|
||||
return init_transition_pgtable(image, image->arch.pgd, control_page);
|
||||
}
|
||||
|
||||
static void load_segments(void)
|
||||
@ -296,22 +306,35 @@ static void load_segments(void)
|
||||
|
||||
int machine_kexec_prepare(struct kimage *image)
|
||||
{
|
||||
unsigned long start_pgtable;
|
||||
void *control_page = page_address(image->control_code_page);
|
||||
unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
|
||||
unsigned long reloc_end = (unsigned long)__relocate_kernel_end;
|
||||
int result;
|
||||
|
||||
/* Calculate the offsets */
|
||||
start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
|
||||
|
||||
/* Setup the identity mapped 64bit page table */
|
||||
result = init_pgtable(image, start_pgtable);
|
||||
result = init_pgtable(image, __pa(control_page));
|
||||
if (result)
|
||||
return result;
|
||||
kexec_va_control_page = (unsigned long)control_page;
|
||||
kexec_pa_table_page = (unsigned long)__pa(image->arch.pgd);
|
||||
|
||||
if (image->type == KEXEC_TYPE_DEFAULT)
|
||||
kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT;
|
||||
|
||||
__memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start);
|
||||
|
||||
set_memory_rox((unsigned long)control_page, 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void machine_kexec_cleanup(struct kimage *image)
|
||||
{
|
||||
void *control_page = page_address(image->control_code_page);
|
||||
|
||||
set_memory_nx((unsigned long)control_page, 1);
|
||||
set_memory_rw((unsigned long)control_page, 1);
|
||||
|
||||
free_transition_pgtable(image);
|
||||
}
|
||||
|
||||
@ -321,7 +344,12 @@ void machine_kexec_cleanup(struct kimage *image)
|
||||
*/
|
||||
void machine_kexec(struct kimage *image)
|
||||
{
|
||||
unsigned long page_list[PAGES_NR];
|
||||
unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page,
|
||||
unsigned long pa_control_page,
|
||||
unsigned long start_address,
|
||||
unsigned int preserve_context,
|
||||
unsigned int host_mem_enc_active);
|
||||
unsigned long reloc_start = (unsigned long)__relocate_kernel_start;
|
||||
unsigned int host_mem_enc_active;
|
||||
int save_ftrace_enabled;
|
||||
void *control_page;
|
||||
@ -357,17 +385,14 @@ void machine_kexec(struct kimage *image)
|
||||
#endif
|
||||
}
|
||||
|
||||
control_page = page_address(image->control_code_page) + PAGE_SIZE;
|
||||
__memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
|
||||
control_page = page_address(image->control_code_page);
|
||||
|
||||
page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
|
||||
page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
|
||||
page_list[PA_TABLE_PAGE] =
|
||||
(unsigned long)__pa(page_address(image->control_code_page));
|
||||
|
||||
if (image->type == KEXEC_TYPE_DEFAULT)
|
||||
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
|
||||
<< PAGE_SHIFT);
|
||||
/*
|
||||
* Allow for the possibility that relocate_kernel might not be at
|
||||
* the very start of the page.
|
||||
*/
|
||||
relocate_kernel_ptr = control_page + (unsigned long)relocate_kernel -
|
||||
reloc_start;
|
||||
|
||||
/*
|
||||
* The segment registers are funny things, they have both a
|
||||
@ -388,11 +413,11 @@ void machine_kexec(struct kimage *image)
|
||||
native_gdt_invalidate();
|
||||
|
||||
/* now call it */
|
||||
image->start = relocate_kernel((unsigned long)image->head,
|
||||
(unsigned long)page_list,
|
||||
image->start,
|
||||
image->preserve_context,
|
||||
host_mem_enc_active);
|
||||
image->start = relocate_kernel_ptr((unsigned long)image->head,
|
||||
virt_to_phys(control_page),
|
||||
image->start,
|
||||
image->preserve_context,
|
||||
host_mem_enc_active);
|
||||
|
||||
#ifdef CONFIG_KEXEC_JUMP
|
||||
if (image->preserve_context)
|
||||
@ -573,8 +598,7 @@ static void kexec_mark_crashkres(bool protect)
|
||||
|
||||
/* Don't touch the control code page used in crash_kexec().*/
|
||||
control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page));
|
||||
/* Control code page is located in the 2nd page. */
|
||||
kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect);
|
||||
kexec_mark_range(crashk_res.start, control - 1, protect);
|
||||
control += KEXEC_CONTROL_PAGE_SIZE;
|
||||
kexec_mark_range(control, crashk_res.end, protect);
|
||||
}
|
||||
|
@ -123,11 +123,6 @@ static noinstr void pv_native_set_debugreg(int regno, unsigned long val)
|
||||
native_set_debugreg(regno, val);
|
||||
}
|
||||
|
||||
noinstr void pv_native_wbinvd(void)
|
||||
{
|
||||
native_wbinvd();
|
||||
}
|
||||
|
||||
static noinstr void pv_native_safe_halt(void)
|
||||
{
|
||||
native_safe_halt();
|
||||
@ -155,7 +150,6 @@ struct paravirt_patch_template pv_ops = {
|
||||
.cpu.read_cr0 = native_read_cr0,
|
||||
.cpu.write_cr0 = native_write_cr0,
|
||||
.cpu.write_cr4 = native_write_cr4,
|
||||
.cpu.wbinvd = pv_native_wbinvd,
|
||||
.cpu.read_msr = native_read_msr,
|
||||
.cpu.write_msr = native_write_msr,
|
||||
.cpu.read_msr_safe = native_read_msr_safe,
|
||||
|
@ -108,10 +108,6 @@ void __init pci_iommu_alloc(void)
|
||||
swiotlb_init(x86_swiotlb_enable, x86_swiotlb_flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* See <Documentation/arch/x86/x86_64/boot-options.rst> for the iommu kernel
|
||||
* parameter documentation.
|
||||
*/
|
||||
static __init int iommu_setup(char *p)
|
||||
{
|
||||
iommu_merge = 1;
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <linux/hw_breakpoint.h>
|
||||
#include <linux/entry-common.h>
|
||||
#include <asm/cpu.h>
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/apic.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/mwait.h>
|
||||
@ -825,7 +826,7 @@ void __noreturn stop_this_cpu(void *dummy)
|
||||
* X86_FEATURE_SME due to cmdline options.
|
||||
*/
|
||||
if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
|
||||
native_wbinvd();
|
||||
wbinvd();
|
||||
|
||||
/*
|
||||
* This brings a cache line back and dirties it, but
|
||||
@ -838,7 +839,7 @@ void __noreturn stop_this_cpu(void *dummy)
|
||||
#ifdef CONFIG_SMP
|
||||
if (smp_ops.stop_this_cpu) {
|
||||
smp_ops.stop_this_cpu();
|
||||
unreachable();
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -846,7 +847,7 @@ void __noreturn stop_this_cpu(void *dummy)
|
||||
/*
|
||||
* Use native_halt() so that memory contents don't change
|
||||
* (stack usage and variables) after possibly issuing the
|
||||
* native_wbinvd() above.
|
||||
* wbinvd() above.
|
||||
*/
|
||||
native_halt();
|
||||
}
|
||||
@ -877,7 +878,7 @@ static __init bool prefer_mwait_c1_over_halt(void)
|
||||
if (boot_cpu_has_bug(X86_BUG_MONITOR) || boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E))
|
||||
return false;
|
||||
|
||||
cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
|
||||
cpuid(CPUID_LEAF_MWAIT, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
/*
|
||||
* If MWAIT extensions are not available, it is safe to use MWAIT
|
||||
|
@ -883,7 +883,7 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
|
||||
|
||||
if (smp_ops.stop_this_cpu) {
|
||||
smp_ops.stop_this_cpu();
|
||||
unreachable();
|
||||
BUG();
|
||||
}
|
||||
|
||||
/* Assume hlt works */
|
||||
|
@ -24,33 +24,30 @@
|
||||
#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
|
||||
|
||||
/*
|
||||
* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
|
||||
* ~ control_page + PAGE_SIZE are used as data storage and stack for
|
||||
* jumping back
|
||||
* The .text.relocate_kernel and .data.relocate_kernel sections are copied
|
||||
* into the control page, and the remainder of the page is used as the stack.
|
||||
*/
|
||||
#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
|
||||
|
||||
.section .data.relocate_kernel,"a";
|
||||
/* Minimal CPU state */
|
||||
#define RSP DATA(0x0)
|
||||
#define CR0 DATA(0x8)
|
||||
#define CR3 DATA(0x10)
|
||||
#define CR4 DATA(0x18)
|
||||
SYM_DATA_LOCAL(saved_rsp, .quad 0)
|
||||
SYM_DATA_LOCAL(saved_cr0, .quad 0)
|
||||
SYM_DATA_LOCAL(saved_cr3, .quad 0)
|
||||
SYM_DATA_LOCAL(saved_cr4, .quad 0)
|
||||
/* other data */
|
||||
SYM_DATA(kexec_va_control_page, .quad 0)
|
||||
SYM_DATA(kexec_pa_table_page, .quad 0)
|
||||
SYM_DATA(kexec_pa_swap_page, .quad 0)
|
||||
SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0)
|
||||
|
||||
/* other data */
|
||||
#define CP_PA_TABLE_PAGE DATA(0x20)
|
||||
#define CP_PA_SWAP_PAGE DATA(0x28)
|
||||
#define CP_PA_BACKUP_PAGES_MAP DATA(0x30)
|
||||
|
||||
.text
|
||||
.align PAGE_SIZE
|
||||
.section .text.relocate_kernel,"ax";
|
||||
.code64
|
||||
SYM_CODE_START_NOALIGN(relocate_range)
|
||||
SYM_CODE_START_NOALIGN(relocate_kernel)
|
||||
UNWIND_HINT_END_OF_STACK
|
||||
ANNOTATE_NOENDBR
|
||||
/*
|
||||
* %rdi indirection_page
|
||||
* %rsi page_list
|
||||
* %rsi pa_control_page
|
||||
* %rdx start address
|
||||
* %rcx preserve_context
|
||||
* %r8 host_mem_enc_active
|
||||
@ -65,51 +62,36 @@ SYM_CODE_START_NOALIGN(relocate_kernel)
|
||||
pushq %r15
|
||||
pushf
|
||||
|
||||
movq PTR(VA_CONTROL_PAGE)(%rsi), %r11
|
||||
movq %rsp, RSP(%r11)
|
||||
movq %cr0, %rax
|
||||
movq %rax, CR0(%r11)
|
||||
movq %cr3, %rax
|
||||
movq %rax, CR3(%r11)
|
||||
movq %cr4, %rax
|
||||
movq %rax, CR4(%r11)
|
||||
|
||||
/* Save CR4. Required to enable the right paging mode later. */
|
||||
movq %rax, %r13
|
||||
|
||||
/* zero out flags, and disable interrupts */
|
||||
pushq $0
|
||||
popfq
|
||||
|
||||
/* Save SME active flag */
|
||||
movq %r8, %r12
|
||||
|
||||
/*
|
||||
* get physical address of control page now
|
||||
* this is impossible after page table switch
|
||||
*/
|
||||
movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
|
||||
|
||||
/* get physical address of page table now too */
|
||||
movq PTR(PA_TABLE_PAGE)(%rsi), %r9
|
||||
|
||||
/* get physical address of swap page now */
|
||||
movq PTR(PA_SWAP_PAGE)(%rsi), %r10
|
||||
|
||||
/* save some information for jumping back */
|
||||
movq %r9, CP_PA_TABLE_PAGE(%r11)
|
||||
movq %r10, CP_PA_SWAP_PAGE(%r11)
|
||||
movq %rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
|
||||
|
||||
/* Switch to the identity mapped page tables */
|
||||
movq %cr3, %rax
|
||||
movq kexec_pa_table_page(%rip), %r9
|
||||
movq %r9, %cr3
|
||||
|
||||
/* Save %rsp and CRs. */
|
||||
movq %rsp, saved_rsp(%rip)
|
||||
movq %rax, saved_cr3(%rip)
|
||||
movq %cr0, %rax
|
||||
movq %rax, saved_cr0(%rip)
|
||||
/* Leave CR4 in %r13 to enable the right paging mode later. */
|
||||
movq %cr4, %r13
|
||||
movq %r13, saved_cr4(%rip)
|
||||
|
||||
/* save indirection list for jumping back */
|
||||
movq %rdi, pa_backup_pages_map(%rip)
|
||||
|
||||
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
|
||||
movq %rcx, %r11
|
||||
|
||||
/* setup a new stack at the end of the physical control page */
|
||||
lea PAGE_SIZE(%r8), %rsp
|
||||
lea PAGE_SIZE(%rsi), %rsp
|
||||
|
||||
/* jump to identity mapped page */
|
||||
addq $(identity_mapped - relocate_kernel), %r8
|
||||
pushq %r8
|
||||
addq $(identity_mapped - relocate_kernel), %rsi
|
||||
pushq %rsi
|
||||
ANNOTATE_UNRET_SAFE
|
||||
ret
|
||||
int3
|
||||
@ -117,6 +99,15 @@ SYM_CODE_END(relocate_kernel)
|
||||
|
||||
SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
|
||||
UNWIND_HINT_END_OF_STACK
|
||||
/*
|
||||
* %rdi indirection page
|
||||
* %rdx start address
|
||||
* %r8 host_mem_enc_active
|
||||
* %r9 page table page
|
||||
* %r11 preserve_context
|
||||
* %r13 original CR4 when relocate_kernel() was invoked
|
||||
*/
|
||||
|
||||
/* set return address to 0 if not preserving context */
|
||||
pushq $0
|
||||
/* store the start address on the stack */
|
||||
@ -166,13 +157,11 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
|
||||
* entries that will conflict with the now unencrypted memory
|
||||
* used by kexec. Flush the caches before copying the kernel.
|
||||
*/
|
||||
testq %r12, %r12
|
||||
testq %r8, %r8
|
||||
jz .Lsme_off
|
||||
wbinvd
|
||||
.Lsme_off:
|
||||
|
||||
/* Save the preserve_context to %r11 as swap_pages clobbers %rcx. */
|
||||
movq %rcx, %r11
|
||||
call swap_pages
|
||||
|
||||
/*
|
||||
@ -184,13 +173,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
|
||||
movq %cr3, %rax
|
||||
movq %rax, %cr3
|
||||
|
||||
testq %r11, %r11 /* preserve_context */
|
||||
jnz .Lrelocate
|
||||
|
||||
/*
|
||||
* set all of the registers to known values
|
||||
* leave %rsp alone
|
||||
*/
|
||||
|
||||
testq %r11, %r11
|
||||
jnz .Lrelocate
|
||||
xorl %eax, %eax
|
||||
xorl %ebx, %ebx
|
||||
xorl %ecx, %ecx
|
||||
@ -220,13 +210,14 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
|
||||
/* get the re-entry point of the peer system */
|
||||
movq 0(%rsp), %rbp
|
||||
leaq relocate_kernel(%rip), %r8
|
||||
movq CP_PA_SWAP_PAGE(%r8), %r10
|
||||
movq CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
|
||||
movq CP_PA_TABLE_PAGE(%r8), %rax
|
||||
movq kexec_pa_swap_page(%rip), %r10
|
||||
movq pa_backup_pages_map(%rip), %rdi
|
||||
movq kexec_pa_table_page(%rip), %rax
|
||||
movq %rax, %cr3
|
||||
lea PAGE_SIZE(%r8), %rsp
|
||||
call swap_pages
|
||||
movq $virtual_mapped, %rax
|
||||
movq kexec_va_control_page(%rip), %rax
|
||||
addq $(virtual_mapped - relocate_kernel), %rax
|
||||
pushq %rax
|
||||
ANNOTATE_UNRET_SAFE
|
||||
ret
|
||||
@ -236,11 +227,11 @@ SYM_CODE_END(identity_mapped)
|
||||
SYM_CODE_START_LOCAL_NOALIGN(virtual_mapped)
|
||||
UNWIND_HINT_END_OF_STACK
|
||||
ANNOTATE_NOENDBR // RET target, above
|
||||
movq RSP(%r8), %rsp
|
||||
movq CR4(%r8), %rax
|
||||
movq saved_rsp(%rip), %rsp
|
||||
movq saved_cr4(%rip), %rax
|
||||
movq %rax, %cr4
|
||||
movq CR3(%r8), %rax
|
||||
movq CR0(%r8), %r8
|
||||
movq saved_cr3(%rip), %rax
|
||||
movq saved_cr0(%rip), %r8
|
||||
movq %rax, %cr3
|
||||
movq %r8, %cr0
|
||||
|
||||
@ -270,37 +261,40 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
|
||||
movq %rdi, %rcx /* Put the indirection_page in %rcx */
|
||||
xorl %edi, %edi
|
||||
xorl %esi, %esi
|
||||
jmp 1f
|
||||
jmp .Lstart /* Should start with an indirection record */
|
||||
|
||||
0: /* top, read another word for the indirection page */
|
||||
.Lloop: /* top, read another word for the indirection page */
|
||||
|
||||
movq (%rbx), %rcx
|
||||
addq $8, %rbx
|
||||
1:
|
||||
.Lstart:
|
||||
testb $0x1, %cl /* is it a destination page? */
|
||||
jz 2f
|
||||
jz .Lnotdest
|
||||
movq %rcx, %rdi
|
||||
andq $0xfffffffffffff000, %rdi
|
||||
jmp 0b
|
||||
2:
|
||||
jmp .Lloop
|
||||
.Lnotdest:
|
||||
testb $0x2, %cl /* is it an indirection page? */
|
||||
jz 2f
|
||||
jz .Lnotind
|
||||
movq %rcx, %rbx
|
||||
andq $0xfffffffffffff000, %rbx
|
||||
jmp 0b
|
||||
2:
|
||||
jmp .Lloop
|
||||
.Lnotind:
|
||||
testb $0x4, %cl /* is it the done indicator? */
|
||||
jz 2f
|
||||
jmp 3f
|
||||
2:
|
||||
jz .Lnotdone
|
||||
jmp .Ldone
|
||||
.Lnotdone:
|
||||
testb $0x8, %cl /* is it the source indicator? */
|
||||
jz 0b /* Ignore it otherwise */
|
||||
jz .Lloop /* Ignore it otherwise */
|
||||
movq %rcx, %rsi /* For ever source page do a copy */
|
||||
andq $0xfffffffffffff000, %rsi
|
||||
|
||||
movq %rdi, %rdx /* Save destination page to %rdx */
|
||||
movq %rsi, %rax /* Save source page to %rax */
|
||||
|
||||
testq %r11, %r11 /* Only actually swap for ::preserve_context */
|
||||
jz .Lnoswap
|
||||
|
||||
/* copy source page to swap page */
|
||||
movq %r10, %rdi
|
||||
movl $512, %ecx
|
||||
@ -315,16 +309,14 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages)
|
||||
/* copy swap page to destination page */
|
||||
movq %rdx, %rdi
|
||||
movq %r10, %rsi
|
||||
.Lnoswap:
|
||||
movl $512, %ecx
|
||||
rep ; movsq
|
||||
|
||||
lea PAGE_SIZE(%rax), %rsi
|
||||
jmp 0b
|
||||
3:
|
||||
jmp .Lloop
|
||||
.Ldone:
|
||||
ANNOTATE_UNRET_SAFE
|
||||
ret
|
||||
int3
|
||||
SYM_CODE_END(swap_pages)
|
||||
|
||||
.skip KEXEC_CONTROL_CODE_MAX_SIZE - (. - relocate_kernel), 0xcc
|
||||
SYM_CODE_END(relocate_range);
|
||||
|
@ -64,6 +64,7 @@
|
||||
|
||||
#include <asm/acpi.h>
|
||||
#include <asm/cacheinfo.h>
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/desc.h>
|
||||
#include <asm/nmi.h>
|
||||
#include <asm/irq.h>
|
||||
@ -1291,10 +1292,8 @@ static inline void mwait_play_dead(void)
|
||||
return;
|
||||
if (!this_cpu_has(X86_FEATURE_CLFLUSH))
|
||||
return;
|
||||
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
|
||||
return;
|
||||
|
||||
eax = CPUID_MWAIT_LEAF;
|
||||
eax = CPUID_LEAF_MWAIT;
|
||||
ecx = 0;
|
||||
native_cpuid(&eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <linux/static_key.h>
|
||||
#include <linux/static_call.h>
|
||||
|
||||
#include <asm/cpuid.h>
|
||||
#include <asm/hpet.h>
|
||||
#include <asm/timer.h>
|
||||
#include <asm/vgtod.h>
|
||||
@ -665,13 +666,13 @@ unsigned long native_calibrate_tsc(void)
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
|
||||
return 0;
|
||||
|
||||
if (boot_cpu_data.cpuid_level < 0x15)
|
||||
if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
|
||||
return 0;
|
||||
|
||||
eax_denominator = ebx_numerator = ecx_hz = edx = 0;
|
||||
|
||||
/* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */
|
||||
cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
|
||||
cpuid(CPUID_LEAF_TSC, &eax_denominator, &ebx_numerator, &ecx_hz, &edx);
|
||||
|
||||
if (ebx_numerator == 0 || eax_denominator == 0)
|
||||
return 0;
|
||||
@ -680,8 +681,8 @@ unsigned long native_calibrate_tsc(void)
|
||||
|
||||
/*
|
||||
* Denverton SoCs don't report crystal clock, and also don't support
|
||||
* CPUID.0x16 for the calculation below, so hardcode the 25MHz crystal
|
||||
* clock.
|
||||
* CPUID_LEAF_FREQ for the calculation below, so hardcode the 25MHz
|
||||
* crystal clock.
|
||||
*/
|
||||
if (crystal_khz == 0 &&
|
||||
boot_cpu_data.x86_vfm == INTEL_ATOM_GOLDMONT_D)
|
||||
@ -700,10 +701,10 @@ unsigned long native_calibrate_tsc(void)
|
||||
* clock, but we can easily calculate it to a high degree of accuracy
|
||||
* by considering the crystal ratio and the CPU speed.
|
||||
*/
|
||||
if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= 0x16) {
|
||||
if (crystal_khz == 0 && boot_cpu_data.cpuid_level >= CPUID_LEAF_FREQ) {
|
||||
unsigned int eax_base_mhz, ebx, ecx, edx;
|
||||
|
||||
cpuid(0x16, &eax_base_mhz, &ebx, &ecx, &edx);
|
||||
cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx, &ecx, &edx);
|
||||
crystal_khz = eax_base_mhz * 1000 *
|
||||
eax_denominator / ebx_numerator;
|
||||
}
|
||||
@ -738,12 +739,12 @@ static unsigned long cpu_khz_from_cpuid(void)
|
||||
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
|
||||
return 0;
|
||||
|
||||
if (boot_cpu_data.cpuid_level < 0x16)
|
||||
if (boot_cpu_data.cpuid_level < CPUID_LEAF_FREQ)
|
||||
return 0;
|
||||
|
||||
eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0;
|
||||
|
||||
cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
|
||||
cpuid(CPUID_LEAF_FREQ, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx);
|
||||
|
||||
return eax_base_mhz * 1000;
|
||||
}
|
||||
@ -1067,10 +1068,8 @@ core_initcall(cpufreq_register_tsc_scaling);
|
||||
|
||||
#endif /* CONFIG_CPU_FREQ */
|
||||
|
||||
#define ART_CPUID_LEAF (0x15)
|
||||
#define ART_MIN_DENOMINATOR (1)
|
||||
|
||||
|
||||
/*
|
||||
* If ART is present detect the numerator:denominator to convert to TSC
|
||||
*/
|
||||
@ -1078,7 +1077,7 @@ static void __init detect_art(void)
|
||||
{
|
||||
unsigned int unused;
|
||||
|
||||
if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
|
||||
if (boot_cpu_data.cpuid_level < CPUID_LEAF_TSC)
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -1091,7 +1090,7 @@ static void __init detect_art(void)
|
||||
tsc_async_resets)
|
||||
return;
|
||||
|
||||
cpuid(ART_CPUID_LEAF, &art_base_clk.denominator,
|
||||
cpuid(CPUID_LEAF_TSC, &art_base_clk.denominator,
|
||||
&art_base_clk.numerator, &art_base_clk.freq_khz, &unused);
|
||||
|
||||
art_base_clk.freq_khz /= KHZ;
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include <asm/orc_lookup.h>
|
||||
#include <asm/cache.h>
|
||||
#include <asm/boot.h>
|
||||
#include <asm/kexec.h>
|
||||
|
||||
#undef i386 /* in case the preprocessor is a 32bit one */
|
||||
|
||||
@ -95,7 +96,19 @@ const_pcpu_hot = pcpu_hot;
|
||||
#define BSS_DECRYPTED
|
||||
|
||||
#endif
|
||||
#if defined(CONFIG_X86_64) && defined(CONFIG_KEXEC_CORE)
|
||||
#define KEXEC_RELOCATE_KERNEL \
|
||||
. = ALIGN(0x100); \
|
||||
__relocate_kernel_start = .; \
|
||||
*(.text.relocate_kernel); \
|
||||
*(.data.relocate_kernel); \
|
||||
__relocate_kernel_end = .;
|
||||
|
||||
ASSERT(__relocate_kernel_end - __relocate_kernel_start <= KEXEC_CONTROL_CODE_MAX_SIZE,
|
||||
"relocate_kernel code too large!")
|
||||
#else
|
||||
#define KEXEC_RELOCATE_KERNEL
|
||||
#endif
|
||||
PHDRS {
|
||||
text PT_LOAD FLAGS(5); /* R_E */
|
||||
data PT_LOAD FLAGS(6); /* RW_ */
|
||||
@ -121,19 +134,6 @@ SECTIONS
|
||||
.text : AT(ADDR(.text) - LOAD_OFFSET) {
|
||||
_text = .;
|
||||
_stext = .;
|
||||
/* bootstrapping code */
|
||||
HEAD_TEXT
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
#ifdef CONFIG_MITIGATION_RETPOLINE
|
||||
*(.text..__x86.indirect_thunk)
|
||||
*(.text..__x86.return_thunk)
|
||||
#endif
|
||||
STATIC_CALL_TEXT
|
||||
|
||||
ALIGN_ENTRY_TEXT_BEGIN
|
||||
*(.text..__x86.rethunk_untrain)
|
||||
ENTRY_TEXT
|
||||
@ -147,10 +147,26 @@ SECTIONS
|
||||
*(.text..__x86.rethunk_safe)
|
||||
#endif
|
||||
ALIGN_ENTRY_TEXT_END
|
||||
|
||||
TEXT_TEXT
|
||||
SCHED_TEXT
|
||||
LOCK_TEXT
|
||||
KPROBES_TEXT
|
||||
SOFTIRQENTRY_TEXT
|
||||
#ifdef CONFIG_MITIGATION_RETPOLINE
|
||||
*(.text..__x86.indirect_thunk)
|
||||
*(.text..__x86.return_thunk)
|
||||
#endif
|
||||
STATIC_CALL_TEXT
|
||||
*(.gnu.warning)
|
||||
|
||||
} :text = 0xcccccccc
|
||||
|
||||
/* bootstrapping code */
|
||||
.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {
|
||||
HEAD_TEXT
|
||||
} :text = 0xcccccccc
|
||||
|
||||
/* End of text section, which should occupy whole number of pages */
|
||||
_etext = .;
|
||||
. = ALIGN(PAGE_SIZE);
|
||||
@ -181,6 +197,7 @@ SECTIONS
|
||||
|
||||
DATA_DATA
|
||||
CONSTRUCTORS
|
||||
KEXEC_RELOCATE_KERNEL
|
||||
|
||||
/* rarely changed data like cpu maps */
|
||||
READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
|
||||
|
@ -3820,7 +3820,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
|
||||
goto next_range;
|
||||
}
|
||||
|
||||
unreachable();
|
||||
BUG();
|
||||
}
|
||||
|
||||
static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
|
||||
|
@ -678,7 +678,7 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
|
||||
ASM_CALL_ARG3,
|
||||
, [arg1] "r" (regs), [arg2] "r" (address), [arg3] "r" (&info));
|
||||
|
||||
unreachable();
|
||||
BUG();
|
||||
}
|
||||
#endif
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user